diff --git a/arangod/Agency/AgencyFeature.cpp b/arangod/Agency/AgencyFeature.cpp index c6b20ce8e9..4c54c29f15 100644 --- a/arangod/Agency/AgencyFeature.cpp +++ b/arangod/Agency/AgencyFeature.cpp @@ -44,7 +44,7 @@ AgencyFeature::AgencyFeature(application_features::ApplicationServer* server) _notify(false), _supervision(false), _waitForSync(true), - _supervisionFrequency(1.0), + _supervisionFrequency(5.0), _compactionStepSize(1000) { setOptional(true); requiresElevatedPrivileges(false); diff --git a/arangod/Agency/CleanOutServer.cpp b/arangod/Agency/CleanOutServer.cpp index e6164fdd39..df8fc1cd38 100644 --- a/arangod/Agency/CleanOutServer.cpp +++ b/arangod/Agency/CleanOutServer.cpp @@ -133,9 +133,11 @@ bool CleanOutServer::start() const { // Check if we can get things done in the first place if (!checkFeasibility()) { finish("DBServers/" + _server); - return false; + return false; } + + // Schedule shard relocations scheduleMoveShards(); diff --git a/arangod/Agency/Supervision.cpp b/arangod/Agency/Supervision.cpp index 7666c8c6b6..a254060acd 100644 --- a/arangod/Agency/Supervision.cpp +++ b/arangod/Agency/Supervision.cpp @@ -224,60 +224,49 @@ bool Supervision::doChecks(bool timedout) { void Supervision::run() { - // We do a try/catch around everything to prevent agency crashes until - // debugging of the Supervision is finished: - // try { + CONDITION_LOCKER(guard, _cv); + TRI_ASSERT(_agent != nullptr); + bool timedout = false; - CONDITION_LOCKER(guard, _cv); - TRI_ASSERT(_agent != nullptr); - bool timedout = false; - - while (!this->isStopping()) { - - // Get agency prefix after cluster init - if (_jobId == 0) { - // We need the agency prefix to work, but it is only initialized by - // some other server in the cluster. Since the supervision does not - // make sense at all without other ArangoDB servers, we wait pretty - // long here before giving up: - if (!updateAgencyPrefix(1000, 1)) { - LOG_TOPIC(ERR, Logger::AGENCY) - << "Cannot get prefix from Agency. Stopping supervision for good."; - break; - } + while (!this->isStopping()) { + + // Get agency prefix after cluster init + if (_jobId == 0) { + // We need the agency prefix to work, but it is only initialized by + // some other server in the cluster. Since the supervision does not + // make sense at all without other ArangoDB servers, we wait pretty + // long here before giving up: + if (!updateAgencyPrefix(1000, 1)) { + LOG_TOPIC(ERR, Logger::AGENCY) + << "Cannot get prefix from Agency. Stopping supervision for good."; + break; } - - // Get bunch of job IDs from agency for future jobs - if (_jobId == 0 || _jobId == _jobIdMax) { - getUniqueIds(); // cannot fail but only hang - } - - // Do nothing unless leader - if (_agent->leading()) { - timedout = _cv.wait(_frequency * 1000000); // quarter second - } else { - _cv.wait(); - } - - // Do supervision - doChecks(timedout); - workJobs(); - - } - /*} - catch (std::exception const& e) { - LOG_TOPIC(ERR, Logger::AGENCY) - << "Supervision thread has caught an exception and is terminated: " - << e.what(); - }*/ + + // Get bunch of job IDs from agency for future jobs + if (_jobId == 0 || _jobId == _jobIdMax) { + getUniqueIds(); // cannot fail but only hang + } + + // Do nothing unless leader + if (_agent->leading()) { + timedout = _cv.wait(_frequency * 1000000); // quarter second + } else { + _cv.wait(); + } + + // Do supervision + doChecks(timedout); + workJobs(); + + } + } void Supervision::workJobs() { + _snapshot = _agent->readDB().get("/"); Node::Children const& todos = _snapshot(toDoPrefix).children(); - Node::Children const& pends = _snapshot(pendingPrefix).children(); - if (!todos.empty()) { for (auto const& todoEnt : todos) { Node const& job = *todoEnt.second; @@ -292,6 +281,7 @@ void Supervision::workJobs() { } } + Node::Children const& pends = _snapshot(pendingPrefix).children(); if (!pends.empty()) { for (auto const& pendEnt : pends) { Node const& job = *pendEnt.second; @@ -305,7 +295,6 @@ void Supervision::workJobs() { } } } - } // Start thread diff --git a/js/actions/api-cluster.js b/js/actions/api-cluster.js index 752b1cb6d4..4bd5ce7d5a 100644 --- a/js/actions/api-cluster.js +++ b/js/actions/api-cluster.js @@ -196,7 +196,7 @@ actions.defineHttp({ var DBserver = req.parameters.DBserver; var coord = { coordTransactionID: ArangoClusterInfo.uniqid() }; var options = { coordTransactionID: coord.coordTransactionID, timeout:10 }; - var op = ArangoClusterComm.asyncRequest("GET","tcp://"+local,"_system", + var op = ArangoClusterComm.asyncRequest("GET","server:"+local,"_system", "/_admin/statistics","",{},options); var r = ArangoClusterComm.wait(op); res.contentType = "application/json; charset=utf-8"; diff --git a/scripts/startLocalCluster.sh b/scripts/startLocalCluster.sh index 8ce6c5ed32..fe90ada8b2 100755 --- a/scripts/startLocalCluster.sh +++ b/scripts/startLocalCluster.sh @@ -47,7 +47,7 @@ build/bin/arangod \ --agency.size 1 \ --agency.wait-for-sync false \ --agency.supervision true \ - --agency.supervision-frequency 5 \ + --agency.supervision-frequency 1 \ --database.directory cluster/data4001 \ --javascript.app-path ./js/apps \ --javascript.startup-directory ./js \