From 00d6111a3ebf95db0ad49c80fba87c6d7f7adfe6 Mon Sep 17 00:00:00 2001 From: Kaveh Vahedipour Date: Fri, 3 Jun 2016 14:27:04 +0200 Subject: [PATCH] server health for aardvark --- arangod/Agency/CleanOutServer.cpp | 16 +++++---- arangod/Agency/Supervision.cpp | 59 ++++++++++++++++++------------- arangod/Agency/Supervision.h | 3 ++ js/actions/api-cluster.js | 2 +- 4 files changed, 49 insertions(+), 31 deletions(-) diff --git a/arangod/Agency/CleanOutServer.cpp b/arangod/Agency/CleanOutServer.cpp index df8fc1cd38..35c5dc1102 100644 --- a/arangod/Agency/CleanOutServer.cpp +++ b/arangod/Agency/CleanOutServer.cpp @@ -129,17 +129,21 @@ bool CleanOutServer::start() const { if (res.accepted && res.indices.size()==1 && res.indices[0]) { LOG_TOPIC(INFO, Logger::AGENCY) << "Pending: Clean out server " + _server; + LOG(WARN) << __FILE__<<__LINE__ ; // Check if we can get things done in the first place if (!checkFeasibility()) { - finish("DBServers/" + _server); - return false; + finish("DBServers/" + _server, false); + LOG(WARN) << __FILE__<<__LINE__ ; + return false; } - + LOG(WARN) << __FILE__<<__LINE__ ; + // Schedule shard relocations scheduleMoveShards(); + LOG(WARN) << __FILE__<<__LINE__ ; return true; @@ -244,9 +248,9 @@ bool CleanOutServer::checkFeasibility () const { } LOG_TOPIC(ERR, Logger::AGENCY) - << "Cannot accomodate all shards " << collections.str() - << " with replication factors " << factors.str() - << " after cleaning out server " << _server; + << "Cannot accomodate shards " << collections.str() + << "with replication factors " << factors.str() + << "after cleaning out server " << _server; return false; } diff --git a/arangod/Agency/Supervision.cpp b/arangod/Agency/Supervision.cpp index 0796191634..6f100bfde9 100644 --- a/arangod/Agency/Supervision.cpp +++ b/arangod/Agency/Supervision.cpp @@ -206,22 +206,23 @@ std::vector Supervision::checkCoordinators() { return ret; } - -bool Supervision::doChecks(bool timedout) { +bool Supervision::updateSnapshot() { if (_agent == nullptr || this->isStopping()) { return false; } - _snapshot = _agent->readDB().get(_agencyPrefix); - - LOG_TOPIC(DEBUG, Logger::AGENCY) << "Sanity checks"; - /*std::vector ret = */checkDBServers(); - checkCoordinators(); - return true; } +bool Supervision::doChecks(bool timedout) { + + checkDBServers(); + checkCoordinators(); + return true; + +} + void Supervision::run() { CONDITION_LOCKER(guard, _cv); @@ -256,6 +257,7 @@ void Supervision::run() { } // Do supervision + updateSnapshot(); doChecks(timedout); workJobs(); @@ -267,36 +269,45 @@ void Supervision::workJobs() { Node::Children const& todos = _snapshot(toDoPrefix).children(); Node::Children const& pends = _snapshot(pendingPrefix).children(); + if (!todos.empty()) { for (auto const& todoEnt : todos) { Node const& job = *todoEnt.second; - LOG(WARN) << __FILE__<<__LINE__ << job.toJson(); - std::string jobType = job("type").getString(), - jobId = job("jobId").getString(), - creator = job("creator").getString(); - if (jobType == "failedServer") { - FailedServer fs(_snapshot, _agent, jobId, creator, _agencyPrefix); - } else if (jobType == "cleanOutServer") { - CleanOutServer cos(_snapshot, _agent, jobId, creator, _agencyPrefix); + try { + std::string jobType = job("type").getString(), + jobId = job("jobId").getString(), + creator = job("creator").getString(); + if (jobType == "failedServer") { + FailedServer fs(_snapshot, _agent, jobId, creator, _agencyPrefix); + } else if (jobType == "cleanOutServer") { + CleanOutServer cos(_snapshot, _agent, jobId, creator, _agencyPrefix); + } + } catch (std::exception const& e) { + LOG_TOPIC(ERR, Logger::AGENCY) << e.what() << " " << __FILE__ << __LINE__; } } } + if (!pends.empty()) { for (auto const& pendEnt : pends) { Node const& job = *pendEnt.second; - LOG(WARN) << __FILE__<<__LINE__ << job.toJson(); - std::string jobType = job("type").getString(), - jobId = job("jobId").getString(), - creator = job("creator").getString(); - if (jobType == "failedServer") { - FailedServer fs(_snapshot, _agent, jobId, creator, _agencyPrefix); - } else if (jobType == "cleanOutServer") { - CleanOutServer cos(_snapshot, _agent, jobId, creator, _agencyPrefix); + try { + std::string jobType = job("type").getString(), + jobId = job("jobId").getString(), + creator = job("creator").getString(); + if (jobType == "failedServer") { + FailedServer fs(_snapshot, _agent, jobId, creator, _agencyPrefix); + } else if (jobType == "cleanOutServer") { + CleanOutServer cos(_snapshot, _agent, jobId, creator, _agencyPrefix); + } + } catch (std::exception const& e) { + LOG_TOPIC(ERR, Logger::AGENCY) << e.what() << " " << __FILE__ << __LINE__; } } } + } // Start thread diff --git a/arangod/Agency/Supervision.h b/arangod/Agency/Supervision.h index a5e8d27d2c..6511fede4f 100644 --- a/arangod/Agency/Supervision.h +++ b/arangod/Agency/Supervision.h @@ -141,6 +141,9 @@ class Supervision : public arangodb::Thread { /// @brief Perform sanity checking bool doChecks(bool); + /// @brief update my local agency snapshot + bool updateSnapshot(); + Agent* _agent; /**< @brief My agent */ Node _snapshot; diff --git a/js/actions/api-cluster.js b/js/actions/api-cluster.js index 4bd5ce7d5a..58005060c4 100644 --- a/js/actions/api-cluster.js +++ b/js/actions/api-cluster.js @@ -196,7 +196,7 @@ actions.defineHttp({ var DBserver = req.parameters.DBserver; var coord = { coordTransactionID: ArangoClusterInfo.uniqid() }; var options = { coordTransactionID: coord.coordTransactionID, timeout:10 }; - var op = ArangoClusterComm.asyncRequest("GET","server:"+local,"_system", + var op = ArangoClusterComm.asyncRequest("GET","server:"+DBserver,"_system", "/_admin/statistics","",{},options); var r = ArangoClusterComm.wait(op); res.contentType = "application/json; charset=utf-8";