mirror of https://gitee.com/bigwinds/arangodb
Better shutdown handling
This commit is contained in:
parent
a8780ff4bb
commit
47a0f8602a
|
@ -408,7 +408,7 @@ void Constituent::beginShutdown() {
|
|||
/// Start operation
|
||||
bool Constituent::start(TRI_vocbase_t* vocbase,
|
||||
aql::QueryRegistry* queryRegistry) {
|
||||
|
||||
TRI_ASSERT(vocbase != nullptr);
|
||||
_vocbase = vocbase;
|
||||
_queryRegistry = queryRegistry;
|
||||
|
||||
|
@ -420,7 +420,7 @@ bool Constituent::start(TRI_vocbase_t* vocbase,
|
|||
/// Get persisted information and run election process
|
||||
void Constituent::run() {
|
||||
|
||||
LOG(WARN) << "Starting constituent";
|
||||
LOG_TOPIC(DEBUG, Logger::AGENCY) << "Starting Constituent";
|
||||
_id = _agent->config().id();
|
||||
|
||||
TRI_ASSERT(_vocbase != nullptr);
|
||||
|
|
|
@ -132,7 +132,7 @@ std::vector<check_t> Supervision::checkDBServers() {
|
|||
report->add("LastHeartbeatAcked",
|
||||
VPackValue(
|
||||
timepointToString(std::chrono::system_clock::now())));
|
||||
report->add("Status", VPackValue("GOOD"));
|
||||
report->add("Status", VPackValue(Supervision::HEALTH_STATUS_GOOD));
|
||||
} else {
|
||||
std::chrono::seconds t{0};
|
||||
t = std::chrono::duration_cast<std::chrono::seconds>(
|
||||
|
@ -254,17 +254,17 @@ std::vector<check_t> Supervision::checkCoordinators() {
|
|||
report->add("LastHeartbeatAcked",
|
||||
VPackValue(
|
||||
timepointToString(std::chrono::system_clock::now())));
|
||||
report->add("Status", VPackValue("GOOD"));
|
||||
report->add("Status", VPackValue(Supervision::HEALTH_STATUS_GOOD));
|
||||
} else {
|
||||
std::chrono::seconds t{0};
|
||||
t = std::chrono::duration_cast<std::chrono::seconds>(
|
||||
std::chrono::system_clock::now()-stringToTimepoint(lastHeartbeatAcked));
|
||||
if (t.count() > _gracePeriod) { // Failure
|
||||
if (lastStatus == "BAD") {
|
||||
report->add("Status", VPackValue("FAILED"));
|
||||
if (lastStatus == Supervision::HEALTH_STATUS_BAD) {
|
||||
report->add("Status", VPackValue(Supervision::HEALTH_STATUS_FAILED));
|
||||
}
|
||||
} else {
|
||||
report->add("Status", VPackValue("BAD"));
|
||||
report->add("Status", VPackValue(Supervision::HEALTH_STATUS_BAD));
|
||||
}
|
||||
}
|
||||
|
||||
|
@ -346,6 +346,11 @@ void Supervision::run() {
|
|||
|
||||
while (!this->isStopping()) {
|
||||
updateSnapshot();
|
||||
// mop: always do health checks so shutdown is able to detect if a server failed otherwise
|
||||
if (_agent->leading()) {
|
||||
doChecks();
|
||||
}
|
||||
|
||||
if (isShuttingDown()) {
|
||||
handleShutdown();
|
||||
} else if (_agent->leading()) {
|
||||
|
@ -365,16 +370,32 @@ bool Supervision::isShuttingDown() {
|
|||
}
|
||||
}
|
||||
|
||||
bool Supervision::serverGood(const std::string& serverName) {
|
||||
try {
|
||||
const std::string status = _snapshot(healthPrefix + "/" + serverName + "/Status").getString();
|
||||
return status == Supervision::HEALTH_STATUS_GOOD;
|
||||
} catch (...) {
|
||||
return false;
|
||||
}
|
||||
}
|
||||
|
||||
void Supervision::handleShutdown() {
|
||||
LOG_TOPIC(DEBUG, Logger::AGENCY) << "Initiating shutdown";
|
||||
LOG_TOPIC(DEBUG, Logger::AGENCY) << "Waiting for clients to shut down";
|
||||
Node::Children const& serversRegistered = _snapshot(currentServersRegisteredPrefix).children();
|
||||
bool serversCleared = true;
|
||||
for (auto const& server : serversRegistered) {
|
||||
if (server.first == "Version") {
|
||||
continue;
|
||||
}
|
||||
|
||||
LOG_TOPIC(DEBUG, Logger::AGENCY)
|
||||
<< "Waiting for " << server.first << " to shutdown";
|
||||
|
||||
if (!serverGood(server.first)) {
|
||||
LOG_TOPIC(WARN, Logger::AGENCY)
|
||||
<< "Server " << server.first << " did not shutdown properly it seems!";
|
||||
continue;
|
||||
}
|
||||
serversCleared = false;
|
||||
}
|
||||
|
||||
|
@ -390,7 +411,6 @@ bool Supervision::handleJobs() {
|
|||
}
|
||||
|
||||
// Do supervision
|
||||
doChecks();
|
||||
shrinkCluster();
|
||||
workJobs();
|
||||
|
||||
|
@ -398,7 +418,6 @@ bool Supervision::handleJobs() {
|
|||
}
|
||||
|
||||
void Supervision::workJobs() {
|
||||
|
||||
Node::Children const& todos = _snapshot(toDoPrefix).children();
|
||||
Node::Children const& pends = _snapshot(pendingPrefix).children();
|
||||
|
||||
|
|
|
@ -108,6 +108,9 @@ class Supervision : public arangodb::Thread {
|
|||
void wakeUp();
|
||||
|
||||
private:
|
||||
static constexpr const char* HEALTH_STATUS_GOOD = "GOOD";
|
||||
static constexpr const char* HEALTH_STATUS_BAD = "BAD";
|
||||
static constexpr const char* HEALTH_STATUS_FAILED = "FAILED";
|
||||
|
||||
/// @brief Update agency prefix from agency itself
|
||||
bool updateAgencyPrefix (size_t nTries = 10, int intervalSec = 1);
|
||||
|
@ -165,6 +168,8 @@ class Supervision : public arangodb::Thread {
|
|||
uint64_t _jobId;
|
||||
uint64_t _jobIdMax;
|
||||
|
||||
bool serverGood(const std::string&);
|
||||
|
||||
static std::string _agencyPrefix;
|
||||
};
|
||||
|
||||
|
|
Loading…
Reference in New Issue