mirror of https://gitee.com/bigwinds/arangodb
Bug fix 3.3: Fix supervisor thread crash (#4165)
* port devel branch to 3.3 of supervisor thread death fix
This commit is contained in:
parent
ef8fcd101c
commit
392ddde251
|
@ -405,8 +405,13 @@ std::vector<check_t> Supervision::check(std::string const& type) {
|
|||
// Do actual monitoring
|
||||
for (auto const& machine : machinesPlanned) {
|
||||
std::string lastHeartbeatStatus, lastHeartbeatAcked, lastHeartbeatTime,
|
||||
lastStatus, serverID(machine.first),
|
||||
shortName(_snapshot(targetShortID + serverID + "/ShortName").getString());
|
||||
lastStatus, serverID(machine.first), shortName;
|
||||
|
||||
// short name arrives asynchronous to machine registering, make sure
|
||||
// it has arrived before trying to use it
|
||||
if (LEAF == _snapshot(targetShortID + serverID + "/ShortName").type()) {
|
||||
|
||||
shortName = _snapshot(targetShortID + serverID + "/ShortName").getString();
|
||||
|
||||
// Endpoint
|
||||
std::string endpoint;
|
||||
|
@ -518,8 +523,12 @@ std::vector<check_t> Supervision::check(std::string const& type) {
|
|||
transient(_agent, *tReport);
|
||||
}
|
||||
}
|
||||
} else {
|
||||
LOG_TOPIC(INFO, Logger::SUPERVISION) <<
|
||||
"Short name for << " << serverID << " not yet available. Skipping health check.";
|
||||
} // else
|
||||
|
||||
}
|
||||
} // for
|
||||
|
||||
return ret;
|
||||
}
|
||||
|
@ -621,9 +630,17 @@ void Supervision::run() {
|
|||
// Do nothing unless leader for over 10 seconds
|
||||
auto secondsSinceLeader = std::chrono::duration<double>(
|
||||
std::chrono::steady_clock::now() - _agent->leaderSince()).count();
|
||||
|
||||
// 10 seconds should be plenty of time for all servers to send
|
||||
// heartbeat status to new leader (heartbeat is once per second)
|
||||
if (secondsSinceLeader > 10.0) {
|
||||
try {
|
||||
doChecks();
|
||||
} catch (std::exception const& e) {
|
||||
LOG_TOPIC(ERR, Logger::SUPERVISION) << e.what() << " " << __FILE__ << " " << __LINE__;
|
||||
} catch (...) {
|
||||
LOG_TOPIC(ERR, Logger::SUPERVISION) <<
|
||||
"Supervision::doChecks() generated an uncaught exception.";
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
|
@ -989,7 +1006,7 @@ void Supervision::getUniqueIds() {
|
|||
} catch (std::exception const& e) {
|
||||
LOG_TOPIC(ERR, Logger::SUPERVISION)
|
||||
<< "Failed to acquire job IDs from agency: "
|
||||
<< e.what() << __FILE__ << __LINE__;
|
||||
<< e.what() << __FILE__ << " " << __LINE__;
|
||||
}
|
||||
}
|
||||
|
||||
|
|
Loading…
Reference in New Issue