mirror of https://gitee.com/bigwinds/arangodb
Bug fix 3.3: Fix supervisor thread crash (#4165)
* port devel branch to 3.3 of supervisor thread death fix
This commit is contained in:
parent
ef8fcd101c
commit
392ddde251
|
@ -405,8 +405,13 @@ std::vector<check_t> Supervision::check(std::string const& type) {
|
||||||
// Do actual monitoring
|
// Do actual monitoring
|
||||||
for (auto const& machine : machinesPlanned) {
|
for (auto const& machine : machinesPlanned) {
|
||||||
std::string lastHeartbeatStatus, lastHeartbeatAcked, lastHeartbeatTime,
|
std::string lastHeartbeatStatus, lastHeartbeatAcked, lastHeartbeatTime,
|
||||||
lastStatus, serverID(machine.first),
|
lastStatus, serverID(machine.first), shortName;
|
||||||
shortName(_snapshot(targetShortID + serverID + "/ShortName").getString());
|
|
||||||
|
// short name arrives asynchronous to machine registering, make sure
|
||||||
|
// it has arrived before trying to use it
|
||||||
|
if (LEAF == _snapshot(targetShortID + serverID + "/ShortName").type()) {
|
||||||
|
|
||||||
|
shortName = _snapshot(targetShortID + serverID + "/ShortName").getString();
|
||||||
|
|
||||||
// Endpoint
|
// Endpoint
|
||||||
std::string endpoint;
|
std::string endpoint;
|
||||||
|
@ -518,8 +523,12 @@ std::vector<check_t> Supervision::check(std::string const& type) {
|
||||||
transient(_agent, *tReport);
|
transient(_agent, *tReport);
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
} else {
|
||||||
|
LOG_TOPIC(INFO, Logger::SUPERVISION) <<
|
||||||
|
"Short name for << " << serverID << " not yet available. Skipping health check.";
|
||||||
|
} // else
|
||||||
|
|
||||||
}
|
} // for
|
||||||
|
|
||||||
return ret;
|
return ret;
|
||||||
}
|
}
|
||||||
|
@ -621,9 +630,17 @@ void Supervision::run() {
|
||||||
// Do nothing unless leader for over 10 seconds
|
// Do nothing unless leader for over 10 seconds
|
||||||
auto secondsSinceLeader = std::chrono::duration<double>(
|
auto secondsSinceLeader = std::chrono::duration<double>(
|
||||||
std::chrono::steady_clock::now() - _agent->leaderSince()).count();
|
std::chrono::steady_clock::now() - _agent->leaderSince()).count();
|
||||||
|
// 10 seconds should be plenty of time for all servers to send
|
||||||
|
// heartbeat status to new leader (heartbeat is once per second)
|
||||||
if (secondsSinceLeader > 10.0) {
|
if (secondsSinceLeader > 10.0) {
|
||||||
|
try {
|
||||||
doChecks();
|
doChecks();
|
||||||
|
} catch (std::exception const& e) {
|
||||||
|
LOG_TOPIC(ERR, Logger::SUPERVISION) << e.what() << " " << __FILE__ << " " << __LINE__;
|
||||||
|
} catch (...) {
|
||||||
|
LOG_TOPIC(ERR, Logger::SUPERVISION) <<
|
||||||
|
"Supervision::doChecks() generated an uncaught exception.";
|
||||||
|
}
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
|
@ -989,7 +1006,7 @@ void Supervision::getUniqueIds() {
|
||||||
} catch (std::exception const& e) {
|
} catch (std::exception const& e) {
|
||||||
LOG_TOPIC(ERR, Logger::SUPERVISION)
|
LOG_TOPIC(ERR, Logger::SUPERVISION)
|
||||||
<< "Failed to acquire job IDs from agency: "
|
<< "Failed to acquire job IDs from agency: "
|
||||||
<< e.what() << __FILE__ << __LINE__;
|
<< e.what() << __FILE__ << " " << __LINE__;
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|
Loading…
Reference in New Issue