mirror of https://gitee.com/bigwinds/arangodb
correct race condition leading to infinite job execution (#5201)
* fix infinite loop by setting _lastSyncTime within runBackgroundJob(). add code to make agency callback ignore _lastSyncTime limit. * create change notes for this PR and previous PR 5114
This commit is contained in:
parent
c4b0ccb56f
commit
a67df088b0
|
@ -1,6 +1,9 @@
|
|||
devel
|
||||
-----
|
||||
* pull request 5201: eliminate race scenario where handlePlanChange could run infinite times
|
||||
after an execution exceeded 7.4 second time span
|
||||
|
||||
* pull request 5114: detect shutdown more quickly on heartbeat thread of coordinator and dbserver
|
||||
|
||||
* fixed issue #3811: gharial api is now checking existence of _from and _to vertices
|
||||
during edge creation
|
||||
|
|
|
@ -172,7 +172,8 @@ void HeartbeatThread::runBackgroundJob() {
|
|||
_launchAnotherBackgroundJob = false;
|
||||
|
||||
// the JobGuard is in the operator() of HeartbeatBackgroundJob
|
||||
SchedulerFeature::SCHEDULER->post(HeartbeatBackgroundJob(shared_from_this(), TRI_microtime()));
|
||||
_lastSyncTime = TRI_microtime();
|
||||
SchedulerFeature::SCHEDULER->post(HeartbeatBackgroundJob(shared_from_this(), _lastSyncTime));
|
||||
} else {
|
||||
_backgroundJobScheduledOrRunning = false;
|
||||
_launchAnotherBackgroundJob = false;
|
||||
|
@ -265,7 +266,7 @@ void HeartbeatThread::runDBServer() {
|
|||
}
|
||||
|
||||
if (doSync) {
|
||||
syncDBServerStatusQuo();
|
||||
syncDBServerStatusQuo(true);
|
||||
}
|
||||
|
||||
return true;
|
||||
|
@ -289,6 +290,13 @@ void HeartbeatThread::runDBServer() {
|
|||
int const currentCountStart = 1; // set to 1 by Max to speed up discovery
|
||||
int currentCount = currentCountStart;
|
||||
|
||||
// Loop priorities / goals
|
||||
// 0. send state to agency server
|
||||
// 1. schedule handlePlanChange immediately when agency callback occurs
|
||||
// 2. poll for plan change, schedule handlePlanChange immediately if change detected
|
||||
// 3. force handlePlanChange every 7.4 seconds just in case
|
||||
// (7.4 seconds is just less than half the 15 seconds agency uses to declare dead server)
|
||||
// 4. if handlePlanChange runs long (greater than 7.4 seconds), have another start immediately after
|
||||
|
||||
while (!isStopping()) {
|
||||
logThreadDeaths();
|
||||
|
@ -1140,7 +1148,7 @@ bool HeartbeatThread::handlePlanChangeCoordinator(uint64_t currentPlanVersion) {
|
|||
/// and every few heartbeats if the Current/Version has changed.
|
||||
////////////////////////////////////////////////////////////////////////////////
|
||||
|
||||
void HeartbeatThread::syncDBServerStatusQuo() {
|
||||
void HeartbeatThread::syncDBServerStatusQuo(bool asyncPush) {
|
||||
bool shouldUpdate = false;
|
||||
bool becauseOfPlan = false;
|
||||
bool becauseOfCurrent = false;
|
||||
|
@ -1162,8 +1170,10 @@ void HeartbeatThread::syncDBServerStatusQuo() {
|
|||
becauseOfCurrent = true;
|
||||
}
|
||||
|
||||
// 7.4 seconds is just less than half the 15 seconds agency uses to declare dead server,
|
||||
// perform a safety execution of job in case other plan changes somehow incomplete or undetected
|
||||
double now = TRI_microtime();
|
||||
if (now > _lastSyncTime + 7.4) {
|
||||
if (now > _lastSyncTime + 7.4 || asyncPush) {
|
||||
shouldUpdate = true;
|
||||
}
|
||||
|
||||
|
|
|
@ -169,7 +169,7 @@ class HeartbeatThread : public CriticalThread,
|
|||
/// @brief bring the db server in sync with the desired state
|
||||
//////////////////////////////////////////////////////////////////////////////
|
||||
|
||||
void syncDBServerStatusQuo();
|
||||
void syncDBServerStatusQuo(bool asyncPush = false);
|
||||
|
||||
//////////////////////////////////////////////////////////////////////////////
|
||||
/// @brief update the local agent pool from the slice
|
||||
|
|
Loading…
Reference in New Issue