1
0
Fork 0

correct race condition leading to infinite job execution (#5201)

* fix infinite loop by setting _lastSyncTime within runBackgroundJob().  add code to make agency callback ignore _lastSyncTime limit.
* create change notes for this PR and previous PR 5114
This commit is contained in:
Matthew Von-Maszewski 2018-04-27 07:43:22 -04:00 committed by Max Neunhöffer
parent c4b0ccb56f
commit a67df088b0
3 changed files with 24 additions and 11 deletions

View File

@ -1,6 +1,9 @@
devel
-----
* pull request 5201: eliminate race scenario where handlePlanChange could run infinite times
after an execution exceeded 7.4 second time span
* pull request 5114: detect shutdown more quickly on heartbeat thread of coordinator and dbserver
* fixed issue #3811: gharial api is now checking existence of _from and _to vertices
during edge creation
@ -218,7 +221,7 @@ v3.3.7 (2018-04-11)
query = "FOR doc IN mycollection FILTER doc.value > 42 RETURN doc";
require("@arangodb/aql/explainer").debugDump("/tmp/query-debug-info", query);
Entitled users can send the generated file to the ArangoDB support to facilitate
Entitled users can send the generated file to the ArangoDB support to facilitate
reproduction and debugging.
* added hidden option `--server.ask-jwt-secret`. This is an internal option
@ -250,7 +253,7 @@ v3.3.5 (2018-03-28)
can be applied for queries that access a collection only once in the query, and that
do not use traversals, shortest path queries and that do not access collection data
dynamically using the `DOCUMENT`, `FULLTEXT`, `NEAR` or `WITHIN` AQL functions.
Additionally, the optimizer will only pull off this optimization if can safely
Additionally, the optimizer will only pull off this optimization if can safely
determine the values of all the collection's shard keys from the query, and when the
shard keys are covered by a single index (this is always true if the shard key is
the default `_key`)
@ -271,8 +274,8 @@ v3.3.5 (2018-03-28)
without grouping being executed on the DB servers and the coordinator only summing
up the counts from the individual shards
* fixed issue #4900: Nested FOR query uses index but ignores other filters
* fixed issue #4900: Nested FOR query uses index but ignores other filters
* properly exit v8::Context in one place where it was missing before
* added hidden option `--cluster.index-create-timeout` for controlling the
@ -281,7 +284,7 @@ v3.3.5 (2018-03-28)
* increase default timeout for index creation in cluster to 3600s
* fixed issue #4843: Query-Result has more Docs than the Collection itself
* fixed issue #4843: Query-Result has more Docs than the Collection itself
* fixed the behavior of ClusterInfo when waiting for current to catch
up with plan in create collection.

View File

@ -172,7 +172,8 @@ void HeartbeatThread::runBackgroundJob() {
_launchAnotherBackgroundJob = false;
// the JobGuard is in the operator() of HeartbeatBackgroundJob
SchedulerFeature::SCHEDULER->post(HeartbeatBackgroundJob(shared_from_this(), TRI_microtime()));
_lastSyncTime = TRI_microtime();
SchedulerFeature::SCHEDULER->post(HeartbeatBackgroundJob(shared_from_this(), _lastSyncTime));
} else {
_backgroundJobScheduledOrRunning = false;
_launchAnotherBackgroundJob = false;
@ -265,7 +266,7 @@ void HeartbeatThread::runDBServer() {
}
if (doSync) {
syncDBServerStatusQuo();
syncDBServerStatusQuo(true);
}
return true;
@ -289,6 +290,13 @@ void HeartbeatThread::runDBServer() {
int const currentCountStart = 1; // set to 1 by Max to speed up discovery
int currentCount = currentCountStart;
// Loop priorities / goals
// 0. send state to agency server
// 1. schedule handlePlanChange immediately when agency callback occurs
// 2. poll for plan change, schedule handlePlanChange immediately if change detected
// 3. force handlePlanChange every 7.4 seconds just in case
// (7.4 seconds is just less than half the 15 seconds agency uses to declare dead server)
// 4. if handlePlanChange runs long (greater than 7.4 seconds), have another start immediately after
while (!isStopping()) {
logThreadDeaths();
@ -612,7 +620,7 @@ void HeartbeatThread::runSingleServer() {
// wait for everything to calm down for good measure
std::this_thread::sleep_for(std::chrono::seconds(10));
}
TRI_voc_tick_t lastTick = 0; // we always want to set lastTick
auto sendTransient = [&]() {
VPackBuilder builder;
@ -1140,7 +1148,7 @@ bool HeartbeatThread::handlePlanChangeCoordinator(uint64_t currentPlanVersion) {
/// and every few heartbeats if the Current/Version has changed.
////////////////////////////////////////////////////////////////////////////////
void HeartbeatThread::syncDBServerStatusQuo() {
void HeartbeatThread::syncDBServerStatusQuo(bool asyncPush) {
bool shouldUpdate = false;
bool becauseOfPlan = false;
bool becauseOfCurrent = false;
@ -1162,8 +1170,10 @@ void HeartbeatThread::syncDBServerStatusQuo() {
becauseOfCurrent = true;
}
// 7.4 seconds is just less than half the 15 seconds agency uses to declare dead server,
// perform a safety execution of job in case other plan changes somehow incomplete or undetected
double now = TRI_microtime();
if (now > _lastSyncTime + 7.4) {
if (now > _lastSyncTime + 7.4 || asyncPush) {
shouldUpdate = true;
}

View File

@ -169,7 +169,7 @@ class HeartbeatThread : public CriticalThread,
/// @brief bring the db server in sync with the desired state
//////////////////////////////////////////////////////////////////////////////
void syncDBServerStatusQuo();
void syncDBServerStatusQuo(bool asyncPush = false);
//////////////////////////////////////////////////////////////////////////////
/// @brief update the local agent pool from the slice