mirror of https://gitee.com/bigwinds/arangodb
correct race condition leading to infinite job execution (#5201)
* fix infinite loop by setting _lastSyncTime within runBackgroundJob(). add code to make agency callback ignore _lastSyncTime limit. * create change notes for this PR and previous PR 5114
This commit is contained in:
parent
c4b0ccb56f
commit
a67df088b0
13
CHANGELOG
13
CHANGELOG
|
@ -1,6 +1,9 @@
|
||||||
devel
|
devel
|
||||||
-----
|
-----
|
||||||
|
* pull request 5201: eliminate race scenario where handlePlanChange could run infinite times
|
||||||
|
after an execution exceeded 7.4 second time span
|
||||||
|
|
||||||
|
* pull request 5114: detect shutdown more quickly on heartbeat thread of coordinator and dbserver
|
||||||
|
|
||||||
* fixed issue #3811: gharial api is now checking existence of _from and _to vertices
|
* fixed issue #3811: gharial api is now checking existence of _from and _to vertices
|
||||||
during edge creation
|
during edge creation
|
||||||
|
@ -218,7 +221,7 @@ v3.3.7 (2018-04-11)
|
||||||
query = "FOR doc IN mycollection FILTER doc.value > 42 RETURN doc";
|
query = "FOR doc IN mycollection FILTER doc.value > 42 RETURN doc";
|
||||||
require("@arangodb/aql/explainer").debugDump("/tmp/query-debug-info", query);
|
require("@arangodb/aql/explainer").debugDump("/tmp/query-debug-info", query);
|
||||||
|
|
||||||
Entitled users can send the generated file to the ArangoDB support to facilitate
|
Entitled users can send the generated file to the ArangoDB support to facilitate
|
||||||
reproduction and debugging.
|
reproduction and debugging.
|
||||||
|
|
||||||
* added hidden option `--server.ask-jwt-secret`. This is an internal option
|
* added hidden option `--server.ask-jwt-secret`. This is an internal option
|
||||||
|
@ -250,7 +253,7 @@ v3.3.5 (2018-03-28)
|
||||||
can be applied for queries that access a collection only once in the query, and that
|
can be applied for queries that access a collection only once in the query, and that
|
||||||
do not use traversals, shortest path queries and that do not access collection data
|
do not use traversals, shortest path queries and that do not access collection data
|
||||||
dynamically using the `DOCUMENT`, `FULLTEXT`, `NEAR` or `WITHIN` AQL functions.
|
dynamically using the `DOCUMENT`, `FULLTEXT`, `NEAR` or `WITHIN` AQL functions.
|
||||||
Additionally, the optimizer will only pull off this optimization if can safely
|
Additionally, the optimizer will only pull off this optimization if can safely
|
||||||
determine the values of all the collection's shard keys from the query, and when the
|
determine the values of all the collection's shard keys from the query, and when the
|
||||||
shard keys are covered by a single index (this is always true if the shard key is
|
shard keys are covered by a single index (this is always true if the shard key is
|
||||||
the default `_key`)
|
the default `_key`)
|
||||||
|
@ -271,8 +274,8 @@ v3.3.5 (2018-03-28)
|
||||||
without grouping being executed on the DB servers and the coordinator only summing
|
without grouping being executed on the DB servers and the coordinator only summing
|
||||||
up the counts from the individual shards
|
up the counts from the individual shards
|
||||||
|
|
||||||
* fixed issue #4900: Nested FOR query uses index but ignores other filters
|
* fixed issue #4900: Nested FOR query uses index but ignores other filters
|
||||||
|
|
||||||
* properly exit v8::Context in one place where it was missing before
|
* properly exit v8::Context in one place where it was missing before
|
||||||
|
|
||||||
* added hidden option `--cluster.index-create-timeout` for controlling the
|
* added hidden option `--cluster.index-create-timeout` for controlling the
|
||||||
|
@ -281,7 +284,7 @@ v3.3.5 (2018-03-28)
|
||||||
|
|
||||||
* increase default timeout for index creation in cluster to 3600s
|
* increase default timeout for index creation in cluster to 3600s
|
||||||
|
|
||||||
* fixed issue #4843: Query-Result has more Docs than the Collection itself
|
* fixed issue #4843: Query-Result has more Docs than the Collection itself
|
||||||
|
|
||||||
* fixed the behavior of ClusterInfo when waiting for current to catch
|
* fixed the behavior of ClusterInfo when waiting for current to catch
|
||||||
up with plan in create collection.
|
up with plan in create collection.
|
||||||
|
|
|
@ -172,7 +172,8 @@ void HeartbeatThread::runBackgroundJob() {
|
||||||
_launchAnotherBackgroundJob = false;
|
_launchAnotherBackgroundJob = false;
|
||||||
|
|
||||||
// the JobGuard is in the operator() of HeartbeatBackgroundJob
|
// the JobGuard is in the operator() of HeartbeatBackgroundJob
|
||||||
SchedulerFeature::SCHEDULER->post(HeartbeatBackgroundJob(shared_from_this(), TRI_microtime()));
|
_lastSyncTime = TRI_microtime();
|
||||||
|
SchedulerFeature::SCHEDULER->post(HeartbeatBackgroundJob(shared_from_this(), _lastSyncTime));
|
||||||
} else {
|
} else {
|
||||||
_backgroundJobScheduledOrRunning = false;
|
_backgroundJobScheduledOrRunning = false;
|
||||||
_launchAnotherBackgroundJob = false;
|
_launchAnotherBackgroundJob = false;
|
||||||
|
@ -265,7 +266,7 @@ void HeartbeatThread::runDBServer() {
|
||||||
}
|
}
|
||||||
|
|
||||||
if (doSync) {
|
if (doSync) {
|
||||||
syncDBServerStatusQuo();
|
syncDBServerStatusQuo(true);
|
||||||
}
|
}
|
||||||
|
|
||||||
return true;
|
return true;
|
||||||
|
@ -289,6 +290,13 @@ void HeartbeatThread::runDBServer() {
|
||||||
int const currentCountStart = 1; // set to 1 by Max to speed up discovery
|
int const currentCountStart = 1; // set to 1 by Max to speed up discovery
|
||||||
int currentCount = currentCountStart;
|
int currentCount = currentCountStart;
|
||||||
|
|
||||||
|
// Loop priorities / goals
|
||||||
|
// 0. send state to agency server
|
||||||
|
// 1. schedule handlePlanChange immediately when agency callback occurs
|
||||||
|
// 2. poll for plan change, schedule handlePlanChange immediately if change detected
|
||||||
|
// 3. force handlePlanChange every 7.4 seconds just in case
|
||||||
|
// (7.4 seconds is just less than half the 15 seconds agency uses to declare dead server)
|
||||||
|
// 4. if handlePlanChange runs long (greater than 7.4 seconds), have another start immediately after
|
||||||
|
|
||||||
while (!isStopping()) {
|
while (!isStopping()) {
|
||||||
logThreadDeaths();
|
logThreadDeaths();
|
||||||
|
@ -612,7 +620,7 @@ void HeartbeatThread::runSingleServer() {
|
||||||
// wait for everything to calm down for good measure
|
// wait for everything to calm down for good measure
|
||||||
std::this_thread::sleep_for(std::chrono::seconds(10));
|
std::this_thread::sleep_for(std::chrono::seconds(10));
|
||||||
}
|
}
|
||||||
|
|
||||||
TRI_voc_tick_t lastTick = 0; // we always want to set lastTick
|
TRI_voc_tick_t lastTick = 0; // we always want to set lastTick
|
||||||
auto sendTransient = [&]() {
|
auto sendTransient = [&]() {
|
||||||
VPackBuilder builder;
|
VPackBuilder builder;
|
||||||
|
@ -1140,7 +1148,7 @@ bool HeartbeatThread::handlePlanChangeCoordinator(uint64_t currentPlanVersion) {
|
||||||
/// and every few heartbeats if the Current/Version has changed.
|
/// and every few heartbeats if the Current/Version has changed.
|
||||||
////////////////////////////////////////////////////////////////////////////////
|
////////////////////////////////////////////////////////////////////////////////
|
||||||
|
|
||||||
void HeartbeatThread::syncDBServerStatusQuo() {
|
void HeartbeatThread::syncDBServerStatusQuo(bool asyncPush) {
|
||||||
bool shouldUpdate = false;
|
bool shouldUpdate = false;
|
||||||
bool becauseOfPlan = false;
|
bool becauseOfPlan = false;
|
||||||
bool becauseOfCurrent = false;
|
bool becauseOfCurrent = false;
|
||||||
|
@ -1162,8 +1170,10 @@ void HeartbeatThread::syncDBServerStatusQuo() {
|
||||||
becauseOfCurrent = true;
|
becauseOfCurrent = true;
|
||||||
}
|
}
|
||||||
|
|
||||||
|
// 7.4 seconds is just less than half the 15 seconds agency uses to declare dead server,
|
||||||
|
// perform a safety execution of job in case other plan changes somehow incomplete or undetected
|
||||||
double now = TRI_microtime();
|
double now = TRI_microtime();
|
||||||
if (now > _lastSyncTime + 7.4) {
|
if (now > _lastSyncTime + 7.4 || asyncPush) {
|
||||||
shouldUpdate = true;
|
shouldUpdate = true;
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|
|
@ -169,7 +169,7 @@ class HeartbeatThread : public CriticalThread,
|
||||||
/// @brief bring the db server in sync with the desired state
|
/// @brief bring the db server in sync with the desired state
|
||||||
//////////////////////////////////////////////////////////////////////////////
|
//////////////////////////////////////////////////////////////////////////////
|
||||||
|
|
||||||
void syncDBServerStatusQuo();
|
void syncDBServerStatusQuo(bool asyncPush = false);
|
||||||
|
|
||||||
//////////////////////////////////////////////////////////////////////////////
|
//////////////////////////////////////////////////////////////////////////////
|
||||||
/// @brief update the local agent pool from the slice
|
/// @brief update the local agent pool from the slice
|
||||||
|
|
Loading…
Reference in New Issue