1
0
Fork 0

Fix move leader shard. (#7445)

* Ungreylist move shard test.
* Move leader shard: wait until all but the old leader are in sync.
* Increate moveShard timeout to 10000 seconds.
* Add CHANGELOG entry.
* Fix compilation.
* Fix a misleading comment.
This commit is contained in:
Max Neunhöffer 2018-11-26 15:04:04 +01:00 committed by GitHub
parent 346a0a15a4
commit d72e51ed8f
No known key found for this signature in database
GPG Key ID: 4AEE18F83AFDEB23
4 changed files with 29 additions and 10 deletions

View File

@ -90,6 +90,8 @@ devel
* use `-std=c++14` for ArangoDB compilation
* fix move leader shard: wait until all but the old leader are in sync.
This fixes some unstable tests.
v3.4.0-rc.4 (XXXX-XX-XX)
------------------------

View File

@ -427,7 +427,7 @@ JOB_STATUS MoveShard::pendingLeader() {
= _snapshot.hasAsString(pendingPrefix + _jobId + "/timeCreated").first;
Supervision::TimePoint timeCreated = stringToTimepoint(timeCreatedString);
Supervision::TimePoint now(std::chrono::system_clock::now());
if (now - timeCreated > std::chrono::duration<double>(3600.0)) {
if (now - timeCreated > std::chrono::duration<double>(10000.0)) {
abort();
return true;
}
@ -549,12 +549,34 @@ JOB_STATUS MoveShard::pendingLeader() {
trx.add(pre.slice());
}
} else if (plan[0].copyString() == _to) {
// New leader in Plan, let's check that it has assumed leadership:
size_t done = 0; // count the number of shards for which leader has retired
// New leader in Plan, let's check that it has assumed leadership and
// all but except the old leader are in sync:
size_t done = 0;
doForAllShards(_snapshot, _database, shardsLikeMe,
[this, &done](Slice plan, Slice current, std::string& planPath) {
if (current.length() > 0 && current[0].copyString() == _to) {
++done;
if (plan.length() < 3) {
// This only happens for replicationFactor == 1, in which case
// there are exactly 2 servers in the Plan at this stage.
// But then we do not have to wait for any follower to get in sync.
++done;
} else {
// New leader has assumed leadership, now check all but the
// old leader:
size_t found = 0;
for (size_t i = 1; i < plan.length() - 1; ++i) {
VPackSlice p = plan[i];
for (auto const& c : VPackArrayIterator(current)) {
if (arangodb::basics::VelocyPackHelper::compare(p, c, true)) {
++found;
break;
}
}
}
if (found >= plan.length() - 2) {
++done;
}
}
}
});
@ -645,7 +667,7 @@ JOB_STATUS MoveShard::pendingFollower() {
= _snapshot.hasAsString(pendingPrefix + _jobId + "/timeCreated").first;
Supervision::TimePoint timeCreated = stringToTimepoint(timeCreatedString);
Supervision::TimePoint now(std::chrono::system_clock::now());
if (now - timeCreated > std::chrono::duration<double>(3600.0)) {
if (now - timeCreated > std::chrono::duration<double>(10000.0)) {
abort();
return FAILED;
}

View File

@ -4,8 +4,3 @@ In this file we collect information about which tests are currently
greylisted. Please add a reason and date, and possibly links to issues
or PRs.
- `./tests/js/server/resilience/moving-shards-with-arangosearch-view-cluster.js`
as of now this test has frequent failures. The issue seems to be that
the MoveShard operation from one leader to a different one does not
reliably drop the old leader as a follower from the Plan in the end.
See https://github.com/arangodb/release-3.4/issues/125