mirror of https://gitee.com/bigwinds/arangodb
Fix move leader shard. (#7445)
* Ungreylist move shard test. * Move leader shard: wait until all but the old leader are in sync. * Increate moveShard timeout to 10000 seconds. * Add CHANGELOG entry. * Fix compilation. * Fix a misleading comment.
This commit is contained in:
parent
346a0a15a4
commit
d72e51ed8f
|
@ -90,6 +90,8 @@ devel
|
|||
|
||||
* use `-std=c++14` for ArangoDB compilation
|
||||
|
||||
* fix move leader shard: wait until all but the old leader are in sync.
|
||||
This fixes some unstable tests.
|
||||
|
||||
v3.4.0-rc.4 (XXXX-XX-XX)
|
||||
------------------------
|
||||
|
|
|
@ -427,7 +427,7 @@ JOB_STATUS MoveShard::pendingLeader() {
|
|||
= _snapshot.hasAsString(pendingPrefix + _jobId + "/timeCreated").first;
|
||||
Supervision::TimePoint timeCreated = stringToTimepoint(timeCreatedString);
|
||||
Supervision::TimePoint now(std::chrono::system_clock::now());
|
||||
if (now - timeCreated > std::chrono::duration<double>(3600.0)) {
|
||||
if (now - timeCreated > std::chrono::duration<double>(10000.0)) {
|
||||
abort();
|
||||
return true;
|
||||
}
|
||||
|
@ -549,12 +549,34 @@ JOB_STATUS MoveShard::pendingLeader() {
|
|||
trx.add(pre.slice());
|
||||
}
|
||||
} else if (plan[0].copyString() == _to) {
|
||||
// New leader in Plan, let's check that it has assumed leadership:
|
||||
size_t done = 0; // count the number of shards for which leader has retired
|
||||
// New leader in Plan, let's check that it has assumed leadership and
|
||||
// all but except the old leader are in sync:
|
||||
size_t done = 0;
|
||||
doForAllShards(_snapshot, _database, shardsLikeMe,
|
||||
[this, &done](Slice plan, Slice current, std::string& planPath) {
|
||||
if (current.length() > 0 && current[0].copyString() == _to) {
|
||||
++done;
|
||||
if (plan.length() < 3) {
|
||||
// This only happens for replicationFactor == 1, in which case
|
||||
// there are exactly 2 servers in the Plan at this stage.
|
||||
// But then we do not have to wait for any follower to get in sync.
|
||||
++done;
|
||||
} else {
|
||||
// New leader has assumed leadership, now check all but the
|
||||
// old leader:
|
||||
size_t found = 0;
|
||||
for (size_t i = 1; i < plan.length() - 1; ++i) {
|
||||
VPackSlice p = plan[i];
|
||||
for (auto const& c : VPackArrayIterator(current)) {
|
||||
if (arangodb::basics::VelocyPackHelper::compare(p, c, true)) {
|
||||
++found;
|
||||
break;
|
||||
}
|
||||
}
|
||||
}
|
||||
if (found >= plan.length() - 2) {
|
||||
++done;
|
||||
}
|
||||
}
|
||||
}
|
||||
});
|
||||
|
||||
|
@ -645,7 +667,7 @@ JOB_STATUS MoveShard::pendingFollower() {
|
|||
= _snapshot.hasAsString(pendingPrefix + _jobId + "/timeCreated").first;
|
||||
Supervision::TimePoint timeCreated = stringToTimepoint(timeCreatedString);
|
||||
Supervision::TimePoint now(std::chrono::system_clock::now());
|
||||
if (now - timeCreated > std::chrono::duration<double>(3600.0)) {
|
||||
if (now - timeCreated > std::chrono::duration<double>(10000.0)) {
|
||||
abort();
|
||||
return FAILED;
|
||||
}
|
||||
|
|
|
@ -4,8 +4,3 @@ In this file we collect information about which tests are currently
|
|||
greylisted. Please add a reason and date, and possibly links to issues
|
||||
or PRs.
|
||||
|
||||
- `./tests/js/server/resilience/moving-shards-with-arangosearch-view-cluster.js`
|
||||
as of now this test has frequent failures. The issue seems to be that
|
||||
the MoveShard operation from one leader to a different one does not
|
||||
reliably drop the old leader as a follower from the Plan in the end.
|
||||
See https://github.com/arangodb/release-3.4/issues/125
|
||||
|
|
Loading…
Reference in New Issue