mirror of https://gitee.com/bigwinds/arangodb
Fix move leader shard. (#7445)
* Ungreylist move shard test. * Move leader shard: wait until all but the old leader are in sync. * Increate moveShard timeout to 10000 seconds. * Add CHANGELOG entry. * Fix compilation. * Fix a misleading comment.
This commit is contained in:
parent
346a0a15a4
commit
d72e51ed8f
|
@ -90,6 +90,8 @@ devel
|
||||||
|
|
||||||
* use `-std=c++14` for ArangoDB compilation
|
* use `-std=c++14` for ArangoDB compilation
|
||||||
|
|
||||||
|
* fix move leader shard: wait until all but the old leader are in sync.
|
||||||
|
This fixes some unstable tests.
|
||||||
|
|
||||||
v3.4.0-rc.4 (XXXX-XX-XX)
|
v3.4.0-rc.4 (XXXX-XX-XX)
|
||||||
------------------------
|
------------------------
|
||||||
|
|
|
@ -427,7 +427,7 @@ JOB_STATUS MoveShard::pendingLeader() {
|
||||||
= _snapshot.hasAsString(pendingPrefix + _jobId + "/timeCreated").first;
|
= _snapshot.hasAsString(pendingPrefix + _jobId + "/timeCreated").first;
|
||||||
Supervision::TimePoint timeCreated = stringToTimepoint(timeCreatedString);
|
Supervision::TimePoint timeCreated = stringToTimepoint(timeCreatedString);
|
||||||
Supervision::TimePoint now(std::chrono::system_clock::now());
|
Supervision::TimePoint now(std::chrono::system_clock::now());
|
||||||
if (now - timeCreated > std::chrono::duration<double>(3600.0)) {
|
if (now - timeCreated > std::chrono::duration<double>(10000.0)) {
|
||||||
abort();
|
abort();
|
||||||
return true;
|
return true;
|
||||||
}
|
}
|
||||||
|
@ -549,12 +549,34 @@ JOB_STATUS MoveShard::pendingLeader() {
|
||||||
trx.add(pre.slice());
|
trx.add(pre.slice());
|
||||||
}
|
}
|
||||||
} else if (plan[0].copyString() == _to) {
|
} else if (plan[0].copyString() == _to) {
|
||||||
// New leader in Plan, let's check that it has assumed leadership:
|
// New leader in Plan, let's check that it has assumed leadership and
|
||||||
size_t done = 0; // count the number of shards for which leader has retired
|
// all but except the old leader are in sync:
|
||||||
|
size_t done = 0;
|
||||||
doForAllShards(_snapshot, _database, shardsLikeMe,
|
doForAllShards(_snapshot, _database, shardsLikeMe,
|
||||||
[this, &done](Slice plan, Slice current, std::string& planPath) {
|
[this, &done](Slice plan, Slice current, std::string& planPath) {
|
||||||
if (current.length() > 0 && current[0].copyString() == _to) {
|
if (current.length() > 0 && current[0].copyString() == _to) {
|
||||||
++done;
|
if (plan.length() < 3) {
|
||||||
|
// This only happens for replicationFactor == 1, in which case
|
||||||
|
// there are exactly 2 servers in the Plan at this stage.
|
||||||
|
// But then we do not have to wait for any follower to get in sync.
|
||||||
|
++done;
|
||||||
|
} else {
|
||||||
|
// New leader has assumed leadership, now check all but the
|
||||||
|
// old leader:
|
||||||
|
size_t found = 0;
|
||||||
|
for (size_t i = 1; i < plan.length() - 1; ++i) {
|
||||||
|
VPackSlice p = plan[i];
|
||||||
|
for (auto const& c : VPackArrayIterator(current)) {
|
||||||
|
if (arangodb::basics::VelocyPackHelper::compare(p, c, true)) {
|
||||||
|
++found;
|
||||||
|
break;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
if (found >= plan.length() - 2) {
|
||||||
|
++done;
|
||||||
|
}
|
||||||
|
}
|
||||||
}
|
}
|
||||||
});
|
});
|
||||||
|
|
||||||
|
@ -645,7 +667,7 @@ JOB_STATUS MoveShard::pendingFollower() {
|
||||||
= _snapshot.hasAsString(pendingPrefix + _jobId + "/timeCreated").first;
|
= _snapshot.hasAsString(pendingPrefix + _jobId + "/timeCreated").first;
|
||||||
Supervision::TimePoint timeCreated = stringToTimepoint(timeCreatedString);
|
Supervision::TimePoint timeCreated = stringToTimepoint(timeCreatedString);
|
||||||
Supervision::TimePoint now(std::chrono::system_clock::now());
|
Supervision::TimePoint now(std::chrono::system_clock::now());
|
||||||
if (now - timeCreated > std::chrono::duration<double>(3600.0)) {
|
if (now - timeCreated > std::chrono::duration<double>(10000.0)) {
|
||||||
abort();
|
abort();
|
||||||
return FAILED;
|
return FAILED;
|
||||||
}
|
}
|
||||||
|
|
|
@ -4,8 +4,3 @@ In this file we collect information about which tests are currently
|
||||||
greylisted. Please add a reason and date, and possibly links to issues
|
greylisted. Please add a reason and date, and possibly links to issues
|
||||||
or PRs.
|
or PRs.
|
||||||
|
|
||||||
- `./tests/js/server/resilience/moving-shards-with-arangosearch-view-cluster.js`
|
|
||||||
as of now this test has frequent failures. The issue seems to be that
|
|
||||||
the MoveShard operation from one leader to a different one does not
|
|
||||||
reliably drop the old leader as a follower from the Plan in the end.
|
|
||||||
See https://github.com/arangodb/release-3.4/issues/125
|
|
||||||
|
|
Loading…
Reference in New Issue