diff --git a/CHANGELOG b/CHANGELOG index c83d9c5707..1b0fea1877 100644 --- a/CHANGELOG +++ b/CHANGELOG @@ -90,6 +90,8 @@ devel * use `-std=c++14` for ArangoDB compilation +* fix move leader shard: wait until all but the old leader are in sync. + This fixes some unstable tests. v3.4.0-rc.4 (XXXX-XX-XX) ------------------------ diff --git a/arangod/Agency/MoveShard.cpp b/arangod/Agency/MoveShard.cpp index 4695e7faf2..fb589f91d4 100644 --- a/arangod/Agency/MoveShard.cpp +++ b/arangod/Agency/MoveShard.cpp @@ -427,7 +427,7 @@ JOB_STATUS MoveShard::pendingLeader() { = _snapshot.hasAsString(pendingPrefix + _jobId + "/timeCreated").first; Supervision::TimePoint timeCreated = stringToTimepoint(timeCreatedString); Supervision::TimePoint now(std::chrono::system_clock::now()); - if (now - timeCreated > std::chrono::duration(3600.0)) { + if (now - timeCreated > std::chrono::duration(10000.0)) { abort(); return true; } @@ -549,12 +549,34 @@ JOB_STATUS MoveShard::pendingLeader() { trx.add(pre.slice()); } } else if (plan[0].copyString() == _to) { - // New leader in Plan, let's check that it has assumed leadership: - size_t done = 0; // count the number of shards for which leader has retired + // New leader in Plan, let's check that it has assumed leadership and + // all but except the old leader are in sync: + size_t done = 0; doForAllShards(_snapshot, _database, shardsLikeMe, [this, &done](Slice plan, Slice current, std::string& planPath) { if (current.length() > 0 && current[0].copyString() == _to) { - ++done; + if (plan.length() < 3) { + // This only happens for replicationFactor == 1, in which case + // there are exactly 2 servers in the Plan at this stage. + // But then we do not have to wait for any follower to get in sync. + ++done; + } else { + // New leader has assumed leadership, now check all but the + // old leader: + size_t found = 0; + for (size_t i = 1; i < plan.length() - 1; ++i) { + VPackSlice p = plan[i]; + for (auto const& c : VPackArrayIterator(current)) { + if (arangodb::basics::VelocyPackHelper::compare(p, c, true)) { + ++found; + break; + } + } + } + if (found >= plan.length() - 2) { + ++done; + } + } } }); @@ -645,7 +667,7 @@ JOB_STATUS MoveShard::pendingFollower() { = _snapshot.hasAsString(pendingPrefix + _jobId + "/timeCreated").first; Supervision::TimePoint timeCreated = stringToTimepoint(timeCreatedString); Supervision::TimePoint now(std::chrono::system_clock::now()); - if (now - timeCreated > std::chrono::duration(3600.0)) { + if (now - timeCreated > std::chrono::duration(10000.0)) { abort(); return FAILED; } diff --git a/tests/Greylist.txt b/tests/Greylist.txt index acf0d909a9..fd0a71e799 100644 --- a/tests/Greylist.txt +++ b/tests/Greylist.txt @@ -4,8 +4,3 @@ In this file we collect information about which tests are currently greylisted. Please add a reason and date, and possibly links to issues or PRs. - - `./tests/js/server/resilience/moving-shards-with-arangosearch-view-cluster.js` - as of now this test has frequent failures. The issue seems to be that - the MoveShard operation from one leader to a different one does not - reliably drop the old leader as a follower from the Plan in the end. - See https://github.com/arangodb/release-3.4/issues/125 diff --git a/tests/js/server/resilience/moving-shards-with-arangosearch-view-cluster-grey.js b/tests/js/server/resilience/moving-shards-with-arangosearch-view-cluster.js similarity index 100% rename from tests/js/server/resilience/moving-shards-with-arangosearch-view-cluster-grey.js rename to tests/js/server/resilience/moving-shards-with-arangosearch-view-cluster.js