Fix move leader shard. (#7445)

* Ungreylist move shard test. * Move leader shard: wait until all but the old leader are in sync. * Increate moveShard timeout to 10000 seconds. * Add CHANGELOG entry. * Fix compilation. * Fix a misleading comment.
2018-11-26 15:04:04 +01:00 · 2018-11-26 15:04:04 +01:00 · d72e51ed8f
parent 346a0a15a4
commit d72e51ed8f
4 changed files with 29 additions and 10 deletions
--- a/2
+++ b/2
@ -90,6 +90,8 @@ devel

 * use `-std=c++14` for ArangoDB compilation

+* fix move leader shard: wait until all but the old leader are in sync.
+  This fixes some unstable tests.

 v3.4.0-rc.4 (XXXX-XX-XX)
 ------------------------
--- a/arangod/Agency/MoveShard.cpp
+++ b/arangod/Agency/MoveShard.cpp
@ -427,7 +427,7 @@ JOB_STATUS MoveShard::pendingLeader() {
      = _snapshot.hasAsString(pendingPrefix + _jobId + "/timeCreated").first;
    Supervision::TimePoint timeCreated = stringToTimepoint(timeCreatedString);
    Supervision::TimePoint now(std::chrono::system_clock::now());
-    if (now - timeCreated > std::chrono::duration<double>(3600.0)) {
+    if (now - timeCreated > std::chrono::duration<double>(10000.0)) {
      abort();
      return true;
    }
@ -549,12 +549,34 @@ JOB_STATUS MoveShard::pendingLeader() {
      trx.add(pre.slice());
    }
  } else if (plan[0].copyString() == _to) {
-    // New leader in Plan, let's check that it has assumed leadership:
-    size_t done = 0;  // count the number of shards for which leader has retired
+    // New leader in Plan, let's check that it has assumed leadership and
+    // all but except the old leader are in sync:
+    size_t done = 0;
    doForAllShards(_snapshot, _database, shardsLikeMe,
      [this, &done](Slice plan, Slice current, std::string& planPath) {
        if (current.length() > 0 && current[0].copyString() == _to) {
-          ++done;
+          if (plan.length() < 3) {
+            // This only happens for replicationFactor == 1, in which case
+            // there are exactly 2 servers in the Plan at this stage.
+            // But then we do not have to wait for any follower to get in sync.
+            ++done;
+          } else {
+            // New leader has assumed leadership, now check all but the
+            // old leader:
+            size_t found = 0;
+            for (size_t i = 1; i < plan.length() - 1; ++i) {
+              VPackSlice p = plan[i];
+              for (auto const& c : VPackArrayIterator(current)) {
+                if (arangodb::basics::VelocyPackHelper::compare(p, c, true)) {
+                  ++found;
+                  break;
+                }
+              }
+            }
+            if (found >= plan.length() - 2) {
+              ++done;
+            }
+          }
        }
      });

@ -645,7 +667,7 @@ JOB_STATUS MoveShard::pendingFollower() {
      = _snapshot.hasAsString(pendingPrefix + _jobId + "/timeCreated").first;
    Supervision::TimePoint timeCreated = stringToTimepoint(timeCreatedString);
    Supervision::TimePoint now(std::chrono::system_clock::now());
-    if (now - timeCreated > std::chrono::duration<double>(3600.0)) {
+    if (now - timeCreated > std::chrono::duration<double>(10000.0)) {
      abort();
      return FAILED;
    }
--- a/tests/Greylist.txt
+++ b/tests/Greylist.txt
@ -4,8 +4,3 @@ In this file we collect information about which tests are currently
 greylisted. Please add a reason and date, and possibly links to issues
 or PRs.

- - `./tests/js/server/resilience/moving-shards-with-arangosearch-view-cluster.js`
-   as of now this test has frequent failures. The issue seems to be that
-   the MoveShard operation from one leader to a different one does not
-   reliably drop the old leader as a follower from the Plan in the end.
-   See https://github.com/arangodb/release-3.4/issues/125
--- a/tests/js/server/resilience/moving-shards-with-arangosearch-view-cluster-grey.js
+++ b/tests/js/server/resilience/moving-shards-with-arangosearch-view-cluster-grey.js