[3.5] coordinator proper wait for dbservers after hot restore (#10049)

* rebootIds instead of boot stamps * noexcept is of course wrong * wrong noexcept here. we're copying. * change log * Update CHANGELOG
2019-09-30 10:07:04 +02:00 · 2019-09-30 10:07:04 +02:00 · f15fe22c7c
parent 587cead36f
commit f15fe22c7c
5 changed files with 39 additions and 11 deletions
--- a/2
+++ b/2
@ -1,6 +1,8 @@
 v3.5.1 (XXXX-XX-XX)
 -------------------

+* Rely on reboot ids for declaring end of cluster hot restore on coordinators.
+
 * Obtain new unique IDs via a background thread.

 * Fixed issue #10078: FULLTEXT with sort on same field not working.
--- a/arangod/Cluster/ClusterInfo.cpp
+++ b/arangod/Cluster/ClusterInfo.cpp
@ -3567,6 +3567,15 @@ void ClusterInfo::loadServers() {
      << " errorMessage: " << result.errorMessage() << " body: " << result.body();
 }

+////////////////////////////////////////////////////////////////////////
+/// @brief Hand out copy of reboot ids
+////////////////////////////////////////////////////////////////////////////////
+
+std::unordered_map<ServerID, RebootId> ClusterInfo::rebootIds() const {
+  MUTEX_LOCKER(mutexLocker, _serversProt.mutex);
+  return _serversKnown.rebootIds();
+}
+
 ////////////////////////////////////////////////////////////////////////
 /// @brief find the endpoint of a server from its ID.
 /// If it is not found in the cache, the cache is reloaded once, if
@ -4552,7 +4561,7 @@ ClusterInfo::ServersKnown::serversKnown() const noexcept {
  return _serversKnown;
 }

-std::unordered_map<ServerID, RebootId> ClusterInfo::ServersKnown::rebootIds() const noexcept {
+std::unordered_map<ServerID, RebootId> ClusterInfo::ServersKnown::rebootIds() const {
  std::unordered_map<ServerID, RebootId> rebootIds;
  for (auto const& it : _serversKnown) {
    rebootIds.emplace(it.first, it.second.rebootId());
--- a/arangod/Cluster/ClusterInfo.h
+++ b/arangod/Cluster/ClusterInfo.h
@ -363,7 +363,7 @@ class ClusterInfo final {
     public:
      explicit constexpr KnownServer(RebootId rebootId) : _rebootId(rebootId) {}

-      RebootId rebootId() const noexcept { return _rebootId; }
+      RebootId rebootId() const { return _rebootId; }

     private:
      RebootId _rebootId;
@ -371,7 +371,7 @@ class ClusterInfo final {

    std::unordered_map<ServerID, KnownServer> const& serversKnown() const noexcept;

-    std::unordered_map<ServerID, RebootId> rebootIds() const noexcept;
+    std::unordered_map<ServerID, RebootId> rebootIds() const;

   private:
    std::unordered_map<ServerID, KnownServer> _serversKnown;
@ -798,6 +798,8 @@ class ClusterInfo final {

  std::unordered_map<ServerID, std::string> getServerTimestamps();

+  std::unordered_map<ServerID, RebootId> rebootIds() const;
+
  uint64_t getPlanVersion() {
    READ_LOCKER(guard, _planProt.lock);
    return _planVersion;
@ -903,7 +905,7 @@ class ClusterInfo final {

  struct ProtectionData {
    std::atomic<bool> isValid;
-    Mutex mutex;
+    mutable Mutex mutex;
    std::atomic<uint64_t> wantedVersion;
    std::atomic<uint64_t> doneVersion;
    arangodb::basics::ReadWriteLock lock;
--- a/arangod/Cluster/ClusterMethods.cpp
+++ b/arangod/Cluster/ClusterMethods.cpp
@ -22,6 +22,8 @@
 /// @author Kaveh Vahedipour
 ////////////////////////////////////////////////////////////////////////////////

+
+#include "Cluster/ClusterTypes.h"
 #include "ClusterMethods.h"

 #include "Agency/TimeString.h"
@ -3662,9 +3664,8 @@ arangodb::Result hotRestoreCoordinator(VPackSlice const payload, VPackBuilder& r

  // We keep the currently registered timestamps in Current/ServersRegistered,
  // such that we can wait until all have reregistered and are up:
-  ci->loadServers();
-  std::unordered_map<std::string, std::string> serverTimestamps =
-      ci->getServerTimestamps();
+  ci->loadCurrentDBServers();
+  auto const preServersKnown = ci->rebootIds();

  // Restore all db servers
  std::string previous;
@ -3684,13 +3685,17 @@ arangodb::Result hotRestoreCoordinator(VPackSlice const payload, VPackBuilder& r
      return arangodb::Result(TRI_ERROR_HOT_RESTORE_INTERNAL,
                              "Not all DBservers came back in time!");
    }
-    ci->loadServers();
-    std::unordered_map<std::string, std::string> newServerTimestamps =
-        ci->getServerTimestamps();
+    ci->loadCurrentDBServers();
+    auto const postServersKnown = ci->rebootIds();
+    if (ci->getCurrentDBServers().size() < dbServers.size()) {
+      LOG_TOPIC("8dce7", INFO, Logger::BACKUP) << "Waiting for all db servers to return";
+      continue;
+    }
+
    // Check timestamps of all dbservers:
    size_t good = 0;  // Count restarted servers
    for (auto const& dbs : dbServers) {
-      if (serverTimestamps[dbs] != newServerTimestamps[dbs]) {
+      if (postServersKnown.at(dbs) != preServersKnown.at(dbs)) {
        ++good;
      }
    }
--- a/arangod/Cluster/ClusterTypes.h
+++ b/arangod/Cluster/ClusterTypes.h
@ -25,6 +25,7 @@

 #include <limits>
 #include <string>
+#include <iostream>

 namespace arangodb {

@ -66,10 +67,19 @@ class RebootId {
    return RebootId{std::numeric_limits<decltype(_value)>::max()};
  }

+  std::ostream& print(std::ostream& o) const {
+    o << _value;
+    return o;
+  }
+
 private:
  uint64_t _value;
 };

 }  // namespace arangodb

+inline std::ostream& operator<< (std::ostream& o, arangodb::RebootId const& r) {
+  return r.print(o);
+}
+
 #endif  // ARANGOD_CLUSTER_CLUSTERTYPES_H