diff --git a/CHANGELOG b/CHANGELOG index d8637c104a..a16f9d9b43 100644 --- a/CHANGELOG +++ b/CHANGELOG @@ -1,6 +1,8 @@ v3.5.1 (XXXX-XX-XX) ------------------- +* Rely on reboot ids for declaring end of cluster hot restore on coordinators. + * Obtain new unique IDs via a background thread. * Fixed issue #10078: FULLTEXT with sort on same field not working. diff --git a/arangod/Cluster/ClusterInfo.cpp b/arangod/Cluster/ClusterInfo.cpp index c5bd8b435d..13638ebf80 100644 --- a/arangod/Cluster/ClusterInfo.cpp +++ b/arangod/Cluster/ClusterInfo.cpp @@ -3567,6 +3567,15 @@ void ClusterInfo::loadServers() { << " errorMessage: " << result.errorMessage() << " body: " << result.body(); } +//////////////////////////////////////////////////////////////////////// +/// @brief Hand out copy of reboot ids +//////////////////////////////////////////////////////////////////////////////// + +std::unordered_map ClusterInfo::rebootIds() const { + MUTEX_LOCKER(mutexLocker, _serversProt.mutex); + return _serversKnown.rebootIds(); +} + //////////////////////////////////////////////////////////////////////// /// @brief find the endpoint of a server from its ID. /// If it is not found in the cache, the cache is reloaded once, if @@ -4552,7 +4561,7 @@ ClusterInfo::ServersKnown::serversKnown() const noexcept { return _serversKnown; } -std::unordered_map ClusterInfo::ServersKnown::rebootIds() const noexcept { +std::unordered_map ClusterInfo::ServersKnown::rebootIds() const { std::unordered_map rebootIds; for (auto const& it : _serversKnown) { rebootIds.emplace(it.first, it.second.rebootId()); diff --git a/arangod/Cluster/ClusterInfo.h b/arangod/Cluster/ClusterInfo.h index 182494e062..82cdffb0e8 100644 --- a/arangod/Cluster/ClusterInfo.h +++ b/arangod/Cluster/ClusterInfo.h @@ -363,7 +363,7 @@ class ClusterInfo final { public: explicit constexpr KnownServer(RebootId rebootId) : _rebootId(rebootId) {} - RebootId rebootId() const noexcept { return _rebootId; } + RebootId rebootId() const { return _rebootId; } private: RebootId _rebootId; @@ -371,7 +371,7 @@ class ClusterInfo final { std::unordered_map const& serversKnown() const noexcept; - std::unordered_map rebootIds() const noexcept; + std::unordered_map rebootIds() const; private: std::unordered_map _serversKnown; @@ -798,6 +798,8 @@ class ClusterInfo final { std::unordered_map getServerTimestamps(); + std::unordered_map rebootIds() const; + uint64_t getPlanVersion() { READ_LOCKER(guard, _planProt.lock); return _planVersion; @@ -903,7 +905,7 @@ class ClusterInfo final { struct ProtectionData { std::atomic isValid; - Mutex mutex; + mutable Mutex mutex; std::atomic wantedVersion; std::atomic doneVersion; arangodb::basics::ReadWriteLock lock; diff --git a/arangod/Cluster/ClusterMethods.cpp b/arangod/Cluster/ClusterMethods.cpp index 52f377fcd3..ea65f80fef 100644 --- a/arangod/Cluster/ClusterMethods.cpp +++ b/arangod/Cluster/ClusterMethods.cpp @@ -22,6 +22,8 @@ /// @author Kaveh Vahedipour //////////////////////////////////////////////////////////////////////////////// + +#include "Cluster/ClusterTypes.h" #include "ClusterMethods.h" #include "Agency/TimeString.h" @@ -3662,9 +3664,8 @@ arangodb::Result hotRestoreCoordinator(VPackSlice const payload, VPackBuilder& r // We keep the currently registered timestamps in Current/ServersRegistered, // such that we can wait until all have reregistered and are up: - ci->loadServers(); - std::unordered_map serverTimestamps = - ci->getServerTimestamps(); + ci->loadCurrentDBServers(); + auto const preServersKnown = ci->rebootIds(); // Restore all db servers std::string previous; @@ -3684,13 +3685,17 @@ arangodb::Result hotRestoreCoordinator(VPackSlice const payload, VPackBuilder& r return arangodb::Result(TRI_ERROR_HOT_RESTORE_INTERNAL, "Not all DBservers came back in time!"); } - ci->loadServers(); - std::unordered_map newServerTimestamps = - ci->getServerTimestamps(); + ci->loadCurrentDBServers(); + auto const postServersKnown = ci->rebootIds(); + if (ci->getCurrentDBServers().size() < dbServers.size()) { + LOG_TOPIC("8dce7", INFO, Logger::BACKUP) << "Waiting for all db servers to return"; + continue; + } + // Check timestamps of all dbservers: size_t good = 0; // Count restarted servers for (auto const& dbs : dbServers) { - if (serverTimestamps[dbs] != newServerTimestamps[dbs]) { + if (postServersKnown.at(dbs) != preServersKnown.at(dbs)) { ++good; } } diff --git a/arangod/Cluster/ClusterTypes.h b/arangod/Cluster/ClusterTypes.h index 5a13021aeb..fa00f8ecbf 100644 --- a/arangod/Cluster/ClusterTypes.h +++ b/arangod/Cluster/ClusterTypes.h @@ -25,6 +25,7 @@ #include #include +#include namespace arangodb { @@ -66,10 +67,19 @@ class RebootId { return RebootId{std::numeric_limits::max()}; } + std::ostream& print(std::ostream& o) const { + o << _value; + return o; + } + private: uint64_t _value; }; } // namespace arangodb +inline std::ostream& operator<< (std::ostream& o, arangodb::RebootId const& r) { + return r.print(o); +} + #endif // ARANGOD_CLUSTER_CLUSTERTYPES_H