mirror of https://gitee.com/bigwinds/arangodb
[3.5] coordinator proper wait for dbservers after hot restore (#10049)
* rebootIds instead of boot stamps * noexcept is of course wrong * wrong noexcept here. we're copying. * change log * Update CHANGELOG
This commit is contained in:
parent
587cead36f
commit
f15fe22c7c
|
@ -1,6 +1,8 @@
|
|||
v3.5.1 (XXXX-XX-XX)
|
||||
-------------------
|
||||
|
||||
* Rely on reboot ids for declaring end of cluster hot restore on coordinators.
|
||||
|
||||
* Obtain new unique IDs via a background thread.
|
||||
|
||||
* Fixed issue #10078: FULLTEXT with sort on same field not working.
|
||||
|
|
|
@ -3567,6 +3567,15 @@ void ClusterInfo::loadServers() {
|
|||
<< " errorMessage: " << result.errorMessage() << " body: " << result.body();
|
||||
}
|
||||
|
||||
////////////////////////////////////////////////////////////////////////
|
||||
/// @brief Hand out copy of reboot ids
|
||||
////////////////////////////////////////////////////////////////////////////////
|
||||
|
||||
std::unordered_map<ServerID, RebootId> ClusterInfo::rebootIds() const {
|
||||
MUTEX_LOCKER(mutexLocker, _serversProt.mutex);
|
||||
return _serversKnown.rebootIds();
|
||||
}
|
||||
|
||||
////////////////////////////////////////////////////////////////////////
|
||||
/// @brief find the endpoint of a server from its ID.
|
||||
/// If it is not found in the cache, the cache is reloaded once, if
|
||||
|
@ -4552,7 +4561,7 @@ ClusterInfo::ServersKnown::serversKnown() const noexcept {
|
|||
return _serversKnown;
|
||||
}
|
||||
|
||||
std::unordered_map<ServerID, RebootId> ClusterInfo::ServersKnown::rebootIds() const noexcept {
|
||||
std::unordered_map<ServerID, RebootId> ClusterInfo::ServersKnown::rebootIds() const {
|
||||
std::unordered_map<ServerID, RebootId> rebootIds;
|
||||
for (auto const& it : _serversKnown) {
|
||||
rebootIds.emplace(it.first, it.second.rebootId());
|
||||
|
|
|
@ -363,7 +363,7 @@ class ClusterInfo final {
|
|||
public:
|
||||
explicit constexpr KnownServer(RebootId rebootId) : _rebootId(rebootId) {}
|
||||
|
||||
RebootId rebootId() const noexcept { return _rebootId; }
|
||||
RebootId rebootId() const { return _rebootId; }
|
||||
|
||||
private:
|
||||
RebootId _rebootId;
|
||||
|
@ -371,7 +371,7 @@ class ClusterInfo final {
|
|||
|
||||
std::unordered_map<ServerID, KnownServer> const& serversKnown() const noexcept;
|
||||
|
||||
std::unordered_map<ServerID, RebootId> rebootIds() const noexcept;
|
||||
std::unordered_map<ServerID, RebootId> rebootIds() const;
|
||||
|
||||
private:
|
||||
std::unordered_map<ServerID, KnownServer> _serversKnown;
|
||||
|
@ -798,6 +798,8 @@ class ClusterInfo final {
|
|||
|
||||
std::unordered_map<ServerID, std::string> getServerTimestamps();
|
||||
|
||||
std::unordered_map<ServerID, RebootId> rebootIds() const;
|
||||
|
||||
uint64_t getPlanVersion() {
|
||||
READ_LOCKER(guard, _planProt.lock);
|
||||
return _planVersion;
|
||||
|
@ -903,7 +905,7 @@ class ClusterInfo final {
|
|||
|
||||
struct ProtectionData {
|
||||
std::atomic<bool> isValid;
|
||||
Mutex mutex;
|
||||
mutable Mutex mutex;
|
||||
std::atomic<uint64_t> wantedVersion;
|
||||
std::atomic<uint64_t> doneVersion;
|
||||
arangodb::basics::ReadWriteLock lock;
|
||||
|
|
|
@ -22,6 +22,8 @@
|
|||
/// @author Kaveh Vahedipour
|
||||
////////////////////////////////////////////////////////////////////////////////
|
||||
|
||||
|
||||
#include "Cluster/ClusterTypes.h"
|
||||
#include "ClusterMethods.h"
|
||||
|
||||
#include "Agency/TimeString.h"
|
||||
|
@ -3662,9 +3664,8 @@ arangodb::Result hotRestoreCoordinator(VPackSlice const payload, VPackBuilder& r
|
|||
|
||||
// We keep the currently registered timestamps in Current/ServersRegistered,
|
||||
// such that we can wait until all have reregistered and are up:
|
||||
ci->loadServers();
|
||||
std::unordered_map<std::string, std::string> serverTimestamps =
|
||||
ci->getServerTimestamps();
|
||||
ci->loadCurrentDBServers();
|
||||
auto const preServersKnown = ci->rebootIds();
|
||||
|
||||
// Restore all db servers
|
||||
std::string previous;
|
||||
|
@ -3684,13 +3685,17 @@ arangodb::Result hotRestoreCoordinator(VPackSlice const payload, VPackBuilder& r
|
|||
return arangodb::Result(TRI_ERROR_HOT_RESTORE_INTERNAL,
|
||||
"Not all DBservers came back in time!");
|
||||
}
|
||||
ci->loadServers();
|
||||
std::unordered_map<std::string, std::string> newServerTimestamps =
|
||||
ci->getServerTimestamps();
|
||||
ci->loadCurrentDBServers();
|
||||
auto const postServersKnown = ci->rebootIds();
|
||||
if (ci->getCurrentDBServers().size() < dbServers.size()) {
|
||||
LOG_TOPIC("8dce7", INFO, Logger::BACKUP) << "Waiting for all db servers to return";
|
||||
continue;
|
||||
}
|
||||
|
||||
// Check timestamps of all dbservers:
|
||||
size_t good = 0; // Count restarted servers
|
||||
for (auto const& dbs : dbServers) {
|
||||
if (serverTimestamps[dbs] != newServerTimestamps[dbs]) {
|
||||
if (postServersKnown.at(dbs) != preServersKnown.at(dbs)) {
|
||||
++good;
|
||||
}
|
||||
}
|
||||
|
|
|
@ -25,6 +25,7 @@
|
|||
|
||||
#include <limits>
|
||||
#include <string>
|
||||
#include <iostream>
|
||||
|
||||
namespace arangodb {
|
||||
|
||||
|
@ -66,10 +67,19 @@ class RebootId {
|
|||
return RebootId{std::numeric_limits<decltype(_value)>::max()};
|
||||
}
|
||||
|
||||
std::ostream& print(std::ostream& o) const {
|
||||
o << _value;
|
||||
return o;
|
||||
}
|
||||
|
||||
private:
|
||||
uint64_t _value;
|
||||
};
|
||||
|
||||
} // namespace arangodb
|
||||
|
||||
inline std::ostream& operator<< (std::ostream& o, arangodb::RebootId const& r) {
|
||||
return r.print(o);
|
||||
}
|
||||
|
||||
#endif // ARANGOD_CLUSTER_CLUSTERTYPES_H
|
||||
|
|
Loading…
Reference in New Issue