1
0
Fork 0

[3.5] coordinator proper wait for dbservers after hot restore (#10049)

* rebootIds instead of boot stamps

* noexcept is of course wrong

* wrong noexcept here. we're copying.

* change log

* Update CHANGELOG
This commit is contained in:
Kaveh Vahedipour 2019-09-30 10:07:04 +02:00 committed by KVS85
parent 587cead36f
commit f15fe22c7c
5 changed files with 39 additions and 11 deletions

View File

@ -1,6 +1,8 @@
v3.5.1 (XXXX-XX-XX)
-------------------
* Rely on reboot ids for declaring end of cluster hot restore on coordinators.
* Obtain new unique IDs via a background thread.
* Fixed issue #10078: FULLTEXT with sort on same field not working.

View File

@ -3567,6 +3567,15 @@ void ClusterInfo::loadServers() {
<< " errorMessage: " << result.errorMessage() << " body: " << result.body();
}
////////////////////////////////////////////////////////////////////////
/// @brief Hand out copy of reboot ids
////////////////////////////////////////////////////////////////////////////////
std::unordered_map<ServerID, RebootId> ClusterInfo::rebootIds() const {
MUTEX_LOCKER(mutexLocker, _serversProt.mutex);
return _serversKnown.rebootIds();
}
////////////////////////////////////////////////////////////////////////
/// @brief find the endpoint of a server from its ID.
/// If it is not found in the cache, the cache is reloaded once, if
@ -4552,7 +4561,7 @@ ClusterInfo::ServersKnown::serversKnown() const noexcept {
return _serversKnown;
}
std::unordered_map<ServerID, RebootId> ClusterInfo::ServersKnown::rebootIds() const noexcept {
std::unordered_map<ServerID, RebootId> ClusterInfo::ServersKnown::rebootIds() const {
std::unordered_map<ServerID, RebootId> rebootIds;
for (auto const& it : _serversKnown) {
rebootIds.emplace(it.first, it.second.rebootId());

View File

@ -363,7 +363,7 @@ class ClusterInfo final {
public:
explicit constexpr KnownServer(RebootId rebootId) : _rebootId(rebootId) {}
RebootId rebootId() const noexcept { return _rebootId; }
RebootId rebootId() const { return _rebootId; }
private:
RebootId _rebootId;
@ -371,7 +371,7 @@ class ClusterInfo final {
std::unordered_map<ServerID, KnownServer> const& serversKnown() const noexcept;
std::unordered_map<ServerID, RebootId> rebootIds() const noexcept;
std::unordered_map<ServerID, RebootId> rebootIds() const;
private:
std::unordered_map<ServerID, KnownServer> _serversKnown;
@ -798,6 +798,8 @@ class ClusterInfo final {
std::unordered_map<ServerID, std::string> getServerTimestamps();
std::unordered_map<ServerID, RebootId> rebootIds() const;
uint64_t getPlanVersion() {
READ_LOCKER(guard, _planProt.lock);
return _planVersion;
@ -903,7 +905,7 @@ class ClusterInfo final {
struct ProtectionData {
std::atomic<bool> isValid;
Mutex mutex;
mutable Mutex mutex;
std::atomic<uint64_t> wantedVersion;
std::atomic<uint64_t> doneVersion;
arangodb::basics::ReadWriteLock lock;

View File

@ -22,6 +22,8 @@
/// @author Kaveh Vahedipour
////////////////////////////////////////////////////////////////////////////////
#include "Cluster/ClusterTypes.h"
#include "ClusterMethods.h"
#include "Agency/TimeString.h"
@ -3662,9 +3664,8 @@ arangodb::Result hotRestoreCoordinator(VPackSlice const payload, VPackBuilder& r
// We keep the currently registered timestamps in Current/ServersRegistered,
// such that we can wait until all have reregistered and are up:
ci->loadServers();
std::unordered_map<std::string, std::string> serverTimestamps =
ci->getServerTimestamps();
ci->loadCurrentDBServers();
auto const preServersKnown = ci->rebootIds();
// Restore all db servers
std::string previous;
@ -3684,13 +3685,17 @@ arangodb::Result hotRestoreCoordinator(VPackSlice const payload, VPackBuilder& r
return arangodb::Result(TRI_ERROR_HOT_RESTORE_INTERNAL,
"Not all DBservers came back in time!");
}
ci->loadServers();
std::unordered_map<std::string, std::string> newServerTimestamps =
ci->getServerTimestamps();
ci->loadCurrentDBServers();
auto const postServersKnown = ci->rebootIds();
if (ci->getCurrentDBServers().size() < dbServers.size()) {
LOG_TOPIC("8dce7", INFO, Logger::BACKUP) << "Waiting for all db servers to return";
continue;
}
// Check timestamps of all dbservers:
size_t good = 0; // Count restarted servers
for (auto const& dbs : dbServers) {
if (serverTimestamps[dbs] != newServerTimestamps[dbs]) {
if (postServersKnown.at(dbs) != preServersKnown.at(dbs)) {
++good;
}
}

View File

@ -25,6 +25,7 @@
#include <limits>
#include <string>
#include <iostream>
namespace arangodb {
@ -66,10 +67,19 @@ class RebootId {
return RebootId{std::numeric_limits<decltype(_value)>::max()};
}
std::ostream& print(std::ostream& o) const {
o << _value;
return o;
}
private:
uint64_t _value;
};
} // namespace arangodb
inline std::ostream& operator<< (std::ostream& o, arangodb::RebootId const& r) {
return r.print(o);
}
#endif // ARANGOD_CLUSTER_CLUSTERTYPES_H