diff --git a/CHANGELOG b/CHANGELOG index 36dfcc814c..e1092d5ce3 100644 --- a/CHANGELOG +++ b/CHANGELOG @@ -346,6 +346,8 @@ devel * fix log spam in agency supervision when leader resigned +* make AddFollower less aggressive + v3.4.1 (XXXX-XX-XX) ------------------- diff --git a/arangod/Agency/AddFollower.cpp b/arangod/Agency/AddFollower.cpp index 893e7fb56c..5960afca4f 100644 --- a/arangod/Agency/AddFollower.cpp +++ b/arangod/Agency/AddFollower.cpp @@ -155,7 +155,7 @@ bool AddFollower::start(bool&) { if (replFact2.second && replFact2.first == "satellite") { // satellites => distribute to every server auto available = Job::availableServers(_snapshot); - desiredReplFactor = Job::countGoodServersInList(_snapshot, available); + desiredReplFactor = Job::countGoodOrBadServersInList(_snapshot, available); } } @@ -171,7 +171,7 @@ bool AddFollower::start(bool&) { } } size_t actualReplFactor - = 1 + Job::countGoodServersInList(_snapshot, onlyFollowers.slice()); + = 1 + Job::countGoodOrBadServersInList(_snapshot, onlyFollowers.slice()); // Leader plus good followers in plan if (actualReplFactor >= desiredReplFactor) { finish("", "", true, "job no longer necessary, have enough replicas"); diff --git a/arangod/Agency/Job.cpp b/arangod/Agency/Job.cpp index 7d603b290d..db544540c2 100644 --- a/arangod/Agency/Job.cpp +++ b/arangod/Agency/Job.cpp @@ -227,8 +227,8 @@ std::string Job::randomIdleAvailableServer(Node const& snap, Slice const& exclud } // The following counts in a given server list how many of the servers are -// in Status "GOOD". -size_t Job::countGoodServersInList(Node const& snap, VPackSlice const& serverList) { +// in Status "GOOD" or "BAD". +size_t Job::countGoodOrBadServersInList(Node const& snap, VPackSlice const& serverList) { size_t count = 0; if (!serverList.isArray()) { // No array, strange, return 0 @@ -248,7 +248,8 @@ size_t Job::countGoodServersInList(Node const& snap, VPackSlice const& serverLis // Only check if found std::shared_ptr healthNode = it->second; // Check its status: - if (healthNode->hasAsString("Status").first == "GOOD") { + auto status = healthNode->hasAsString("Status"); + if (status.first == "GOOD" || status.first == "BAD") { ++count; } } @@ -259,8 +260,8 @@ size_t Job::countGoodServersInList(Node const& snap, VPackSlice const& serverLis } // The following counts in a given server list how many of the servers are -// in Status "GOOD". -size_t Job::countGoodServersInList(Node const& snap, std::vector const& serverList) { + // in Status "GOOD" or "BAD". +size_t Job::countGoodOrBadServersInList(Node const& snap, std::vector const& serverList) { size_t count = 0; auto const& health = snap.hasAsChildren(healthPrefix); // Do we have a Health substructure? @@ -273,7 +274,8 @@ size_t Job::countGoodServersInList(Node const& snap, std::vector co // Only check if found std::shared_ptr healthNode = it->second; // Check its status: - if (healthNode->hasAsString("Status").first == "GOOD") { + auto status = healthNode->hasAsString("Status"); + if (status.first == "GOOD" || status.first == "BAD") { ++count; } } diff --git a/arangod/Agency/Job.h b/arangod/Agency/Job.h index 41c48a81db..c1d022fa2c 100644 --- a/arangod/Agency/Job.h +++ b/arangod/Agency/Job.h @@ -125,8 +125,8 @@ struct Job { static std::string randomIdleAvailableServer(Node const& snap, std::vector const& exclude); static std::string randomIdleAvailableServer(Node const& snap, VPackSlice const& exclude); - static size_t countGoodServersInList(Node const& snap, VPackSlice const& serverList); - static size_t countGoodServersInList(Node const& snap, std::vector const& serverList); + static size_t countGoodOrBadServersInList(Node const& snap, VPackSlice const& serverList); + static size_t countGoodOrBadServersInList(Node const& snap, std::vector const& serverList); static bool isInServerList(Node const& snap, std::string const& prefix, std::string const& server, bool isArray); /// @brief Get servers from plan, which are not failed or cleaned out diff --git a/arangod/Agency/RemoveFollower.cpp b/arangod/Agency/RemoveFollower.cpp index 65bf96755b..31a7c6aba8 100644 --- a/arangod/Agency/RemoveFollower.cpp +++ b/arangod/Agency/RemoveFollower.cpp @@ -158,7 +158,7 @@ bool RemoveFollower::start(bool&) { if (replFact2.second && replFact2.first == "satellite") { // satellites => distribute to every server auto available = Job::availableServers(_snapshot); - desiredReplFactor = Job::countGoodServersInList(_snapshot, available); + desiredReplFactor = Job::countGoodOrBadServersInList(_snapshot, available); } } diff --git a/arangod/Agency/Supervision.cpp b/arangod/Agency/Supervision.cpp index cce50efc32..9c82221ba2 100644 --- a/arangod/Agency/Supervision.cpp +++ b/arangod/Agency/Supervision.cpp @@ -1257,6 +1257,28 @@ void Supervision::readyOrphanedIndexCreations() { void Supervision::enforceReplication() { _lock.assertLockedByCurrentThread(); + + // First check the number of AddFollower and RemoveFollower jobs in ToDo: + // We always maintain that we have at most maxNrAddRemoveJobsInTodo + // AddFollower or RemoveFollower jobs in ToDo. These are all long-term + // cleanup jobs, so they can be done over time. This is to ensure that + // there is no overload on the Agency job system. Therefore, if this + // number is at least maxNrAddRemoveJobsInTodo, we skip the rest of + // the function: + int const maxNrAddRemoveJobsInTodo = 15; + + auto todos = _snapshot.hasAsChildren(toDoPrefix).first; + int nrAddRemoveJobsInTodo = 0; + for (auto it = todos.begin(); it != todos.end(); ++it) { + auto jobNode = *(it->second); + auto t = jobNode.hasAsString("type"); + if (t.second && (t.first == "addFollower" || t.first == "removeFollower")) { + if (++nrAddRemoveJobsInTodo >= maxNrAddRemoveJobsInTodo) { + return; + } + } + } + auto const& plannedDBs = _snapshot.hasAsChildren(planColPrefix).first; for (const auto& db_ : plannedDBs) { // Planned databases @@ -1273,7 +1295,7 @@ void Supervision::enforceReplication() { if (replFact2.second && replFact2.first == "satellite") { // satellites => distribute to every server auto available = Job::availableServers(_snapshot); - replicationFactor = Job::countGoodServersInList(_snapshot, available); + replicationFactor = Job::countGoodOrBadServersInList(_snapshot, available); } else { LOG_TOPIC(DEBUG, Logger::SUPERVISION) << "no replicationFactor entry in " << col.toJson(); @@ -1298,7 +1320,7 @@ void Supervision::enforceReplication() { } } size_t actualReplicationFactor - = 1 + Job::countGoodServersInList(_snapshot, onlyFollowers.slice()); + = 1 + Job::countGoodOrBadServersInList(_snapshot, onlyFollowers.slice()); // leader plus GOOD followers size_t apparentReplicationFactor = shard.slice().length(); @@ -1334,11 +1356,17 @@ void Supervision::enforceReplication() { AddFollower(_snapshot, _agent, std::to_string(_jobId++), "supervision", db_.first, col_.first, shard_.first) .create(); + if (++nrAddRemoveJobsInTodo >= maxNrAddRemoveJobsInTodo) { + return; + } } else if (apparentReplicationFactor > replicationFactor && actualReplicationFactor >= replicationFactor) { RemoveFollower(_snapshot, _agent, std::to_string(_jobId++), "supervision", db_.first, col_.first, shard_.first) .create(); + if (++nrAddRemoveJobsInTodo >= maxNrAddRemoveJobsInTodo) { + return; + } } } }