1
0
Fork 0

Make addfollower jobs less aggressive. (#8490)

* Make addfollower jobs less aggressive.
* CHANGELOG.
This commit is contained in:
Max Neunhöffer 2019-03-21 15:24:31 +01:00 committed by GitHub
parent 59ad583796
commit 55706e3c74
No known key found for this signature in database
GPG Key ID: 4AEE18F83AFDEB23
6 changed files with 45 additions and 13 deletions

View File

@ -346,6 +346,8 @@ devel
* fix log spam in agency supervision when leader resigned * fix log spam in agency supervision when leader resigned
* make AddFollower less aggressive
v3.4.1 (XXXX-XX-XX) v3.4.1 (XXXX-XX-XX)
------------------- -------------------

View File

@ -155,7 +155,7 @@ bool AddFollower::start(bool&) {
if (replFact2.second && replFact2.first == "satellite") { if (replFact2.second && replFact2.first == "satellite") {
// satellites => distribute to every server // satellites => distribute to every server
auto available = Job::availableServers(_snapshot); auto available = Job::availableServers(_snapshot);
desiredReplFactor = Job::countGoodServersInList(_snapshot, available); desiredReplFactor = Job::countGoodOrBadServersInList(_snapshot, available);
} }
} }
@ -171,7 +171,7 @@ bool AddFollower::start(bool&) {
} }
} }
size_t actualReplFactor size_t actualReplFactor
= 1 + Job::countGoodServersInList(_snapshot, onlyFollowers.slice()); = 1 + Job::countGoodOrBadServersInList(_snapshot, onlyFollowers.slice());
// Leader plus good followers in plan // Leader plus good followers in plan
if (actualReplFactor >= desiredReplFactor) { if (actualReplFactor >= desiredReplFactor) {
finish("", "", true, "job no longer necessary, have enough replicas"); finish("", "", true, "job no longer necessary, have enough replicas");

View File

@ -227,8 +227,8 @@ std::string Job::randomIdleAvailableServer(Node const& snap, Slice const& exclud
} }
// The following counts in a given server list how many of the servers are // The following counts in a given server list how many of the servers are
// in Status "GOOD". // in Status "GOOD" or "BAD".
size_t Job::countGoodServersInList(Node const& snap, VPackSlice const& serverList) { size_t Job::countGoodOrBadServersInList(Node const& snap, VPackSlice const& serverList) {
size_t count = 0; size_t count = 0;
if (!serverList.isArray()) { if (!serverList.isArray()) {
// No array, strange, return 0 // No array, strange, return 0
@ -248,7 +248,8 @@ size_t Job::countGoodServersInList(Node const& snap, VPackSlice const& serverLis
// Only check if found // Only check if found
std::shared_ptr<Node> healthNode = it->second; std::shared_ptr<Node> healthNode = it->second;
// Check its status: // Check its status:
if (healthNode->hasAsString("Status").first == "GOOD") { auto status = healthNode->hasAsString("Status");
if (status.first == "GOOD" || status.first == "BAD") {
++count; ++count;
} }
} }
@ -259,8 +260,8 @@ size_t Job::countGoodServersInList(Node const& snap, VPackSlice const& serverLis
} }
// The following counts in a given server list how many of the servers are // The following counts in a given server list how many of the servers are
// in Status "GOOD". // in Status "GOOD" or "BAD".
size_t Job::countGoodServersInList(Node const& snap, std::vector<std::string> const& serverList) { size_t Job::countGoodOrBadServersInList(Node const& snap, std::vector<std::string> const& serverList) {
size_t count = 0; size_t count = 0;
auto const& health = snap.hasAsChildren(healthPrefix); auto const& health = snap.hasAsChildren(healthPrefix);
// Do we have a Health substructure? // Do we have a Health substructure?
@ -273,7 +274,8 @@ size_t Job::countGoodServersInList(Node const& snap, std::vector<std::string> co
// Only check if found // Only check if found
std::shared_ptr<Node> healthNode = it->second; std::shared_ptr<Node> healthNode = it->second;
// Check its status: // Check its status:
if (healthNode->hasAsString("Status").first == "GOOD") { auto status = healthNode->hasAsString("Status");
if (status.first == "GOOD" || status.first == "BAD") {
++count; ++count;
} }
} }

View File

@ -125,8 +125,8 @@ struct Job {
static std::string randomIdleAvailableServer(Node const& snap, static std::string randomIdleAvailableServer(Node const& snap,
std::vector<std::string> const& exclude); std::vector<std::string> const& exclude);
static std::string randomIdleAvailableServer(Node const& snap, VPackSlice const& exclude); static std::string randomIdleAvailableServer(Node const& snap, VPackSlice const& exclude);
static size_t countGoodServersInList(Node const& snap, VPackSlice const& serverList); static size_t countGoodOrBadServersInList(Node const& snap, VPackSlice const& serverList);
static size_t countGoodServersInList(Node const& snap, std::vector<std::string> const& serverList); static size_t countGoodOrBadServersInList(Node const& snap, std::vector<std::string> const& serverList);
static bool isInServerList(Node const& snap, std::string const& prefix, std::string const& server, bool isArray); static bool isInServerList(Node const& snap, std::string const& prefix, std::string const& server, bool isArray);
/// @brief Get servers from plan, which are not failed or cleaned out /// @brief Get servers from plan, which are not failed or cleaned out

View File

@ -158,7 +158,7 @@ bool RemoveFollower::start(bool&) {
if (replFact2.second && replFact2.first == "satellite") { if (replFact2.second && replFact2.first == "satellite") {
// satellites => distribute to every server // satellites => distribute to every server
auto available = Job::availableServers(_snapshot); auto available = Job::availableServers(_snapshot);
desiredReplFactor = Job::countGoodServersInList(_snapshot, available); desiredReplFactor = Job::countGoodOrBadServersInList(_snapshot, available);
} }
} }

View File

@ -1257,6 +1257,28 @@ void Supervision::readyOrphanedIndexCreations() {
void Supervision::enforceReplication() { void Supervision::enforceReplication() {
_lock.assertLockedByCurrentThread(); _lock.assertLockedByCurrentThread();
// First check the number of AddFollower and RemoveFollower jobs in ToDo:
// We always maintain that we have at most maxNrAddRemoveJobsInTodo
// AddFollower or RemoveFollower jobs in ToDo. These are all long-term
// cleanup jobs, so they can be done over time. This is to ensure that
// there is no overload on the Agency job system. Therefore, if this
// number is at least maxNrAddRemoveJobsInTodo, we skip the rest of
// the function:
int const maxNrAddRemoveJobsInTodo = 15;
auto todos = _snapshot.hasAsChildren(toDoPrefix).first;
int nrAddRemoveJobsInTodo = 0;
for (auto it = todos.begin(); it != todos.end(); ++it) {
auto jobNode = *(it->second);
auto t = jobNode.hasAsString("type");
if (t.second && (t.first == "addFollower" || t.first == "removeFollower")) {
if (++nrAddRemoveJobsInTodo >= maxNrAddRemoveJobsInTodo) {
return;
}
}
}
auto const& plannedDBs = _snapshot.hasAsChildren(planColPrefix).first; auto const& plannedDBs = _snapshot.hasAsChildren(planColPrefix).first;
for (const auto& db_ : plannedDBs) { // Planned databases for (const auto& db_ : plannedDBs) { // Planned databases
@ -1273,7 +1295,7 @@ void Supervision::enforceReplication() {
if (replFact2.second && replFact2.first == "satellite") { if (replFact2.second && replFact2.first == "satellite") {
// satellites => distribute to every server // satellites => distribute to every server
auto available = Job::availableServers(_snapshot); auto available = Job::availableServers(_snapshot);
replicationFactor = Job::countGoodServersInList(_snapshot, available); replicationFactor = Job::countGoodOrBadServersInList(_snapshot, available);
} else { } else {
LOG_TOPIC(DEBUG, Logger::SUPERVISION) LOG_TOPIC(DEBUG, Logger::SUPERVISION)
<< "no replicationFactor entry in " << col.toJson(); << "no replicationFactor entry in " << col.toJson();
@ -1298,7 +1320,7 @@ void Supervision::enforceReplication() {
} }
} }
size_t actualReplicationFactor size_t actualReplicationFactor
= 1 + Job::countGoodServersInList(_snapshot, onlyFollowers.slice()); = 1 + Job::countGoodOrBadServersInList(_snapshot, onlyFollowers.slice());
// leader plus GOOD followers // leader plus GOOD followers
size_t apparentReplicationFactor = shard.slice().length(); size_t apparentReplicationFactor = shard.slice().length();
@ -1334,11 +1356,17 @@ void Supervision::enforceReplication() {
AddFollower(_snapshot, _agent, std::to_string(_jobId++), AddFollower(_snapshot, _agent, std::to_string(_jobId++),
"supervision", db_.first, col_.first, shard_.first) "supervision", db_.first, col_.first, shard_.first)
.create(); .create();
if (++nrAddRemoveJobsInTodo >= maxNrAddRemoveJobsInTodo) {
return;
}
} else if (apparentReplicationFactor > replicationFactor && } else if (apparentReplicationFactor > replicationFactor &&
actualReplicationFactor >= replicationFactor) { actualReplicationFactor >= replicationFactor) {
RemoveFollower(_snapshot, _agent, std::to_string(_jobId++), RemoveFollower(_snapshot, _agent, std::to_string(_jobId++),
"supervision", db_.first, col_.first, shard_.first) "supervision", db_.first, col_.first, shard_.first)
.create(); .create();
if (++nrAddRemoveJobsInTodo >= maxNrAddRemoveJobsInTodo) {
return;
}
} }
} }
} }