1
0
Fork 0

Make addfollower jobs less aggressive. (#8490)

* Make addfollower jobs less aggressive.
* CHANGELOG.
This commit is contained in:
Max Neunhöffer 2019-03-21 15:24:31 +01:00 committed by GitHub
parent 59ad583796
commit 55706e3c74
No known key found for this signature in database
GPG Key ID: 4AEE18F83AFDEB23
6 changed files with 45 additions and 13 deletions

View File

@ -346,6 +346,8 @@ devel
* fix log spam in agency supervision when leader resigned
* make AddFollower less aggressive
v3.4.1 (XXXX-XX-XX)
-------------------

View File

@ -155,7 +155,7 @@ bool AddFollower::start(bool&) {
if (replFact2.second && replFact2.first == "satellite") {
// satellites => distribute to every server
auto available = Job::availableServers(_snapshot);
desiredReplFactor = Job::countGoodServersInList(_snapshot, available);
desiredReplFactor = Job::countGoodOrBadServersInList(_snapshot, available);
}
}
@ -171,7 +171,7 @@ bool AddFollower::start(bool&) {
}
}
size_t actualReplFactor
= 1 + Job::countGoodServersInList(_snapshot, onlyFollowers.slice());
= 1 + Job::countGoodOrBadServersInList(_snapshot, onlyFollowers.slice());
// Leader plus good followers in plan
if (actualReplFactor >= desiredReplFactor) {
finish("", "", true, "job no longer necessary, have enough replicas");

View File

@ -227,8 +227,8 @@ std::string Job::randomIdleAvailableServer(Node const& snap, Slice const& exclud
}
// The following counts in a given server list how many of the servers are
// in Status "GOOD".
size_t Job::countGoodServersInList(Node const& snap, VPackSlice const& serverList) {
// in Status "GOOD" or "BAD".
size_t Job::countGoodOrBadServersInList(Node const& snap, VPackSlice const& serverList) {
size_t count = 0;
if (!serverList.isArray()) {
// No array, strange, return 0
@ -248,7 +248,8 @@ size_t Job::countGoodServersInList(Node const& snap, VPackSlice const& serverLis
// Only check if found
std::shared_ptr<Node> healthNode = it->second;
// Check its status:
if (healthNode->hasAsString("Status").first == "GOOD") {
auto status = healthNode->hasAsString("Status");
if (status.first == "GOOD" || status.first == "BAD") {
++count;
}
}
@ -259,8 +260,8 @@ size_t Job::countGoodServersInList(Node const& snap, VPackSlice const& serverLis
}
// The following counts in a given server list how many of the servers are
// in Status "GOOD".
size_t Job::countGoodServersInList(Node const& snap, std::vector<std::string> const& serverList) {
// in Status "GOOD" or "BAD".
size_t Job::countGoodOrBadServersInList(Node const& snap, std::vector<std::string> const& serverList) {
size_t count = 0;
auto const& health = snap.hasAsChildren(healthPrefix);
// Do we have a Health substructure?
@ -273,7 +274,8 @@ size_t Job::countGoodServersInList(Node const& snap, std::vector<std::string> co
// Only check if found
std::shared_ptr<Node> healthNode = it->second;
// Check its status:
if (healthNode->hasAsString("Status").first == "GOOD") {
auto status = healthNode->hasAsString("Status");
if (status.first == "GOOD" || status.first == "BAD") {
++count;
}
}

View File

@ -125,8 +125,8 @@ struct Job {
static std::string randomIdleAvailableServer(Node const& snap,
std::vector<std::string> const& exclude);
static std::string randomIdleAvailableServer(Node const& snap, VPackSlice const& exclude);
static size_t countGoodServersInList(Node const& snap, VPackSlice const& serverList);
static size_t countGoodServersInList(Node const& snap, std::vector<std::string> const& serverList);
static size_t countGoodOrBadServersInList(Node const& snap, VPackSlice const& serverList);
static size_t countGoodOrBadServersInList(Node const& snap, std::vector<std::string> const& serverList);
static bool isInServerList(Node const& snap, std::string const& prefix, std::string const& server, bool isArray);
/// @brief Get servers from plan, which are not failed or cleaned out

View File

@ -158,7 +158,7 @@ bool RemoveFollower::start(bool&) {
if (replFact2.second && replFact2.first == "satellite") {
// satellites => distribute to every server
auto available = Job::availableServers(_snapshot);
desiredReplFactor = Job::countGoodServersInList(_snapshot, available);
desiredReplFactor = Job::countGoodOrBadServersInList(_snapshot, available);
}
}

View File

@ -1257,6 +1257,28 @@ void Supervision::readyOrphanedIndexCreations() {
void Supervision::enforceReplication() {
_lock.assertLockedByCurrentThread();
// First check the number of AddFollower and RemoveFollower jobs in ToDo:
// We always maintain that we have at most maxNrAddRemoveJobsInTodo
// AddFollower or RemoveFollower jobs in ToDo. These are all long-term
// cleanup jobs, so they can be done over time. This is to ensure that
// there is no overload on the Agency job system. Therefore, if this
// number is at least maxNrAddRemoveJobsInTodo, we skip the rest of
// the function:
int const maxNrAddRemoveJobsInTodo = 15;
auto todos = _snapshot.hasAsChildren(toDoPrefix).first;
int nrAddRemoveJobsInTodo = 0;
for (auto it = todos.begin(); it != todos.end(); ++it) {
auto jobNode = *(it->second);
auto t = jobNode.hasAsString("type");
if (t.second && (t.first == "addFollower" || t.first == "removeFollower")) {
if (++nrAddRemoveJobsInTodo >= maxNrAddRemoveJobsInTodo) {
return;
}
}
}
auto const& plannedDBs = _snapshot.hasAsChildren(planColPrefix).first;
for (const auto& db_ : plannedDBs) { // Planned databases
@ -1273,7 +1295,7 @@ void Supervision::enforceReplication() {
if (replFact2.second && replFact2.first == "satellite") {
// satellites => distribute to every server
auto available = Job::availableServers(_snapshot);
replicationFactor = Job::countGoodServersInList(_snapshot, available);
replicationFactor = Job::countGoodOrBadServersInList(_snapshot, available);
} else {
LOG_TOPIC(DEBUG, Logger::SUPERVISION)
<< "no replicationFactor entry in " << col.toJson();
@ -1298,7 +1320,7 @@ void Supervision::enforceReplication() {
}
}
size_t actualReplicationFactor
= 1 + Job::countGoodServersInList(_snapshot, onlyFollowers.slice());
= 1 + Job::countGoodOrBadServersInList(_snapshot, onlyFollowers.slice());
// leader plus GOOD followers
size_t apparentReplicationFactor = shard.slice().length();
@ -1334,11 +1356,17 @@ void Supervision::enforceReplication() {
AddFollower(_snapshot, _agent, std::to_string(_jobId++),
"supervision", db_.first, col_.first, shard_.first)
.create();
if (++nrAddRemoveJobsInTodo >= maxNrAddRemoveJobsInTodo) {
return;
}
} else if (apparentReplicationFactor > replicationFactor &&
actualReplicationFactor >= replicationFactor) {
RemoveFollower(_snapshot, _agent, std::to_string(_jobId++),
"supervision", db_.first, col_.first, shard_.first)
.create();
if (++nrAddRemoveJobsInTodo >= maxNrAddRemoveJobsInTodo) {
return;
}
}
}
}