mirror of https://gitee.com/bigwinds/arangodb
Make addfollower jobs less aggressive. (#8490)
* Make addfollower jobs less aggressive. * CHANGELOG.
This commit is contained in:
parent
59ad583796
commit
55706e3c74
|
@ -346,6 +346,8 @@ devel
|
||||||
|
|
||||||
* fix log spam in agency supervision when leader resigned
|
* fix log spam in agency supervision when leader resigned
|
||||||
|
|
||||||
|
* make AddFollower less aggressive
|
||||||
|
|
||||||
v3.4.1 (XXXX-XX-XX)
|
v3.4.1 (XXXX-XX-XX)
|
||||||
-------------------
|
-------------------
|
||||||
|
|
||||||
|
|
|
@ -155,7 +155,7 @@ bool AddFollower::start(bool&) {
|
||||||
if (replFact2.second && replFact2.first == "satellite") {
|
if (replFact2.second && replFact2.first == "satellite") {
|
||||||
// satellites => distribute to every server
|
// satellites => distribute to every server
|
||||||
auto available = Job::availableServers(_snapshot);
|
auto available = Job::availableServers(_snapshot);
|
||||||
desiredReplFactor = Job::countGoodServersInList(_snapshot, available);
|
desiredReplFactor = Job::countGoodOrBadServersInList(_snapshot, available);
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
|
@ -171,7 +171,7 @@ bool AddFollower::start(bool&) {
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
size_t actualReplFactor
|
size_t actualReplFactor
|
||||||
= 1 + Job::countGoodServersInList(_snapshot, onlyFollowers.slice());
|
= 1 + Job::countGoodOrBadServersInList(_snapshot, onlyFollowers.slice());
|
||||||
// Leader plus good followers in plan
|
// Leader plus good followers in plan
|
||||||
if (actualReplFactor >= desiredReplFactor) {
|
if (actualReplFactor >= desiredReplFactor) {
|
||||||
finish("", "", true, "job no longer necessary, have enough replicas");
|
finish("", "", true, "job no longer necessary, have enough replicas");
|
||||||
|
|
|
@ -227,8 +227,8 @@ std::string Job::randomIdleAvailableServer(Node const& snap, Slice const& exclud
|
||||||
}
|
}
|
||||||
|
|
||||||
// The following counts in a given server list how many of the servers are
|
// The following counts in a given server list how many of the servers are
|
||||||
// in Status "GOOD".
|
// in Status "GOOD" or "BAD".
|
||||||
size_t Job::countGoodServersInList(Node const& snap, VPackSlice const& serverList) {
|
size_t Job::countGoodOrBadServersInList(Node const& snap, VPackSlice const& serverList) {
|
||||||
size_t count = 0;
|
size_t count = 0;
|
||||||
if (!serverList.isArray()) {
|
if (!serverList.isArray()) {
|
||||||
// No array, strange, return 0
|
// No array, strange, return 0
|
||||||
|
@ -248,7 +248,8 @@ size_t Job::countGoodServersInList(Node const& snap, VPackSlice const& serverLis
|
||||||
// Only check if found
|
// Only check if found
|
||||||
std::shared_ptr<Node> healthNode = it->second;
|
std::shared_ptr<Node> healthNode = it->second;
|
||||||
// Check its status:
|
// Check its status:
|
||||||
if (healthNode->hasAsString("Status").first == "GOOD") {
|
auto status = healthNode->hasAsString("Status");
|
||||||
|
if (status.first == "GOOD" || status.first == "BAD") {
|
||||||
++count;
|
++count;
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
@ -259,8 +260,8 @@ size_t Job::countGoodServersInList(Node const& snap, VPackSlice const& serverLis
|
||||||
}
|
}
|
||||||
|
|
||||||
// The following counts in a given server list how many of the servers are
|
// The following counts in a given server list how many of the servers are
|
||||||
// in Status "GOOD".
|
// in Status "GOOD" or "BAD".
|
||||||
size_t Job::countGoodServersInList(Node const& snap, std::vector<std::string> const& serverList) {
|
size_t Job::countGoodOrBadServersInList(Node const& snap, std::vector<std::string> const& serverList) {
|
||||||
size_t count = 0;
|
size_t count = 0;
|
||||||
auto const& health = snap.hasAsChildren(healthPrefix);
|
auto const& health = snap.hasAsChildren(healthPrefix);
|
||||||
// Do we have a Health substructure?
|
// Do we have a Health substructure?
|
||||||
|
@ -273,7 +274,8 @@ size_t Job::countGoodServersInList(Node const& snap, std::vector<std::string> co
|
||||||
// Only check if found
|
// Only check if found
|
||||||
std::shared_ptr<Node> healthNode = it->second;
|
std::shared_ptr<Node> healthNode = it->second;
|
||||||
// Check its status:
|
// Check its status:
|
||||||
if (healthNode->hasAsString("Status").first == "GOOD") {
|
auto status = healthNode->hasAsString("Status");
|
||||||
|
if (status.first == "GOOD" || status.first == "BAD") {
|
||||||
++count;
|
++count;
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
|
@ -125,8 +125,8 @@ struct Job {
|
||||||
static std::string randomIdleAvailableServer(Node const& snap,
|
static std::string randomIdleAvailableServer(Node const& snap,
|
||||||
std::vector<std::string> const& exclude);
|
std::vector<std::string> const& exclude);
|
||||||
static std::string randomIdleAvailableServer(Node const& snap, VPackSlice const& exclude);
|
static std::string randomIdleAvailableServer(Node const& snap, VPackSlice const& exclude);
|
||||||
static size_t countGoodServersInList(Node const& snap, VPackSlice const& serverList);
|
static size_t countGoodOrBadServersInList(Node const& snap, VPackSlice const& serverList);
|
||||||
static size_t countGoodServersInList(Node const& snap, std::vector<std::string> const& serverList);
|
static size_t countGoodOrBadServersInList(Node const& snap, std::vector<std::string> const& serverList);
|
||||||
static bool isInServerList(Node const& snap, std::string const& prefix, std::string const& server, bool isArray);
|
static bool isInServerList(Node const& snap, std::string const& prefix, std::string const& server, bool isArray);
|
||||||
|
|
||||||
/// @brief Get servers from plan, which are not failed or cleaned out
|
/// @brief Get servers from plan, which are not failed or cleaned out
|
||||||
|
|
|
@ -158,7 +158,7 @@ bool RemoveFollower::start(bool&) {
|
||||||
if (replFact2.second && replFact2.first == "satellite") {
|
if (replFact2.second && replFact2.first == "satellite") {
|
||||||
// satellites => distribute to every server
|
// satellites => distribute to every server
|
||||||
auto available = Job::availableServers(_snapshot);
|
auto available = Job::availableServers(_snapshot);
|
||||||
desiredReplFactor = Job::countGoodServersInList(_snapshot, available);
|
desiredReplFactor = Job::countGoodOrBadServersInList(_snapshot, available);
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|
|
@ -1257,6 +1257,28 @@ void Supervision::readyOrphanedIndexCreations() {
|
||||||
|
|
||||||
void Supervision::enforceReplication() {
|
void Supervision::enforceReplication() {
|
||||||
_lock.assertLockedByCurrentThread();
|
_lock.assertLockedByCurrentThread();
|
||||||
|
|
||||||
|
// First check the number of AddFollower and RemoveFollower jobs in ToDo:
|
||||||
|
// We always maintain that we have at most maxNrAddRemoveJobsInTodo
|
||||||
|
// AddFollower or RemoveFollower jobs in ToDo. These are all long-term
|
||||||
|
// cleanup jobs, so they can be done over time. This is to ensure that
|
||||||
|
// there is no overload on the Agency job system. Therefore, if this
|
||||||
|
// number is at least maxNrAddRemoveJobsInTodo, we skip the rest of
|
||||||
|
// the function:
|
||||||
|
int const maxNrAddRemoveJobsInTodo = 15;
|
||||||
|
|
||||||
|
auto todos = _snapshot.hasAsChildren(toDoPrefix).first;
|
||||||
|
int nrAddRemoveJobsInTodo = 0;
|
||||||
|
for (auto it = todos.begin(); it != todos.end(); ++it) {
|
||||||
|
auto jobNode = *(it->second);
|
||||||
|
auto t = jobNode.hasAsString("type");
|
||||||
|
if (t.second && (t.first == "addFollower" || t.first == "removeFollower")) {
|
||||||
|
if (++nrAddRemoveJobsInTodo >= maxNrAddRemoveJobsInTodo) {
|
||||||
|
return;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
auto const& plannedDBs = _snapshot.hasAsChildren(planColPrefix).first;
|
auto const& plannedDBs = _snapshot.hasAsChildren(planColPrefix).first;
|
||||||
|
|
||||||
for (const auto& db_ : plannedDBs) { // Planned databases
|
for (const auto& db_ : plannedDBs) { // Planned databases
|
||||||
|
@ -1273,7 +1295,7 @@ void Supervision::enforceReplication() {
|
||||||
if (replFact2.second && replFact2.first == "satellite") {
|
if (replFact2.second && replFact2.first == "satellite") {
|
||||||
// satellites => distribute to every server
|
// satellites => distribute to every server
|
||||||
auto available = Job::availableServers(_snapshot);
|
auto available = Job::availableServers(_snapshot);
|
||||||
replicationFactor = Job::countGoodServersInList(_snapshot, available);
|
replicationFactor = Job::countGoodOrBadServersInList(_snapshot, available);
|
||||||
} else {
|
} else {
|
||||||
LOG_TOPIC(DEBUG, Logger::SUPERVISION)
|
LOG_TOPIC(DEBUG, Logger::SUPERVISION)
|
||||||
<< "no replicationFactor entry in " << col.toJson();
|
<< "no replicationFactor entry in " << col.toJson();
|
||||||
|
@ -1298,7 +1320,7 @@ void Supervision::enforceReplication() {
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
size_t actualReplicationFactor
|
size_t actualReplicationFactor
|
||||||
= 1 + Job::countGoodServersInList(_snapshot, onlyFollowers.slice());
|
= 1 + Job::countGoodOrBadServersInList(_snapshot, onlyFollowers.slice());
|
||||||
// leader plus GOOD followers
|
// leader plus GOOD followers
|
||||||
size_t apparentReplicationFactor = shard.slice().length();
|
size_t apparentReplicationFactor = shard.slice().length();
|
||||||
|
|
||||||
|
@ -1334,11 +1356,17 @@ void Supervision::enforceReplication() {
|
||||||
AddFollower(_snapshot, _agent, std::to_string(_jobId++),
|
AddFollower(_snapshot, _agent, std::to_string(_jobId++),
|
||||||
"supervision", db_.first, col_.first, shard_.first)
|
"supervision", db_.first, col_.first, shard_.first)
|
||||||
.create();
|
.create();
|
||||||
|
if (++nrAddRemoveJobsInTodo >= maxNrAddRemoveJobsInTodo) {
|
||||||
|
return;
|
||||||
|
}
|
||||||
} else if (apparentReplicationFactor > replicationFactor &&
|
} else if (apparentReplicationFactor > replicationFactor &&
|
||||||
actualReplicationFactor >= replicationFactor) {
|
actualReplicationFactor >= replicationFactor) {
|
||||||
RemoveFollower(_snapshot, _agent, std::to_string(_jobId++),
|
RemoveFollower(_snapshot, _agent, std::to_string(_jobId++),
|
||||||
"supervision", db_.first, col_.first, shard_.first)
|
"supervision", db_.first, col_.first, shard_.first)
|
||||||
.create();
|
.create();
|
||||||
|
if (++nrAddRemoveJobsInTodo >= maxNrAddRemoveJobsInTodo) {
|
||||||
|
return;
|
||||||
|
}
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
Loading…
Reference in New Issue