1
0
Fork 0

Various agency improvements. (#8380)

* Ignore satellite collections in shrinkCluster in agency.
* Abort RemoveFollower job if not enough in-sync followers or leader failure.
* Break quick wait loop in supervision if leadership is lost.
* In case of resigned leader, set isReady=false in clusterInventory.
* Fix catch tests.
This commit is contained in:
Max Neunhöffer 2019-03-12 15:25:16 +01:00 committed by GitHub
parent 30adf5e2d9
commit 2a4f606df2
No known key found for this signature in database
GPG Key ID: 4AEE18F83AFDEB23
5 changed files with 16 additions and 11 deletions

View File

@ -235,6 +235,7 @@ bool RemoveFollower::start(bool&) {
<< " does not have a leader that has confirmed leadership, waiting, "
"jobId="
<< _jobId;
finish("", "", false, "job no longer sensible, leader has gone bad");
return false;
}
@ -245,7 +246,8 @@ bool RemoveFollower::start(bool&) {
<< " does not have enough in sync followers to remove one, waiting, "
"jobId="
<< _jobId;
return false;
finish("", "", false, "job no longer sensible, do not have few enough replicas");
return true;
}
// We now know actualReplFactor >= inSyncCount + noGoodCount and

View File

@ -862,8 +862,9 @@ void Supervision::run() {
}
auto result = _agent->waitFor(leaderIndex);
if (result == Agent::raft_commit_t::UNKNOWN ||
result == Agent::raft_commit_t::TIMEOUT) { // Oh snap
if (result == Agent::raft_commit_t::TIMEOUT) { // Oh snap
// Note that we can get UNKNOWN if we have lost leadership or
// if we are shutting down. In both cases we just leave the loop.
LOG_TOPIC(WARN, Logger::SUPERVISION) << "Waiting for commits to be done ... ";
continue;
} else { // Good we can continue
@ -1457,11 +1458,8 @@ void Supervision::shrinkCluster() {
if (replFact > maxReplFact) {
maxReplFact = replFact;
}
} else {
LOG_TOPIC(WARN, Logger::SUPERVISION)
<< "Cannot retrieve replication factor for collection " << collptr.first;
return;
}
// Note that this could be a satellite collection, in any case, ignore:
}
}

View File

@ -699,7 +699,8 @@ void RestReplicationHandler::handleCommandClusterInventory() {
for (auto const& p : *shardMap) {
auto currentServerList = cic->servers(p.first /* shardId */);
if (currentServerList.size() == 0 || p.second.size() == 0 ||
currentServerList[0] != p.second[0]) {
currentServerList[0] != p.second[0] ||
(!p.second[0].empty() && p.second[0][0] == '_')) {
isReady = false;
}
if (!ClusterHelpers::compareServerLists(p.second, currentServerList)) {

View File

@ -399,7 +399,7 @@ TEST_CASE("RemoveFollower", "[agency][supervision]") {
REQUIRE(typeName(q->slice()) == "array");
REQUIRE(q->slice().length() == 1);
REQUIRE(typeName(q->slice()[0]) == "array");
REQUIRE(q->slice()[0].length() == 1); // we always simply override! no preconditions...
REQUIRE(q->slice()[0].length() == 2); // precondition
REQUIRE(typeName(q->slice()[0][0]) == "object");
auto writes = q->slice()[0][0];
@ -407,8 +407,12 @@ TEST_CASE("RemoveFollower", "[agency][supervision]") {
REQUIRE(typeName(writes.get("/arango/Target/ToDo/1").get("op")) == "string");
CHECK(writes.get("/arango/Target/ToDo/1").get("op").copyString() == "delete");
CHECK(writes.get("/arango/Target/Finished/1").get("collection").copyString() == COLLECTION);
CHECK(writes.get("/arango/Target/Pending/1").get("op").copyString() == "delete");
CHECK(typeName(writes.get("/arango/Target/Failed/1")) == "none");
auto precond = q->slice()[0][1];
REQUIRE(typeName(precond) == "object");
REQUIRE(typeName(precond.get("/arango/Supervision/Health/follower1/Status")) == "object");
return fakeWriteResult;
}
);

View File

@ -11,7 +11,7 @@ R"=(
},
"collection2": {
"s2": {
"servers": ["leader"]
"servers": ["leader", "follower1"]
}
},
"collection3": {