1
0
Fork 0

Various agency improvements. (#8380)

* Ignore satellite collections in shrinkCluster in agency.
* Abort RemoveFollower job if not enough in-sync followers or leader failure.
* Break quick wait loop in supervision if leadership is lost.
* In case of resigned leader, set isReady=false in clusterInventory.
* Fix catch tests.
This commit is contained in:
Max Neunhöffer 2019-03-12 15:25:16 +01:00 committed by GitHub
parent 30adf5e2d9
commit 2a4f606df2
No known key found for this signature in database
GPG Key ID: 4AEE18F83AFDEB23
5 changed files with 16 additions and 11 deletions

View File

@ -235,6 +235,7 @@ bool RemoveFollower::start(bool&) {
<< " does not have a leader that has confirmed leadership, waiting, " << " does not have a leader that has confirmed leadership, waiting, "
"jobId=" "jobId="
<< _jobId; << _jobId;
finish("", "", false, "job no longer sensible, leader has gone bad");
return false; return false;
} }
@ -245,7 +246,8 @@ bool RemoveFollower::start(bool&) {
<< " does not have enough in sync followers to remove one, waiting, " << " does not have enough in sync followers to remove one, waiting, "
"jobId=" "jobId="
<< _jobId; << _jobId;
return false; finish("", "", false, "job no longer sensible, do not have few enough replicas");
return true;
} }
// We now know actualReplFactor >= inSyncCount + noGoodCount and // We now know actualReplFactor >= inSyncCount + noGoodCount and

View File

@ -862,8 +862,9 @@ void Supervision::run() {
} }
auto result = _agent->waitFor(leaderIndex); auto result = _agent->waitFor(leaderIndex);
if (result == Agent::raft_commit_t::UNKNOWN || if (result == Agent::raft_commit_t::TIMEOUT) { // Oh snap
result == Agent::raft_commit_t::TIMEOUT) { // Oh snap // Note that we can get UNKNOWN if we have lost leadership or
// if we are shutting down. In both cases we just leave the loop.
LOG_TOPIC(WARN, Logger::SUPERVISION) << "Waiting for commits to be done ... "; LOG_TOPIC(WARN, Logger::SUPERVISION) << "Waiting for commits to be done ... ";
continue; continue;
} else { // Good we can continue } else { // Good we can continue
@ -1457,11 +1458,8 @@ void Supervision::shrinkCluster() {
if (replFact > maxReplFact) { if (replFact > maxReplFact) {
maxReplFact = replFact; maxReplFact = replFact;
} }
} else {
LOG_TOPIC(WARN, Logger::SUPERVISION)
<< "Cannot retrieve replication factor for collection " << collptr.first;
return;
} }
// Note that this could be a satellite collection, in any case, ignore:
} }
} }

View File

@ -699,7 +699,8 @@ void RestReplicationHandler::handleCommandClusterInventory() {
for (auto const& p : *shardMap) { for (auto const& p : *shardMap) {
auto currentServerList = cic->servers(p.first /* shardId */); auto currentServerList = cic->servers(p.first /* shardId */);
if (currentServerList.size() == 0 || p.second.size() == 0 || if (currentServerList.size() == 0 || p.second.size() == 0 ||
currentServerList[0] != p.second[0]) { currentServerList[0] != p.second[0] ||
(!p.second[0].empty() && p.second[0][0] == '_')) {
isReady = false; isReady = false;
} }
if (!ClusterHelpers::compareServerLists(p.second, currentServerList)) { if (!ClusterHelpers::compareServerLists(p.second, currentServerList)) {

View File

@ -399,7 +399,7 @@ TEST_CASE("RemoveFollower", "[agency][supervision]") {
REQUIRE(typeName(q->slice()) == "array"); REQUIRE(typeName(q->slice()) == "array");
REQUIRE(q->slice().length() == 1); REQUIRE(q->slice().length() == 1);
REQUIRE(typeName(q->slice()[0]) == "array"); REQUIRE(typeName(q->slice()[0]) == "array");
REQUIRE(q->slice()[0].length() == 1); // we always simply override! no preconditions... REQUIRE(q->slice()[0].length() == 2); // precondition
REQUIRE(typeName(q->slice()[0][0]) == "object"); REQUIRE(typeName(q->slice()[0][0]) == "object");
auto writes = q->slice()[0][0]; auto writes = q->slice()[0][0];
@ -407,8 +407,12 @@ TEST_CASE("RemoveFollower", "[agency][supervision]") {
REQUIRE(typeName(writes.get("/arango/Target/ToDo/1").get("op")) == "string"); REQUIRE(typeName(writes.get("/arango/Target/ToDo/1").get("op")) == "string");
CHECK(writes.get("/arango/Target/ToDo/1").get("op").copyString() == "delete"); CHECK(writes.get("/arango/Target/ToDo/1").get("op").copyString() == "delete");
CHECK(writes.get("/arango/Target/Finished/1").get("collection").copyString() == COLLECTION); CHECK(writes.get("/arango/Target/Finished/1").get("collection").copyString() == COLLECTION);
CHECK(writes.get("/arango/Target/Pending/1").get("op").copyString() == "delete");
CHECK(typeName(writes.get("/arango/Target/Failed/1")) == "none"); CHECK(typeName(writes.get("/arango/Target/Failed/1")) == "none");
auto precond = q->slice()[0][1];
REQUIRE(typeName(precond) == "object");
REQUIRE(typeName(precond.get("/arango/Supervision/Health/follower1/Status")) == "object");
return fakeWriteResult; return fakeWriteResult;
} }
); );

View File

@ -11,7 +11,7 @@ R"=(
}, },
"collection2": { "collection2": {
"s2": { "s2": {
"servers": ["leader"] "servers": ["leader", "follower1"]
} }
}, },
"collection3": { "collection3": {