mirror of https://gitee.com/bigwinds/arangodb
Various agency improvements. (#8380)
* Ignore satellite collections in shrinkCluster in agency. * Abort RemoveFollower job if not enough in-sync followers or leader failure. * Break quick wait loop in supervision if leadership is lost. * In case of resigned leader, set isReady=false in clusterInventory. * Fix catch tests.
This commit is contained in:
parent
30adf5e2d9
commit
2a4f606df2
|
@ -235,6 +235,7 @@ bool RemoveFollower::start(bool&) {
|
||||||
<< " does not have a leader that has confirmed leadership, waiting, "
|
<< " does not have a leader that has confirmed leadership, waiting, "
|
||||||
"jobId="
|
"jobId="
|
||||||
<< _jobId;
|
<< _jobId;
|
||||||
|
finish("", "", false, "job no longer sensible, leader has gone bad");
|
||||||
return false;
|
return false;
|
||||||
}
|
}
|
||||||
|
|
||||||
|
@ -245,7 +246,8 @@ bool RemoveFollower::start(bool&) {
|
||||||
<< " does not have enough in sync followers to remove one, waiting, "
|
<< " does not have enough in sync followers to remove one, waiting, "
|
||||||
"jobId="
|
"jobId="
|
||||||
<< _jobId;
|
<< _jobId;
|
||||||
return false;
|
finish("", "", false, "job no longer sensible, do not have few enough replicas");
|
||||||
|
return true;
|
||||||
}
|
}
|
||||||
|
|
||||||
// We now know actualReplFactor >= inSyncCount + noGoodCount and
|
// We now know actualReplFactor >= inSyncCount + noGoodCount and
|
||||||
|
|
|
@ -862,8 +862,9 @@ void Supervision::run() {
|
||||||
}
|
}
|
||||||
|
|
||||||
auto result = _agent->waitFor(leaderIndex);
|
auto result = _agent->waitFor(leaderIndex);
|
||||||
if (result == Agent::raft_commit_t::UNKNOWN ||
|
if (result == Agent::raft_commit_t::TIMEOUT) { // Oh snap
|
||||||
result == Agent::raft_commit_t::TIMEOUT) { // Oh snap
|
// Note that we can get UNKNOWN if we have lost leadership or
|
||||||
|
// if we are shutting down. In both cases we just leave the loop.
|
||||||
LOG_TOPIC(WARN, Logger::SUPERVISION) << "Waiting for commits to be done ... ";
|
LOG_TOPIC(WARN, Logger::SUPERVISION) << "Waiting for commits to be done ... ";
|
||||||
continue;
|
continue;
|
||||||
} else { // Good we can continue
|
} else { // Good we can continue
|
||||||
|
@ -1457,11 +1458,8 @@ void Supervision::shrinkCluster() {
|
||||||
if (replFact > maxReplFact) {
|
if (replFact > maxReplFact) {
|
||||||
maxReplFact = replFact;
|
maxReplFact = replFact;
|
||||||
}
|
}
|
||||||
} else {
|
|
||||||
LOG_TOPIC(WARN, Logger::SUPERVISION)
|
|
||||||
<< "Cannot retrieve replication factor for collection " << collptr.first;
|
|
||||||
return;
|
|
||||||
}
|
}
|
||||||
|
// Note that this could be a satellite collection, in any case, ignore:
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|
|
@ -699,7 +699,8 @@ void RestReplicationHandler::handleCommandClusterInventory() {
|
||||||
for (auto const& p : *shardMap) {
|
for (auto const& p : *shardMap) {
|
||||||
auto currentServerList = cic->servers(p.first /* shardId */);
|
auto currentServerList = cic->servers(p.first /* shardId */);
|
||||||
if (currentServerList.size() == 0 || p.second.size() == 0 ||
|
if (currentServerList.size() == 0 || p.second.size() == 0 ||
|
||||||
currentServerList[0] != p.second[0]) {
|
currentServerList[0] != p.second[0] ||
|
||||||
|
(!p.second[0].empty() && p.second[0][0] == '_')) {
|
||||||
isReady = false;
|
isReady = false;
|
||||||
}
|
}
|
||||||
if (!ClusterHelpers::compareServerLists(p.second, currentServerList)) {
|
if (!ClusterHelpers::compareServerLists(p.second, currentServerList)) {
|
||||||
|
|
|
@ -399,7 +399,7 @@ TEST_CASE("RemoveFollower", "[agency][supervision]") {
|
||||||
REQUIRE(typeName(q->slice()) == "array");
|
REQUIRE(typeName(q->slice()) == "array");
|
||||||
REQUIRE(q->slice().length() == 1);
|
REQUIRE(q->slice().length() == 1);
|
||||||
REQUIRE(typeName(q->slice()[0]) == "array");
|
REQUIRE(typeName(q->slice()[0]) == "array");
|
||||||
REQUIRE(q->slice()[0].length() == 1); // we always simply override! no preconditions...
|
REQUIRE(q->slice()[0].length() == 2); // precondition
|
||||||
REQUIRE(typeName(q->slice()[0][0]) == "object");
|
REQUIRE(typeName(q->slice()[0][0]) == "object");
|
||||||
|
|
||||||
auto writes = q->slice()[0][0];
|
auto writes = q->slice()[0][0];
|
||||||
|
@ -407,8 +407,12 @@ TEST_CASE("RemoveFollower", "[agency][supervision]") {
|
||||||
REQUIRE(typeName(writes.get("/arango/Target/ToDo/1").get("op")) == "string");
|
REQUIRE(typeName(writes.get("/arango/Target/ToDo/1").get("op")) == "string");
|
||||||
CHECK(writes.get("/arango/Target/ToDo/1").get("op").copyString() == "delete");
|
CHECK(writes.get("/arango/Target/ToDo/1").get("op").copyString() == "delete");
|
||||||
CHECK(writes.get("/arango/Target/Finished/1").get("collection").copyString() == COLLECTION);
|
CHECK(writes.get("/arango/Target/Finished/1").get("collection").copyString() == COLLECTION);
|
||||||
CHECK(writes.get("/arango/Target/Pending/1").get("op").copyString() == "delete");
|
|
||||||
CHECK(typeName(writes.get("/arango/Target/Failed/1")) == "none");
|
CHECK(typeName(writes.get("/arango/Target/Failed/1")) == "none");
|
||||||
|
|
||||||
|
auto precond = q->slice()[0][1];
|
||||||
|
REQUIRE(typeName(precond) == "object");
|
||||||
|
REQUIRE(typeName(precond.get("/arango/Supervision/Health/follower1/Status")) == "object");
|
||||||
|
|
||||||
return fakeWriteResult;
|
return fakeWriteResult;
|
||||||
}
|
}
|
||||||
);
|
);
|
||||||
|
|
|
@ -11,7 +11,7 @@ R"=(
|
||||||
},
|
},
|
||||||
"collection2": {
|
"collection2": {
|
||||||
"s2": {
|
"s2": {
|
||||||
"servers": ["leader"]
|
"servers": ["leader", "follower1"]
|
||||||
}
|
}
|
||||||
},
|
},
|
||||||
"collection3": {
|
"collection3": {
|
||||||
|
|
Loading…
Reference in New Issue