1
0
Fork 0

This is porting from 3.4 a cleanup in Current (follower removed from plan). (#6718)

* Fix cleanup of Current entry in case a follower is removed from Plan. (#6623)
* Properly remove unplanned followers in leader and Current.
* Add a catch test.
* Fix tests.
* Fix a bug with a temporary object.
* Protect against exception from getCollection not found.
* New Maintenance test data.
This commit is contained in:
Max Neunhöffer 2018-10-09 15:29:42 +02:00 committed by GitHub
parent 2452dcc5d0
commit 79bade7e6b
No known key found for this signature in database
GPG Key ID: 4AEE18F83AFDEB23
12 changed files with 7290 additions and 7002 deletions

View File

@ -89,8 +89,20 @@ Result DBServerAgencySync::getLocalCollections(VPackBuilder& collections) {
collections.add(VPackValue(colname));
VPackObjectBuilder col(&collections);
collection->toVelocyPack(collections,true,false);
collections.add(
"theLeader", VPackValue(collection->followers()->getLeader()));
auto const& folls = collection->followers();
auto const theLeader = folls->getLeader();
collections.add("theLeader", VPackValue(theLeader));
if (theLeader.empty()) { // we are the leader ourselves
// In this case we report our in-sync followers here in the format
// of the agency: [ leader, follower1, follower2, ... ]
collections.add(VPackValue("servers"));
{ VPackArrayBuilder guard(&collections);
collections.add(VPackValue(arangodb::ServerState::instance()->getId()));
for (auto const& s : *folls->get()) {
collections.add(VPackValue(s));
}
}
}
}
}
} catch (std::exception const& e) {

View File

@ -205,18 +205,54 @@ void handlePlanShard(
auto fullShardLabel = dbname + "/" + colname + "/" + shname;
// Check if there is some in-sync-follower which is no longer in the Plan:
std::string followersToDropString;
if (leading && shouldBeLeading) {
VPackSlice shards = cprops.get("shards");
if (shards.isObject()) {
VPackSlice planServers = shards.get(shname);
if (planServers.isArray()) {
VPackSlice inSyncFollowers = lcol.get("servers");
if (inSyncFollowers.isArray()) {
// Now we have two server lists, we are looking for a server
// which does not occur in the plan, but is in the followers
// at an index > 0:
std::unordered_set<std::string> followersToDrop;
for (auto const& q : VPackArrayIterator(inSyncFollowers)) {
followersToDrop.insert(q.copyString());
}
for (auto const& p : VPackArrayIterator(planServers)) {
if (p.isString()) {
followersToDrop.erase(p.copyString());
}
}
// Everything remaining in followersToDrop is something we
// need to act on
for (auto const& r : followersToDrop) {
if (!followersToDropString.empty()) {
followersToDropString.push_back(',');
}
followersToDropString += r;
}
}
}
}
}
// If comparison has brought any updates
if (properties->slice() != VPackSlice::emptyObjectSlice()
|| leading != shouldBeLeading) {
|| leading != shouldBeLeading || !followersToDropString.empty()) {
if (errors.shards.find(fullShardLabel) ==
errors.shards.end()) {
actions.emplace_back(
ActionDescription(
{{NAME, UPDATE_COLLECTION}, {DATABASE, dbname}, {COLLECTION, colname},
{SHARD, shname}, {THE_LEADER, shouldBeLeading ? std::string() : leaderId},
{SERVER_ID, serverId}, {LOCAL_LEADER, lcol.get(THE_LEADER).copyString()}},
properties));
std::map<std::string,std::string> {
{NAME, UPDATE_COLLECTION}, {DATABASE, dbname}, {COLLECTION, colname},
{SHARD, shname}, {THE_LEADER, shouldBeLeading ? std::string() : leaderId},
{SERVER_ID, serverId}, {LOCAL_LEADER, lcol.get(THE_LEADER).copyString()},
{FOLLOWERS_TO_DROP, followersToDropString}},
properties));
} else {
LOG_TOPIC(DEBUG, Logger::MAINTENANCE)
<< "Previous failure exists for local shard " << dbname
@ -688,14 +724,10 @@ static VPackBuilder assembleLocalCollectionInfo(
// planServers may be `none` in the case that the shard is not contained
// in Plan, but in local.
if (planServers.isArray()) {
auto current = *(collection->followers()->get());
// This method is only called when we are the leader for that shard,
// hence we are not contained in `current`, i.e. followers.
for (auto const& server : VPackArrayIterator(planServers)) {
if (std::find(current.begin(), current.end(), server.copyString())
!= current.end()) {
ret.add(server);
}
std::shared_ptr<std::vector<std::string> const> current
= collection->followers()->get();
for (auto const& server : *current) {
ret.add(VPackValue(server));
}
}
}

View File

@ -42,6 +42,7 @@ constexpr char const* EDGE = "edge";
constexpr char const* ENSURE_INDEX = "EnsureIndex";
constexpr char const* FIELDS = "fields";
constexpr char const* FOLLOWER_ID = "followerId";
constexpr char const* FOLLOWERS_TO_DROP = "followersToDrop";
constexpr char const* GLOB_UID = "globallyUniqueId";
constexpr char const* ID = "id";
constexpr char const* INDEX = "index";

View File

@ -679,8 +679,21 @@ bool SynchronizeShard::first() {
return false;
}
std::shared_ptr<LogicalCollection> ci =
clusterInfo->getCollection(database, planId);
std::shared_ptr<LogicalCollection> ci;
try { // ci->getCollection can throw
ci = clusterInfo->getCollection(database, planId);
} catch(...) {
auto const endTime = system_clock::now();
std::stringstream msg;
msg << "exception in getCollection, " << database << "/"
<< shard << ", " << database
<< "/" << planId << ", started " << startTimeStr << ", ended "
<< timepointToString(endTime);
LOG_TOPIC(DEBUG, Logger::MAINTENANCE) << "SynchronizeOneShard: "
<< msg.str();
_result.reset(TRI_ERROR_FAILED, msg.str());
return false;
}
TRI_ASSERT(ci != nullptr);
std::string const cid = std::to_string(ci->id());

View File

@ -73,6 +73,11 @@ UpdateCollection::UpdateCollection(
}
TRI_ASSERT(desc.has(LOCAL_LEADER));
if (!desc.has(FOLLOWERS_TO_DROP)) {
error << "followersToDrop must be specified. ";
}
TRI_ASSERT(desc.has(FOLLOWERS_TO_DROP));
if (!error.str().empty()) {
LOG_TOPIC(ERR, Logger::MAINTENANCE) << "UpdateCollection: " << error.str();
_result.reset(TRI_ERROR_INTERNAL, error.str());
@ -83,7 +88,7 @@ UpdateCollection::UpdateCollection(
void handleLeadership(
LogicalCollection& collection, std::string const& localLeader,
std::string const& plannedLeader) {
std::string const& plannedLeader, std::string const& followersToDrop) {
auto& followers = collection.followers();
@ -97,8 +102,14 @@ void handleLeadership(
// will not notice until it fails to replicate an operation
// to the old follower. This here is to drop such a follower
// from the local list of followers. Will be reported
// to Current in due course. This is not needed for
// correctness but is a performance optimization.
// to Current in due course.
if (!followersToDrop.empty()) {
std::vector<std::string> ftd = arangodb::basics::StringUtils::split(
followersToDrop, ',');
for (auto const& s : ftd) {
followers->remove(s);
}
}
}
} else { // Planned to follow
if (localLeader.empty()) {
@ -130,6 +141,7 @@ bool UpdateCollection::first() {
auto const& shard = _description.get(SHARD);
auto const& plannedLeader = _description.get(THE_LEADER);
auto const& localLeader = _description.get(LOCAL_LEADER);
auto const& followersToDrop = _description.get(FOLLOWERS_TO_DROP);
auto const& props = properties();
try {
@ -148,7 +160,7 @@ bool UpdateCollection::first() {
// resignation case is not handled here, since then
// ourselves does not appear in shards[shard] but only
// "_" + ourselves.
handleLeadership(*coll, localLeader, plannedLeader);
handleLeadership(*coll, localLeader, plannedLeader, followersToDrop);
_result = Collections::updateProperties(coll.get(), props);
if (!_result.ok()) {

File diff suppressed because it is too large Load Diff

File diff suppressed because it is too large Load Diff

File diff suppressed because it is too large Load Diff

File diff suppressed because it is too large Load Diff

View File

@ -804,6 +804,48 @@ TEST_CASE("ActionPhaseOne", "[cluster][maintenance]") {
}
SECTION( "Removed follower in Plan must be dropped" ) {
plan = originalPlan;
std::string const dbname("_system");
std::string const colname("bar");
auto cid = collectionMap(plan).at(dbname + "/" + colname);
Node::Children& shards = plan({"Collections",dbname,cid,"shards"}).children();
auto firstShard = shards.begin();
VPackBuilder b = firstShard->second->toBuilder();
std::string const shname = firstShard->first;
std::string const leaderName = b.slice()[0].copyString();
std::string const followerName = b.slice()[1].copyString();
firstShard->second->handle<POP>(
arangodb::velocypack::Slice::emptyObjectSlice());
for (auto const& node : localNodes) {
std::vector<ActionDescription> actions;
arangodb::maintenance::diffPlanLocal (
plan.toBuilder().slice(), node.second.toBuilder().slice(), node.first,
errors, feature, actions);
if (node.first == followerName) {
// Must see an action dropping the shard
REQUIRE(actions.size() == 1);
REQUIRE(actions.front().name() == "DropCollection");
REQUIRE(actions.front().get(DATABASE) == dbname);
REQUIRE(actions.front().get(COLLECTION) == shname);
} else if (node.first == leaderName) {
// Must see an UpdateCollection action to drop the follower
REQUIRE(actions.size() == 1);
REQUIRE(actions.front().name() == "UpdateCollection");
REQUIRE(actions.front().get(DATABASE) == dbname);
REQUIRE(actions.front().get(SHARD) == shname);
REQUIRE(actions.front().get(FOLLOWERS_TO_DROP) == followerName);
} else {
// No actions required
REQUIRE(actions.size() == 0);
}
}
}
}
TEST_CASE("ActionPhaseTwo", "[cluster][maintenance]") {

File diff suppressed because it is too large Load Diff

View File

@ -1,57 +1,57 @@
R"=(
{
"DBServers": {},
"Shards": {},
"Health": {
"PRMR-50478eb8-afe1-41b7-a78c-351bfe02b31c": {
"CRDN-7f7de680-14b9-4337-8c6f-207ef9a366a6": {
"AdvertisedEndpoint": "",
"Timestamp": "2018-10-01T09:38:59Z",
"SyncStatus": "SERVING",
"Host": "ac8ddefc7d1f4364ba655b4debcd076f",
"Endpoint": "tcp://[::1]:11097",
"Timestamp": "2018-10-09T09:19:24Z",
"Status": "GOOD",
"ShortName": "DBServer0003",
"Endpoint": "tcp://[::1]:11198"
"ShortName": "Coordinator0001",
"Host": "0cbe86cef70f4e50a024274ac9d90a7f"
},
"PRMR-c18eb708-81bf-446f-977d-61b2c28e407c": {
"CRDN-067429e5-38b7-4917-8b51-0eb170d73079": {
"AdvertisedEndpoint": "",
"Timestamp": "2018-10-01T09:38:59Z",
"SyncStatus": "SERVING",
"Host": "ac8ddefc7d1f4364ba655b4debcd076f",
"Endpoint": "tcp://[::1]:11098",
"Timestamp": "2018-10-09T09:19:24Z",
"Status": "GOOD",
"ShortName": "DBServer0002",
"Endpoint": "tcp://[::1]:11197"
"ShortName": "Coordinator0002",
"Host": "0cbe86cef70f4e50a024274ac9d90a7f"
},
"PRMR-f3a3c23f-6cc1-4d7e-ab17-e37fb662b4f5": {
"PRMR-62940941-7b13-4333-86a4-819deacff9e3": {
"AdvertisedEndpoint": "",
"Timestamp": "2018-10-01T09:38:59Z",
"SyncStatus": "SERVING",
"Host": "ac8ddefc7d1f4364ba655b4debcd076f",
"Endpoint": "tcp://[::1]:11196",
"Timestamp": "2018-10-09T09:19:24Z",
"Status": "GOOD",
"ShortName": "DBServer0001",
"Endpoint": "tcp://[::1]:11196"
"Host": "0cbe86cef70f4e50a024274ac9d90a7f"
},
"CRDN-57234819-26be-4b6e-a91e-521527fbe208": {
"PRMR-3418c83d-0201-4742-a8cd-7d3894811cc2": {
"AdvertisedEndpoint": "",
"SyncStatus": "SERVING",
"Timestamp": "2018-10-01T09:39:00Z",
"Host": "ac8ddefc7d1f4364ba655b4debcd076f",
"ShortName": "Coordinator0002",
"Endpoint": "tcp://[::1]:11197",
"Timestamp": "2018-10-09T09:19:24Z",
"Status": "GOOD",
"Endpoint": "tcp://[::1]:11098"
"ShortName": "DBServer0002",
"Host": "0cbe86cef70f4e50a024274ac9d90a7f"
},
"CRDN-4416fe9e-904c-4306-ad26-95c2b92c5020": {
"PRMR-37174faa-1b91-4ba3-9d22-cf10513ca4e5": {
"AdvertisedEndpoint": "",
"SyncStatus": "SERVING",
"Timestamp": "2018-10-01T09:39:01Z",
"Host": "ac8ddefc7d1f4364ba655b4debcd076f",
"ShortName": "Coordinator0001",
"Endpoint": "tcp://[::1]:11198",
"Timestamp": "2018-10-09T09:19:24Z",
"Status": "GOOD",
"Endpoint": "tcp://[::1]:11097"
"ShortName": "DBServer0003",
"Host": "0cbe86cef70f4e50a024274ac9d90a7f"
}
},
"DBServers": {},
"State": {
"Mode": "Normal",
"Timestamp": "2018-10-01T09:38:59Z"
},
"Shards": {}
"Timestamp": "2018-10-09T09:19:23Z"
}
}
)="