1
0
Fork 0

actually honor the return value of FollowerInfo::addFollower (#9358)

This commit is contained in:
Jan 2019-06-28 18:31:15 +02:00 committed by GitHub
parent 1653e9698a
commit d842b877a2
No known key found for this signature in database
GPG Key ID: 4AEE18F83AFDEB23
6 changed files with 260 additions and 59 deletions

View File

@ -263,7 +263,8 @@ Result commitAbortTransaction(transaction::Methods& trx, transaction::Status sta
state.allCollections([&](TransactionCollection& tc) {
auto cc = tc.collection();
if (cc) {
if (cc->followers()->remove(follower)) {
Result r = cc->followers()->remove(follower);
if (r.ok()) {
// TODO: what happens if a server is re-added during a transaction ?
LOG_TOPIC("709c9", WARN, Logger::REPLICATION)
<< "synchronous replication: dropping follower " << follower
@ -271,7 +272,7 @@ Result commitAbortTransaction(transaction::Methods& trx, transaction::Status sta
} else {
LOG_TOPIC("4971f", ERR, Logger::REPLICATION)
<< "synchronous replication: could not drop follower "
<< follower << " for shard " << tc.collectionName();
<< follower << " for shard " << tc.collectionName() << ": " << r.errorMessage();
res.reset(TRI_ERROR_CLUSTER_COULD_NOT_DROP_FOLLOWER);
return false; // cancel transaction
}
@ -373,13 +374,14 @@ Result ClusterTrxMethods::beginTransactionOnFollowers(transaction::Methods& trx,
LOG_TOPIC("217e3", INFO, Logger::REPLICATION)
<< "dropping follower because it did not start trx "
<< state.id() << ", error: '" << r.errorMessage() << "'";
if (info.remove(followers[i])) {
r = info.remove(followers[i]);
if (r.ok()) {
// TODO: what happens if a server is re-added during a transaction ?
LOG_TOPIC("c70a6", WARN, Logger::REPLICATION)
<< "synchronous replication: dropping follower " << followers[i];
} else {
LOG_TOPIC("fe8e1", ERR, Logger::REPLICATION)
<< "synchronous replication: could not drop follower " << followers[i];
<< "synchronous replication: could not drop follower " << followers[i] << ": " << r.errorMessage();
res.reset(TRI_ERROR_CLUSTER_COULD_NOT_DROP_FOLLOWER);
}
}

View File

@ -89,7 +89,10 @@ static VPackBuilder newShardEntry(VPackSlice oldValue, ServerID const& sid, bool
/// `/Current` but in asynchronous "fire-and-forget" way.
////////////////////////////////////////////////////////////////////////////////
void FollowerInfo::add(ServerID const& sid) {
Result FollowerInfo::add(ServerID const& sid) {
TRI_IF_FAILURE("FollowerInfo::add") {
return {TRI_ERROR_CLUSTER_AGENCY_COMMUNICATION_FAILED, "unable to add follower"};
}
MUTEX_LOCKER(locker, _agencyMutex);
@ -100,7 +103,7 @@ void FollowerInfo::add(ServerID const& sid) {
// First check if there is anything to do:
for (auto const& s : *_followers) {
if (s == sid) {
return; // Do nothing, if follower already there
return {TRI_ERROR_NO_ERROR}; // Do nothing, if follower already there
}
}
// Fully copy the vector:
@ -109,7 +112,7 @@ void FollowerInfo::add(ServerID const& sid) {
_followers = v; // will cast to std::vector<ServerID> const
#ifdef DEBUG_SYNC_REPLICATION
if (!AgencyCommManager::MANAGER) {
return;
return {TRI_ERROR_NO_ERROR};
}
#endif
}
@ -123,7 +126,6 @@ void FollowerInfo::add(ServerID const& sid) {
path += _docColl->name();
AgencyComm ac;
double startTime = TRI_microtime();
bool success = false;
do {
AgencyCommResult res = ac.getValues(path);
@ -153,23 +155,23 @@ void FollowerInfo::add(ServerID const& sid) {
AgencyOperation("Current/Version", AgencySimpleOperationType::INCREMENT_OP));
AgencyCommResult res2 = ac.sendTransactionWithFailover(trx);
if (res2.successful()) {
success = true;
break; //
} else {
LOG_TOPIC("1d5fe", WARN, Logger::CLUSTER)
<< "FollowerInfo::add, could not cas key " << path;
// we are finished!
return {TRI_ERROR_NO_ERROR};
}
}
} else {
LOG_TOPIC("dcf54", ERR, Logger::CLUSTER)
<< "FollowerInfo::add, could not read " << path << " in agency.";
<< "FollowerInfo::add, could not read " << path << " in agency";
}
std::this_thread::sleep_for(std::chrono::microseconds(500000));
} while (TRI_microtime() < startTime + 30);
if (!success) {
LOG_TOPIC("6295b", ERR, Logger::CLUSTER)
<< "FollowerInfo::add, timeout in agency operation for key " << path;
}
} while (TRI_microtime() < startTime + 30 &&
application_features::ApplicationServer::isRetryOK());
int errorCode = (application_features::ApplicationServer::isRetryOK()) ? TRI_ERROR_CLUSTER_AGENCY_COMMUNICATION_FAILED : TRI_ERROR_SHUTTING_DOWN;
std::string errorMessage = "unable to add follower in agency, timeout in agency CAS operation for key " + path + ": " + TRI_errno_string(errorCode);
LOG_TOPIC("6295b", ERR, Logger::CLUSTER) << errorMessage;
return {errorCode, std::move(errorMessage)};
}
////////////////////////////////////////////////////////////////////////////////
@ -180,11 +182,15 @@ void FollowerInfo::add(ServerID const& sid) {
/// since been dropped (see `dropFollowerInfo` below).
////////////////////////////////////////////////////////////////////////////////
bool FollowerInfo::remove(ServerID const& sid) {
Result FollowerInfo::remove(ServerID const& sid) {
TRI_IF_FAILURE("FollowerInfo::remove") {
return {TRI_ERROR_CLUSTER_AGENCY_COMMUNICATION_FAILED, "unable to remove follower"};
}
if (application_features::ApplicationServer::isStopping()) {
// If we are already shutting down, we cannot be trusted any more with
// such an important decision like dropping a follower.
return false;
return {TRI_ERROR_SHUTTING_DOWN};
}
LOG_TOPIC("ce460", DEBUG, Logger::CLUSTER)
@ -204,7 +210,7 @@ bool FollowerInfo::remove(ServerID const& sid) {
}
}
if (!found) {
return true; // nothing to do
return {TRI_ERROR_NO_ERROR}; // nothing to do
}
auto v = std::make_shared<std::vector<ServerID>>();
@ -220,7 +226,7 @@ bool FollowerInfo::remove(ServerID const& sid) {
_followers = v; // will cast to std::vector<ServerID> const
#ifdef DEBUG_SYNC_REPLICATION
if (!AgencyCommManager::MANAGER) {
return true;
return {TRI_ERROR_NO_ERROR};
}
#endif
// Now tell the agency, path is
@ -233,7 +239,6 @@ bool FollowerInfo::remove(ServerID const& sid) {
path += _docColl->name();
AgencyComm ac;
double startTime = TRI_microtime();
bool success = false;
do {
AgencyCommResult res = ac.getValues(path);
if (res.successful()) {
@ -262,13 +267,10 @@ bool FollowerInfo::remove(ServerID const& sid) {
AgencyOperation("Current/Version", AgencySimpleOperationType::INCREMENT_OP));
AgencyCommResult res2 = ac.sendTransactionWithFailover(trx);
if (res2.successful()) {
success = true;
break; //
} else {
LOG_TOPIC("2db38", WARN, Logger::CLUSTER)
<< "FollowerInfo::remove, could not cas key " << path
<< ". status code: " << res2._statusCode
<< ", incriminating body: " << res2.bodyRef();
// we are finished
LOG_TOPIC("be0cb", DEBUG, Logger::CLUSTER) << "Removing follower " << sid << " from "
<< _docColl->name() << "succeeded";
return {TRI_ERROR_NO_ERROR};
}
}
} else {
@ -278,16 +280,15 @@ bool FollowerInfo::remove(ServerID const& sid) {
std::this_thread::sleep_for(std::chrono::microseconds(500000));
} while (TRI_microtime() < startTime + 30 &&
application_features::ApplicationServer::isRetryOK());
if (!success) {
_followers = _oldFollowers;
LOG_TOPIC("c0e76", ERR, Logger::CLUSTER)
<< "FollowerInfo::remove, timeout in agency operation for key " << path;
}
// rollback
_followers = _oldFollowers;
int errorCode = (application_features::ApplicationServer::isRetryOK()) ? TRI_ERROR_CLUSTER_AGENCY_COMMUNICATION_FAILED : TRI_ERROR_SHUTTING_DOWN;
std::string errorMessage = "unable to remove follower from agency, timeout in agency CAS operation for key " + path + ": " + TRI_errno_string(errorCode);
LOG_TOPIC("a0dcc", ERR, Logger::CLUSTER) << errorMessage;
LOG_TOPIC("be0cb", DEBUG, Logger::CLUSTER) << "Removing follower " << sid << " from "
<< _docColl->name() << "succeeded: " << success;
return success;
return {errorCode, std::move(errorMessage)};
}
//////////////////////////////////////////////////////////////////////////////
@ -296,8 +297,7 @@ bool FollowerInfo::remove(ServerID const& sid) {
void FollowerInfo::clear() {
WRITE_LOCKER(writeLocker, _dataLock);
auto v = std::make_shared<std::vector<ServerID>>();
_followers = v; // will cast to std::vector<ServerID> const
_followers = std::make_shared<std::vector<ServerID>>();
}
//////////////////////////////////////////////////////////////////////////////
@ -306,10 +306,6 @@ void FollowerInfo::clear() {
bool FollowerInfo::contains(ServerID const& sid) const {
READ_LOCKER(readLocker, _dataLock);
for (auto const& s : *_followers) {
if (s == sid) {
return true;
}
}
return false;
auto const& f = *_followers;
return std::find(f.begin(), f.end(), sid) != f.end();
}

View File

@ -28,6 +28,7 @@
#include "ClusterInfo.h"
#include "Basics/Mutex.h"
#include "Basics/ReadWriteLock.h"
#include "Basics/Result.h"
#include "Basics/WriteLocker.h"
namespace arangodb {
@ -51,7 +52,7 @@ class FollowerInfo {
public:
explicit FollowerInfo(arangodb::LogicalCollection* d)
: _followers(new std::vector<ServerID>()), _docColl(d), _theLeaderTouched(false) {}
: _followers(std::make_shared<std::vector<ServerID>>()), _docColl(d), _theLeaderTouched(false) {}
////////////////////////////////////////////////////////////////////////////////
/// @brief get information about current followers of a shard.
@ -70,7 +71,7 @@ class FollowerInfo {
/// (see `dropFollowerInfo` below).
//////////////////////////////////////////////////////////////////////////////
void add(ServerID const& s);
Result add(ServerID const& s);
//////////////////////////////////////////////////////////////////////////////
/// @brief remove a follower from a shard, this is only done by the
@ -79,7 +80,7 @@ class FollowerInfo {
/// way.
//////////////////////////////////////////////////////////////////////////////
bool remove(ServerID const& s);
Result remove(ServerID const& s);
//////////////////////////////////////////////////////////////////////////////
/// @brief clear follower list, no changes in agency necesary

View File

@ -2233,7 +2233,12 @@ void RestReplicationHandler::handleCommandAddFollower() {
<< "Compare with shortCut Leader: " << nr
<< " == Follower: " << checksumSlice.copyString();
if (nr == 0 && checksumSlice.isEqualString("0")) {
col->followers()->add(followerId);
Result res = col->followers()->add(followerId);
if (res.fail()) {
// this will create an error response with the appropriate message
THROW_ARANGO_EXCEPTION(res);
}
VPackBuilder b;
{
@ -2293,7 +2298,12 @@ void RestReplicationHandler::handleCommandAddFollower() {
return;
}
col->followers()->add(followerId);
Result res = col->followers()->add(followerId);
if (res.fail()) {
// this will create an error response with the appropriate message
THROW_ARANGO_EXCEPTION(res);
}
VPackBuilder b;
{
@ -2302,7 +2312,7 @@ void RestReplicationHandler::handleCommandAddFollower() {
}
LOG_TOPIC("c13d4", DEBUG, Logger::REPLICATION) << followerId << " is now following on shard "
<< _vocbase.name() << "/" << col->name();
<< _vocbase.name() << "/" << col->name();
generateResult(rest::ResponseCode::OK, b.slice());
}
@ -2340,7 +2350,13 @@ void RestReplicationHandler::handleCommandRemoveFollower() {
"did not find collection");
return;
}
col->followers()->remove(followerId.copyString());
Result res = col->followers()->remove(followerId.copyString());
if (res.fail()) {
// this will create an error response with the appropriate message
THROW_ARANGO_EXCEPTION(res);
}
VPackBuilder b;
{

View File

@ -2687,7 +2687,8 @@ OperationResult transaction::Methods::truncateLocal(std::string const& collectio
requests[i].result.answer_code == rest::ResponseCode::OK);
if (!replicationWorked) {
auto const& followerInfo = collection->followers();
if (followerInfo->remove((*followers)[i])) {
Result res = followerInfo->remove((*followers)[i]);
if (res.ok()) {
_state->removeKnownServer((*followers)[i]);
LOG_TOPIC("0e2e0", WARN, Logger::REPLICATION)
<< "truncateLocal: dropping follower " << (*followers)[i]
@ -2695,7 +2696,7 @@ OperationResult transaction::Methods::truncateLocal(std::string const& collectio
} else {
LOG_TOPIC("359bc", ERR, Logger::REPLICATION)
<< "truncateLocal: could not drop follower " << (*followers)[i]
<< " for shard " << collectionName;
<< " for shard " << collectionName << ": " << res.errorMessage();
THROW_ARANGO_EXCEPTION(TRI_ERROR_CLUSTER_COULD_NOT_DROP_FOLLOWER);
}
}
@ -3459,7 +3460,8 @@ Result Methods::replicateOperations(LogicalCollection const& collection,
}
if (!replicationWorked) {
auto const& followerInfo = collection.followers();
if (followerInfo->remove((*followers)[i])) {
Result res = followerInfo->remove((*followers)[i]);
if (res.ok()) {
// TODO: what happens if a server is re-added during a transaction ?
_state->removeKnownServer((*followers)[i]);
LOG_TOPIC("12d8c", WARN, Logger::REPLICATION)
@ -3468,7 +3470,7 @@ Result Methods::replicateOperations(LogicalCollection const& collection,
} else {
LOG_TOPIC("db473", ERR, Logger::REPLICATION)
<< "synchronous replication: could not drop follower "
<< (*followers)[i] << " for shard " << collection.name();
<< (*followers)[i] << " for shard " << collection.name() << ": " << res.errorMessage();
THROW_ARANGO_EXCEPTION(TRI_ERROR_CLUSTER_COULD_NOT_DROP_FOLLOWER);
}
}

View File

@ -0,0 +1,184 @@
/*jshint globalstrict:false, strict:false */
/*global fail, assertFalse, assertTrue, assertEqual */
////////////////////////////////////////////////////////////////////////////////
/// @brief test add/drop followers
///
/// @file
///
/// DISCLAIMER
///
/// Copyright 2010-2012 triagens GmbH, Cologne, Germany
///
/// Licensed under the Apache License, Version 2.0 (the "License");
/// you may not use this file except in compliance with the License.
/// You may obtain a copy of the License at
///
/// http://www.apache.org/licenses/LICENSE-2.0
///
/// Unless required by applicable law or agreed to in writing, software
/// distributed under the License is distributed on an "AS IS" BASIS,
/// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
/// See the License for the specific language governing permissions and
/// limitations under the License.
///
/// Copyright holder is triAGENS GmbH, Cologne, Germany
///
/// @author Jan Steemann
/// @author Copyright 2013, triAGENS GmbH, Cologne, Germany
////////////////////////////////////////////////////////////////////////////////
const jsunity = require("jsunity");
let db = require("@arangodb").db;
let internal = require("internal");
function FollowersSuite () {
'use strict';
const cn = "UnitTestsCollection";
return {
setUp : function () {
internal.debugClearFailAt();
db._drop(cn);
},
tearDown : function () {
internal.debugClearFailAt();
db._drop(cn);
},
testNoReplication : function () {
let c = db._create(cn, { numberOfShards: 5, replicationFactor: 1 });
let result = require("@arangodb/cluster").shardDistribution().results[cn];
// validate Plan
assertTrue(result.hasOwnProperty("Plan"));
let shards = Object.keys(result.Plan);
shards.forEach(function(shard) {
let data = result.Plan[shard];
assertTrue(data.hasOwnProperty("leader"));
assertTrue(data.hasOwnProperty("followers"));
// supposed to have a no follower
assertEqual(0, data.followers.length);
});
// now check Current
assertTrue(result.hasOwnProperty("Current"));
shards = Object.keys(result.Current);
assertEqual(5, shards.length);
shards.forEach(function(shard) {
let data = result.Current[shard];
assertTrue(data.hasOwnProperty("leader"));
assertTrue(data.hasOwnProperty("followers"));
assertEqual(0, data.followers.length);
});
},
testWithReplication : function () {
let c = db._create(cn, { numberOfShards: 5, replicationFactor: 2 });
let result = require("@arangodb/cluster").shardDistribution().results[cn];
// validate Plan
assertTrue(result.hasOwnProperty("Plan"));
let shards = Object.keys(result.Plan);
assertEqual(5, shards.length);
shards.forEach(function(shard) {
let data = result.Plan[shard];
assertTrue(data.hasOwnProperty("leader"));
assertTrue(data.hasOwnProperty("followers"));
// supposed to have a single follower
assertEqual(1, data.followers.length);
// follower must be != leader
assertEqual(-1, data.followers.indexOf(data.leader));
});
// now check Current
assertTrue(result.hasOwnProperty("Current"));
let tries = 0;
let found = 0;
while (++tries < 60) {
found = 0;
shards = Object.keys(result.Current);
assertEqual(5, shards.length);
shards.forEach(function(shard) {
let data = result.Current[shard];
assertTrue(data.hasOwnProperty("leader"));
assertTrue(data.hasOwnProperty("followers"));
// supposed to have a single follower
if (data.followers.length !== 1) {
return;
}
assertEqual(1, data.followers.length);
// follower must be != leader
assertEqual(-1, data.followers.indexOf(data.leader));
++found;
});
if (found === shards.length) {
break;
}
internal.wait(0.5, false);
result = require("@arangodb/cluster").shardDistribution().results[cn];
}
assertEqual(shards.length, found);
},
testWithReplicationAndFailure : function () {
if (!internal.debugCanUseFailAt()) {
return;
}
internal.debugSetFailAt("FollowerInfo::add");
let c = db._create(cn, { numberOfShards: 5, replicationFactor: 2 });
let result = require("@arangodb/cluster").shardDistribution().results[cn];
// validate Plan
assertTrue(result.hasOwnProperty("Plan"));
let shards = Object.keys(result.Plan);
assertEqual(5, shards.length);
shards.forEach(function(shard) {
let data = result.Plan[shard];
assertTrue(data.hasOwnProperty("leader"));
assertTrue(data.hasOwnProperty("followers"));
// supposed to have a single follower
assertEqual(1, data.followers.length);
// follower must be != leader
assertEqual(-1, data.followers.indexOf(data.leader));
});
// now check Current
assertTrue(result.hasOwnProperty("Current"));
let tries = 0;
// try for 10 seconds, and in this period no followers must show up
while (++tries < 20) {
shards = Object.keys(result.Current);
assertEqual(5, shards.length);
shards.forEach(function(shard) {
let data = result.Current[shard];
assertTrue(data.hasOwnProperty("leader"));
assertTrue(data.hasOwnProperty("followers"));
// supposed to have a no followers
assertEqual(0, data.followers.length);
});
internal.wait(0.5, false);
result = require("@arangodb/cluster").shardDistribution().results[cn];
}
}
};
}
jsunity.run(FollowersSuite);
return jsunity.done();