1
0
Fork 0

Fix behaviour of synchronous replication in dropFollower case.

If we are already in shutdown, we do not drop a follower.
If we cannot drop a follower (no contact to agency), we error out.
This commit is contained in:
Max Neunhoeffer 2017-06-09 13:06:02 +02:00
parent f1db8666ac
commit 29921d32a8
7 changed files with 71 additions and 19 deletions

View File

@ -24,6 +24,7 @@
#include "FollowerInfo.h"
#include "ApplicationFeatures/ApplicationServer.h"
#include "Cluster/ServerState.h"
#include "VocBase/LogicalCollection.h"
@ -183,7 +184,12 @@ void FollowerInfo::add(ServerID const& sid) {
/// since been dropped (see `dropFollowerInfo` below).
////////////////////////////////////////////////////////////////////////////////
void FollowerInfo::remove(ServerID const& sid) {
bool FollowerInfo::remove(ServerID const& sid) {
if (application_features::ApplicationServer::isStopping()) {
// If we are already shutting down, we cannot be trusted any more with
// such an important decision like dropping a follower.
return false;
}
MUTEX_LOCKER(locker, _mutex);
// First check if there is anything to do:
@ -195,7 +201,7 @@ void FollowerInfo::remove(ServerID const& sid) {
}
}
if (!found) {
return; // nothing to do
return true; // nothing to do
}
auto v = std::make_shared<std::vector<ServerID>>();
@ -207,10 +213,11 @@ void FollowerInfo::remove(ServerID const& sid) {
}
}
}
auto _oldFollowers = _followers;
_followers = v; // will cast to std::vector<ServerID> const
#ifdef DEBUG_SYNC_REPLICATION
if (!AgencyCommManager::MANAGER) {
return;
return true;
}
#endif
// Now tell the agency, path is
@ -267,13 +274,15 @@ void FollowerInfo::remove(ServerID const& sid) {
usleep(500000);
} while (TRI_microtime() < startTime + 30);
if (!success) {
_followers = _oldFollowers;
LOG_TOPIC(ERR, Logger::CLUSTER)
<< "FollowerInfo::remove, timeout in agency operation for key " << path;
}
return success;
}
//////////////////////////////////////////////////////////////////////////////
/// @brief clear follower list, no changes in agency necesary
/// @brief clear follower list, no changes in agency necessary
//////////////////////////////////////////////////////////////////////////////
void FollowerInfo::clear() {

View File

@ -67,7 +67,7 @@ class FollowerInfo {
/// way.
//////////////////////////////////////////////////////////////////////////////
void remove(ServerID const& s);
bool remove(ServerID const& s);
//////////////////////////////////////////////////////////////////////////////
/// @brief clear follower list, no changes in agency necesary

View File

@ -1534,10 +1534,16 @@ OperationResult transaction::Methods::insertLocal(
}
if (!replicationWorked) {
auto const& followerInfo = collection->followers();
followerInfo->remove((*followers)[i]);
LOG_TOPIC(ERR, Logger::REPLICATION)
<< "insertLocal: dropping follower " << (*followers)[i]
<< " for shard " << collectionName;
if (followerInfo->remove((*followers)[i])) {
LOG_TOPIC(WARN, Logger::REPLICATION)
<< "insertLocal: dropping follower " << (*followers)[i]
<< " for shard " << collectionName;
} else {
LOG_TOPIC(ERR, Logger::REPLICATION)
<< "insertLocal: could not drop follower "
<< (*followers)[i] << " for shard " << collectionName;
THROW_ARANGO_EXCEPTION(TRI_ERROR_CLUSTER_COULD_NOT_DROP_FOLLOWER);
}
}
}
}
@ -1850,7 +1856,16 @@ OperationResult transaction::Methods::modifyLocal(
}
if (!replicationWorked) {
auto const& followerInfo = collection->followers();
followerInfo->remove((*followers)[i]);
if (followerInfo->remove((*followers)[i])) {
LOG_TOPIC(WARN, Logger::REPLICATION)
<< "modifyLocal: dropping follower " << (*followers)[i]
<< " for shard " << collectionName;
} else {
LOG_TOPIC(ERR, Logger::REPLICATION)
<< "modifyLocal: could not drop follower "
<< (*followers)[i] << " for shard " << collectionName;
THROW_ARANGO_EXCEPTION(TRI_ERROR_CLUSTER_COULD_NOT_DROP_FOLLOWER);
}
LOG_TOPIC(ERR, Logger::REPLICATION)
<< "modifyLocal: dropping follower " << (*followers)[i]
<< " for shard " << collectionName;
@ -2094,10 +2109,16 @@ OperationResult transaction::Methods::removeLocal(
}
if (!replicationWorked) {
auto const& followerInfo = collection->followers();
followerInfo->remove((*followers)[i]);
LOG_TOPIC(ERR, Logger::REPLICATION)
<< "removeLocal: dropping follower " << (*followers)[i]
<< " for shard " << collectionName;
if (followerInfo->remove((*followers)[i])) {
LOG_TOPIC(WARN, Logger::REPLICATION)
<< "removeLocal: dropping follower " << (*followers)[i]
<< " for shard " << collectionName;
} else {
LOG_TOPIC(ERR, Logger::REPLICATION)
<< "removeLocal: could not drop follower "
<< (*followers)[i] << " for shard " << collectionName;
THROW_ARANGO_EXCEPTION(TRI_ERROR_CLUSTER_COULD_NOT_DROP_FOLLOWER);
}
}
}
}
@ -2270,10 +2291,16 @@ OperationResult transaction::Methods::truncateLocal(
requests[i].result.answer_code == rest::ResponseCode::OK);
if (!replicationWorked) {
auto const& followerInfo = collection->followers();
followerInfo->remove((*followers)[i]);
LOG_TOPIC(ERR, Logger::REPLICATION)
<< "truncateLocal: dropping follower " << (*followers)[i]
<< " for shard " << collectionName;
if (followerInfo->remove((*followers)[i])) {
LOG_TOPIC(WARN, Logger::REPLICATION)
<< "truncateLocal: dropping follower " << (*followers)[i]
<< " for shard " << collectionName;
} else {
LOG_TOPIC(ERR, Logger::REPLICATION)
<< "truncateLocal: could not drop follower "
<< (*followers)[i] << " for shard " << collectionName;
THROW_ARANGO_EXCEPTION(TRI_ERROR_CLUSTER_COULD_NOT_DROP_FOLLOWER);
}
}
}
}

View File

@ -166,6 +166,7 @@
"ERROR_CLUSTER_MUST_NOT_DROP_COLL_OTHER_DISTRIBUTESHARDSLIKE" : { "code" : 1485, "message" : "must not drop collection while another has a distributeShardsLike attribute pointing to it" },
"ERROR_CLUSTER_UNKNOWN_DISTRIBUTESHARDSLIKE" : { "code" : 1486, "message" : "must not have a distributeShardsLike attribute pointing to an unknown collection" },
"ERROR_CLUSTER_INSUFFICIENT_DBSERVERS" : { "code" : 1487, "message" : "the number of current dbservers is lower than the requested replicationFactor" },
"ERROR_CLUSTER_COULD_NOT_DROP_FOLLOWER" : { "code" : 1488, "message" : "a follower could not be dropped in agency" },
"ERROR_QUERY_KILLED" : { "code" : 1500, "message" : "query killed" },
"ERROR_QUERY_PARSE" : { "code" : 1501, "message" : "%s" },
"ERROR_QUERY_EMPTY" : { "code" : 1502, "message" : "query is empty" },

View File

@ -202,7 +202,7 @@ ERROR_CLUSTER_CHAIN_OF_DISTRIBUTESHARDSLIKE,1484,"chain of distributeShardsLike
ERROR_CLUSTER_MUST_NOT_DROP_COLL_OTHER_DISTRIBUTESHARDSLIKE,1485,"must not drop collection while another has a distributeShardsLike attribute pointing to it","Will be raised if one tries to drop a collection to which another collection points with its distributeShardsLike attribute."
ERROR_CLUSTER_UNKNOWN_DISTRIBUTESHARDSLIKE,1486,"must not have a distributeShardsLike attribute pointing to an unknown collection","Will be raised if one tries to create a collection which points to an unknown collection in its distributeShardsLike attribute."
ERROR_CLUSTER_INSUFFICIENT_DBSERVERS,1487,"the number of current dbservers is lower than the requested replicationFactor","Will be raised if one tries to create a collection with a replicationFactor greater than the available number of DBServers."
ERROR_CLUSTER_COULD_NOT_DROP_FOLLOWER,1488,"a follower could not be dropped in agency","Will be raised if a follower that ought to be dropped could not be dropped in the agency (under Current)."
################################################################################
## ArangoDB query errors

View File

@ -162,6 +162,7 @@ void TRI_InitializeErrorMessages () {
REG_ERROR(ERROR_CLUSTER_MUST_NOT_DROP_COLL_OTHER_DISTRIBUTESHARDSLIKE, "must not drop collection while another has a distributeShardsLike attribute pointing to it");
REG_ERROR(ERROR_CLUSTER_UNKNOWN_DISTRIBUTESHARDSLIKE, "must not have a distributeShardsLike attribute pointing to an unknown collection");
REG_ERROR(ERROR_CLUSTER_INSUFFICIENT_DBSERVERS, "the number of current dbservers is lower than the requested replicationFactor");
REG_ERROR(ERROR_CLUSTER_COULD_NOT_DROP_FOLLOWER, "a follower could not be dropped in agency");
REG_ERROR(ERROR_QUERY_KILLED, "query killed");
REG_ERROR(ERROR_QUERY_PARSE, "%s");
REG_ERROR(ERROR_QUERY_EMPTY, "query is empty");

View File

@ -400,6 +400,9 @@
/// - 1487: @LIT{the number of current dbservers is lower than the requested replicationFactor}
/// Will be raised if one tries to create a collection with a
/// replicationFactor greater than the available number of DBServers.
/// - 1488: @LIT{a follower could not be dropped in agency}
/// Will be raised if a follower that ought to be dropped could not be
/// dropped in the agency (under Current).
/// - 1500: @LIT{query killed}
/// Will be raised when a running query is killed by an explicit admin
/// command.
@ -2372,6 +2375,17 @@ void TRI_InitializeErrorMessages ();
#define TRI_ERROR_CLUSTER_INSUFFICIENT_DBSERVERS (1487)
////////////////////////////////////////////////////////////////////////////////
/// @brief 1488: ERROR_CLUSTER_COULD_NOT_DROP_FOLLOWER
///
/// a follower could not be dropped in agency
///
/// Will be raised if a follower that ought to be dropped could not be dropped
/// in the agency (under Current).
////////////////////////////////////////////////////////////////////////////////
#define TRI_ERROR_CLUSTER_COULD_NOT_DROP_FOLLOWER (1488)
////////////////////////////////////////////////////////////////////////////////
/// @brief 1500: ERROR_QUERY_KILLED
///