1
0
Fork 0

[devel] Bug fix/bad leader report current (#7585)

* Bug fix 3.4/bad leader report current (#7574)
* Initialize theLeader non-empty, thus not assuming leadership.
* Correct ClusterInfo to look into Target/CleanedServers.
* Prevent usage of to be cleaned out servers in new collections.
* After a restart, do not assume to be leader for a shard.
* Do nothing in phaseTwo if leader has not been touched. (#7579)
* Drop follower if it refuses to cooperate.

This is important since a dbserver that is follower for a shard will
after a reboot think that it is a leader, at least for a short amount
of time. If it came back quickly enough, the leader might not have
noticed that it was away.
This commit is contained in:
Lars Maier 2018-12-03 10:20:30 +01:00 committed by Max Neunhöffer
parent 4eb37c348e
commit dd07d74d69
7 changed files with 122 additions and 55 deletions

View File

@ -123,6 +123,12 @@ JOB_STATUS CleanOutServer::status() {
reportTrx.add("op", VPackValue("push")); reportTrx.add("op", VPackValue("push"));
reportTrx.add("new", VPackValue(_server)); reportTrx.add("new", VPackValue(_server));
} }
reportTrx.add(VPackValue("/Target/ToBeCleanedServers"));
{
VPackObjectBuilder guard4(&reportTrx);
reportTrx.add("op", VPackValue("erase"));
reportTrx.add("val", VPackValue(_server));
}
addRemoveJobFromSomewhere(reportTrx, "Pending", _jobId); addRemoveJobFromSomewhere(reportTrx, "Pending", _jobId);
Builder job; Builder job;
_snapshot.hasAsBuilder(pendingPrefix + _jobId, job); _snapshot.hasAsBuilder(pendingPrefix + _jobId, job);
@ -312,6 +318,14 @@ bool CleanOutServer::start() {
addBlockServer(*pending, _server, _jobId); addBlockServer(*pending, _server, _jobId);
// Put ourselves in list of servers to be cleaned:
pending->add(VPackValue("/Target/ToBeCleanedServers"));
{
VPackObjectBuilder guard4(pending.get());
pending->add("op", VPackValue("push"));
pending->add("new", VPackValue(_server));
}
// Schedule shard relocations // Schedule shard relocations
if (!scheduleMoveShards(pending)) { if (!scheduleMoveShards(pending)) {
finish("", "", false, "Could not schedule MoveShard."); finish("", "", false, "Could not schedule MoveShard.");

View File

@ -3368,6 +3368,7 @@ void ClusterInfo::loadCurrentDBServers() {
velocypack::Slice currentDBServers; velocypack::Slice currentDBServers;
velocypack::Slice failedDBServers; velocypack::Slice failedDBServers;
velocypack::Slice cleanedDBServers; velocypack::Slice cleanedDBServers;
velocypack::Slice toBeCleanedDBServers;
if (result.slice().length() > 0) { if (result.slice().length() > 0) {
currentDBServers = result.slice()[0].get(std::vector<std::string>( currentDBServers = result.slice()[0].get(std::vector<std::string>(
@ -3379,7 +3380,10 @@ void ClusterInfo::loadCurrentDBServers() {
{AgencyCommManager::path(), "Target", "FailedServers"})); {AgencyCommManager::path(), "Target", "FailedServers"}));
cleanedDBServers = cleanedDBServers =
target.slice()[0].get(std::vector<std::string>( target.slice()[0].get(std::vector<std::string>(
{AgencyCommManager::path(), "Target", "CleanedOutServers"})); {AgencyCommManager::path(), "Target", "CleanedServers"}));
toBeCleanedDBServers =
target.slice()[0].get(std::vector<std::string>(
{AgencyCommManager::path(), "Target", "ToBeCleanedServers"}));
} }
if (currentDBServers.isObject() && failedDBServers.isObject()) { if (currentDBServers.isObject() && failedDBServers.isObject()) {
decltype(_DBServers) newDBServers; decltype(_DBServers) newDBServers;
@ -3405,7 +3409,21 @@ void ClusterInfo::loadCurrentDBServers() {
VPackArrayIterator(cleanedDBServers)) { VPackArrayIterator(cleanedDBServers)) {
if (dbserver.key == cleanedServer) { if (dbserver.key == cleanedServer) {
found = true; found = true;
continue; break;
}
}
if (found) {
continue;
}
}
if (toBeCleanedDBServers.isArray()) {
bool found = false;
for (auto const& toBeCleanedServer :
VPackArrayIterator(toBeCleanedDBServers)) {
if (dbserver.key == toBeCleanedServer) {
found = true;
break;
} }
} }
if (found) { if (found) {

View File

@ -519,8 +519,8 @@ static std::shared_ptr<std::unordered_map<std::string, std::vector<std::string>>
auto shards = std::make_shared<std::unordered_map<std::string, std::vector<std::string>>>(); auto shards = std::make_shared<std::unordered_map<std::string, std::vector<std::string>>>();
ci->loadCurrentDBServers();
if (dbServers.size() == 0) { if (dbServers.size() == 0) {
ci->loadCurrentDBServers();
dbServers = ci->getCurrentDBServers(); dbServers = ci->getCurrentDBServers();
if (dbServers.empty()) { if (dbServers.empty()) {
return shards; return shards;
@ -2565,6 +2565,7 @@ std::shared_ptr<LogicalCollection> ClusterMethods::persistCollectionInAgency(
std::string distributeShardsLike = col->distributeShardsLike(); std::string distributeShardsLike = col->distributeShardsLike();
std::vector<std::string> avoid = col->avoidServers(); std::vector<std::string> avoid = col->avoidServers();
ClusterInfo* ci = ClusterInfo::instance(); ClusterInfo* ci = ClusterInfo::instance();
ci->loadCurrentDBServers();
std::vector<std::string> dbServers = ci->getCurrentDBServers(); std::vector<std::string> dbServers = ci->getCurrentDBServers();
std::shared_ptr<std::unordered_map<std::string, std::vector<std::string>>> shards = nullptr; std::shared_ptr<std::unordered_map<std::string, std::vector<std::string>>> shards = nullptr;

View File

@ -90,20 +90,26 @@ Result DBServerAgencySync::getLocalCollections(VPackBuilder& collections) {
std::string const colname = collection->name(); std::string const colname = collection->name();
collections.add(VPackValue(colname)); collections.add(VPackValue(colname));
VPackObjectBuilder col(&collections); VPackObjectBuilder col(&collections);
collection->properties(collections,true,false); collection->properties(collections,true,false);
auto const& folls = collection->followers(); auto const& folls = collection->followers();
auto const theLeader = folls->getLeader(); std::string const theLeader = folls->getLeader();
bool theLeaderTouched = folls->getLeaderTouched();
collections.add("theLeader", VPackValue(theLeader)); // Note that whenever theLeader was set explicitly since the collection
// object was created, we believe it. Otherwise, we do not accept
// that we are the leader. This is to circumvent the problem that
// after a restart we would implicitly be assumed to be the leader.
collections.add("theLeader", VPackValue(theLeaderTouched ? theLeader : "NOT_YET_TOUCHED"));
collections.add("theLeaderTouched", VPackValue(theLeaderTouched));
if (theLeader.empty()) { // we are the leader ourselves if (theLeader.empty() && theLeaderTouched) {
// we are the leader ourselves
// In this case we report our in-sync followers here in the format // In this case we report our in-sync followers here in the format
// of the agency: [ leader, follower1, follower2, ... ] // of the agency: [ leader, follower1, follower2, ... ]
collections.add(VPackValue("servers")); collections.add(VPackValue("servers"));
{ VPackArrayBuilder guard(&collections); { VPackArrayBuilder guard(&collections);
collections.add(VPackValue(arangodb::ServerState::instance()->getId())); collections.add(VPackValue(arangodb::ServerState::instance()->getId()));
@ -213,14 +219,14 @@ DBServerAgencySyncResult DBServerAgencySync::execute() {
std::vector<std::string> path = {maintenance::PHASE_TWO, "agency"}; std::vector<std::string> path = {maintenance::PHASE_TWO, "agency"};
if (report.hasKey(path) && report.get(path).isObject()) { if (report.hasKey(path) && report.get(path).isObject()) {
auto agency = report.get(path); auto agency = report.get(path);
LOG_TOPIC(DEBUG, Logger::MAINTENANCE) LOG_TOPIC(DEBUG, Logger::MAINTENANCE)
<< "DBServerAgencySync reporting to Current: " << agency.toJson(); << "DBServerAgencySync reporting to Current: " << agency.toJson();
// Report to current // Report to current
if (!agency.isEmptyObject()) { if (!agency.isEmptyObject()) {
std::vector<AgencyOperation> operations; std::vector<AgencyOperation> operations;
for (auto const& ao : VPackObjectIterator(agency)) { for (auto const& ao : VPackObjectIterator(agency)) {
auto const key = ao.key.copyString(); auto const key = ao.key.copyString();
@ -248,7 +254,7 @@ DBServerAgencySyncResult DBServerAgencySync::execute() {
} }
} }
if (tmp.ok()) { if (tmp.ok()) {
result = DBServerAgencySyncResult( result = DBServerAgencySyncResult(
true, true,

View File

@ -39,6 +39,7 @@ class FollowerInfo {
arangodb::LogicalCollection* _docColl; arangodb::LogicalCollection* _docColl;
std::string _theLeader; std::string _theLeader;
// if the latter is empty, then we are leading // if the latter is empty, then we are leading
bool _theLeaderTouched;
public: public:
@ -83,6 +84,7 @@ class FollowerInfo {
void setTheLeader(std::string const& who) { void setTheLeader(std::string const& who) {
MUTEX_LOCKER(locker, _mutex); MUTEX_LOCKER(locker, _mutex);
_theLeader = who; _theLeader = who;
_theLeaderTouched = true;
} }
////////////////////////////////////////////////////////////////////////////// //////////////////////////////////////////////////////////////////////////////
@ -94,6 +96,15 @@ class FollowerInfo {
return _theLeader; return _theLeader;
} }
//////////////////////////////////////////////////////////////////////////////
/// @brief see if leader was explicitly set
//////////////////////////////////////////////////////////////////////////////
bool getLeaderTouched() const {
MUTEX_LOCKER(locker, _mutex);
return _theLeaderTouched;
}
}; };
} // end namespace arangodb } // end namespace arangodb

View File

@ -860,27 +860,35 @@ arangodb::Result arangodb::maintenance::reportInCurrent(
if (cur.hasKey(servers)) { if (cur.hasKey(servers)) {
auto s = cur.get(servers); auto s = cur.get(servers);
if (s.isArray() && cur.get(servers)[0].copyString() == serverId) { if (s.isArray() && cur.get(servers)[0].copyString() == serverId) {
// we were previously leader and we are done resigning.
// update current and let supervision handle the rest // We are in the situation after a restart, that we do not know
VPackBuilder ns; // who the leader is because FollowerInfo is not updated yet.
{ VPackArrayBuilder a(&ns); // Hence, in the case we are the Leader in Plan but do not
if (s.isArray()) { // know it yet, do nothing here.
bool front = true; if (shSlice.get("theLeaderTouched").isTrue()) {
for (auto const& i : VPackArrayIterator(s)) {
ns.add(VPackValue((!front) ? i.copyString() : UNDERSCORE + i.copyString())); // we were previously leader and we are done resigning.
front = false; // update current and let supervision handle the rest
} VPackBuilder ns;
}} { VPackArrayBuilder a(&ns);
report.add( if (s.isArray()) {
VPackValue( bool front = true;
CURRENT_COLLECTIONS + dbName + "/" + colName + "/" + shName for (auto const& i : VPackArrayIterator(s)) {
+ "/" + SERVERS)); ns.add(VPackValue((!front) ? i.copyString() : UNDERSCORE + i.copyString()));
front = false;
}
}}
report.add(
VPackValue(
CURRENT_COLLECTIONS + dbName + "/" + colName + "/" + shName
+ "/" + SERVERS));
{ VPackObjectBuilder o(&report); { VPackObjectBuilder o(&report);
report.add(OP, VP_SET); report.add(OP, VP_SET);
report.add("payload", ns.slice()); } report.add("payload", ns.slice()); }
} }
} }
}
} }
} }
} }

View File

@ -278,7 +278,7 @@ bool transaction::Methods::removeStatusChangeCallback(
} else if (!_state) { } else if (!_state) {
return false; // nothing to add to return false; // nothing to add to
} }
auto* statusChangeCallbacks = getStatusChangeCallbacks(*_state, false); auto* statusChangeCallbacks = getStatusChangeCallbacks(*_state, false);
if (statusChangeCallbacks) { if (statusChangeCallbacks) {
auto it = std::find(statusChangeCallbacks->begin(), statusChangeCallbacks->end(), callback); auto it = std::find(statusChangeCallbacks->begin(), statusChangeCallbacks->end(), callback);
@ -554,8 +554,8 @@ bool transaction::Methods::sortOrs(
root->removeMemberUnchecked(0); root->removeMemberUnchecked(0);
} }
std::unordered_set<std::string> seenIndexConditions; std::unordered_set<std::string> seenIndexConditions;
// and rebuild // and rebuild
for (size_t i = 0; i < n; ++i) { for (size_t i = 0; i < n; ++i) {
if (parts[i].operatorType == if (parts[i].operatorType ==
@ -781,7 +781,7 @@ transaction::Methods::Methods(
).release(); ).release();
TRI_ASSERT(_state != nullptr); TRI_ASSERT(_state != nullptr);
TRI_ASSERT(_state->isTopLevelTransaction()); TRI_ASSERT(_state->isTopLevelTransaction());
// register the transaction in the context // register the transaction in the context
_transactionContextPtr->registerTransaction(_state); _transactionContextPtr->registerTransaction(_state);
} }
@ -1649,7 +1649,7 @@ OperationResult transaction::Methods::insertLocal(
// If we maybe will overwrite, we cannot do single document operations, thus: // If we maybe will overwrite, we cannot do single document operations, thus:
// options.overwrite => !needsLock // options.overwrite => !needsLock
TRI_ASSERT(!options.overwrite || !needsLock); TRI_ASSERT(!options.overwrite || !needsLock);
bool const isMMFiles = EngineSelectorFeature::isMMFiles(); bool const isMMFiles = EngineSelectorFeature::isMMFiles();
// Assert my assumption that we don't have a lock only with mmfiles single // Assert my assumption that we don't have a lock only with mmfiles single
@ -1754,7 +1754,7 @@ OperationResult transaction::Methods::insertLocal(
VPackBuilder resultBuilder; VPackBuilder resultBuilder;
ManagedDocumentResult documentResult; ManagedDocumentResult documentResult;
TRI_voc_tick_t maxTick = 0; TRI_voc_tick_t maxTick = 0;
auto workForOneDocument = [&](VPackSlice const value) -> Result { auto workForOneDocument = [&](VPackSlice const value) -> Result {
if (!value.isObject()) { if (!value.isObject()) {
return Result(TRI_ERROR_ARANGO_DOCUMENT_TYPE_INVALID); return Result(TRI_ERROR_ARANGO_DOCUMENT_TYPE_INVALID);
@ -1856,7 +1856,7 @@ OperationResult transaction::Methods::insertLocal(
return OperationResult{std::move(res), options}; return OperationResult{std::move(res), options};
} }
} }
// wait for operation(s) to be synced to disk here. On rocksdb maxTick == 0 // wait for operation(s) to be synced to disk here. On rocksdb maxTick == 0
if (res.ok() && options.waitForSync && maxTick > 0 && if (res.ok() && options.waitForSync && maxTick > 0 &&
isSingleOperationTransaction()) { isSingleOperationTransaction()) {
@ -2334,7 +2334,7 @@ OperationResult transaction::Methods::removeLocal(
TRI_ASSERT(followers == nullptr); TRI_ASSERT(followers == nullptr);
followers = collection->followers()->get(); followers = collection->followers()->get();
} }
// we may need to lock individual keys here so we can ensure that even with concurrent // we may need to lock individual keys here so we can ensure that even with concurrent
// operations on the same keys we have the same order of data application on leader // operations on the same keys we have the same order of data application on leader
// and followers // and followers
@ -2469,7 +2469,7 @@ OperationResult transaction::Methods::removeLocal(
return OperationResult{std::move(res), options}; return OperationResult{std::move(res), options};
} }
} }
// wait for operation(s) to be synced to disk here. On rocksdb maxTick == 0 // wait for operation(s) to be synced to disk here. On rocksdb maxTick == 0
if (res.ok() && options.waitForSync && maxTick > 0 && if (res.ok() && options.waitForSync && maxTick > 0 &&
isSingleOperationTransaction()) { isSingleOperationTransaction()) {
@ -2594,7 +2594,7 @@ OperationResult transaction::Methods::truncateLocal(
if (!options.isSynchronousReplicationFrom.empty()) { if (!options.isSynchronousReplicationFrom.empty()) {
return OperationResult(TRI_ERROR_CLUSTER_SHARD_LEADER_REFUSES_REPLICATION); return OperationResult(TRI_ERROR_CLUSTER_SHARD_LEADER_REFUSES_REPLICATION);
} }
// fetch followers // fetch followers
followers = followerInfo->get(); followers = followerInfo->get();
if (followers->size() > 0) { if (followers->size() > 0) {
@ -2724,14 +2724,14 @@ OperationResult transaction::Methods::count(std::string const& collectionName,
/// @brief count the number of documents in a collection /// @brief count the number of documents in a collection
OperationResult transaction::Methods::countCoordinator( OperationResult transaction::Methods::countCoordinator(
std::string const& collectionName, transaction::CountType type) { std::string const& collectionName, transaction::CountType type) {
ClusterInfo* ci = ClusterInfo::instance(); ClusterInfo* ci = ClusterInfo::instance();
auto cc = ClusterComm::instance(); auto cc = ClusterComm::instance();
if (cc == nullptr) { if (cc == nullptr) {
// nullptr happens only during controlled shutdown // nullptr happens only during controlled shutdown
return OperationResult(TRI_ERROR_SHUTTING_DOWN); return OperationResult(TRI_ERROR_SHUTTING_DOWN);
} }
// First determine the collection ID from the name: // First determine the collection ID from the name:
auto collinfo = ci->getCollectionNT(vocbase().name(), collectionName); auto collinfo = ci->getCollectionNT(vocbase().name(), collectionName);
if (collinfo == nullptr) { if (collinfo == nullptr) {
@ -2744,7 +2744,7 @@ OperationResult transaction::Methods::countCoordinator(
#endif #endif
OperationResult transaction::Methods::countCoordinatorHelper( OperationResult transaction::Methods::countCoordinatorHelper(
std::shared_ptr<LogicalCollection> const& collinfo, std::string const& collectionName, transaction::CountType type) { std::shared_ptr<LogicalCollection> const& collinfo, std::string const& collectionName, transaction::CountType type) {
TRI_ASSERT(collinfo != nullptr); TRI_ASSERT(collinfo != nullptr);
auto& cache = collinfo->countCache(); auto& cache = collinfo->countCache();
@ -2760,24 +2760,24 @@ OperationResult transaction::Methods::countCoordinatorHelper(
// no cache hit, or detailed results requested // no cache hit, or detailed results requested
std::vector<std::pair<std::string, uint64_t>> count; std::vector<std::pair<std::string, uint64_t>> count;
auto res = arangodb::countOnCoordinator( auto res = arangodb::countOnCoordinator(
vocbase().name(), collectionName, *this, count vocbase().name(), collectionName, *this, count
); );
if (res != TRI_ERROR_NO_ERROR) { if (res != TRI_ERROR_NO_ERROR) {
return OperationResult(res); return OperationResult(res);
} }
int64_t total = 0; int64_t total = 0;
OperationResult opRes = buildCountResult(count, type, total); OperationResult opRes = buildCountResult(count, type, total);
cache.store(total); cache.store(total);
return opRes; return opRes;
} }
// cache hit! // cache hit!
TRI_ASSERT(documents >= 0); TRI_ASSERT(documents >= 0);
TRI_ASSERT(type != transaction::CountType::Detailed); TRI_ASSERT(type != transaction::CountType::Detailed);
// return number from cache // return number from cache
VPackBuilder resultBuilder; VPackBuilder resultBuilder;
resultBuilder.add(VPackValue(documents)); resultBuilder.add(VPackValue(documents));
return OperationResult(Result(), resultBuilder.buffer(), nullptr); return OperationResult(Result(), resultBuilder.buffer(), nullptr);
@ -3279,7 +3279,7 @@ transaction::Methods::indexesForCollectionCoordinator(
std::string const& name) const { std::string const& name) const {
auto clusterInfo = arangodb::ClusterInfo::instance(); auto clusterInfo = arangodb::ClusterInfo::instance();
auto collection = clusterInfo->getCollection(vocbase().name(), name); auto collection = clusterInfo->getCollection(vocbase().name(), name);
// update selectivity estimates if they were expired // update selectivity estimates if they were expired
collection->clusterIndexEstimates(true); collection->clusterIndexEstimates(true);
return collection->getIndexes(); return collection->getIndexes();
@ -3472,17 +3472,23 @@ Result Methods::replicateOperations(
double const timeout = chooseTimeout(count, body->size() * followers->size()); double const timeout = chooseTimeout(count, body->size() * followers->size());
size_t nrDone = 0; size_t nrDone = 0;
cc->performRequests(requests, timeout, nrDone, Logger::REPLICATION, false);
// If any would-be-follower refused to follow there must be a
// new leader in the meantime, in this case we must not allow
// this operation to succeed, we simply return with a refusal
// error (note that we use the follower version, since we have
// lost leadership):
if (findRefusal(requests)) {
return Result{TRI_ERROR_CLUSTER_SHARD_LEADER_RESIGNED};
}
// Otherwise we drop all followers that were not successful: cc->performRequests(requests,
timeout,
nrDone, Logger::REPLICATION, false);
// If any would-be-follower refused to follow there are two possiblities:
// (1) there is a new leader in the meantime, or
// (2) the follower was restarted and forgot that it is a follower.
// Unfortunately, we cannot know which is the case.
// In case (1) case we must not allow
// this operation to succeed, since the new leader is now responsible.
// In case (2) we at least have to drop the follower such that it
// resyncs and we can be sure that it is in sync again.
// Therefore, we drop the follower here (just in case), and refuse to
// return with a refusal error (note that we use the follower version,
// since we have lost leadership):
// We drop all followers that were not successful:
for (size_t i = 0; i < followers->size(); ++i) { for (size_t i = 0; i < followers->size(); ++i) {
bool replicationWorked = bool replicationWorked =
requests[i].done && requests[i].done &&
@ -3510,6 +3516,9 @@ Result Methods::replicateOperations(
} }
} }
// we return "ok" here still. if (findRefusal(requests)) {
return Result{TRI_ERROR_CLUSTER_SHARD_LEADER_RESIGNED};
}
return Result{}; return Result{};
} }