diff --git a/CHANGELOG b/CHANGELOG index 8d2634211a..85bfb5a1b4 100644 --- a/CHANGELOG +++ b/CHANGELOG @@ -16,6 +16,11 @@ devel * Fixed agency bug with election lock step. +* Fixed some error reporting and logging in Maintenance. + +* Fixed an error condition in which an ex-leader for a short still believed + to be the leader and wrongly reported to Current. + v3.5.0-rc.4 (2019-06-15) ------------------------ diff --git a/arangod/Cluster/DBServerAgencySync.cpp b/arangod/Cluster/DBServerAgencySync.cpp index 5146a70afb..6bdc09e467 100644 --- a/arangod/Cluster/DBServerAgencySync.cpp +++ b/arangod/Cluster/DBServerAgencySync.cpp @@ -183,11 +183,6 @@ DBServerAgencySyncResult DBServerAgencySync::execute() { VPackBuilder local; Result glc = getLocalCollections(local); if (!glc.ok()) { - // FIXMEMAINTENANCE: if this fails here, then result is empty, is this - // intended? I also notice that there is another Result object "tmp" - // that is going to eat bad results in few lines later. Again, is - // that the correct action? If so, how about supporting comments in - // the code for both. result.errorMessage = "Could not do getLocalCollections for phase 1: '"; result.errorMessage.append(glc.errorMessage()).append("'"); return result; @@ -315,6 +310,30 @@ DBServerAgencySyncResult DBServerAgencySync::execute() { result = DBServerAgencySyncResult(false, "Error in phase 2: " + tmp.errorMessage(), 0, 0); } + } else { + // This code should never run, it is only there to debug problems if + // we mess up in other places. + result.errorMessage = "Report from phase 1 and 2 was no object."; + try { + std::string json = report.toJson(); + LOG_TOPIC("65fde", WARN, Logger::MAINTENANCE) << "Report from phase 1 and 2 was: " << json; + } catch(std::exception const& exc) { + LOG_TOPIC("54de2", WARN, Logger::MAINTENANCE) + << "Report from phase 1 and 2 could not be dumped to JSON, error: " + << exc.what() << ", head byte:" << report.head(); + uint64_t l = 0; + try { + l = report.byteSize(); + LOG_TOPIC("54dda", WARN, Logger::MAINTENANCE) + << "Report from phase 1 and 2, byte size: " << l; + LOG_TOPIC("67421", WARN, Logger::MAINTENANCE) + << "Bytes: " + << arangodb::basics::StringUtils::encodeHex((char const*) report.start(), l); + } catch(...) { + LOG_TOPIC("76124", WARN, Logger::MAINTENANCE) + << "Report from phase 1 and 2, byte size throws."; + } + } } } else { result.errorMessage = "Report from phase 1 and 2 was not closed."; diff --git a/arangod/Cluster/DBServerAgencySync.h b/arangod/Cluster/DBServerAgencySync.h index d332271e10..4bea7bdc00 100644 --- a/arangod/Cluster/DBServerAgencySync.h +++ b/arangod/Cluster/DBServerAgencySync.h @@ -29,6 +29,7 @@ #include "Basics/VelocyPackHelper.h" namespace arangodb { + class HeartbeatThread; struct DBServerAgencySyncResult { @@ -45,9 +46,6 @@ struct DBServerAgencySyncResult { DBServerAgencySyncResult(bool s, std::string const& e, uint64_t p, uint64_t c) : success(s), errorMessage(e), planVersion(p), currentVersion(c) {} - - DBServerAgencySyncResult(DBServerAgencySyncResult const& other) = default; - DBServerAgencySyncResult& operator=(DBServerAgencySyncResult const& other) = default; }; class DBServerAgencySync { diff --git a/arangod/Cluster/FollowerInfo.cpp b/arangod/Cluster/FollowerInfo.cpp index fa44234b23..b12bcb743e 100644 --- a/arangod/Cluster/FollowerInfo.cpp +++ b/arangod/Cluster/FollowerInfo.cpp @@ -116,24 +116,31 @@ Result FollowerInfo::add(ServerID const& sid) { } #endif } + std::string planId = std::to_string(_docColl->planId()); // Now tell the agency, path is // Current/Collections/// - std::string path = "Current/Collections/"; - path += _docColl->vocbase().name(); - path += "/"; - path += std::to_string(_docColl->planId()); - path += "/"; - path += _docColl->name(); + std::string path = _docColl->vocbase().name() + "/" + planId + "/"; + std::string curPath = "Current/Collections/" + path + _docColl->name(); + // We also need the corresponding Plan entry, path is + // Plan/Collections///shards/ + std::string planPath = "Plan/Collections/" + path + "shards/" + + _docColl->name(); AgencyComm ac; double startTime = TRI_microtime(); do { - AgencyCommResult res = ac.getValues(path); + AgencyReadTransaction trx(std::vector( + {AgencyCommManager::path(planPath), AgencyCommManager::path(curPath)})); + AgencyCommResult res = ac.sendTransactionWithFailover(trx); if (res.successful()) { + // Let's look at the results, note that both can be None! + velocypack::Slice planEntry = res.slice()[0].get( + std::vector( + {AgencyCommManager::path(), "Plan", "Collections", + _docColl->vocbase().name(), planId, "shards", _docColl->name()})); velocypack::Slice currentEntry = res.slice()[0].get(std::vector( {AgencyCommManager::path(), "Current", "Collections", - _docColl->vocbase().name(), std::to_string(_docColl->planId()), - _docColl->name()})); + _docColl->vocbase().name(), planId, _docColl->name()})); if (!currentEntry.isObject()) { LOG_TOPIC("b753d", ERR, Logger::CLUSTER) @@ -142,31 +149,47 @@ Result FollowerInfo::add(ServerID const& sid) { LOG_TOPIC("568de", ERR, Logger::CLUSTER) << "Found: " << currentEntry.toJson(); } } else { - auto newValue = newShardEntry(currentEntry, sid, true); - std::string key = "Current/Collections/" + _docColl->vocbase().name() + - "/" + std::to_string(_docColl->planId()) + "/" + - _docColl->name(); - AgencyWriteTransaction trx; - trx.preconditions.push_back( - AgencyPrecondition(key, AgencyPrecondition::Type::VALUE, currentEntry)); - trx.operations.push_back( - AgencyOperation(key, AgencyValueOperationType::SET, newValue.slice())); - trx.operations.push_back( - AgencyOperation("Current/Version", AgencySimpleOperationType::INCREMENT_OP)); - AgencyCommResult res2 = ac.sendTransactionWithFailover(trx); - if (res2.successful()) { - // we are finished! - return {TRI_ERROR_NO_ERROR}; + if (!planEntry.isArray() || planEntry.length() == 0 || + !planEntry[0].isString() || + !planEntry[0].isEqualString(ServerState::instance()->getId())) { + LOG_TOPIC("54555", INFO, Logger::CLUSTER) + << "FollowerInfo::add, did not find myself in Plan: " << path + << " (can happen when the leader changed recently)."; + if (!planEntry.isNone()) { + LOG_TOPIC("66762", INFO, Logger::CLUSTER) << "Found: " << planEntry.toJson(); + } + return {TRI_ERROR_CLUSTER_NOT_LEADER}; + } else { + auto newValue = newShardEntry(currentEntry, sid, true); + AgencyWriteTransaction trx; + trx.preconditions.push_back( + AgencyPrecondition(curPath, AgencyPrecondition::Type::VALUE, currentEntry)); + trx.preconditions.push_back( + AgencyPrecondition(planPath, AgencyPrecondition::Type::VALUE, planEntry)); + trx.operations.push_back( + AgencyOperation(curPath, AgencyValueOperationType::SET, newValue.slice())); + trx.operations.push_back( + AgencyOperation("Current/Version", AgencySimpleOperationType::INCREMENT_OP)); + AgencyCommResult res2 = ac.sendTransactionWithFailover(trx); + if (res2.successful()) { + return {TRI_ERROR_NO_ERROR}; + } else { + LOG_TOPIC("daeda", WARN, Logger::CLUSTER) + << "FollowerInfo::add, could not cas key " << path; + } } } } else { - LOG_TOPIC("dcf54", ERR, Logger::CLUSTER) - << "FollowerInfo::add, could not read " << path << " in agency"; + LOG_TOPIC("dcf54", WARN, Logger::CLUSTER) + << "FollowerInfo::add, could not read " << planPath + << " and " << curPath << " in agency."; } std::this_thread::sleep_for(std::chrono::microseconds(500000)); - } while (TRI_microtime() < startTime + 30 && + } while (TRI_microtime() < startTime + 3600 && application_features::ApplicationServer::isRetryOK()); - + // This is important, give it 1h if needed. We really do not want to get + // into the position to not accept a shard getting-in-sync just because + // we cannot talk to the agency temporarily. int errorCode = (application_features::ApplicationServer::isRetryOK()) ? TRI_ERROR_CLUSTER_AGENCY_COMMUNICATION_FAILED : TRI_ERROR_SHUTTING_DOWN; std::string errorMessage = "unable to add follower in agency, timeout in agency CAS operation for key " + path + ": " + TRI_errno_string(errorCode); LOG_TOPIC("6295b", ERR, Logger::CLUSTER) << errorMessage; @@ -177,7 +200,9 @@ Result FollowerInfo::add(ServerID const& sid) { //////////////////////////////////////////////////////////////////////////////// /// @brief remove a follower from a shard, this is only done by the /// server if a synchronous replication request fails. This reports to -/// the agency under `/Current` but in asynchronous "fire-and-forget" +/// the agency under `/Current`. This method can fail, which is critical, +/// because we cannot drop a follower ourselves and not report this to the +/// agency, since then a failover to a not-in-sync follower might happen. /// way. The method fails silently, if the follower information has /// since been dropped (see `dropFollowerInfo` below). //////////////////////////////////////////////////////////////////////////////// @@ -229,19 +254,28 @@ Result FollowerInfo::remove(ServerID const& sid) { return {TRI_ERROR_NO_ERROR}; } #endif + + std::string planId = std::to_string(_docColl->planId()); // Now tell the agency, path is // Current/Collections/// - std::string path = "Current/Collections/"; - path += _docColl->vocbase().name(); - path += "/"; - path += std::to_string(_docColl->planId()); - path += "/"; - path += _docColl->name(); + std::string path = _docColl->vocbase().name() + "/" + planId + "/"; + std::string curPath = "Current/Collections/" + path + _docColl->name(); + // We also need the corresponding Plan entry, path is + // Plan/Collections///shards/ + std::string planPath = "Plan/Collections/" + path + "shards/" + + _docColl->name(); AgencyComm ac; double startTime = TRI_microtime(); do { - AgencyCommResult res = ac.getValues(path); + AgencyReadTransaction trx(std::vector( + {AgencyCommManager::path(planPath), AgencyCommManager::path(curPath)})); + AgencyCommResult res = ac.sendTransactionWithFailover(trx); if (res.successful()) { + // Let's look at the results, note that both can be None! + velocypack::Slice planEntry = res.slice()[0].get( + std::vector( + {AgencyCommManager::path(), "Plan", "Collections", + _docColl->vocbase().name(), planId, "shards", _docColl->name()})); velocypack::Slice currentEntry = res.slice()[0].get(std::vector( {AgencyCommManager::path(), "Current", "Collections", _docColl->vocbase().name(), std::to_string(_docColl->planId()), @@ -254,34 +288,61 @@ Result FollowerInfo::remove(ServerID const& sid) { LOG_TOPIC("57c84", ERR, Logger::CLUSTER) << "Found: " << currentEntry.toJson(); } } else { - auto newValue = newShardEntry(currentEntry, sid, false); - std::string key = "Current/Collections/" + _docColl->vocbase().name() + - "/" + std::to_string(_docColl->planId()) + "/" + - _docColl->name(); - AgencyWriteTransaction trx; - trx.preconditions.push_back( - AgencyPrecondition(key, AgencyPrecondition::Type::VALUE, currentEntry)); - trx.operations.push_back( - AgencyOperation(key, AgencyValueOperationType::SET, newValue.slice())); - trx.operations.push_back( - AgencyOperation("Current/Version", AgencySimpleOperationType::INCREMENT_OP)); - AgencyCommResult res2 = ac.sendTransactionWithFailover(trx); - if (res2.successful()) { - // we are finished - LOG_TOPIC("be0cb", DEBUG, Logger::CLUSTER) << "Removing follower " << sid << " from " + if (!planEntry.isArray() || planEntry.length() == 0 || + !planEntry[0].isString() || + !planEntry[0].isEqualString(ServerState::instance()->getId())) { + LOG_TOPIC("42231", INFO, Logger::CLUSTER) + << "FollowerInfo::remove, did not find myself in Plan: " << path + << " (can happen when the leader changed recently)."; + if (!planEntry.isNone()) { + LOG_TOPIC("ffede", INFO, Logger::CLUSTER) << "Found: " << planEntry.toJson(); + } + return {TRI_ERROR_CLUSTER_NOT_LEADER}; + } else { + auto newValue = newShardEntry(currentEntry, sid, false); + std::string key = "Current/Collections/" + _docColl->vocbase().name() + + "/" + std::to_string(_docColl->planId()) + "/" + + _docColl->name(); + AgencyWriteTransaction trx; + trx.preconditions.push_back( + AgencyPrecondition(key, AgencyPrecondition::Type::VALUE, currentEntry)); + trx.preconditions.push_back( + AgencyPrecondition(planPath, AgencyPrecondition::Type::VALUE, planEntry)); + trx.operations.push_back( + AgencyOperation(key, AgencyValueOperationType::SET, newValue.slice())); + trx.operations.push_back( + AgencyOperation("Current/Version", AgencySimpleOperationType::INCREMENT_OP)); + AgencyCommResult res2 = ac.sendTransactionWithFailover(trx); + if (res2.successful()) { + // we are finished + LOG_TOPIC("be0cb", DEBUG, Logger::CLUSTER) << "Removing follower " << sid << " from " << _docColl->name() << "succeeded"; - return {TRI_ERROR_NO_ERROR}; + return {TRI_ERROR_NO_ERROR}; + } else { + LOG_TOPIC("67778", WARN, Logger::CLUSTER) + << "FollowerInfo::remove, could not cas key " << path + << ". status code: " << res2._statusCode + << ", incriminating body: " << res2.bodyRef(); + } } } } else { - LOG_TOPIC("b7333", ERR, Logger::CLUSTER) - << "FollowerInfo::remove, could not read " << path << " in agency."; + LOG_TOPIC("b7333", WARN, Logger::CLUSTER) + << "FollowerInfo::remove, could not read " << planPath + << " and " << curPath << " in agency."; } std::this_thread::sleep_for(std::chrono::microseconds(500000)); - } while (TRI_microtime() < startTime + 30 && + } while (TRI_microtime() < startTime + 7200 && application_features::ApplicationServer::isRetryOK()); - - // rollback + // This is important, give it 2h if needed. We really do not want to get + // into the position to fail to drop a follower, just because we cannot + // talk to the agency temporarily. The worst would be to drop the follower + // locally but not report the fact to the agency. The second worst is to + // not be able to drop the follower, despite the fact that a replication + // was not successful. All else is less dramatic. Therefore we try for + // a long time. + + // rollback: _followers = _oldFollowers; int errorCode = (application_features::ApplicationServer::isRetryOK()) ? TRI_ERROR_CLUSTER_AGENCY_COMMUNICATION_FAILED : TRI_ERROR_SHUTTING_DOWN; diff --git a/arangod/Cluster/Maintenance.cpp b/arangod/Cluster/Maintenance.cpp index db3f8092cb..2574d030ef 100644 --- a/arangod/Cluster/Maintenance.cpp +++ b/arangod/Cluster/Maintenance.cpp @@ -527,6 +527,7 @@ arangodb::Result arangodb::maintenance::executePlan(VPackSlice const& plan, std::vector actions; report.add(VPackValue(AGENCY)); { + // TODO: Just putting an empty array does not make any sense here! VPackArrayBuilder a(&report); diffPlanLocal(plan, local, serverId, errors, feature, actions); } @@ -745,6 +746,7 @@ static VPackBuilder assembleLocalCollectionInfo( errorMsg += database; errorMsg += ", exception: "; errorMsg += e.what(); + errorMsg += " (this is expected if the database was recently deleted)."; LOG_TOPIC("7fe5d", WARN, Logger::MAINTENANCE) << errorMsg; { VPackObjectBuilder o(&ret); } return ret; @@ -790,7 +792,7 @@ static VPackBuilder assembleLocalDatabaseInfo(std::string const& database, return ret; } catch (std::exception const& e) { - ret.clear(); + ret.clear(); // In case the above has mid air collision. std::string errorMsg( "Maintenance::assembleLocalDatabaseInfo: Failed to lookup database "); errorMsg += database; @@ -840,6 +842,30 @@ arangodb::Result arangodb::maintenance::reportInCurrent( VPackBuilder error; if (shSlice.get(THE_LEADER).copyString().empty()) { // Leader + // Check that we are the leader of this shard in the Plan, together + // with the precondition below that the Plan is unchanged, this ensures + // that we only ever modify Current if we are the leader in the Plan: + auto const planPath = std::vector{dbName, colName, "shards", shName}; + if (!pdbs.hasKey(planPath)) { + LOG_TOPIC("43242", DEBUG, Logger::MAINTENANCE) + << "Ooops, we have a shard for which we believe to be the leader," + " but the Plan does not have it any more, we do not report in " + "Current about this, database: " << dbName + << ", shard: " << shName; + continue; + } + + VPackSlice thePlanList = pdbs.get(planPath); + if (!thePlanList.isArray() || thePlanList.length() == 0 || + !thePlanList[0].isString() || + !thePlanList[0].isEqualStringUnchecked(serverId)) { + LOG_TOPIC("87776", DEBUG, Logger::MAINTENANCE) + << "Ooops, we have a shard for which we believe to be the leader," + " but the Plan says otherwise, we do not report in Current " + "about this, database: " << dbName << ", shard: " << shName; + continue; + } + auto const localCollectionInfo = assembleLocalCollectionInfo(shSlice, shardMap.slice().get(shName), dbName, shName, serverId, allErrors); @@ -854,31 +880,21 @@ arangodb::Result arangodb::maintenance::reportInCurrent( auto inCurrent = cur.hasKey(cp); if (!inCurrent || !equivalent(localCollectionInfo.slice(), cur.get(cp))) { report.add(VPackValue(CURRENT_COLLECTIONS + dbName + "/" + colName + "/" + shName)); + { VPackObjectBuilder o(&report); report.add(OP, VP_SET); // Report new current entry ... report.add("payload", localCollectionInfo.slice()); // ... if and only if plan for this shard has changed in the meantime - try { - // Try to add a precondition, just in case we catch the exception. - // Even if the Plan entry is gone by now, it is still OK to - // report the Current value, it will be deleted in due course. - // It is the case that the Plan value is there but has changed - // that we want to protect against. - VPackSlice oldValue = pdbs.get(std::vector{dbName, colName, "shards", shName}); - if (!oldValue.isNone()) { - report.add(VPackValue("precondition")); - { - VPackObjectBuilder p(&report); - report.add( - PLAN_COLLECTIONS + dbName + "/" + colName + "/shards/" + shName, - oldValue); - } - } - } catch(...) { + // Add a precondition: + report.add(VPackValue("precondition")); + { + VPackObjectBuilder p(&report); + report.add( + PLAN_COLLECTIONS + dbName + "/" + colName + "/shards/" + shName, + thePlanList); } - } } } else { // Follower @@ -894,7 +910,31 @@ arangodb::Result arangodb::maintenance::reportInCurrent( // know it yet, do nothing here. if (shSlice.get("theLeaderTouched").isTrue()) { // we were previously leader and we are done resigning. - // update current and let supervision handle the rest + // update current and let supervision handle the rest, however + // check that we are in the Plan a leader which is supposed to + // resign and add a precondition that this is still the case: + + auto const planPath = std::vector{dbName, colName, "shards", shName}; + if (!pdbs.hasKey(planPath)) { + LOG_TOPIC("65432", DEBUG, Logger::MAINTENANCE) + << "Ooops, we have a shard for which we believe that we " + "just resigned, but the Plan does not have it any more," + " we do not report in Current about this, database: " + << dbName << ", shard: " << shName; + continue; + } + + VPackSlice thePlanList = pdbs.get(planPath); + if (!thePlanList.isArray() || thePlanList.length() == 0 || + !thePlanList[0].isString() || + !thePlanList[0].isEqualStringUnchecked(UNDERSCORE + serverId)) { + LOG_TOPIC("99987", DEBUG, Logger::MAINTENANCE) + << "Ooops, we have a shard for which we believe that we " + "have just resigned, but the Plan says otherwise, we " + "do not report in Current about this, database: " + << dbName << ", shard: " << shName; + continue; + } VPackBuilder ns; { VPackArrayBuilder a(&ns); @@ -914,6 +954,12 @@ arangodb::Result arangodb::maintenance::reportInCurrent( VPackObjectBuilder o(&report); report.add(OP, VP_SET); report.add("payload", ns.slice()); + { + VPackObjectBuilder p(&report, "precondition"); + report.add( + PLAN_COLLECTIONS + dbName + "/" + colName + "/shards/" + shName, + thePlanList); + } } } } diff --git a/arangod/Transaction/Methods.cpp b/arangod/Transaction/Methods.cpp index 27d29657f3..b2419bfe8c 100644 --- a/arangod/Transaction/Methods.cpp +++ b/arangod/Transaction/Methods.cpp @@ -2694,7 +2694,7 @@ OperationResult transaction::Methods::truncateLocal(std::string const& collectio << "truncateLocal: dropping follower " << (*followers)[i] << " for shard " << collectionName; } else { - LOG_TOPIC("359bc", ERR, Logger::REPLICATION) + LOG_TOPIC("359bc", WARN, Logger::REPLICATION) << "truncateLocal: could not drop follower " << (*followers)[i] << " for shard " << collectionName << ": " << res.errorMessage(); THROW_ARANGO_EXCEPTION(TRI_ERROR_CLUSTER_COULD_NOT_DROP_FOLLOWER); diff --git a/tests/Agency/FailedLeaderTest.cpp b/tests/Agency/FailedLeaderTest.cpp index 0b93d16efa..3f6993166c 100644 --- a/tests/Agency/FailedLeaderTest.cpp +++ b/tests/Agency/FailedLeaderTest.cpp @@ -405,7 +405,6 @@ TEST_F(FailedLeaderTest, job_must_not_be_started_if_distributeshardslike_shard_i } else if (path == "/arango/Target/ToDo") { builder->add("1", createBuilder(todo).slice()); } - builder->close(); } else { builder->add(s); } @@ -932,7 +931,6 @@ TEST_F(FailedLeaderTest, when_everything_is_finished_there_should_be_cleanup) { } builder->add("1", jobBuilder.slice()); } - builder->close(); } else { if (path == "/arango/Current/Collections/" + DATABASE + "/" + COLLECTION + "/" + SHARD + "/servers") { diff --git a/tests/Agency/FailedServerTest.cpp b/tests/Agency/FailedServerTest.cpp index 72c08e0a14..dc3e45ea66 100644 --- a/tests/Agency/FailedServerTest.cpp +++ b/tests/Agency/FailedServerTest.cpp @@ -302,7 +302,6 @@ TEST_F(FailedServerTest, the_state_is_still_bad_and_faileservers_is_still_in_sna "timeCreated":"2017-04-10T11:40:09Z","type":"failedServer"})="; builder->add(jobId, createBuilder(todo).slice()); } - builder->close(); } else { builder->add(s); } @@ -363,7 +362,6 @@ TEST_F(FailedServerTest, the_state_is_still_bad_and_faileservers_is_still_in_sna "timeCreated":"2017-04-10T11:40:09Z","type":"failedServer"})="; builder->add(jobId, createBuilder(todo).slice()); } - builder->close(); } else { if (path == "/arango/Supervision/Health/leader/Status") { builder->add("/arango/Supervision/Health/leader/Status", diff --git a/tests/Agency/RemoveFollowerTest.cpp b/tests/Agency/RemoveFollowerTest.cpp index 224e1d8cd6..07be2f8a59 100644 --- a/tests/Agency/RemoveFollowerTest.cpp +++ b/tests/Agency/RemoveFollowerTest.cpp @@ -183,7 +183,6 @@ TEST_F(RemoveFollowerTest, collection_still_exists_if_missing_job_is_finished_mo if (path == "/arango/Target/ToDo") { builder->add(jobId, createBuilder(todo).slice()); } - builder->close(); } else { builder->add(s); }