mirror of https://gitee.com/bigwinds/arangodb
Leader updates Current precondition fixes. (#9410)
* Better logging and error reporting. * Preconditions for FollowerInfo. * Preconditions when updating Current as leader. * Change a log level. * Fix unit tests. * CHANGELOG. * LOG_TOPIC ids. * Fix a log id. * Fix Windows compilation.
This commit is contained in:
parent
595728ec1b
commit
7aa0c19026
|
@ -16,6 +16,11 @@ devel
|
||||||
|
|
||||||
* Fixed agency bug with election lock step.
|
* Fixed agency bug with election lock step.
|
||||||
|
|
||||||
|
* Fixed some error reporting and logging in Maintenance.
|
||||||
|
|
||||||
|
* Fixed an error condition in which an ex-leader for a short still believed
|
||||||
|
to be the leader and wrongly reported to Current.
|
||||||
|
|
||||||
|
|
||||||
v3.5.0-rc.4 (2019-06-15)
|
v3.5.0-rc.4 (2019-06-15)
|
||||||
------------------------
|
------------------------
|
||||||
|
|
|
@ -183,11 +183,6 @@ DBServerAgencySyncResult DBServerAgencySync::execute() {
|
||||||
VPackBuilder local;
|
VPackBuilder local;
|
||||||
Result glc = getLocalCollections(local);
|
Result glc = getLocalCollections(local);
|
||||||
if (!glc.ok()) {
|
if (!glc.ok()) {
|
||||||
// FIXMEMAINTENANCE: if this fails here, then result is empty, is this
|
|
||||||
// intended? I also notice that there is another Result object "tmp"
|
|
||||||
// that is going to eat bad results in few lines later. Again, is
|
|
||||||
// that the correct action? If so, how about supporting comments in
|
|
||||||
// the code for both.
|
|
||||||
result.errorMessage = "Could not do getLocalCollections for phase 1: '";
|
result.errorMessage = "Could not do getLocalCollections for phase 1: '";
|
||||||
result.errorMessage.append(glc.errorMessage()).append("'");
|
result.errorMessage.append(glc.errorMessage()).append("'");
|
||||||
return result;
|
return result;
|
||||||
|
@ -315,6 +310,30 @@ DBServerAgencySyncResult DBServerAgencySync::execute() {
|
||||||
result = DBServerAgencySyncResult(false, "Error in phase 2: " + tmp.errorMessage(),
|
result = DBServerAgencySyncResult(false, "Error in phase 2: " + tmp.errorMessage(),
|
||||||
0, 0);
|
0, 0);
|
||||||
}
|
}
|
||||||
|
} else {
|
||||||
|
// This code should never run, it is only there to debug problems if
|
||||||
|
// we mess up in other places.
|
||||||
|
result.errorMessage = "Report from phase 1 and 2 was no object.";
|
||||||
|
try {
|
||||||
|
std::string json = report.toJson();
|
||||||
|
LOG_TOPIC("65fde", WARN, Logger::MAINTENANCE) << "Report from phase 1 and 2 was: " << json;
|
||||||
|
} catch(std::exception const& exc) {
|
||||||
|
LOG_TOPIC("54de2", WARN, Logger::MAINTENANCE)
|
||||||
|
<< "Report from phase 1 and 2 could not be dumped to JSON, error: "
|
||||||
|
<< exc.what() << ", head byte:" << report.head();
|
||||||
|
uint64_t l = 0;
|
||||||
|
try {
|
||||||
|
l = report.byteSize();
|
||||||
|
LOG_TOPIC("54dda", WARN, Logger::MAINTENANCE)
|
||||||
|
<< "Report from phase 1 and 2, byte size: " << l;
|
||||||
|
LOG_TOPIC("67421", WARN, Logger::MAINTENANCE)
|
||||||
|
<< "Bytes: "
|
||||||
|
<< arangodb::basics::StringUtils::encodeHex((char const*) report.start(), l);
|
||||||
|
} catch(...) {
|
||||||
|
LOG_TOPIC("76124", WARN, Logger::MAINTENANCE)
|
||||||
|
<< "Report from phase 1 and 2, byte size throws.";
|
||||||
|
}
|
||||||
|
}
|
||||||
}
|
}
|
||||||
} else {
|
} else {
|
||||||
result.errorMessage = "Report from phase 1 and 2 was not closed.";
|
result.errorMessage = "Report from phase 1 and 2 was not closed.";
|
||||||
|
|
|
@ -29,6 +29,7 @@
|
||||||
#include "Basics/VelocyPackHelper.h"
|
#include "Basics/VelocyPackHelper.h"
|
||||||
|
|
||||||
namespace arangodb {
|
namespace arangodb {
|
||||||
|
|
||||||
class HeartbeatThread;
|
class HeartbeatThread;
|
||||||
|
|
||||||
struct DBServerAgencySyncResult {
|
struct DBServerAgencySyncResult {
|
||||||
|
@ -45,9 +46,6 @@ struct DBServerAgencySyncResult {
|
||||||
|
|
||||||
DBServerAgencySyncResult(bool s, std::string const& e, uint64_t p, uint64_t c)
|
DBServerAgencySyncResult(bool s, std::string const& e, uint64_t p, uint64_t c)
|
||||||
: success(s), errorMessage(e), planVersion(p), currentVersion(c) {}
|
: success(s), errorMessage(e), planVersion(p), currentVersion(c) {}
|
||||||
|
|
||||||
DBServerAgencySyncResult(DBServerAgencySyncResult const& other) = default;
|
|
||||||
DBServerAgencySyncResult& operator=(DBServerAgencySyncResult const& other) = default;
|
|
||||||
};
|
};
|
||||||
|
|
||||||
class DBServerAgencySync {
|
class DBServerAgencySync {
|
||||||
|
|
|
@ -116,24 +116,31 @@ Result FollowerInfo::add(ServerID const& sid) {
|
||||||
}
|
}
|
||||||
#endif
|
#endif
|
||||||
}
|
}
|
||||||
|
std::string planId = std::to_string(_docColl->planId());
|
||||||
// Now tell the agency, path is
|
// Now tell the agency, path is
|
||||||
// Current/Collections/<dbName>/<collectionID>/<shardID>
|
// Current/Collections/<dbName>/<collectionID>/<shardID>
|
||||||
std::string path = "Current/Collections/";
|
std::string path = _docColl->vocbase().name() + "/" + planId + "/";
|
||||||
path += _docColl->vocbase().name();
|
std::string curPath = "Current/Collections/" + path + _docColl->name();
|
||||||
path += "/";
|
// We also need the corresponding Plan entry, path is
|
||||||
path += std::to_string(_docColl->planId());
|
// Plan/Collections/<dbName>/<collectionID>/shards/<shardID>
|
||||||
path += "/";
|
std::string planPath = "Plan/Collections/" + path + "shards/" +
|
||||||
path += _docColl->name();
|
_docColl->name();
|
||||||
AgencyComm ac;
|
AgencyComm ac;
|
||||||
double startTime = TRI_microtime();
|
double startTime = TRI_microtime();
|
||||||
do {
|
do {
|
||||||
AgencyCommResult res = ac.getValues(path);
|
AgencyReadTransaction trx(std::vector<std::string>(
|
||||||
|
{AgencyCommManager::path(planPath), AgencyCommManager::path(curPath)}));
|
||||||
|
AgencyCommResult res = ac.sendTransactionWithFailover(trx);
|
||||||
|
|
||||||
if (res.successful()) {
|
if (res.successful()) {
|
||||||
|
// Let's look at the results, note that both can be None!
|
||||||
|
velocypack::Slice planEntry = res.slice()[0].get(
|
||||||
|
std::vector<std::string>(
|
||||||
|
{AgencyCommManager::path(), "Plan", "Collections",
|
||||||
|
_docColl->vocbase().name(), planId, "shards", _docColl->name()}));
|
||||||
velocypack::Slice currentEntry = res.slice()[0].get(std::vector<std::string>(
|
velocypack::Slice currentEntry = res.slice()[0].get(std::vector<std::string>(
|
||||||
{AgencyCommManager::path(), "Current", "Collections",
|
{AgencyCommManager::path(), "Current", "Collections",
|
||||||
_docColl->vocbase().name(), std::to_string(_docColl->planId()),
|
_docColl->vocbase().name(), planId, _docColl->name()}));
|
||||||
_docColl->name()}));
|
|
||||||
|
|
||||||
if (!currentEntry.isObject()) {
|
if (!currentEntry.isObject()) {
|
||||||
LOG_TOPIC("b753d", ERR, Logger::CLUSTER)
|
LOG_TOPIC("b753d", ERR, Logger::CLUSTER)
|
||||||
|
@ -142,31 +149,47 @@ Result FollowerInfo::add(ServerID const& sid) {
|
||||||
LOG_TOPIC("568de", ERR, Logger::CLUSTER) << "Found: " << currentEntry.toJson();
|
LOG_TOPIC("568de", ERR, Logger::CLUSTER) << "Found: " << currentEntry.toJson();
|
||||||
}
|
}
|
||||||
} else {
|
} else {
|
||||||
auto newValue = newShardEntry(currentEntry, sid, true);
|
if (!planEntry.isArray() || planEntry.length() == 0 ||
|
||||||
std::string key = "Current/Collections/" + _docColl->vocbase().name() +
|
!planEntry[0].isString() ||
|
||||||
"/" + std::to_string(_docColl->planId()) + "/" +
|
!planEntry[0].isEqualString(ServerState::instance()->getId())) {
|
||||||
_docColl->name();
|
LOG_TOPIC("54555", INFO, Logger::CLUSTER)
|
||||||
AgencyWriteTransaction trx;
|
<< "FollowerInfo::add, did not find myself in Plan: " << path
|
||||||
trx.preconditions.push_back(
|
<< " (can happen when the leader changed recently).";
|
||||||
AgencyPrecondition(key, AgencyPrecondition::Type::VALUE, currentEntry));
|
if (!planEntry.isNone()) {
|
||||||
trx.operations.push_back(
|
LOG_TOPIC("66762", INFO, Logger::CLUSTER) << "Found: " << planEntry.toJson();
|
||||||
AgencyOperation(key, AgencyValueOperationType::SET, newValue.slice()));
|
}
|
||||||
trx.operations.push_back(
|
return {TRI_ERROR_CLUSTER_NOT_LEADER};
|
||||||
AgencyOperation("Current/Version", AgencySimpleOperationType::INCREMENT_OP));
|
} else {
|
||||||
AgencyCommResult res2 = ac.sendTransactionWithFailover(trx);
|
auto newValue = newShardEntry(currentEntry, sid, true);
|
||||||
if (res2.successful()) {
|
AgencyWriteTransaction trx;
|
||||||
// we are finished!
|
trx.preconditions.push_back(
|
||||||
return {TRI_ERROR_NO_ERROR};
|
AgencyPrecondition(curPath, AgencyPrecondition::Type::VALUE, currentEntry));
|
||||||
|
trx.preconditions.push_back(
|
||||||
|
AgencyPrecondition(planPath, AgencyPrecondition::Type::VALUE, planEntry));
|
||||||
|
trx.operations.push_back(
|
||||||
|
AgencyOperation(curPath, AgencyValueOperationType::SET, newValue.slice()));
|
||||||
|
trx.operations.push_back(
|
||||||
|
AgencyOperation("Current/Version", AgencySimpleOperationType::INCREMENT_OP));
|
||||||
|
AgencyCommResult res2 = ac.sendTransactionWithFailover(trx);
|
||||||
|
if (res2.successful()) {
|
||||||
|
return {TRI_ERROR_NO_ERROR};
|
||||||
|
} else {
|
||||||
|
LOG_TOPIC("daeda", WARN, Logger::CLUSTER)
|
||||||
|
<< "FollowerInfo::add, could not cas key " << path;
|
||||||
|
}
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
} else {
|
} else {
|
||||||
LOG_TOPIC("dcf54", ERR, Logger::CLUSTER)
|
LOG_TOPIC("dcf54", WARN, Logger::CLUSTER)
|
||||||
<< "FollowerInfo::add, could not read " << path << " in agency";
|
<< "FollowerInfo::add, could not read " << planPath
|
||||||
|
<< " and " << curPath << " in agency.";
|
||||||
}
|
}
|
||||||
std::this_thread::sleep_for(std::chrono::microseconds(500000));
|
std::this_thread::sleep_for(std::chrono::microseconds(500000));
|
||||||
} while (TRI_microtime() < startTime + 30 &&
|
} while (TRI_microtime() < startTime + 3600 &&
|
||||||
application_features::ApplicationServer::isRetryOK());
|
application_features::ApplicationServer::isRetryOK());
|
||||||
|
// This is important, give it 1h if needed. We really do not want to get
|
||||||
|
// into the position to not accept a shard getting-in-sync just because
|
||||||
|
// we cannot talk to the agency temporarily.
|
||||||
int errorCode = (application_features::ApplicationServer::isRetryOK()) ? TRI_ERROR_CLUSTER_AGENCY_COMMUNICATION_FAILED : TRI_ERROR_SHUTTING_DOWN;
|
int errorCode = (application_features::ApplicationServer::isRetryOK()) ? TRI_ERROR_CLUSTER_AGENCY_COMMUNICATION_FAILED : TRI_ERROR_SHUTTING_DOWN;
|
||||||
std::string errorMessage = "unable to add follower in agency, timeout in agency CAS operation for key " + path + ": " + TRI_errno_string(errorCode);
|
std::string errorMessage = "unable to add follower in agency, timeout in agency CAS operation for key " + path + ": " + TRI_errno_string(errorCode);
|
||||||
LOG_TOPIC("6295b", ERR, Logger::CLUSTER) << errorMessage;
|
LOG_TOPIC("6295b", ERR, Logger::CLUSTER) << errorMessage;
|
||||||
|
@ -177,7 +200,9 @@ Result FollowerInfo::add(ServerID const& sid) {
|
||||||
////////////////////////////////////////////////////////////////////////////////
|
////////////////////////////////////////////////////////////////////////////////
|
||||||
/// @brief remove a follower from a shard, this is only done by the
|
/// @brief remove a follower from a shard, this is only done by the
|
||||||
/// server if a synchronous replication request fails. This reports to
|
/// server if a synchronous replication request fails. This reports to
|
||||||
/// the agency under `/Current` but in asynchronous "fire-and-forget"
|
/// the agency under `/Current`. This method can fail, which is critical,
|
||||||
|
/// because we cannot drop a follower ourselves and not report this to the
|
||||||
|
/// agency, since then a failover to a not-in-sync follower might happen.
|
||||||
/// way. The method fails silently, if the follower information has
|
/// way. The method fails silently, if the follower information has
|
||||||
/// since been dropped (see `dropFollowerInfo` below).
|
/// since been dropped (see `dropFollowerInfo` below).
|
||||||
////////////////////////////////////////////////////////////////////////////////
|
////////////////////////////////////////////////////////////////////////////////
|
||||||
|
@ -229,19 +254,28 @@ Result FollowerInfo::remove(ServerID const& sid) {
|
||||||
return {TRI_ERROR_NO_ERROR};
|
return {TRI_ERROR_NO_ERROR};
|
||||||
}
|
}
|
||||||
#endif
|
#endif
|
||||||
|
|
||||||
|
std::string planId = std::to_string(_docColl->planId());
|
||||||
// Now tell the agency, path is
|
// Now tell the agency, path is
|
||||||
// Current/Collections/<dbName>/<collectionID>/<shardID>
|
// Current/Collections/<dbName>/<collectionID>/<shardID>
|
||||||
std::string path = "Current/Collections/";
|
std::string path = _docColl->vocbase().name() + "/" + planId + "/";
|
||||||
path += _docColl->vocbase().name();
|
std::string curPath = "Current/Collections/" + path + _docColl->name();
|
||||||
path += "/";
|
// We also need the corresponding Plan entry, path is
|
||||||
path += std::to_string(_docColl->planId());
|
// Plan/Collections/<dbName>/<collectionID>/shards/<shardID>
|
||||||
path += "/";
|
std::string planPath = "Plan/Collections/" + path + "shards/" +
|
||||||
path += _docColl->name();
|
_docColl->name();
|
||||||
AgencyComm ac;
|
AgencyComm ac;
|
||||||
double startTime = TRI_microtime();
|
double startTime = TRI_microtime();
|
||||||
do {
|
do {
|
||||||
AgencyCommResult res = ac.getValues(path);
|
AgencyReadTransaction trx(std::vector<std::string>(
|
||||||
|
{AgencyCommManager::path(planPath), AgencyCommManager::path(curPath)}));
|
||||||
|
AgencyCommResult res = ac.sendTransactionWithFailover(trx);
|
||||||
if (res.successful()) {
|
if (res.successful()) {
|
||||||
|
// Let's look at the results, note that both can be None!
|
||||||
|
velocypack::Slice planEntry = res.slice()[0].get(
|
||||||
|
std::vector<std::string>(
|
||||||
|
{AgencyCommManager::path(), "Plan", "Collections",
|
||||||
|
_docColl->vocbase().name(), planId, "shards", _docColl->name()}));
|
||||||
velocypack::Slice currentEntry = res.slice()[0].get(std::vector<std::string>(
|
velocypack::Slice currentEntry = res.slice()[0].get(std::vector<std::string>(
|
||||||
{AgencyCommManager::path(), "Current", "Collections",
|
{AgencyCommManager::path(), "Current", "Collections",
|
||||||
_docColl->vocbase().name(), std::to_string(_docColl->planId()),
|
_docColl->vocbase().name(), std::to_string(_docColl->planId()),
|
||||||
|
@ -254,34 +288,61 @@ Result FollowerInfo::remove(ServerID const& sid) {
|
||||||
LOG_TOPIC("57c84", ERR, Logger::CLUSTER) << "Found: " << currentEntry.toJson();
|
LOG_TOPIC("57c84", ERR, Logger::CLUSTER) << "Found: " << currentEntry.toJson();
|
||||||
}
|
}
|
||||||
} else {
|
} else {
|
||||||
auto newValue = newShardEntry(currentEntry, sid, false);
|
if (!planEntry.isArray() || planEntry.length() == 0 ||
|
||||||
std::string key = "Current/Collections/" + _docColl->vocbase().name() +
|
!planEntry[0].isString() ||
|
||||||
"/" + std::to_string(_docColl->planId()) + "/" +
|
!planEntry[0].isEqualString(ServerState::instance()->getId())) {
|
||||||
_docColl->name();
|
LOG_TOPIC("42231", INFO, Logger::CLUSTER)
|
||||||
AgencyWriteTransaction trx;
|
<< "FollowerInfo::remove, did not find myself in Plan: " << path
|
||||||
trx.preconditions.push_back(
|
<< " (can happen when the leader changed recently).";
|
||||||
AgencyPrecondition(key, AgencyPrecondition::Type::VALUE, currentEntry));
|
if (!planEntry.isNone()) {
|
||||||
trx.operations.push_back(
|
LOG_TOPIC("ffede", INFO, Logger::CLUSTER) << "Found: " << planEntry.toJson();
|
||||||
AgencyOperation(key, AgencyValueOperationType::SET, newValue.slice()));
|
}
|
||||||
trx.operations.push_back(
|
return {TRI_ERROR_CLUSTER_NOT_LEADER};
|
||||||
AgencyOperation("Current/Version", AgencySimpleOperationType::INCREMENT_OP));
|
} else {
|
||||||
AgencyCommResult res2 = ac.sendTransactionWithFailover(trx);
|
auto newValue = newShardEntry(currentEntry, sid, false);
|
||||||
if (res2.successful()) {
|
std::string key = "Current/Collections/" + _docColl->vocbase().name() +
|
||||||
// we are finished
|
"/" + std::to_string(_docColl->planId()) + "/" +
|
||||||
LOG_TOPIC("be0cb", DEBUG, Logger::CLUSTER) << "Removing follower " << sid << " from "
|
_docColl->name();
|
||||||
|
AgencyWriteTransaction trx;
|
||||||
|
trx.preconditions.push_back(
|
||||||
|
AgencyPrecondition(key, AgencyPrecondition::Type::VALUE, currentEntry));
|
||||||
|
trx.preconditions.push_back(
|
||||||
|
AgencyPrecondition(planPath, AgencyPrecondition::Type::VALUE, planEntry));
|
||||||
|
trx.operations.push_back(
|
||||||
|
AgencyOperation(key, AgencyValueOperationType::SET, newValue.slice()));
|
||||||
|
trx.operations.push_back(
|
||||||
|
AgencyOperation("Current/Version", AgencySimpleOperationType::INCREMENT_OP));
|
||||||
|
AgencyCommResult res2 = ac.sendTransactionWithFailover(trx);
|
||||||
|
if (res2.successful()) {
|
||||||
|
// we are finished
|
||||||
|
LOG_TOPIC("be0cb", DEBUG, Logger::CLUSTER) << "Removing follower " << sid << " from "
|
||||||
<< _docColl->name() << "succeeded";
|
<< _docColl->name() << "succeeded";
|
||||||
return {TRI_ERROR_NO_ERROR};
|
return {TRI_ERROR_NO_ERROR};
|
||||||
|
} else {
|
||||||
|
LOG_TOPIC("67778", WARN, Logger::CLUSTER)
|
||||||
|
<< "FollowerInfo::remove, could not cas key " << path
|
||||||
|
<< ". status code: " << res2._statusCode
|
||||||
|
<< ", incriminating body: " << res2.bodyRef();
|
||||||
|
}
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
} else {
|
} else {
|
||||||
LOG_TOPIC("b7333", ERR, Logger::CLUSTER)
|
LOG_TOPIC("b7333", WARN, Logger::CLUSTER)
|
||||||
<< "FollowerInfo::remove, could not read " << path << " in agency.";
|
<< "FollowerInfo::remove, could not read " << planPath
|
||||||
|
<< " and " << curPath << " in agency.";
|
||||||
}
|
}
|
||||||
std::this_thread::sleep_for(std::chrono::microseconds(500000));
|
std::this_thread::sleep_for(std::chrono::microseconds(500000));
|
||||||
} while (TRI_microtime() < startTime + 30 &&
|
} while (TRI_microtime() < startTime + 7200 &&
|
||||||
application_features::ApplicationServer::isRetryOK());
|
application_features::ApplicationServer::isRetryOK());
|
||||||
|
// This is important, give it 2h if needed. We really do not want to get
|
||||||
// rollback
|
// into the position to fail to drop a follower, just because we cannot
|
||||||
|
// talk to the agency temporarily. The worst would be to drop the follower
|
||||||
|
// locally but not report the fact to the agency. The second worst is to
|
||||||
|
// not be able to drop the follower, despite the fact that a replication
|
||||||
|
// was not successful. All else is less dramatic. Therefore we try for
|
||||||
|
// a long time.
|
||||||
|
|
||||||
|
// rollback:
|
||||||
_followers = _oldFollowers;
|
_followers = _oldFollowers;
|
||||||
|
|
||||||
int errorCode = (application_features::ApplicationServer::isRetryOK()) ? TRI_ERROR_CLUSTER_AGENCY_COMMUNICATION_FAILED : TRI_ERROR_SHUTTING_DOWN;
|
int errorCode = (application_features::ApplicationServer::isRetryOK()) ? TRI_ERROR_CLUSTER_AGENCY_COMMUNICATION_FAILED : TRI_ERROR_SHUTTING_DOWN;
|
||||||
|
|
|
@ -527,6 +527,7 @@ arangodb::Result arangodb::maintenance::executePlan(VPackSlice const& plan,
|
||||||
std::vector<ActionDescription> actions;
|
std::vector<ActionDescription> actions;
|
||||||
report.add(VPackValue(AGENCY));
|
report.add(VPackValue(AGENCY));
|
||||||
{
|
{
|
||||||
|
// TODO: Just putting an empty array does not make any sense here!
|
||||||
VPackArrayBuilder a(&report);
|
VPackArrayBuilder a(&report);
|
||||||
diffPlanLocal(plan, local, serverId, errors, feature, actions);
|
diffPlanLocal(plan, local, serverId, errors, feature, actions);
|
||||||
}
|
}
|
||||||
|
@ -745,6 +746,7 @@ static VPackBuilder assembleLocalCollectionInfo(
|
||||||
errorMsg += database;
|
errorMsg += database;
|
||||||
errorMsg += ", exception: ";
|
errorMsg += ", exception: ";
|
||||||
errorMsg += e.what();
|
errorMsg += e.what();
|
||||||
|
errorMsg += " (this is expected if the database was recently deleted).";
|
||||||
LOG_TOPIC("7fe5d", WARN, Logger::MAINTENANCE) << errorMsg;
|
LOG_TOPIC("7fe5d", WARN, Logger::MAINTENANCE) << errorMsg;
|
||||||
{ VPackObjectBuilder o(&ret); }
|
{ VPackObjectBuilder o(&ret); }
|
||||||
return ret;
|
return ret;
|
||||||
|
@ -790,7 +792,7 @@ static VPackBuilder assembleLocalDatabaseInfo(std::string const& database,
|
||||||
|
|
||||||
return ret;
|
return ret;
|
||||||
} catch (std::exception const& e) {
|
} catch (std::exception const& e) {
|
||||||
ret.clear();
|
ret.clear(); // In case the above has mid air collision.
|
||||||
std::string errorMsg(
|
std::string errorMsg(
|
||||||
"Maintenance::assembleLocalDatabaseInfo: Failed to lookup database ");
|
"Maintenance::assembleLocalDatabaseInfo: Failed to lookup database ");
|
||||||
errorMsg += database;
|
errorMsg += database;
|
||||||
|
@ -840,6 +842,30 @@ arangodb::Result arangodb::maintenance::reportInCurrent(
|
||||||
VPackBuilder error;
|
VPackBuilder error;
|
||||||
if (shSlice.get(THE_LEADER).copyString().empty()) { // Leader
|
if (shSlice.get(THE_LEADER).copyString().empty()) { // Leader
|
||||||
|
|
||||||
|
// Check that we are the leader of this shard in the Plan, together
|
||||||
|
// with the precondition below that the Plan is unchanged, this ensures
|
||||||
|
// that we only ever modify Current if we are the leader in the Plan:
|
||||||
|
auto const planPath = std::vector<std::string>{dbName, colName, "shards", shName};
|
||||||
|
if (!pdbs.hasKey(planPath)) {
|
||||||
|
LOG_TOPIC("43242", DEBUG, Logger::MAINTENANCE)
|
||||||
|
<< "Ooops, we have a shard for which we believe to be the leader,"
|
||||||
|
" but the Plan does not have it any more, we do not report in "
|
||||||
|
"Current about this, database: " << dbName
|
||||||
|
<< ", shard: " << shName;
|
||||||
|
continue;
|
||||||
|
}
|
||||||
|
|
||||||
|
VPackSlice thePlanList = pdbs.get(planPath);
|
||||||
|
if (!thePlanList.isArray() || thePlanList.length() == 0 ||
|
||||||
|
!thePlanList[0].isString() ||
|
||||||
|
!thePlanList[0].isEqualStringUnchecked(serverId)) {
|
||||||
|
LOG_TOPIC("87776", DEBUG, Logger::MAINTENANCE)
|
||||||
|
<< "Ooops, we have a shard for which we believe to be the leader,"
|
||||||
|
" but the Plan says otherwise, we do not report in Current "
|
||||||
|
"about this, database: " << dbName << ", shard: " << shName;
|
||||||
|
continue;
|
||||||
|
}
|
||||||
|
|
||||||
auto const localCollectionInfo =
|
auto const localCollectionInfo =
|
||||||
assembleLocalCollectionInfo(shSlice, shardMap.slice().get(shName),
|
assembleLocalCollectionInfo(shSlice, shardMap.slice().get(shName),
|
||||||
dbName, shName, serverId, allErrors);
|
dbName, shName, serverId, allErrors);
|
||||||
|
@ -854,31 +880,21 @@ arangodb::Result arangodb::maintenance::reportInCurrent(
|
||||||
auto inCurrent = cur.hasKey(cp);
|
auto inCurrent = cur.hasKey(cp);
|
||||||
if (!inCurrent || !equivalent(localCollectionInfo.slice(), cur.get(cp))) {
|
if (!inCurrent || !equivalent(localCollectionInfo.slice(), cur.get(cp))) {
|
||||||
report.add(VPackValue(CURRENT_COLLECTIONS + dbName + "/" + colName + "/" + shName));
|
report.add(VPackValue(CURRENT_COLLECTIONS + dbName + "/" + colName + "/" + shName));
|
||||||
|
|
||||||
{
|
{
|
||||||
VPackObjectBuilder o(&report);
|
VPackObjectBuilder o(&report);
|
||||||
report.add(OP, VP_SET);
|
report.add(OP, VP_SET);
|
||||||
// Report new current entry ...
|
// Report new current entry ...
|
||||||
report.add("payload", localCollectionInfo.slice());
|
report.add("payload", localCollectionInfo.slice());
|
||||||
// ... if and only if plan for this shard has changed in the meantime
|
// ... if and only if plan for this shard has changed in the meantime
|
||||||
try {
|
// Add a precondition:
|
||||||
// Try to add a precondition, just in case we catch the exception.
|
report.add(VPackValue("precondition"));
|
||||||
// Even if the Plan entry is gone by now, it is still OK to
|
{
|
||||||
// report the Current value, it will be deleted in due course.
|
VPackObjectBuilder p(&report);
|
||||||
// It is the case that the Plan value is there but has changed
|
report.add(
|
||||||
// that we want to protect against.
|
PLAN_COLLECTIONS + dbName + "/" + colName + "/shards/" + shName,
|
||||||
VPackSlice oldValue = pdbs.get(std::vector<std::string>{dbName, colName, "shards", shName});
|
thePlanList);
|
||||||
if (!oldValue.isNone()) {
|
|
||||||
report.add(VPackValue("precondition"));
|
|
||||||
{
|
|
||||||
VPackObjectBuilder p(&report);
|
|
||||||
report.add(
|
|
||||||
PLAN_COLLECTIONS + dbName + "/" + colName + "/shards/" + shName,
|
|
||||||
oldValue);
|
|
||||||
}
|
|
||||||
}
|
|
||||||
} catch(...) {
|
|
||||||
}
|
}
|
||||||
|
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
} else { // Follower
|
} else { // Follower
|
||||||
|
@ -894,7 +910,31 @@ arangodb::Result arangodb::maintenance::reportInCurrent(
|
||||||
// know it yet, do nothing here.
|
// know it yet, do nothing here.
|
||||||
if (shSlice.get("theLeaderTouched").isTrue()) {
|
if (shSlice.get("theLeaderTouched").isTrue()) {
|
||||||
// we were previously leader and we are done resigning.
|
// we were previously leader and we are done resigning.
|
||||||
// update current and let supervision handle the rest
|
// update current and let supervision handle the rest, however
|
||||||
|
// check that we are in the Plan a leader which is supposed to
|
||||||
|
// resign and add a precondition that this is still the case:
|
||||||
|
|
||||||
|
auto const planPath = std::vector<std::string>{dbName, colName, "shards", shName};
|
||||||
|
if (!pdbs.hasKey(planPath)) {
|
||||||
|
LOG_TOPIC("65432", DEBUG, Logger::MAINTENANCE)
|
||||||
|
<< "Ooops, we have a shard for which we believe that we "
|
||||||
|
"just resigned, but the Plan does not have it any more,"
|
||||||
|
" we do not report in Current about this, database: "
|
||||||
|
<< dbName << ", shard: " << shName;
|
||||||
|
continue;
|
||||||
|
}
|
||||||
|
|
||||||
|
VPackSlice thePlanList = pdbs.get(planPath);
|
||||||
|
if (!thePlanList.isArray() || thePlanList.length() == 0 ||
|
||||||
|
!thePlanList[0].isString() ||
|
||||||
|
!thePlanList[0].isEqualStringUnchecked(UNDERSCORE + serverId)) {
|
||||||
|
LOG_TOPIC("99987", DEBUG, Logger::MAINTENANCE)
|
||||||
|
<< "Ooops, we have a shard for which we believe that we "
|
||||||
|
"have just resigned, but the Plan says otherwise, we "
|
||||||
|
"do not report in Current about this, database: "
|
||||||
|
<< dbName << ", shard: " << shName;
|
||||||
|
continue;
|
||||||
|
}
|
||||||
VPackBuilder ns;
|
VPackBuilder ns;
|
||||||
{
|
{
|
||||||
VPackArrayBuilder a(&ns);
|
VPackArrayBuilder a(&ns);
|
||||||
|
@ -914,6 +954,12 @@ arangodb::Result arangodb::maintenance::reportInCurrent(
|
||||||
VPackObjectBuilder o(&report);
|
VPackObjectBuilder o(&report);
|
||||||
report.add(OP, VP_SET);
|
report.add(OP, VP_SET);
|
||||||
report.add("payload", ns.slice());
|
report.add("payload", ns.slice());
|
||||||
|
{
|
||||||
|
VPackObjectBuilder p(&report, "precondition");
|
||||||
|
report.add(
|
||||||
|
PLAN_COLLECTIONS + dbName + "/" + colName + "/shards/" + shName,
|
||||||
|
thePlanList);
|
||||||
|
}
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
|
@ -2694,7 +2694,7 @@ OperationResult transaction::Methods::truncateLocal(std::string const& collectio
|
||||||
<< "truncateLocal: dropping follower " << (*followers)[i]
|
<< "truncateLocal: dropping follower " << (*followers)[i]
|
||||||
<< " for shard " << collectionName;
|
<< " for shard " << collectionName;
|
||||||
} else {
|
} else {
|
||||||
LOG_TOPIC("359bc", ERR, Logger::REPLICATION)
|
LOG_TOPIC("359bc", WARN, Logger::REPLICATION)
|
||||||
<< "truncateLocal: could not drop follower " << (*followers)[i]
|
<< "truncateLocal: could not drop follower " << (*followers)[i]
|
||||||
<< " for shard " << collectionName << ": " << res.errorMessage();
|
<< " for shard " << collectionName << ": " << res.errorMessage();
|
||||||
THROW_ARANGO_EXCEPTION(TRI_ERROR_CLUSTER_COULD_NOT_DROP_FOLLOWER);
|
THROW_ARANGO_EXCEPTION(TRI_ERROR_CLUSTER_COULD_NOT_DROP_FOLLOWER);
|
||||||
|
|
|
@ -405,7 +405,6 @@ TEST_F(FailedLeaderTest, job_must_not_be_started_if_distributeshardslike_shard_i
|
||||||
} else if (path == "/arango/Target/ToDo") {
|
} else if (path == "/arango/Target/ToDo") {
|
||||||
builder->add("1", createBuilder(todo).slice());
|
builder->add("1", createBuilder(todo).slice());
|
||||||
}
|
}
|
||||||
builder->close();
|
|
||||||
} else {
|
} else {
|
||||||
builder->add(s);
|
builder->add(s);
|
||||||
}
|
}
|
||||||
|
@ -932,7 +931,6 @@ TEST_F(FailedLeaderTest, when_everything_is_finished_there_should_be_cleanup) {
|
||||||
}
|
}
|
||||||
builder->add("1", jobBuilder.slice());
|
builder->add("1", jobBuilder.slice());
|
||||||
}
|
}
|
||||||
builder->close();
|
|
||||||
} else {
|
} else {
|
||||||
if (path == "/arango/Current/Collections/" + DATABASE + "/" + COLLECTION +
|
if (path == "/arango/Current/Collections/" + DATABASE + "/" + COLLECTION +
|
||||||
"/" + SHARD + "/servers") {
|
"/" + SHARD + "/servers") {
|
||||||
|
|
|
@ -302,7 +302,6 @@ TEST_F(FailedServerTest, the_state_is_still_bad_and_faileservers_is_still_in_sna
|
||||||
"timeCreated":"2017-04-10T11:40:09Z","type":"failedServer"})=";
|
"timeCreated":"2017-04-10T11:40:09Z","type":"failedServer"})=";
|
||||||
builder->add(jobId, createBuilder(todo).slice());
|
builder->add(jobId, createBuilder(todo).slice());
|
||||||
}
|
}
|
||||||
builder->close();
|
|
||||||
} else {
|
} else {
|
||||||
builder->add(s);
|
builder->add(s);
|
||||||
}
|
}
|
||||||
|
@ -363,7 +362,6 @@ TEST_F(FailedServerTest, the_state_is_still_bad_and_faileservers_is_still_in_sna
|
||||||
"timeCreated":"2017-04-10T11:40:09Z","type":"failedServer"})=";
|
"timeCreated":"2017-04-10T11:40:09Z","type":"failedServer"})=";
|
||||||
builder->add(jobId, createBuilder(todo).slice());
|
builder->add(jobId, createBuilder(todo).slice());
|
||||||
}
|
}
|
||||||
builder->close();
|
|
||||||
} else {
|
} else {
|
||||||
if (path == "/arango/Supervision/Health/leader/Status") {
|
if (path == "/arango/Supervision/Health/leader/Status") {
|
||||||
builder->add("/arango/Supervision/Health/leader/Status",
|
builder->add("/arango/Supervision/Health/leader/Status",
|
||||||
|
|
|
@ -183,7 +183,6 @@ TEST_F(RemoveFollowerTest, collection_still_exists_if_missing_job_is_finished_mo
|
||||||
if (path == "/arango/Target/ToDo") {
|
if (path == "/arango/Target/ToDo") {
|
||||||
builder->add(jobId, createBuilder(todo).slice());
|
builder->add(jobId, createBuilder(todo).slice());
|
||||||
}
|
}
|
||||||
builder->close();
|
|
||||||
} else {
|
} else {
|
||||||
builder->add(s);
|
builder->add(s);
|
||||||
}
|
}
|
||||||
|
|
Loading…
Reference in New Issue