1
0
Fork 0

[devel] supervision bug fix backports (#8314)

* back ports for supervision fixes from 3.4 part 1

* back ports for supervision fixes from 3.4 part 2
This commit is contained in:
Kaveh Vahedipour 2019-03-04 19:27:24 +01:00 committed by Max Neunhöffer
parent 0d2056550c
commit 68178ba165
51 changed files with 1156 additions and 382 deletions

View File

@ -5,6 +5,36 @@ devel
* fixed overflow in Windows NowNanos in RocksDB
* Allow MoveShard from leader to a follower, thus swapping the two
* Supervision fix: Satellite collections, various fixes
* Add coordinator route for agency dump
* Supervision fix: abort MoveShard job does not leave a lock behind,
* Supervision fix: abort MoveShard (leader) job moves forwards when point
of no return has been reached,
* Supervision fix: abort CleanOutServer job does not leave server in
ToBeCleanedServers,
* Supervision fix: move shard with data stopped to early due to wrong usage
of compare function
* Supervision fix: AddFollower only counts good followers, fixing a
situation after a FailedLeader job could not find a new working
follower
* Supervision fix: FailedLeader now also considers temporarily BAD
servers as replacement followers and does not block servers which
currently receive a new shard
* Supervision fix: Servers in ToBeCleanedServers are no longer considered
as replacement servers
* Maintenance fix: added precondition of unchanged Plan in phase2
* add "PRUNE <condition>" to AQL Traversals. This allows to early abort searching of
unnecessary branches within a traversal.
PRUNE is only allowed in the Traversal statement and only between the graph

View File

@ -59,7 +59,7 @@ ActiveFailoverJob::ActiveFailoverJob(Node const& snapshot, AgentInterface* agent
ActiveFailoverJob::~ActiveFailoverJob() {}
void ActiveFailoverJob::run() { runHelper(_server, ""); }
void ActiveFailoverJob::run(bool& aborts) { runHelper(_server, "", aborts); }
bool ActiveFailoverJob::create(std::shared_ptr<VPackBuilder> envelope) {
LOG_TOPIC(DEBUG, Logger::SUPERVISION) << "Todo: Handle failover for leader " + _server;
@ -130,7 +130,7 @@ bool ActiveFailoverJob::create(std::shared_ptr<VPackBuilder> envelope) {
return false;
}
bool ActiveFailoverJob::start() {
bool ActiveFailoverJob::start(bool&) {
// If anything throws here, the run() method catches it and finishes
// the job.

View File

@ -39,9 +39,9 @@ struct ActiveFailoverJob final : public Job {
virtual ~ActiveFailoverJob();
virtual JOB_STATUS status() override final;
virtual void run() override final;
virtual void run(bool&) override final;
virtual bool create(std::shared_ptr<VPackBuilder> envelope = nullptr) override final;
virtual bool start() override final;
virtual bool start(bool&) override final;
virtual Result abort() override final;
private:

View File

@ -64,7 +64,7 @@ AddFollower::AddFollower(Node const& snapshot, AgentInterface* agent,
AddFollower::~AddFollower() {}
void AddFollower::run() { runHelper("", _shard); }
void AddFollower::run(bool& aborts) { runHelper("", _shard, aborts); }
bool AddFollower::create(std::shared_ptr<VPackBuilder> envelope) {
LOG_TOPIC(INFO, Logger::SUPERVISION)
@ -119,7 +119,7 @@ bool AddFollower::create(std::shared_ptr<VPackBuilder> envelope) {
return false;
}
bool AddFollower::start() {
bool AddFollower::start(bool&) {
// If anything throws here, the run() method catches it and finishes
// the job.
@ -146,8 +146,33 @@ bool AddFollower::start() {
// First check that we still have too few followers for the current
// `replicationFactor`:
size_t desiredReplFactor = collection.hasAsUInt("replicationFactor").first;
size_t actualReplFactor = planned.length();
size_t desiredReplFactor = 1;
auto replFact = collection.hasAsUInt("replicationFactor");
if (replFact.second) {
desiredReplFactor = replFact.first;
} else {
auto replFact2 = collection.hasAsString("replicationFactor");
if (replFact2.second && replFact2.first == "satellite") {
// satellites => distribute to every server
auto available = Job::availableServers(_snapshot);
desiredReplFactor = Job::countGoodServersInList(_snapshot, available);
}
}
VPackBuilder onlyFollowers;
{
VPackArrayBuilder guard(&onlyFollowers);
bool first = true;
for (auto const& pp : VPackArrayIterator(planned)) {
if (!first) {
onlyFollowers.add(pp);
}
first = false;
}
}
size_t actualReplFactor
= 1 + Job::countGoodServersInList(_snapshot, onlyFollowers.slice());
// Leader plus good followers in plan
if (actualReplFactor >= desiredReplFactor) {
finish("", "", true, "job no longer necessary, have enough replicas");
return true;
@ -245,7 +270,7 @@ bool AddFollower::start() {
// --- Plan changes
doForAllShards(_snapshot, _database, shardsLikeMe,
[&trx, &chosen](Slice plan, Slice current, std::string& planPath) {
[&trx, &chosen](Slice plan, Slice current, std::string& planPath, std::string& curPath) {
trx.add(VPackValue(planPath));
{
VPackArrayBuilder serverList(&trx);

View File

@ -42,8 +42,8 @@ struct AddFollower : public Job {
virtual JOB_STATUS status() override final;
virtual bool create(std::shared_ptr<VPackBuilder> envelope = nullptr) override final;
virtual void run() override final;
virtual bool start() override final;
virtual void run(bool&) override final;
virtual bool start(bool&) override final;
virtual Result abort() override final;
std::string _database;

View File

@ -903,6 +903,37 @@ AgencyCommResult AgencyComm::getValues(std::string const& key) {
return result;
}
AgencyCommResult AgencyComm::dump() {
std::string url = AgencyComm::AGENCY_URL_PREFIX + "/state";
AgencyCommResult result =
sendWithFailover(
arangodb::rest::RequestType::GET,
AgencyCommManager::CONNECTION_OPTIONS._requestTimeout,
url, VPackSlice::noneSlice());
if (!result.successful()) {
return result;
}
try {
result.setVPack(VPackParser::fromJson(result.bodyRef()));
result._body.clear();
result._statusCode = 200;
} catch (std::exception const& e) {
LOG_TOPIC(ERR, Logger::AGENCYCOMM) << "Error transforming result: " << e.what();
result.clear();
} catch (...) {
LOG_TOPIC(ERR, Logger::AGENCYCOMM)
<< "Error transforming result: out of memory";
result.clear();
}
return result;
}
AgencyCommResult AgencyComm::removeValues(std::string const& key, bool recursive) {
AgencyWriteTransaction transaction(AgencyOperation(key, AgencySimpleOperationType::DELETE_OP));

View File

@ -631,6 +631,8 @@ class AgencyComm {
std::string version();
AgencyCommResult dump();
bool increaseVersion(std::string const& key) {
AgencyCommResult result = increment(key);
return result.successful();

View File

@ -252,6 +252,18 @@ bool Agent::isCommitted(index_t index) {
}
}
index_t Agent::index() {
if (challengeLeadership()) {
resign();
return 0;
}
MUTEX_LOCKER(tiLocker, _tiLock);
return _confirmed[id()];
}
// AgentCallback reports id of follower and its highest processed index
void Agent::reportIn(std::string const& peerId, index_t index, size_t toLog) {
auto startTime = steady_clock::now();
@ -1584,6 +1596,29 @@ arangodb::consensus::index_t Agent::readDB(Node& node) const {
return _commitIndex;
}
/// Get readdb
arangodb::consensus::index_t Agent::readDB(VPackBuilder& builder) const {
TRI_ASSERT(builder.isOpenObject());
uint64_t commitIndex = 0;
{ READ_LOCKER(oLocker, _outputLock);
commitIndex = _commitIndex;
// commit index
builder.add("index", VPackValue(commitIndex));
builder.add("term", VPackValue(term()));
// key-value store {}
builder.add(VPackValue("agency"));
_readDB.get().toBuilder(builder, true); }
// replicated log []
_state.toVelocyPack(commitIndex, builder);
return commitIndex;
}
void Agent::executeLockedRead(std::function<void()> const& cb) {
_tiLock.assertNotLockedByCurrentThread();
MUTEX_LOCKER(ioLocker, _ioLock);

View File

@ -177,6 +177,9 @@ class Agent final : public arangodb::Thread, public AgentInterface {
/// @brief Gossip in
bool activeAgency();
/// @brief Get the index at which the leader is
index_t index();
/// @brief Start orderly shutdown of threads
void beginShutdown() override final;
@ -218,6 +221,9 @@ class Agent final : public arangodb::Thread, public AgentInterface {
/// @brief Get read store and compaction index
index_t readDB(Node&) const;
/// @brief Get read store and compaction index
index_t readDB(VPackBuilder&) const;
/// @brief Get read store
/// WARNING: this assumes caller holds appropriate
/// locks or will use executeLockedRead() or

View File

@ -58,7 +58,7 @@ CleanOutServer::CleanOutServer(Node const& snapshot, AgentInterface* agent,
CleanOutServer::~CleanOutServer() {}
void CleanOutServer::run() { runHelper(_server, ""); }
void CleanOutServer::run(bool& aborts) { runHelper(_server, "", aborts); }
JOB_STATUS CleanOutServer::status() {
if (_status != PENDING) {
@ -121,7 +121,7 @@ JOB_STATUS CleanOutServer::status() {
reportTrx.add("op", VPackValue("push"));
reportTrx.add("new", VPackValue(_server));
}
reportTrx.add(VPackValue("/Target/ToBeCleanedServers"));
reportTrx.add(VPackValue(toBeCleanedPrefix));
{
VPackObjectBuilder guard4(&reportTrx);
reportTrx.add("op", VPackValue("erase"));
@ -196,7 +196,7 @@ bool CleanOutServer::create(std::shared_ptr<VPackBuilder> envelope) {
return false;
}
bool CleanOutServer::start() {
bool CleanOutServer::start(bool& aborts) {
// If anything throws here, the run() method catches it and finishes
// the job.
@ -318,7 +318,7 @@ bool CleanOutServer::start() {
addBlockServer(*pending, _server, _jobId);
// Put ourselves in list of servers to be cleaned:
pending->add(VPackValue("/Target/ToBeCleanedServers"));
pending->add(VPackValue(toBeCleanedPrefix));
{
VPackObjectBuilder guard4(pending.get());
pending->add("op", VPackValue("push"));
@ -388,34 +388,58 @@ bool CleanOutServer::scheduleMoveShards(std::shared_ptr<Builder>& trx) {
continue;
}
decltype(servers) serversCopy(servers); // a copy
auto replicationFactor = collection.hasAsString("replicationFactor");
bool isSatellite = replicationFactor.second && replicationFactor.first == "satellite";
// Only destinations, which are not already holding this shard
for (auto const& dbserver : VPackArrayIterator(shard.second->slice())) {
serversCopy.erase(std::remove(serversCopy.begin(), serversCopy.end(),
dbserver.copyString()),
serversCopy.end());
}
bool isLeader = (found == 0);
// Among those a random destination:
std::string toServer;
if (serversCopy.empty()) {
LOG_TOPIC(DEBUG, Logger::SUPERVISION)
<< "No servers remain as target for MoveShard";
return false;
if (isSatellite) {
if (isLeader) {
std::string toServer = Job::findNonblockedCommonHealthyInSyncFollower(
_snapshot, database.first, collptr.first, shard.first);
MoveShard(_snapshot, _agent, _jobId + "-" + std::to_string(sub++),
_jobId, database.first, collptr.first, shard.first, _server,
toServer, isLeader, false)
.create(trx);
} else {
// Intentionally do nothing. RemoveServer will remove the failed follower
LOG_TOPIC(DEBUG, Logger::SUPERVISION) <<
"Do nothing for cleanout of follower of the satellite collection " << collection.hasAsString("id").first;
continue ;
}
} else {
decltype(servers) serversCopy(servers); // a copy
// Only destinations, which are not already holding this shard
for (auto const& dbserver : VPackArrayIterator(shard.second->slice())) {
serversCopy.erase(std::remove(serversCopy.begin(), serversCopy.end(),
dbserver.copyString()),
serversCopy.end());
}
// Among those a random destination:
std::string toServer;
if (serversCopy.empty()) {
LOG_TOPIC(DEBUG, Logger::SUPERVISION)
<< "No servers remain as target for MoveShard";
return false;
}
toServer = serversCopy.at(
arangodb::RandomGenerator::interval(static_cast<int64_t>(0),
serversCopy.size() - 1));
// Schedule move into trx:
MoveShard(_snapshot, _agent, _jobId + "-" + std::to_string(sub++),
_jobId, database.first, collptr.first, shard.first, _server,
toServer, isLeader, false)
.create(trx);
}
toServer = serversCopy.at(
arangodb::RandomGenerator::interval(static_cast<int64_t>(0),
serversCopy.size() - 1));
// Schedule move into trx:
MoveShard(_snapshot, _agent, _jobId + "-" + std::to_string(sub++),
_jobId, database.first, collptr.first, shard.first, _server,
toServer, isLeader, false)
.create(trx);
}
}
}
@ -499,17 +523,28 @@ arangodb::Result CleanOutServer::abort() {
Node::Children const pends = _snapshot.hasAsChildren(pendingPrefix).first;
for (auto const& subJob : todos) {
if (!subJob.first.compare(0, _jobId.size() + 1, _jobId + "-")) {
if (subJob.first.compare(0, _jobId.size() + 1, _jobId + "-") == 0) {
JobContext(TODO, subJob.first, _snapshot, _agent).abort();
}
}
for (auto const& subJob : pends) {
if (!subJob.first.compare(0, _jobId.size() + 1, _jobId + "-")) {
if (subJob.first.compare(0, _jobId.size() + 1, _jobId + "-") == 0) {
JobContext(PENDING, subJob.first, _snapshot, _agent).abort();
}
}
finish(_server, "", false, "job aborted");
auto payload = std::make_shared<VPackBuilder>();
{
VPackObjectBuilder p(payload.get());
payload->add(VPackValue(toBeCleanedPrefix));
{
VPackObjectBuilder pp(payload.get());
payload->add("op", VPackValue("erase"));
payload->add("val", VPackValue(_server));
}
}
finish(_server, "", false, "job aborted", payload);
return result;
}

View File

@ -42,8 +42,8 @@ struct CleanOutServer : public Job {
virtual JOB_STATUS status() override final;
virtual bool create(std::shared_ptr<VPackBuilder> envelope = nullptr) override final;
virtual void run() override final;
virtual bool start() override final;
virtual void run(bool&) override final;
virtual bool start(bool&) override final;
virtual Result abort() override final;
// Check if all shards' replication factors can be satisfied after clean out.

View File

@ -78,7 +78,7 @@ FailedFollower::FailedFollower(Node const& snapshot, AgentInterface* agent,
FailedFollower::~FailedFollower() {}
void FailedFollower::run() { runHelper("", _shard); }
void FailedFollower::run(bool& aborts) { runHelper("", _shard, aborts); }
bool FailedFollower::create(std::shared_ptr<VPackBuilder> envelope) {
using namespace std::chrono;
@ -86,7 +86,7 @@ bool FailedFollower::create(std::shared_ptr<VPackBuilder> envelope) {
<< "Create failedFollower for " + _shard + " from " + _from;
_created = system_clock::now();
if (envelope == nullptr) {
_jb = std::make_shared<Builder>();
_jb->openArray();
@ -117,10 +117,10 @@ bool FailedFollower::create(std::shared_ptr<VPackBuilder> envelope) {
}
return true;
}
bool FailedFollower::start() {
bool FailedFollower::start(bool& aborts) {
using namespace std::chrono;
std::vector<std::string> existing =
@ -141,15 +141,33 @@ bool FailedFollower::start() {
// Planned servers vector
std::string planPath =
planColPrefix + _database + "/" + _collection + "/shards/" + _shard;
auto plannedPair = _snapshot.hasAsSlice(planPath); // if missing, what?
auto plannedPair = _snapshot.hasAsSlice(planPath);
Slice const& planned = plannedPair.first;
if (!plannedPair.second) {
// not clear what servers should or should not get failover ... retry later
finish("", _shard, true,
"Plan entry for collection " + _collection + " gone");
return false;
}
// Now check if _server is still in this plan, note that it could have
// been removed by RemoveFollower already, in which case we simply stop:
bool found = false;
if (planned.isArray()) {
for (auto const& s : VPackArrayIterator(planned)) {
if (s.isString() && _from == s.copyString()) {
found = true;
break;
}
}
}
if (!found) {
finish("", _shard, true, "Server no longer found in Plan for collection " +
_collection + ", our job is done.");
return false;
}
// Get proper replacement
_to = randomIdleGoodAvailableServer(_snapshot, planned);
_to = randomIdleAvailableServer(_snapshot, planned);
if (_to.empty()) {
// retry later
return false;
@ -255,8 +273,9 @@ bool FailedFollower::start() {
return false;
} else if (jobId.second) {
JobContext(PENDING, jobId.first, _snapshot, _agent).abort();
return false;
}
} // if
}
LOG_TOPIC(DEBUG, Logger::SUPERVISION)
<< "FailedFollower start transaction: " << job.toJson();

View File

@ -44,8 +44,8 @@ struct FailedFollower : public Job {
virtual ~FailedFollower();
virtual bool create(std::shared_ptr<VPackBuilder> b = nullptr) override final;
virtual void run() override final;
virtual bool start() override final;
virtual void run(bool&) override final;
virtual bool start(bool&) override final;
virtual JOB_STATUS status() override final;
virtual Result abort() override final;

View File

@ -81,7 +81,7 @@ FailedLeader::FailedLeader(Node const& snapshot, AgentInterface* agent,
FailedLeader::~FailedLeader() {}
void FailedLeader::run() { runHelper("", _shard); }
void FailedLeader::run(bool& aborts) { runHelper("", _shard, aborts); }
void FailedLeader::rollback() {
// Create new plan servers (exchange _to and _from)
@ -161,10 +161,10 @@ bool FailedLeader::create(std::shared_ptr<VPackBuilder> b) {
}
return true;
}
bool FailedLeader::start() {
bool FailedLeader::start(bool& aborts) {
std::vector<std::string> existing =
_snapshot.exists(planColPrefix + _database + "/" + _collection + "/" +
"distributeShardsLike");
@ -232,7 +232,7 @@ bool FailedLeader::start() {
}
// Additional follower, if applicable
auto additionalFollower = randomIdleGoodAvailableServer(_snapshot, planned);
auto additionalFollower = randomIdleAvailableServer(_snapshot, planned);
if (!additionalFollower.empty()) {
planv.push_back(additionalFollower);
}
@ -309,7 +309,9 @@ bool FailedLeader::start() {
if (jobId.second && !abortable(_snapshot, jobId.first)) {
return false;
} else if (jobId.second) {
aborts = true;
JobContext(PENDING, jobId.first, _snapshot, _agent).abort();
return false;
}
}

View File

@ -44,9 +44,9 @@ struct FailedLeader : public Job {
virtual ~FailedLeader();
virtual bool create(std::shared_ptr<VPackBuilder> b = nullptr) override final;
virtual bool start() override final;
virtual bool start(bool&) override final;
virtual JOB_STATUS status() override final;
virtual void run() override final;
virtual void run(bool&) override final;
virtual Result abort() override final;
void rollback();

View File

@ -58,9 +58,9 @@ FailedServer::FailedServer(Node const& snapshot, AgentInterface* agent,
FailedServer::~FailedServer() {}
void FailedServer::run() { runHelper(_server, ""); }
void FailedServer::run(bool& aborts) { runHelper(_server, "", aborts); }
bool FailedServer::start() {
bool FailedServer::start(bool& aborts) {
using namespace std::chrono;
// Fail job, if Health back to not FAILED
@ -71,6 +71,12 @@ bool FailedServer::start() {
LOG_TOPIC(INFO, Logger::SUPERVISION) << reason.str();
finish(_server, "", false, reason.str());
return false;
} else if(!status.second) {
std::stringstream reason;
reason << "Server " << _server << " no longer in health. Already removed. Abort.";
LOG_TOPIC(INFO, Logger::SUPERVISION) << reason.str();
finish(_server, "", false, reason.str()); // Finish or abort?
return false;
}
// Abort job blocking server if abortable
@ -78,7 +84,9 @@ bool FailedServer::start() {
if (jobId.second && !abortable(_snapshot, jobId.first)) {
return false;
} else if (jobId.second) {
aborts = true;
JobContext(PENDING, jobId.first, _snapshot, _agent).abort();
return false;
}
// Todo entry
@ -90,8 +98,8 @@ bool FailedServer::start() {
if (toDoJob.second) {
toDoJob.first.toBuilder(todo);
} else {
LOG_TOPIC(INFO, Logger::SUPERVISION)
<< "Failed to get key " + toDoPrefix + _jobId + " from agency snapshot";
LOG_TOPIC(INFO, Logger::SUPERVISION) << "Failed to get key " + toDoPrefix + _jobId +
" from agency snapshot";
return false;
}
} else {
@ -99,7 +107,6 @@ bool FailedServer::start() {
}
} // Todo entry
// Pending entry
auto transactions = std::make_shared<VPackBuilder>();
{
VPackArrayBuilder a(transactions.get());
@ -108,10 +115,10 @@ bool FailedServer::start() {
{
VPackObjectBuilder oper(transactions.get());
// Add pending
auto const& databases = _snapshot.hasAsChildren("/Plan/Collections").first;
// auto const& current = _snapshot.hasAsChildren("/Current/Collections").first;
size_t sub = 0;
// FIXME: looks OK, but only the non-clone shards are put into the job
@ -124,19 +131,27 @@ bool FailedServer::start() {
auto const& replicationFactorPair =
collection.hasAsNode("replicationFactor");
if (replicationFactorPair.second) {
VPackSlice const replicationFactor = replicationFactorPair.first.slice();
if (!replicationFactor.isNumber()) {
continue; // no point to try salvaging unreplicated data
}
uint64_t number = 1;
try {
number = replicationFactor.getNumber<uint64_t>();
} catch(...) {
}
if (number == 1) {
continue;
bool isSatellite = false;
if (replicationFactor.isString() && replicationFactor.compareString("satellite") == 0) {
isSatellite = true; // do nothing - number = Job::availableServers(_snapshot).size();
} else if (replicationFactor.isNumber()) {
try {
number = replicationFactor.getNumber<uint64_t>();
} catch(...) {
LOG_TOPIC(ERR, Logger::SUPERVISION) << "Failed to read replicationFactor. job: "
<< _jobId << " " << collection.hasAsString("id").first;
continue ;
}
if (number == 1) {
continue ;
}
} else {
continue; // no point to try salvaging unreplicated data
}
if (collection.has("distributeShardsLike")) {
@ -152,14 +167,19 @@ bool FailedServer::start() {
if (dbs == _server) {
if (pos == 0) {
FailedLeader(
_snapshot, _agent, _jobId + "-" + std::to_string(sub++),
_snapshot, _agent, _jobId + "-" + std::to_string(sub++),
_jobId, database.first, collptr.first, shard.first, _server)
.create(transactions);
} else {
FailedFollower(
_snapshot, _agent, _jobId + "-" + std::to_string(sub++),
_jobId, database.first, collptr.first, shard.first, _server)
.create(transactions);
if (!isSatellite) {
FailedFollower(
_snapshot, _agent, _jobId + "-" + std::to_string(sub++),
_jobId, database.first, collptr.first, shard.first, _server)
.create(transactions);
} else {
LOG_TOPIC(DEBUG, Logger::SUPERVISION) << "Do intentionally nothing for failed follower of satellite collection. job: "
<< _jobId;
}
}
}
pos++;
@ -172,7 +192,8 @@ bool FailedServer::start() {
transactions->add(VPackValue(pendingPrefix + _jobId));
{
VPackObjectBuilder ts(transactions.get());
transactions->add("timeStarted", VPackValue(timepointToString(system_clock::now())));
transactions->add("timeStarted",
VPackValue(timepointToString(system_clock::now())));
for (auto const& obj : VPackObjectIterator(todo.slice()[0])) {
transactions->add(obj.key.copyString(), obj.value);
}

View File

@ -40,10 +40,10 @@ struct FailedServer : public Job {
virtual ~FailedServer();
virtual bool start() override final;
virtual bool start(bool&) override final;
virtual bool create(std::shared_ptr<VPackBuilder> b = nullptr) override final;
virtual JOB_STATUS status() override final;
virtual void run() override final;
virtual void run(bool&) override final;
virtual Result abort() override final;
std::string _server;

View File

@ -38,6 +38,7 @@ std::string const failedPrefix = "/Target/Failed/";
std::string const finishedPrefix = "/Target/Finished/";
std::string const toDoPrefix = "/Target/ToDo/";
std::string const cleanedPrefix = "/Target/CleanedServers";
std::string const toBeCleanedPrefix = "/Target/ToBeCleanedServers";
std::string const failedServersPrefix = "/Target/FailedServers";
std::string const planColPrefix = "/Plan/Collections/";
std::string const curColPrefix = "/Current/Collections/";
@ -96,37 +97,68 @@ bool Job::finish(std::string const& server, std::string const& shard,
LOG_TOPIC(WARN, Logger::AGENCY) << "Failed to obtain type of job " << _jobId;
}
// Prepare pending entry, block toserver
{
VPackArrayBuilder guard(&finished);
VPackObjectBuilder guard2(&finished);
addPutJobIntoSomewhere(finished, success ? "Finished" : "Failed",
pending.slice()[0], reason);
addRemoveJobFromSomewhere(finished, "ToDo", _jobId);
addRemoveJobFromSomewhere(finished, "Pending", _jobId);
// Additional payload, which is to be executed in the finish transaction
if (payload != nullptr) {
Slice slice = payload->slice();
TRI_ASSERT(slice.isObject());
// Additional payload, which is to be executed in the finish transaction
Slice operations = Slice::emptyObjectSlice();
Slice preconditions = Slice::emptyObjectSlice();
if (payload != nullptr) {
Slice slice = payload->slice();
TRI_ASSERT(slice.isObject() || slice.isArray());
if (slice.isObject()) { // opers only
operations = slice;
TRI_ASSERT(operations.isObject());
} else {
TRI_ASSERT(slice.length() < 3); // opers + precs only
if (slice.length() > 0) {
for (auto const& oper : VPackObjectIterator(slice)) {
finished.add(oper.key.copyString(), oper.value);
operations = slice[0];
TRI_ASSERT(operations.isObject());
if (slice.length() > 1) {
preconditions = slice[1];
TRI_ASSERT(preconditions.isObject());
}
}
}
}
// --- Remove blocks if specified:
if (started && !server.empty()) {
addReleaseServer(finished, server);
}
if (started && !shard.empty()) {
addReleaseShard(finished, shard);
}
// Prepare pending entry, block toserver
{
VPackArrayBuilder guard(&finished);
} // close object and array
{ // operations --
VPackObjectBuilder operguard(&finished);
addPutJobIntoSomewhere(finished, success ? "Finished" : "Failed",
pending.slice()[0], reason);
addRemoveJobFromSomewhere(finished, "ToDo", _jobId);
addRemoveJobFromSomewhere(finished, "Pending", _jobId);
if (operations.length() > 0) {
for (auto const& oper : VPackObjectIterator(operations)) {
finished.add(oper.key.copyString(), oper.value);
}
}
// --- Remove blocks if specified:
if (started && !server.empty()) {
addReleaseServer(finished, server);
}
if (started && !shard.empty()) {
addReleaseShard(finished, shard);
}
} // -- operations
if (preconditions != Slice::emptyObjectSlice()) { // preconditions --
VPackObjectBuilder precguard(&finished);
if (preconditions.length() > 0) {
for (auto const& prec : VPackObjectIterator(preconditions)) {
finished.add(prec.key.copyString(), prec.value);
}
}
} // -- preconditions
}
write_ret_t res = singleWriteTransaction(_agent, finished);
if (res.accepted && res.indices.size() == 1 && res.indices[0]) {
@ -139,53 +171,50 @@ bool Job::finish(std::string const& server, std::string const& shard,
return false;
}
std::string Job::randomIdleGoodAvailableServer(Node const& snap,
std::string Job::randomIdleAvailableServer(Node const& snap,
std::vector<std::string> const& exclude) {
std::vector<std::string> as = availableServers(snap);
std::string ret;
auto ex(exclude);
// ungood;
// Prefer good servers over bad servers
std::vector<std::string> good;
// Only take good servers as valid server.
try {
for (auto const& srv : snap.hasAsChildren(healthPrefix).first) {
if ((*srv.second).hasAsString("Status").first != "GOOD") {
ex.push_back(srv.first);
// ignore excluded servers
if (std::find(std::begin(exclude), std::end(exclude), srv.first) != std::end(exclude)) {
continue ;
}
// ignore servers not in availableServers above:
if (std::find(std::begin(as), std::end(as), srv.first) == std::end(as)) {
continue ;
}
std::string const& status = (*srv.second).hasAsString("Status").first;
if (status == "GOOD") {
good.push_back(srv.first);
}
}
} catch (...) {
}
// blocked;
try {
for (auto const& srv : snap.hasAsChildren(blockedServersPrefix).first) {
ex.push_back(srv.first);
}
} catch (...) {
if (good.empty()) {
return ret;
}
// Remove excluded servers
std::sort(std::begin(ex), std::end(ex));
as.erase(std::remove_if(std::begin(as), std::end(as),
[&](std::string const& s) {
return std::binary_search(std::begin(ex), std::end(ex), s);
}),
std::end(as));
// Choose random server from rest
if (!as.empty()) {
if (as.size() == 1) {
ret = as[0];
} else {
uint16_t interval = static_cast<uint16_t>(as.size() - 1);
uint16_t random = RandomGenerator::interval(interval);
ret = as.at(random);
}
if (good.size() == 1) {
ret = good[0];
return ret;
}
uint16_t interval = static_cast<uint16_t>(good.size() - 1);
uint16_t random = RandomGenerator::interval(interval);
ret = good.at(random);
return ret;
}
std::string Job::randomIdleGoodAvailableServer(Node const& snap, Slice const& exclude) {
std::string Job::randomIdleAvailableServer(Node const& snap, Slice const& exclude) {
std::vector<std::string> ev;
if (exclude.isArray()) {
for (const auto& s : VPackArrayIterator(exclude)) {
@ -194,10 +223,96 @@ std::string Job::randomIdleGoodAvailableServer(Node const& snap, Slice const& ex
}
}
}
return randomIdleGoodAvailableServer(snap, ev);
return randomIdleAvailableServer(snap, ev);
}
/// @brief Get servers from plan, which are not failed or cleaned out
// The following counts in a given server list how many of the servers are
// in Status "GOOD".
size_t Job::countGoodServersInList(Node const& snap, VPackSlice const& serverList) {
size_t count = 0;
if (!serverList.isArray()) {
// No array, strange, return 0
return count;
}
auto health = snap.hasAsChildren(healthPrefix);
// Do we have a Health substructure?
if (health.second) {
Node::Children& healthData = health.first; // List of servers in Health
for (VPackSlice const serverName : VPackArrayIterator(serverList)) {
if (serverName.isString()) {
// serverName not a string? Then don't count
std::string serverStr = serverName.copyString();
// Now look up this server:
auto it = healthData.find(serverStr);
if (it != healthData.end()) {
// Only check if found
std::shared_ptr<Node> healthNode = it->second;
// Check its status:
if (healthNode->hasAsString("Status").first == "GOOD") {
++count;
}
}
}
}
}
return count;
}
// The following counts in a given server list how many of the servers are
// in Status "GOOD".
size_t Job::countGoodServersInList(Node const& snap, std::vector<std::string> const& serverList) {
size_t count = 0;
auto health = snap.hasAsChildren(healthPrefix);
// Do we have a Health substructure?
if (health.second) {
Node::Children& healthData = health.first; // List of servers in Health
for (auto& serverStr : serverList) {
// Now look up this server:
auto it = healthData.find(serverStr);
if (it != healthData.end()) {
// Only check if found
std::shared_ptr<Node> healthNode = it->second;
// Check its status:
if (healthNode->hasAsString("Status").first == "GOOD") {
++count;
}
}
}
}
return count;
}
/// @brief Check if a server is cleaned or to be cleaned out:
bool Job::isInServerList(Node const& snap, std::string const& prefix, std::string const& server, bool isArray) {
VPackSlice slice;
bool has;
bool found = false;
if (isArray) {
std::tie(slice, has) = snap.hasAsSlice(prefix);
if (has && slice.isArray()) {
for (auto const& srv : VPackArrayIterator(slice)) {
if (srv.copyString() == server) {
found = true;
break;
}
}
}
} else { // an object
Node::Children children;
std::tie(children, has) = snap.hasAsChildren(prefix);
if (has) {
for (auto const& srv : children) {
if (srv.first == server) {
found = true;
break;
}
}
}
}
return found;
}
/// @brief Get servers from plan, which are not failed or (to be) cleaned out
std::vector<std::string> Job::availableServers(Node const& snapshot) {
std::vector<std::string> ret;
@ -207,22 +322,31 @@ std::vector<std::string> Job::availableServers(Node const& snapshot) {
ret.push_back(srv.first);
}
// Remove cleaned servers from list (test first to avoid warning log
if (snapshot.has(cleanedPrefix)) try {
for (auto const& srv :
VPackArrayIterator(snapshot.hasAsSlice(cleanedPrefix).first)) {
ret.erase(std::remove(ret.begin(), ret.end(), srv.copyString()), ret.end());
}
} catch (...) {
}
auto excludePrefix = [&ret, &snapshot](std::string const& prefix, bool isArray) {
// Remove failed servers from list (test first to avoid warning log)
if (snapshot.has(failedServersPrefix)) try {
for (auto const& srv : snapshot.hasAsChildren(failedServersPrefix).first) {
bool has;
VPackSlice slice;
Node::Children children;
if (isArray) {
std::tie(slice, has) = snapshot.hasAsSlice(prefix);
if (has) {
for (auto const& srv : VPackArrayIterator(slice)) {
ret.erase(std::remove(ret.begin(), ret.end(), srv.copyString()), ret.end());
}
}
} else {
std::tie(children, has) = snapshot.hasAsChildren(prefix);
for (auto const& srv : children) {
ret.erase(std::remove(ret.begin(), ret.end(), srv.first), ret.end());
}
} catch (...) {
}
};
// Remove (to be) cleaned and failed servers from the list
excludePrefix(cleanedPrefix, true);
excludePrefix(failedServersPrefix, false);
excludePrefix(toBeCleanedPrefix, true);
return ret;
}
@ -419,7 +543,7 @@ bool Job::abortable(Node const& snapshot, std::string const& jobId) {
void Job::doForAllShards(Node const& snapshot, std::string& database,
std::vector<shard_t>& shards,
std::function<void(Slice plan, Slice current, std::string& planPath)> worker) {
std::function<void(Slice plan, Slice current, std::string& planPath, std::string& curPath)> worker) {
for (auto const& collShard : shards) {
std::string shard = collShard.shard;
std::string collection = collShard.collection;
@ -432,7 +556,7 @@ void Job::doForAllShards(Node const& snapshot, std::string& database,
Slice plan = snapshot.hasAsSlice(planPath).first;
Slice current = snapshot.hasAsSlice(curPath).first;
worker(plan, current, planPath);
worker(plan, current, planPath, curPath);
}
}

View File

@ -48,6 +48,7 @@ extern std::string const failedPrefix;
extern std::string const finishedPrefix;
extern std::string const toDoPrefix;
extern std::string const cleanedPrefix;
extern std::string const toBeCleanedPrefix;
extern std::string const failedServersPrefix;
extern std::string const planColPrefix;
extern std::string const curColPrefix;
@ -72,9 +73,9 @@ struct Job {
virtual ~Job();
virtual void run() = 0;
virtual void run(bool& aborts) = 0;
void runHelper(std::string const& server, std::string const& shard) {
void runHelper(std::string const& server, std::string const& shard, bool& aborts) {
if (_status == FAILED) { // happens when the constructor did not work
return;
}
@ -87,10 +88,10 @@ struct Job {
}
try {
if (_status == TODO) {
start();
start(aborts);
} else if (_status == NOTFOUND) {
if (create(nullptr)) {
start();
start(aborts);
}
}
} catch (std::exception const& e) {
@ -112,7 +113,7 @@ struct Job {
virtual bool create(std::shared_ptr<VPackBuilder> b) = 0;
// Returns if job was actually started (i.e. false if directly failed!)
virtual bool start() = 0;
virtual bool start(bool& aborts) = 0;
static bool abortable(Node const& snapshot, std::string const& jobId);
@ -121,9 +122,12 @@ struct Job {
/// @brief Get a random server, which is not blocked, in good condition and
/// excluding "exclude" vector
static std::string randomIdleGoodAvailableServer(Node const& snap,
static std::string randomIdleAvailableServer(Node const& snap,
std::vector<std::string> const& exclude);
static std::string randomIdleGoodAvailableServer(Node const& snap, VPackSlice const& exclude);
static std::string randomIdleAvailableServer(Node const& snap, VPackSlice const& exclude);
static size_t countGoodServersInList(Node const& snap, VPackSlice const& serverList);
static size_t countGoodServersInList(Node const& snap, std::vector<std::string> const& serverList);
static bool isInServerList(Node const& snap, std::string const& prefix, std::string const& server, bool isArray);
/// @brief Get servers from plan, which are not failed or cleaned out
static std::vector<std::string> availableServers(const arangodb::consensus::Node&);
@ -151,7 +155,7 @@ struct Job {
static void doForAllShards(
Node const& snapshot, std::string& database, std::vector<shard_t>& shards,
std::function<void(Slice plan, Slice current, std::string& planPath)> worker);
std::function<void(Slice plan, Slice current, std::string& planPath, std::string& curPath)> worker);
// The following methods adds an operation to a transaction object or
// a condition to a precondition object. In all cases, the builder trx

View File

@ -73,15 +73,15 @@ void JobContext::create(std::shared_ptr<VPackBuilder> b) {
}
}
void JobContext::start() {
void JobContext::start(bool& aborts) {
if (_job != nullptr) {
_job->start();
_job->start(aborts);
}
}
void JobContext::run() {
void JobContext::run(bool& aborts) {
if (_job != nullptr) {
_job->run();
_job->run(aborts);
}
}

View File

@ -44,10 +44,10 @@ class JobContext {
void create(std::shared_ptr<VPackBuilder> b = nullptr);
/// @brief Start job
void start();
void start(bool& aborts);
/// @brief Run job
void run();
void run(bool& aborts);
/// @brief Abort job
void abort();

View File

@ -93,7 +93,7 @@ MoveShard::MoveShard(Node const& snapshot, AgentInterface* agent,
MoveShard::~MoveShard() {}
void MoveShard::run() { runHelper(_to, _shard); }
void MoveShard::run(bool& aborts) { runHelper(_to, _shard, aborts); }
bool MoveShard::create(std::shared_ptr<VPackBuilder> envelope) {
LOG_TOPIC(DEBUG, Logger::SUPERVISION)
@ -167,7 +167,7 @@ bool MoveShard::create(std::shared_ptr<VPackBuilder> envelope) {
return false;
}
bool MoveShard::start() {
bool MoveShard::start(bool&) {
// If anything throws here, the run() method catches it and finishes
// the job.
@ -265,11 +265,16 @@ bool MoveShard::start() {
int found = -1;
int count = 0;
_toServerIsFollower = false;
for (auto const& srv : VPackArrayIterator(planned)) {
TRI_ASSERT(srv.isString());
if (srv.copyString() == _to) {
finish("", "", false, "toServer must not yet be planned for shard");
return false;
if (!_isLeader) {
finish("", "", false, "toServer must not be planned for a following shard");
return false;
} else {
_toServerIsFollower = true;
}
}
if (srv.copyString() == _from) {
found = count;
@ -340,17 +345,18 @@ bool MoveShard::start() {
// --- Plan changes
doForAllShards(_snapshot, _database, shardsLikeMe,
[this, &pending](Slice plan, Slice current, std::string& planPath) {
[this, &pending](Slice plan, Slice current, std::string& planPath, std::string& curPath) {
pending.add(VPackValue(planPath));
{
VPackArrayBuilder serverList(&pending);
if (_isLeader) {
TRI_ASSERT(plan[0].copyString() != _to);
pending.add(plan[0]);
pending.add(VPackValue(_to));
if (!_toServerIsFollower) {
pending.add(VPackValue(_to));
}
for (size_t i = 1; i < plan.length(); ++i) {
pending.add(plan[i]);
TRI_ASSERT(plan[i].copyString() != _to);
}
} else {
for (auto const& srv : VPackArrayIterator(plan)) {
@ -445,7 +451,7 @@ JOB_STATUS MoveShard::pendingLeader() {
// Still the old leader, let's check that the toServer is insync:
size_t done = 0; // count the number of shards for which _to is in sync:
doForAllShards(_snapshot, _database, shardsLikeMe,
[this, &done](Slice plan, Slice current, std::string& planPath) {
[this, &done](Slice plan, Slice current, std::string& planPath, std::string& curPath) {
for (auto const& s : VPackArrayIterator(current)) {
if (s.copyString() == _to) {
++done;
@ -469,7 +475,7 @@ JOB_STATUS MoveShard::pendingLeader() {
VPackObjectBuilder trxObject(&trx);
VPackObjectBuilder preObject(&pre);
doForAllShards(_snapshot, _database, shardsLikeMe,
[this, &trx, &pre](Slice plan, Slice current, std::string& planPath) {
[this, &trx, &pre](Slice plan, Slice current, std::string& planPath, std::string& curPath) {
// Replace _from by "_" + _from
trx.add(VPackValue(planPath));
{
@ -500,7 +506,7 @@ JOB_STATUS MoveShard::pendingLeader() {
// Retired old leader, let's check that the fromServer has retired:
size_t done = 0; // count the number of shards for which leader has retired
doForAllShards(_snapshot, _database, shardsLikeMe,
[this, &done](Slice plan, Slice current, std::string& planPath) {
[this, &done](Slice plan, Slice current, std::string& planPath, std::string& curPath) {
if (current.length() > 0 && current[0].copyString() == "_" + _from) {
++done;
}
@ -521,7 +527,7 @@ JOB_STATUS MoveShard::pendingLeader() {
VPackObjectBuilder trxObject(&trx);
VPackObjectBuilder preObject(&pre);
doForAllShards(_snapshot, _database, shardsLikeMe,
[this, &trx, &pre](Slice plan, Slice current, std::string& planPath) {
[this, &trx, &pre](Slice plan, Slice current, std::string& planPath, std::string& curPath) {
// Replace "_" + _from by _to and leave _from out:
trx.add(VPackValue(planPath));
{
@ -556,7 +562,7 @@ JOB_STATUS MoveShard::pendingLeader() {
// all but except the old leader are in sync:
size_t done = 0;
doForAllShards(_snapshot, _database, shardsLikeMe,
[this, &done](Slice plan, Slice current, std::string& planPath) {
[this, &done](Slice plan, Slice current, std::string& planPath, std::string& curPath) {
if (current.length() > 0 && current[0].copyString() == _to) {
if (plan.length() < 3) {
// This only happens for replicationFactor == 1, in
@ -571,7 +577,7 @@ JOB_STATUS MoveShard::pendingLeader() {
for (size_t i = 1; i < plan.length() - 1; ++i) {
VPackSlice p = plan[i];
for (auto const& c : VPackArrayIterator(current)) {
if (arangodb::basics::VelocyPackHelper::compare(p, c, true)) {
if (arangodb::basics::VelocyPackHelper::compare(p, c, true) == 0) {
++found;
break;
}
@ -599,7 +605,7 @@ JOB_STATUS MoveShard::pendingLeader() {
VPackObjectBuilder trxObject(&trx);
VPackObjectBuilder preObject(&pre);
doForAllShards(_snapshot, _database, shardsLikeMe,
[&trx, &pre, this](Slice plan, Slice current, std::string& planPath) {
[&trx, &pre, this](Slice plan, Slice current, std::string& planPath, std::string& curPath) {
if (!_remainsFollower) {
// Remove _from from the list of follower
trx.add(VPackValue(planPath));
@ -663,7 +669,7 @@ JOB_STATUS MoveShard::pendingFollower() {
size_t done = 0; // count the number of shards done
doForAllShards(_snapshot, _database, shardsLikeMe,
[&done](Slice plan, Slice current, std::string& planPath) {
[&done](Slice plan, Slice current, std::string& planPath, std::string& curPath) {
if (ClusterHelpers::compareServerLists(plan, current)) {
++done;
}
@ -695,7 +701,7 @@ JOB_STATUS MoveShard::pendingFollower() {
// All changes to Plan for all shards, with precondition:
doForAllShards(_snapshot, _database, shardsLikeMe,
[this, &trx, &precondition](Slice plan, Slice current, std::string& planPath) {
[this, &trx, &precondition](Slice plan, Slice current, std::string& planPath, std::string& curPath) {
// Remove fromServer from Plan:
trx.add(VPackValue(planPath));
{
@ -748,15 +754,51 @@ arangodb::Result MoveShard::abort() {
return result;
}
// Can now only be TODO or PENDING
// Can now only be TODO or PENDING.
if (_status == TODO) {
finish("", "", true, "job aborted");
return result;
// Do NOT remove, just cause it seems obvious!
// We're working off a snapshot.
// Make sure ToDo is still actually to be done
auto todoPrec = std::make_shared<Builder>();
{ VPackArrayBuilder b(todoPrec.get());
{ VPackObjectBuilder o(todoPrec.get()); } // nothing to declare
{ VPackObjectBuilder path(todoPrec.get()); // expect jobs still to be sitting in ToDo
todoPrec->add(VPackValue(toDoPrefix + _jobId));
{ VPackObjectBuilder guard(todoPrec.get());
todoPrec->add("oldEmpty", VPackValue(false));
}
}
}
if (finish("", "", true, "job aborted", todoPrec)) {
return result;
}
_status = PENDING;
// If the above finish failed, then we must be in PENDING
}
// Can now only be PENDING
// Find the other shards in the same distributeShardsLike group:
std::vector<Job::shard_t> shardsLikeMe =
clones(_snapshot, _database, _collection, _shard);
clones(_snapshot, _database, _collection, _shard);
// We can no longer abort by reverting to where we started, if any of the
// shards of the distributeShardsLike group has already gone to new leader
if (_isLeader) {
for (auto const& i : shardsLikeMe) {
auto const& cur = _snapshot.hasAsArray(
curColPrefix + _database + "/" + i.collection + "/" + i.shard + "/" + "servers");
if (cur.second && cur.first[0].copyString() == _to) {
LOG_TOPIC(INFO, Logger::SUPERVISION) <<
"MoveShard can no longer abort through reversion to where it started. Flight forward";
finish(_to, _shard, true, "job aborted - new leader already in place");
return result;
}
}
}
Builder trx; // to build the transaction
// Now look after a PENDING job:
@ -767,7 +809,7 @@ arangodb::Result MoveShard::abort() {
if (_isLeader) {
// All changes to Plan for all shards:
doForAllShards(_snapshot, _database, shardsLikeMe,
[this, &trx](Slice plan, Slice current, std::string& planPath) {
[this, &trx](Slice plan, Slice current, std::string& planPath, std::string& curPath) {
// Restore leader to be _from:
trx.add(VPackValue(planPath));
{
@ -784,7 +826,7 @@ arangodb::Result MoveShard::abort() {
} else {
// All changes to Plan for all shards:
doForAllShards(_snapshot, _database, shardsLikeMe,
[this, &trx](Slice plan, Slice current, std::string& planPath) {
[this, &trx](Slice plan, Slice current, std::string& planPath, std::string& curPath) {
// Remove toServer from Plan:
trx.add(VPackValue(planPath));
{
@ -805,8 +847,18 @@ arangodb::Result MoveShard::abort() {
addReleaseServer(trx, _to);
addIncreasePlanVersion(trx);
}
if (_isLeader) { // Precondition, that current is still as in snapshot
VPackObjectBuilder preconditionObj(&trx);
// Current preconditions for all shards
doForAllShards(
_snapshot, _database, shardsLikeMe,
[this, &trx](
Slice plan, Slice current, std::string& planPath, std::string& curPath) {
// Current still as is
trx.add(curPath, current);
});
}
}
write_ret_t res = singleWriteTransaction(_agent, trx);
if (!res.accepted) {
@ -814,10 +866,16 @@ arangodb::Result MoveShard::abort() {
std::string("Lost leadership"));
return result;
} else if (res.indices[0] == 0) {
if (_isLeader) {
// Tough luck. Things have changed. We'll move on
LOG_TOPIC(INFO, Logger::SUPERVISION) <<
"MoveShard can no longer abort through reversion to where it started. Flight forward";
finish(_to, _shard, true, "job aborted - new leader already in place");
return result;
}
result = Result(
TRI_ERROR_SUPERVISION_GENERAL_FAILURE,
std::string("Precondition failed while aborting moveShard job ") + _jobId);
return result;
TRI_ERROR_SUPERVISION_GENERAL_FAILURE,
std::string("Precondition failed while aborting moveShard job ") + _jobId);
}
return result;

View File

@ -47,9 +47,9 @@ struct MoveShard : public Job {
virtual ~MoveShard();
virtual JOB_STATUS status() override final;
virtual void run() override final;
virtual void run(bool&) override final;
virtual bool create(std::shared_ptr<VPackBuilder> envelope = nullptr) override final;
virtual bool start() override final;
virtual bool start(bool&) override final;
virtual Result abort() override;
JOB_STATUS pendingLeader();
JOB_STATUS pendingFollower();
@ -61,6 +61,8 @@ struct MoveShard : public Job {
std::string _to;
bool _isLeader;
bool _remainsFollower;
bool _toServerIsFollower;
};
} // namespace consensus
} // namespace arangodb

View File

@ -65,7 +65,7 @@ RemoveFollower::RemoveFollower(Node const& snapshot, AgentInterface* agent,
RemoveFollower::~RemoveFollower() {}
void RemoveFollower::run() { runHelper("", _shard); }
void RemoveFollower::run(bool& aborts) { runHelper("", _shard, aborts); }
bool RemoveFollower::create(std::shared_ptr<VPackBuilder> envelope) {
LOG_TOPIC(DEBUG, Logger::SUPERVISION)
@ -122,7 +122,7 @@ bool RemoveFollower::create(std::shared_ptr<VPackBuilder> envelope) {
return false;
}
bool RemoveFollower::start() {
bool RemoveFollower::start(bool&) {
// If anything throws here, the run() method catches it and finishes
// the job.
@ -149,7 +149,19 @@ bool RemoveFollower::start() {
// First check that we still have too many followers for the current
// `replicationFactor`:
size_t desiredReplFactor = collection.hasAsUInt("replicationFactor").first;
size_t desiredReplFactor = 1;
auto replFact = collection.hasAsUInt("replicationFactor");
if (replFact.second) {
desiredReplFactor = replFact.first;
} else {
auto replFact2 = collection.hasAsString("replicationFactor");
if (replFact2.second && replFact2.first == "satellite") {
// satellites => distribute to every server
auto available = Job::availableServers(_snapshot);
desiredReplFactor = Job::countGoodServersInList(_snapshot, available);
}
}
size_t actualReplFactor = planned.length();
if (actualReplFactor <= desiredReplFactor) {
finish("", "", true, "job no longer necessary, have few enough replicas");
@ -187,7 +199,7 @@ bool RemoveFollower::start() {
}
doForAllShards(_snapshot, _database, shardsLikeMe,
[&planned, &overview, &leaderBad](Slice plan, Slice current,
std::string& planPath) {
std::string& planPath, std::string& curPath) {
if (current.length() > 0) {
if (current[0].copyString() != planned[0].copyString()) {
leaderBad = true;
@ -286,13 +298,35 @@ bool RemoveFollower::start() {
if (pair.second >= 0 &&
static_cast<size_t>(pair.second) >= shardsLikeMe.size() &&
pair.first != planned[0].copyString()) {
chosenToRemove.insert(pair.first);
--currentReplFactor;
if (Job::isInServerList(_snapshot, toBeCleanedPrefix, pair.first, true) ||
Job::isInServerList(_snapshot, cleanedPrefix, pair.first, true)) {
// Prefer those cleaned or to be cleaned servers
chosenToRemove.insert(pair.first);
--currentReplFactor;
}
}
if (currentReplFactor == desiredReplFactor) {
break;
}
}
if (currentReplFactor > desiredReplFactor) {
// Now allow those which are perfectly good as well:
for (auto const& it : reversedPlannedServers) {
auto const pair = *overview.find(it);
if (pair.second >= 0 &&
static_cast<size_t>(pair.second) >= shardsLikeMe.size() &&
pair.first != planned[0].copyString()) {
if (!Job::isInServerList(_snapshot, toBeCleanedPrefix, pair.first, true) &&
!Job::isInServerList(_snapshot, cleanedPrefix, pair.first, true)) {
chosenToRemove.insert(pair.first);
--currentReplFactor;
}
}
if (currentReplFactor == desiredReplFactor) {
break;
}
}
}
}
}
}
@ -347,7 +381,7 @@ bool RemoveFollower::start() {
// --- Plan changes
doForAllShards(_snapshot, _database, shardsLikeMe,
[&trx, &chosenToRemove](Slice plan, Slice current, std::string& planPath) {
[&trx, &chosenToRemove](Slice plan, Slice current, std::string& planPath, std::string& curPath) {
trx.add(VPackValue(planPath));
{
VPackArrayBuilder serverList(&trx);

View File

@ -42,8 +42,8 @@ struct RemoveFollower : public Job {
virtual JOB_STATUS status() override final;
virtual bool create(std::shared_ptr<VPackBuilder> envelope = nullptr) override final;
virtual void run() override final;
virtual bool start() override final;
virtual void run(bool& aborts) override final;
virtual bool start(bool&) override final;
virtual Result abort() override final;
std::string _database;

View File

@ -560,7 +560,8 @@ RestStatus RestAgencyHandler::handleConfig() {
}
RestStatus RestAgencyHandler::handleState() {
Builder body;
VPackBuilder body;
body.add(VPackValue(VPackValueType::Array));
for (auto const& i : _agent->state().get()) {
body.add(VPackValue(VPackValueType::Object));

View File

@ -32,10 +32,6 @@
#include <sstream>
#include <thread>
#include <boost/uuid/uuid.hpp>
#include <boost/uuid/uuid_generators.hpp>
#include <boost/uuid/uuid_io.hpp>
#include "Agency/Agent.h"
#include "Aql/Query.h"
#include "Aql/QueryRegistry.h"
@ -1526,3 +1522,78 @@ std::shared_ptr<VPackBuilder> State::latestAgencyState(TRI_vocbase_t& vocbase,
store.dumpToBuilder(*builder);
return builder;
}
/// @brief load a compacted snapshot, returns true if successfull and false
/// otherwise. In case of success store and index are modified. The store
/// is reset to the state after log index `index` has been applied. Sets
/// `index` to 0 if there is no compacted snapshot.
uint64_t State::toVelocyPack(index_t lastIndex, VPackBuilder& builder) const {
TRI_ASSERT(builder.isOpenObject());
auto bindVars = std::make_shared<VPackBuilder>();
{ VPackObjectBuilder b(bindVars.get()); }
std::string const querystr
= "FOR l IN log FILTER l._key <= 'buf" + stringify(lastIndex) +
"' SORT l._key RETURN {'_key': l._key, 'timestamp': l.timestamp,"
"'clientId': l.clientId, 'request': l.request}";
TRI_ASSERT(nullptr != _vocbase); // this check was previously in the Query constructor
arangodb::aql::Query logQuery(false, *_vocbase, aql::QueryString(querystr), bindVars,
nullptr, arangodb::aql::PART_MAIN);
aql::QueryResult logQueryResult = logQuery.executeSync(_queryRegistry);
if (logQueryResult.code != TRI_ERROR_NO_ERROR) {
THROW_ARANGO_EXCEPTION_MESSAGE(logQueryResult.code, logQueryResult.details);
}
VPackSlice result = logQueryResult.result->slice();
std::string firstIndex;
uint64_t n = 0;
builder.add(VPackValue("log"));
if (result.isArray()) {
try {
builder.add(result.resolveExternals());
n = result.length();
if (n > 0) {
firstIndex = result[0].get("_key").copyString();
}
} catch (...) {
VPackArrayBuilder a(&builder);
}
}
if (n > 0) {
std::string const compstr
= "FOR c in compact FILTER c._key >= '" + firstIndex +
"' SORT c._key LIMIT 1 RETURN c";
arangodb::aql::Query compQuery(false, *_vocbase, aql::QueryString(compstr),
bindVars, nullptr, arangodb::aql::PART_MAIN);
aql::QueryResult compQueryResult = compQuery.executeSync(_queryRegistry);
if (compQueryResult.code != TRI_ERROR_NO_ERROR) {
THROW_ARANGO_EXCEPTION_MESSAGE(compQueryResult.code, compQueryResult.details);
}
result = compQueryResult.result->slice();
if (result.isArray()) {
if (result.length() > 0) {
builder.add(VPackValue("compaction"));
try {
builder.add(result[0].resolveExternals());
} catch (...) {
VPackObjectBuilder a(&builder);
}
}
}
}
return n;
}

View File

@ -81,6 +81,9 @@ class State {
/// Default: [first, last]
std::vector<log_t> get(index_t = 0, index_t = (std::numeric_limits<uint64_t>::max)()) const;
uint64_t toVelocyPack(index_t lastIndex, VPackBuilder& builder) const;
private:
/// @brief Get complete log entries bound by lower and upper bounds.
/// Default: [first, last]

View File

@ -331,6 +331,7 @@ void handleOnStatusDBServer(Agent* agent, Node const& snapshot,
void handleOnStatusCoordinator(Agent* agent, Node const& snapshot, HealthRecord& persisted,
HealthRecord& transisted, std::string const& serverID) {
if (transisted.status == Supervision::HEALTH_STATUS_FAILED) {
// if the current foxxmaster server failed => reset the value to ""
if (snapshot.hasAsString(foxxmaster).first == serverID) {
@ -382,6 +383,7 @@ void handleOnStatusSingle(Agent* agent, Node const& snapshot, HealthRecord& pers
}
}
void handleOnStatus(Agent* agent, Node const& snapshot, HealthRecord& persisted,
HealthRecord& transisted, std::string const& serverID,
uint64_t const& jobId, std::shared_ptr<VPackBuilder>& envelope) {
@ -397,6 +399,7 @@ void handleOnStatus(Agent* agent, Node const& snapshot, HealthRecord& persisted,
}
}
// Build transaction for removing unattended servers from health monitoring
query_t arangodb::consensus::removeTransactionBuilder(std::vector<std::string> const& todelete) {
query_t del = std::make_shared<Builder>();
@ -785,6 +788,9 @@ void Supervision::run() {
TRI_ASSERT(_agent != nullptr);
while (!this->isStopping()) {
auto lapStart = std::chrono::steady_clock::now();
{
MUTEX_LOCKER(locker, _lock);
@ -813,6 +819,8 @@ void Supervision::run() {
upgradeAgency();
}
_haveAborts = false;
if (_agent->leaderFor() > 55 || earlyBird()) {
// 55 seconds is less than a minute, which fits to the
// 60 seconds timeout in /_admin/cluster/health
@ -840,7 +848,30 @@ void Supervision::run() {
}
}
}
_cv.wait(static_cast<uint64_t>(1000000 * _frequency));
// If anything was rafted, we need to
index_t leaderIndex = _agent->index();
if (leaderIndex != 0) {
while (true) { // No point in progressing, if indexes cannot be advanced
auto result = _agent->waitFor(leaderIndex);
if (result == Agent::raft_commit_t::UNKNOWN ||
result == Agent::raft_commit_t::TIMEOUT) { // Oh snap
LOG_TOPIC(WARN, Logger::SUPERVISION) << "Waiting for commits to be done ... ";
continue;
} else { // Good we can continue
break;
}
}
}
auto lapTime = std::chrono::duration_cast<std::chrono::microseconds>(
std::chrono::steady_clock::now() - lapStart).count();
if (lapTime < 1000000) {
_cv.wait(static_cast<uint64_t>((1000000 - lapTime) * _frequency));
}
}
}
@ -1069,6 +1100,7 @@ bool Supervision::handleJobs() {
enforceReplication();
cleanupLostCollections(_snapshot, _agent, std::to_string(_jobId++));
readyOrphanedIndexCreations();
workJobs();
return true;
@ -1078,15 +1110,38 @@ bool Supervision::handleJobs() {
void Supervision::workJobs() {
_lock.assertLockedByCurrentThread();
for (auto const& todoEnt : _snapshot.hasAsChildren(toDoPrefix).first) {
JobContext(TODO, (*todoEnt.second).hasAsString("jobId").first, _snapshot, _agent)
.run();
bool dummy = false;
auto todos = _snapshot.hasAsChildren(toDoPrefix).first;
auto it = todos.begin();
static std::string const FAILED = "failed";
while (it != todos.end()) {
auto jobNode = *(it->second);
if (jobNode.hasAsString("type").first.compare(0, FAILED.length(), FAILED) == 0) {
JobContext(TODO, jobNode.hasAsString("jobId").first, _snapshot, _agent)
.run(_haveAborts);
it = todos.erase(it);
} else {
++it;
}
}
// Do not start other jobs, if above resilience jobs aborted stuff
if (!_haveAborts) {
for (auto const& todoEnt : todos) {
auto jobNode = *(todoEnt.second);
JobContext(TODO, jobNode.hasAsString("jobId").first, _snapshot, _agent)
.run(dummy);
}
}
for (auto const& pendEnt : _snapshot.hasAsChildren(pendingPrefix).first) {
JobContext(PENDING, (*pendEnt.second).hasAsString("jobId").first, _snapshot, _agent)
.run();
auto pends = _snapshot.hasAsChildren(pendingPrefix).first;
for (auto const& pendEnt : pends) {
auto jobNode = *(pendEnt.second);
JobContext(PENDING, jobNode.hasAsString("jobId").first, _snapshot, _agent)
.run(dummy);
}
}
void Supervision::readyOrphanedIndexCreations() {
@ -1205,18 +1260,20 @@ void Supervision::enforceReplication() {
auto const& col = *(col_.second);
size_t replicationFactor;
if (col.hasAsUInt("replicationFactor").second) {
replicationFactor = col.hasAsUInt("replicationFactor").first;
auto replFact = col.hasAsUInt("replicationFactor");
if (replFact.second) {
replicationFactor = replFact.first;
} else {
LOG_TOPIC(DEBUG, Logger::SUPERVISION)
<< "no replicationFactor entry in " << col.toJson();
continue;
}
// mop: satellites => distribute to every server
if (replicationFactor == 0) {
auto available = Job::availableServers(_snapshot);
replicationFactor = available.size();
auto replFact2 = col.hasAsString("replicationFactor");
if (replFact2.second && replFact2.first == "satellite") {
// satellites => distribute to every server
auto available = Job::availableServers(_snapshot);
replicationFactor = Job::countGoodServersInList(_snapshot, available);
} else {
LOG_TOPIC(DEBUG, Logger::SUPERVISION)
<< "no replicationFactor entry in " << col.toJson();
continue;
}
}
bool clone = col.has("distributeShardsLike");
@ -1224,9 +1281,24 @@ void Supervision::enforceReplication() {
if (!clone) {
for (auto const& shard_ : col.hasAsChildren("shards").first) { // Pl shards
auto const& shard = *(shard_.second);
VPackBuilder onlyFollowers;
{
VPackArrayBuilder guard(&onlyFollowers);
bool first = true;
for (auto const& pp : VPackArrayIterator(shard.slice())) {
if (!first) {
onlyFollowers.add(pp);
}
first = false;
}
}
size_t actualReplicationFactor
= 1 + Job::countGoodServersInList(_snapshot, onlyFollowers.slice());
// leader plus GOOD followers
size_t apparentReplicationFactor = shard.slice().length();
size_t actualReplicationFactor = shard.slice().length();
if (actualReplicationFactor != replicationFactor) {
if (actualReplicationFactor != replicationFactor ||
apparentReplicationFactor != replicationFactor) {
// Check that there is not yet an addFollower or removeFollower
// or moveShard job in ToDo for this shard:
auto const& todo = _snapshot.hasAsChildren(toDoPrefix).first;
@ -1256,11 +1328,12 @@ void Supervision::enforceReplication() {
if (actualReplicationFactor < replicationFactor) {
AddFollower(_snapshot, _agent, std::to_string(_jobId++),
"supervision", db_.first, col_.first, shard_.first)
.run();
} else {
.create();
} else if (apparentReplicationFactor > replicationFactor &&
actualReplicationFactor >= replicationFactor) {
RemoveFollower(_snapshot, _agent, std::to_string(_jobId++),
"supervision", db_.first, col_.first, shard_.first)
.run();
.create();
}
}
}
@ -1397,9 +1470,10 @@ void Supervision::shrinkCluster() {
std::sort(availServers.begin(), availServers.end());
// Schedule last server for cleanout
bool dummy;
CleanOutServer(_snapshot, _agent, std::to_string(_jobId++),
"supervision", availServers.back())
.run();
.run(dummy);
}
}
}

View File

@ -200,6 +200,7 @@ class Supervision : public arangodb::CriticalThread {
double _okThreshold;
uint64_t _jobId;
uint64_t _jobIdMax;
bool _haveAborts; /**< @brief We have accumulated pending aborts in a round */
// mop: this feels very hacky...we have a hen and egg problem here
// we are using /Shutdown in the agency to determine that the cluster should

View File

@ -3770,6 +3770,15 @@ arangodb::Result ClusterInfo::getShardServers(ShardID const& shardId,
return arangodb::Result(TRI_ERROR_FAILED);
}
arangodb::Result ClusterInfo::agencyDump(std::shared_ptr<VPackBuilder> body) {
AgencyCommResult dump = _agency.dump();
body->add(dump.slice());
return Result();
}
// -----------------------------------------------------------------------------
// --SECTION-- END-OF-FILE
// -----------------------------------------------------------------------------

View File

@ -275,6 +275,8 @@ class ClusterInfo {
uint64_t uniqid(uint64_t = 1);
arangodb::Result agencyDump(std::shared_ptr<VPackBuilder> body);
//////////////////////////////////////////////////////////////////////////////
/// @brief flush the caches (used for testing only)
//////////////////////////////////////////////////////////////////////////////

View File

@ -254,10 +254,15 @@ DBServerAgencySyncResult DBServerAgencySync::execute() {
}
operations.push_back(AgencyOperation("Current/Version",
AgencySimpleOperationType::INCREMENT_OP));
AgencyWriteTransaction currentTransaction(operations);
AgencyPrecondition precondition("Plan/Version",
AgencyPrecondition::Type::VALUE, plan->slice().get("Version"));
AgencyWriteTransaction currentTransaction(operations, precondition);
AgencyCommResult r = comm.sendTransactionWithFailover(currentTransaction);
if (!r.successful()) {
LOG_TOPIC(ERR, Logger::MAINTENANCE) << "Error reporting to agency";
LOG_TOPIC(INFO, Logger::MAINTENANCE)
<< "Error reporting to agency: _statusCode: " << r.errorCode()
<< " message: " << r.errorMessage()
<< ". This can be ignored, since it will be retried automaticlly.";
} else {
LOG_TOPIC(DEBUG, Logger::MAINTENANCE)
<< "Invalidating current in ClusterInfo";

View File

@ -18,6 +18,7 @@
/// Copyright holder is ArangoDB GmbH, Cologne, Germany
///
/// @author Simon Grätzer
/// @author Kaveh Vahedipour
////////////////////////////////////////////////////////////////////////////////
#include "RestClusterHandler.h"
@ -25,6 +26,7 @@
#include "Agency/Supervision.h"
#include "Cluster/ClusterInfo.h"
#include "Cluster/ServerState.h"
#include "GeneralServer/AuthenticationFeature.h"
#include "Replication/ReplicationFeature.h"
#include "Rest/HttpRequest.h"
#include "Rest/Version.h"
@ -47,16 +49,46 @@ RestStatus RestClusterHandler::execute() {
}
std::vector<std::string> const& suffixes = _request->suffixes();
if (!suffixes.empty() && suffixes[0] == "endpoints") {
handleCommandEndpoints();
if (!suffixes.empty()) {
if (suffixes[0] == "endpoints") {
handleCommandEndpoints();
} else if (suffixes[0] == "agency-dump") {
handleAgencyDump();
} else {
generateError(
Result(TRI_ERROR_FORBIDDEN, "expecting _api/cluster/[endpoints,agency-dump]"));
}
} else {
generateError(
Result(TRI_ERROR_FORBIDDEN, "expecting _api/cluster/endpoints"));
Result(TRI_ERROR_FORBIDDEN, "expecting _api/cluster/[endpoints,agency-dump]"));
}
return RestStatus::DONE;
}
void RestClusterHandler::handleAgencyDump() {
AuthenticationFeature* af = AuthenticationFeature::instance();
if (af->isActive() && !_request->user().empty()) {
auth::Level lvl = auth::Level::NONE;
if (af->userManager() != nullptr) {
lvl = af->userManager()->databaseAuthLevel(_request->user(), "_system", true);
} else {
lvl = auth::Level::RW;
}
if (lvl < auth::Level::RW) {
generateError(rest::ResponseCode::FORBIDDEN, TRI_ERROR_HTTP_FORBIDDEN,
"you need admin rights to trigger shutdown");
return;
}
}
std::shared_ptr<VPackBuilder> body = std::make_shared<VPackBuilder>();
ClusterInfo::instance()->agencyDump(body);
generateResult(rest::ResponseCode::OK, body->slice());
}
/// @brief returns information about all coordinator endpoints
void RestClusterHandler::handleCommandEndpoints() {
ClusterInfo* ci = ClusterInfo::instance();

View File

@ -41,6 +41,9 @@ class RestClusterHandler : public arangodb::RestBaseHandler {
/// _api/cluster/serverInfo
void handleCommandServerInfo();
/// _api/cluster/agency-dump
void handleAgencyDump();
};
} // namespace arangodb

View File

@ -80,11 +80,14 @@ actions.defineHttp({
return;
}
let preconditions = {};
preconditions['/arango/Supervision/Health/' + serverId + '/Status'] = {'old': 'FAILED'};
// need to make sure it is not responsible for anything
if (node.Role === 'DBServer') {
let used = [];
let count = 0; // Try for 60s if server still in use or not failed
let msg = "";
let used = [];
while (++count <= 60) {
let preconditions = {};
preconditions['/arango/Supervision/Health/' + serverId + '/Status'] = {'old': 'FAILED'};
// need to make sure it is not responsible for anything
used = [];
preconditions = reducePlanServers(function (data, agencyKey, servers) {
data[agencyKey] = {'old': servers};
if (servers.indexOf(serverId) !== -1) {
@ -100,36 +103,41 @@ actions.defineHttp({
return data;
}, preconditions);
if (used.length > 0) {
actions.resultError(req, res, actions.HTTP_PRECONDITION_FAILED,
'the server is still in use at the following locations: ' + JSON.stringify(used));
return;
preconditions["/arango/Supervision/DBServers/" + serverId]
= { "oldEmpty": true };
if (!checkServerLocked(serverId) && used.length === 0) {
let operations = {};
operations['/arango/Plan/Coordinators/' + serverId] = {'op': 'delete'};
operations['/arango/Plan/DBServers/' + serverId] = {'op': 'delete'};
operations['/arango/Current/ServersRegistered/' + serverId] = {'op': 'delete'};
operations['/arango/Supervision/Health/' + serverId] = {'op': 'delete'};
operations['/arango/Target/MapUniqueToShortID/' + serverId] = {'op': 'delete'};
try {
global.ArangoAgency.write([[operations, preconditions]]);
actions.resultOk(req, res, actions.HTTP_OK, true);
return;
} catch (e) {
if (e.code === 412) {
console.log("removeServer: got precondition failed, retrying...");
} else {
console.warn("removeServer: could not talk to agency, retrying...");
}
}
} else {
if (used.length > 0) {
console.log("removeServer: server", serverId, "still in use in",
used.length, "locations.");
} else {
console.log("removeServer: server", serverId, "locked in agency.");
}
}
}
let operations = {};
operations['/arango/Plan/Coordinators/' + serverId] = {'op': 'delete'};
operations['/arango/Plan/DBServers/' + serverId] = {'op': 'delete'};
operations['/arango/Current/ServersRegistered/' + serverId] = {'op': 'delete'};
operations['/arango/Supervision/Health/' + serverId] = {'op': 'delete'};
operations['/arango/Target/MapUniqueToShortID/' + serverId] = {'op': 'delete'};
try {
global.ArangoAgency.write([[operations, preconditions]]);
} catch (e) {
if (e.code === 412) {
actions.resultError(req, res, actions.HTTP_PRECONDITION_FAILED,
'you can only remove failed servers');
return;
}
throw e;
}
actions.resultOk(req, res, actions.HTTP_OK, true);
/* DBOnly:
Current/Databases/YYY/XXX
*/
wait(1.0);
} // while count
actions.resultError(req, res, actions.HTTP_PRECONDITION_FAILED,
'the server not failed, locked or is still in use at the following '
+ 'locations: ' + JSON.stringify(used));
}
});
@ -635,6 +643,17 @@ function reduceCurrentServers (reducer, data) {
}, data);
}
function checkServerLocked (server) {
var locks = ArangoAgency.get('Supervision/DBServers');
try {
if (locks.arango.Supervision.DBServers.hasOwnProperty(server)) {
return true;
}
} catch (e) {
}
return false;
}
// //////////////////////////////////////////////////////////////////////////////
// / @start Docu Block JSF_getNumberOfServers
// / (intentionally not in manual)

View File

@ -66,7 +66,7 @@ std::vector<std::string> split(std::string const& source,
/// @brief joins a string
template <typename C>
std::string join(C const& source, std::string const& delim = ",") {
std::string join(C const& source, std::string const& delim) {
std::string result;
bool first = true;

View File

@ -418,6 +418,7 @@ rest::ResponseCode GeneralResponse::responseCode(int code) {
case TRI_ERROR_CLUSTER_LEADERSHIP_CHALLENGE_ONGOING:
case TRI_ERROR_CLUSTER_NOT_LEADER:
case TRI_ERROR_SHUTTING_DOWN:
case TRI_ERROR_CLUSTER_CONNECTION_LOST:
return ResponseCode::SERVICE_UNAVAILABLE;
case TRI_ERROR_CLUSTER_UNSUPPORTED:

View File

@ -54,6 +54,7 @@ const std::string FOLLOWER3 = "SNGL-follower23"; // tick 9, STATE GOOD
const std::string FOLLOWER4 = "SNGL-follower4"; // tick 100, STATE BAD
const std::string FOLLOWER5 = "SNGL-follower5"; // tick 1000, STATE GOOD wrong leader
bool aborts = false;
const char *agency =
#include "ActiveFailoverTest.json"
@ -229,7 +230,7 @@ TEST_CASE("ActiveFailover", "[agency][supervision]") {
return fakeWriteResult;
});
REQUIRE(job.start());
REQUIRE(job.start(aborts));
REQUIRE(job.status() == JOB_STATUS::FINISHED);
Verify(Method(mockAgent,write)).Exactly(2);
@ -280,7 +281,7 @@ TEST_CASE("ActiveFailover", "[agency][supervision]") {
return fakeWriteResult;
});
REQUIRE(job.start());
REQUIRE(job.start(aborts));
REQUIRE(job.status() == JOB_STATUS::FINISHED);
Verify(Method(mockAgent,write)).Exactly(2);
@ -331,7 +332,7 @@ TEST_CASE("ActiveFailover", "[agency][supervision]") {
return fakeWriteResult;
});
REQUIRE_FALSE(job.start());
REQUIRE_FALSE(job.start(aborts));
// job status stays on TODO and can retry later
REQUIRE(job.status() == JOB_STATUS::TODO);
Verify(Method(mockAgent,transient)).Exactly(Once);
@ -402,7 +403,7 @@ TEST_CASE("ActiveFailover", "[agency][supervision]") {
return fakeWriteResult;
});
REQUIRE(job.start());
REQUIRE(job.start(aborts));
// job status stays on TODO and can retry later
REQUIRE(job.status() == JOB_STATUS::FINISHED);
Verify(Method(mockAgent,transient)).Exactly(1);

View File

@ -56,6 +56,8 @@ const std::string SHARD_FOLLOWER2 = "follower2";
const std::string FREE_SERVER = "free";
const std::string FREE_SERVER2 = "free2";
bool aborts = false;
const char *agency =
#include "AddFollowerTest.json"
;
@ -209,7 +211,7 @@ TEST_CASE("AddFollower", "[agency][supervision]") {
When(Method(mockAgent, waitFor)).AlwaysReturn(AgentInterface::raft_commit_t::OK);
auto& agent = mockAgent.get();
AddFollower(agency("arango"), &agent, JOB_STATUS::TODO, jobId).start();
AddFollower(agency("arango"), &agent, JOB_STATUS::TODO, jobId).start(aborts);
}
@ -266,7 +268,7 @@ TEST_CASE("AddFollower", "[agency][supervision]") {
});
When(Method(mockAgent, waitFor)).AlwaysReturn(AgentInterface::raft_commit_t::OK);
auto& agent = mockAgent.get();
AddFollower(agency("arango"), &agent, JOB_STATUS::TODO, jobId).start();
AddFollower(agency("arango"), &agent, JOB_STATUS::TODO, jobId).start(aborts);
}
@ -331,7 +333,7 @@ TEST_CASE("AddFollower", "[agency][supervision]") {
});
When(Method(mockAgent, waitFor)).AlwaysReturn(AgentInterface::raft_commit_t::OK);
AgentInterface &agent = mockAgent.get();
AddFollower(agency("arango"), &agent, JOB_STATUS::TODO, jobId).start();
AddFollower(agency("arango"), &agent, JOB_STATUS::TODO, jobId).start(aborts);
}
@ -384,7 +386,7 @@ TEST_CASE("AddFollower", "[agency][supervision]") {
});
When(Method(mockAgent, waitFor)).AlwaysReturn(AgentInterface::raft_commit_t::OK);
auto& agent = mockAgent.get();
AddFollower(agency("arango"), &agent, JOB_STATUS::TODO, jobId).start();
AddFollower(agency("arango"), &agent, JOB_STATUS::TODO, jobId).start(aborts);
}
@ -439,7 +441,7 @@ TEST_CASE("AddFollower", "[agency][supervision]") {
});
When(Method(mockAgent, waitFor)).AlwaysReturn(AgentInterface::raft_commit_t::OK);
AgentInterface &agent = mockAgent.get();
AddFollower(agency("arango"), &agent, JOB_STATUS::TODO, jobId).start();
AddFollower(agency("arango"), &agent, JOB_STATUS::TODO, jobId).start(aborts);
}
@ -494,7 +496,7 @@ TEST_CASE("AddFollower", "[agency][supervision]") {
});
When(Method(mockAgent, waitFor)).AlwaysReturn(AgentInterface::raft_commit_t::OK);
AgentInterface &agent = mockAgent.get();
AddFollower(agency("arango"), &agent, JOB_STATUS::TODO, jobId).start();
AddFollower(agency("arango"), &agent, JOB_STATUS::TODO, jobId).start(aborts);
}

View File

@ -47,7 +47,7 @@ R"=(
"Status": "GOOD"
},
"leader": {
"Status": "FAILED"
"Status": "GOOD"
},
"free": {
"Status": "GOOD"

View File

@ -50,6 +50,8 @@ const std::string PREFIX = "arango";
const std::string SERVER = "leader";
const std::string JOBID = "1";
bool aborts = false;
typedef std::function<std::unique_ptr<Builder>(
Slice const&, std::string const&)>TestStructureType;
@ -157,7 +159,7 @@ VPackBuilder createJob(std::string const& server) {
TEST_CASE("CleanOutServer", "[agency][supervision]") {
RandomGenerator::initialize(RandomGenerator::RandomType::MERSENNE);
auto baseStructure = createRootNode();
write_ret_t fakeWriteResult {true, "", std::vector<apply_ret_t> {APPLIED}, std::vector<index_t> {1}};
auto transBuilder = std::make_shared<Builder>();
{ VPackArrayBuilder a(transBuilder.get());
@ -220,7 +222,7 @@ SECTION("cleanout server should fail if the server does not exist") {
JOB_STATUS::TODO,
JOBID
);
cleanOutServer.start();
cleanOutServer.start(aborts);
Verify(Method(mockAgent, write));
Verify(Method(mockAgent, waitFor));
}
@ -262,7 +264,7 @@ SECTION("cleanout server should wait if the server is currently blocked") {
JOB_STATUS::TODO,
JOBID
);
cleanOutServer.start();
cleanOutServer.start(aborts);
REQUIRE(true);
}
@ -307,7 +309,7 @@ SECTION("cleanout server should wait if the server is not healthy right now") {
JOB_STATUS::TODO,
JOBID
);
cleanOutServer.start();
cleanOutServer.start(aborts);
REQUIRE(true);
}
@ -357,7 +359,7 @@ SECTION("cleanout server should fail if the server is already cleaned") {
JOB_STATUS::TODO,
JOBID
);
cleanOutServer.start();
cleanOutServer.start(aborts);
Verify(Method(mockAgent, write));
Verify(Method(mockAgent, waitFor));
}
@ -404,7 +406,7 @@ SECTION("cleanout server should fail if the server is failed") {
JOB_STATUS::TODO,
JOBID
);
cleanOutServer.start();
cleanOutServer.start(aborts);
Verify(Method(mockAgent, write));
Verify(Method(mockAgent, waitFor));
}
@ -453,7 +455,7 @@ SECTION("cleanout server should fail if the replicationFactor is too big for any
JOB_STATUS::TODO,
JOBID
);
cleanOutServer.start();
cleanOutServer.start(aborts);
Verify(Method(mockAgent, write));
Verify(Method(mockAgent, waitFor));
}
@ -503,7 +505,57 @@ SECTION("cleanout server should fail if the replicationFactor is too big for any
JOB_STATUS::TODO,
JOBID
);
cleanOutServer.start();
cleanOutServer.start(aborts);
Verify(Method(mockAgent, write));
Verify(Method(mockAgent, waitFor));
}
SECTION("cleanout server should fail if the replicationFactor is too big for any shard after counting in tobecleanedoutservers") {
TestStructureType createTestStructure = [&](VPackSlice const& s, std::string const& path) {
std::unique_ptr<VPackBuilder> builder;
builder.reset(new VPackBuilder());
if (s.isObject()) {
builder->add(VPackValue(VPackValueType::Object));
for (auto const& it: VPackObjectIterator(s)) {
auto childBuilder = createTestStructure(it.value, path + "/" + it.key.copyString());
if (childBuilder) {
builder->add(it.key.copyString(), childBuilder->slice());
}
}
if (path == "/arango/Target/ToDo") {
builder->add(JOBID, createJob(SERVER).slice());
}
builder->close();
} else {
if (path == "/arango/Target/ToBeCleanedServers") {
builder->add(VPackValue(VPackValueType::Array));
builder->add(VPackValue("free"));
builder->close();
}
builder->add(s);
}
return builder;
};
Mock<AgentInterface> mockAgent;
When(Method(mockAgent, write)).Do([&](query_t const& q, consensus::AgentInterface::WriteMode w) -> write_ret_t {
checkFailed(JOB_STATUS::TODO, q);
return fakeWriteResult;
});
When(Method(mockAgent, waitFor)).AlwaysReturn(AgentInterface::raft_commit_t::OK);
AgentInterface &agent = mockAgent.get();
Node agency = createAgency(createTestStructure);
INFO("AGENCY: " << agency.toJson());
// should not throw
auto cleanOutServer = CleanOutServer(
agency,
&agent,
JOB_STATUS::TODO,
JOBID
);
cleanOutServer.start(aborts);
Verify(Method(mockAgent, write));
Verify(Method(mockAgent, waitFor));
}
@ -549,6 +601,8 @@ SECTION("a cleanout server job should move into pending when everything is ok")
CHECK(std::string(writes.get("/arango/Target/Pending/1").get("timeStarted").typeName()) == "string");
REQUIRE(std::string(writes.get("/arango/Supervision/DBServers/" + SERVER).typeName()) == "string");
REQUIRE(writes.get("/arango/Supervision/DBServers/" + SERVER).copyString() == JOBID);
REQUIRE(writes.get("/arango/Target/ToBeCleanedServers").get("op").copyString() == "push");
REQUIRE(writes.get("/arango/Target/ToBeCleanedServers").get("new").copyString() == SERVER);
REQUIRE(writes.get("/arango/Target/ToDo/1-0").get("toServer").copyString() == "free");
auto preconditions = q->slice()[0][1];
@ -570,7 +624,7 @@ SECTION("a cleanout server job should move into pending when everything is ok")
JOB_STATUS::TODO,
JOBID
);
cleanOutServer.start();
cleanOutServer.start(aborts);
Verify(Method(mockAgent, write));
Verify(Method(mockAgent, waitFor));
}
@ -621,7 +675,7 @@ SECTION("a cleanout server job should abort after a long timeout") {
REQUIRE(std::string(q->slice().typeName()) == "array" );
REQUIRE(q->slice().length() == 1);
REQUIRE(std::string(q->slice()[0].typeName()) == "array");
REQUIRE(q->slice()[0].length() == 1); // we always simply override! no preconditions...
REQUIRE(q->slice()[0].length() == 2); // precondition that still in ToDo
REQUIRE(std::string(q->slice()[0][0].typeName()) == "object");
auto writes = q->slice()[0][0];
@ -630,6 +684,8 @@ SECTION("a cleanout server job should abort after a long timeout") {
CHECK(writes.get("/arango/Target/ToDo/1-0").get("op").copyString() == "delete");
// a not yet started job will be moved to finished
CHECK(std::string(writes.get("/arango/Target/Finished/1-0").typeName()) == "object");
auto preconds = q->slice()[0][1];
CHECK(preconds.get("/arango/Target/ToDo/1-0").get("oldEmpty").isFalse());
} else {
// finally cleanout should be failed
checkFailed(JOB_STATUS::PENDING, q);
@ -648,7 +704,7 @@ SECTION("a cleanout server job should abort after a long timeout") {
JOB_STATUS::PENDING,
JOBID
);
cleanOutServer.run();
cleanOutServer.run(aborts);
Verify(Method(mockAgent, write));
Verify(Method(mockAgent, waitFor));
}
@ -689,7 +745,7 @@ SECTION("when there are still subjobs to be done it should wait") {
JOB_STATUS::PENDING,
JOBID
);
cleanOutServer.run();
cleanOutServer.run(aborts);
REQUIRE(true);
};
@ -748,7 +804,7 @@ SECTION("once all subjobs were successful then the job should be finished") {
JOB_STATUS::PENDING,
JOBID
);
cleanOutServer.run();
cleanOutServer.run(aborts);
REQUIRE(true);
}
@ -793,7 +849,7 @@ SECTION("if there was a failed subjob then the job should also fail") {
JOB_STATUS::PENDING,
JOBID
);
cleanOutServer.run();
cleanOutServer.run(aborts);
REQUIRE(true);
}
@ -832,7 +888,7 @@ SECTION("when the cleanout server job is aborted all subjobs should be aborted t
REQUIRE(std::string(q->slice().typeName()) == "array" );
REQUIRE(q->slice().length() == 1);
REQUIRE(std::string(q->slice()[0].typeName()) == "array");
REQUIRE(q->slice()[0].length() == 1); // we always simply override! no preconditions...
REQUIRE(q->slice()[0].length() == 2); // precondition that still in ToDo
REQUIRE(std::string(q->slice()[0][0].typeName()) == "object");
auto writes = q->slice()[0][0];
@ -841,6 +897,8 @@ SECTION("when the cleanout server job is aborted all subjobs should be aborted t
CHECK(writes.get("/arango/Target/ToDo/1-0").get("op").copyString() == "delete");
// a not yet started job will be moved to finished
CHECK(std::string(writes.get("/arango/Target/Finished/1-0").typeName()) == "object");
auto preconds = q->slice()[0][1];
CHECK(preconds.get("/arango/Target/ToDo/1-0").get("oldEmpty").isFalse());
} else {
checkFailed(JOB_STATUS::PENDING, q);
}

View File

@ -58,6 +58,7 @@ R"=(
},
"Target": {
"CleanedServers": [],
"ToBeCleanedServers": [],
"FailedServers": {},
"MapUniqueToShortID": {
"follower1": {

View File

@ -58,6 +58,8 @@ const std::string SHARD_FOLLOWER2 = "follower2";
const std::string FREE_SERVER = "free";
const std::string FREE_SERVER2 = "free2";
bool aborts = false;
typedef std::function<std::unique_ptr<Builder>(
Slice const&, std::string const&)>TestStructureType;
@ -234,7 +236,7 @@ SECTION("if we want to start and the collection went missing from plan (our trut
JOB_STATUS::TODO,
jobId
);
failedFollower.start();
failedFollower.start(aborts);
}
SECTION("if we are supposed to fail a distributeShardsLike job we immediately fail because this should be done by a job running on the master shard") {
@ -294,7 +296,7 @@ SECTION("if we are supposed to fail a distributeShardsLike job we immediately fa
JOB_STATUS::TODO,
jobId
);
failedFollower.start();
failedFollower.start(aborts);
}
SECTION("if the follower is healthy again we fail the job") {
@ -356,7 +358,7 @@ SECTION("if the follower is healthy again we fail the job") {
JOB_STATUS::TODO,
jobId
);
REQUIRE_FALSE(failedFollower.start());
REQUIRE_FALSE(failedFollower.start(aborts));
Verify(Method(mockAgent, transact));
Verify(Method(mockAgent, write));
}
@ -406,7 +408,7 @@ SECTION("if there is no healthy free server when trying to start just wait") {
JOB_STATUS::TODO,
jobId
);
REQUIRE_FALSE(failedFollower.start());
REQUIRE_FALSE(failedFollower.start(aborts));
}
SECTION("abort any moveShard job blocking the shard and start") {
@ -480,13 +482,6 @@ SECTION("abort any moveShard job blocking the shard and start") {
return fakeWriteResult;
});
When(Method(mockAgent, transact)).Do([&](query_t const& q) -> trans_ret_t {
// check that the job is now pending
INFO("Transaction: " << q->slice().toJson());
auto writes = q->slice()[0][0];
REQUIRE(std::string(writes.get("/arango/Target/Finished/1").typeName()) == "object");
return fakeTransResult;
});
When(Method(mockAgent, waitFor)).AlwaysReturn(AgentInterface::raft_commit_t::OK);
AgentInterface &agent = mockAgent.get();
auto failedFollower = FailedFollower(
@ -495,8 +490,7 @@ SECTION("abort any moveShard job blocking the shard and start") {
JOB_STATUS::TODO,
jobId
);
REQUIRE(failedFollower.start());
Verify(Method(mockAgent, transact));
REQUIRE_FALSE(failedFollower.start(aborts));
Verify(Method(mockAgent, write));
}
@ -574,7 +568,7 @@ SECTION("a successfully started job should finish immediately and set everything
JOB_STATUS::TODO,
jobId
);
failedFollower.start();
failedFollower.start(aborts);
Verify(Method(mockAgent, transact));
}
@ -679,7 +673,7 @@ SECTION("the job should handle distributeShardsLike") {
JOB_STATUS::TODO,
jobId
);
failedFollower.start();
failedFollower.start(aborts);
Verify(Method(mockAgent, transact));
}
@ -746,7 +740,7 @@ SECTION("the job should timeout after a while") {
JOB_STATUS::TODO,
jobId
);
failedFollower.start();
failedFollower.start(aborts);
Verify(Method(mockAgent, write));
}

View File

@ -58,6 +58,8 @@ const std::string SHARD_FOLLOWER2 = "follower2";
const std::string FREE_SERVER = "free";
const std::string FREE_SERVER2 = "free2";
bool aborts = false;
const char *agency =
#include "FailedLeaderTest.json"
;
@ -224,7 +226,7 @@ SECTION("if we want to start and the collection went missing from plan (our trut
JOB_STATUS::TODO,
jobId
);
failedLeader.start();
failedLeader.start(aborts);
}
SECTION("if we are supposed to fail a distributeShardsLike job we immediately fail because this should be done by a job running on the master shard") {
@ -280,7 +282,7 @@ SECTION("if we are supposed to fail a distributeShardsLike job we immediately fa
JOB_STATUS::TODO,
jobId
);
failedLeader.start();
failedLeader.start(aborts);
}
SECTION("if the leader is healthy again we fail the job") {
@ -337,7 +339,7 @@ SECTION("if the leader is healthy again we fail the job") {
JOB_STATUS::TODO,
jobId
);
REQUIRE_FALSE(failedLeader.start());
REQUIRE_FALSE(failedLeader.start(aborts));
Verify(Method(mockAgent, transact));
Verify(Method(mockAgent, write)).Exactly(Once);
}
@ -383,7 +385,7 @@ SECTION("the job must not be started if there is no server that is in sync for e
JOB_STATUS::TODO,
jobId
);
REQUIRE_FALSE(failedLeader.start());
REQUIRE_FALSE(failedLeader.start(aborts));
}
SECTION("the job must not be started if there if one of the linked shards (distributeShardsLike) is not in sync") {
@ -444,10 +446,10 @@ SECTION("the job must not be started if there if one of the linked shards (distr
JOB_STATUS::TODO,
jobId
);
failedLeader.start();
failedLeader.start(aborts);
}
SECTION("abort any moveShard job blocking the shard and start") {
SECTION("abort any moveShard job blocking the shard and stay in ToDo") {
Mock<AgentInterface> moveShardMockAgent;
Builder moveShardBuilder;
@ -518,13 +520,6 @@ SECTION("abort any moveShard job blocking the shard and start") {
return fakeWriteResult;
});
When(Method(mockAgent, transact)).Do([&](query_t const& q) -> trans_ret_t {
// check that the job is now pending
INFO("Transaction: " << q->slice().toJson());
auto writes = q->slice()[0][0];
REQUIRE(std::string(writes.get("/arango/Target/Pending/1").typeName()) == "object");
return fakeTransResult;
});
When(Method(mockAgent, waitFor)).AlwaysReturn(AgentInterface::raft_commit_t::OK);
AgentInterface &agent = mockAgent.get();
auto failedLeader = FailedLeader(
@ -533,8 +528,7 @@ SECTION("abort any moveShard job blocking the shard and start") {
JOB_STATUS::TODO,
jobId
);
failedLeader.start();
Verify(Method(mockAgent, transact));
REQUIRE_FALSE(failedLeader.start(aborts));
Verify(Method(mockAgent, write));
}
@ -637,7 +631,7 @@ SECTION("if everything is fine than the job should be written to pending, adding
JOB_STATUS::TODO,
jobId
);
failedLeader.start();
failedLeader.start(aborts);
}
SECTION("if we want are working and our collection went missing from plan the job should just finish") {
@ -707,7 +701,7 @@ SECTION("if we want are working and our collection went missing from plan the jo
JOB_STATUS::PENDING,
jobId
);
failedLeader.run();
failedLeader.run(aborts);
}
SECTION("if the newly supposed leader didn't catch up yet we wait") {
@ -767,7 +761,7 @@ SECTION("if the newly supposed leader didn't catch up yet we wait") {
JOB_STATUS::PENDING,
jobId
);
failedLeader.run();
failedLeader.run(aborts);
}
SECTION("in case of a timeout the job should be aborted") {
@ -845,7 +839,7 @@ SECTION("in case of a timeout the job should be aborted") {
JOB_STATUS::PENDING,
jobId
);
failedLeader.run();
failedLeader.run(aborts);
Verify(Method(mockAgent, write));
}
@ -924,7 +918,7 @@ SECTION("when everything is finished there should be proper cleanup") {
JOB_STATUS::PENDING,
jobId
);
failedLeader.run();
failedLeader.run(aborts);
Verify(Method(mockAgent, write));
}
@ -984,7 +978,7 @@ SECTION("a failedleader must not take a follower into account that is in sync bu
JOB_STATUS::TODO,
jobId
);
failedLeader.start();
failedLeader.start(aborts);
}
}
}

View File

@ -41,6 +41,12 @@ R"=(
"Supervision": {
"DBServers": {},
"Health": {
"free2": {
"Status": "BAD"
},
"free": {
"Status": "GOOD"
},
"follower1": {
"Status": "GOOD"
},

View File

@ -58,6 +58,8 @@ const std::string SHARD_FOLLOWER2 = "follower2";
const std::string FREE_SERVER = "free";
const std::string FREE_SERVER2 = "free2";
bool aborts = false;
typedef std::function<std::unique_ptr<Builder>(
Slice const&, std::string const&)>TestStructureType;
@ -337,7 +339,7 @@ TEST_CASE("FailedServer", "[agency][supervision]") {
When(Method(mockAgent, waitFor)).AlwaysReturn(AgentInterface::raft_commit_t::OK);
auto& agent = mockAgent.get();
FailedServer(agency("arango"), &agent, JOB_STATUS::TODO, jobId).start();
FailedServer(agency("arango"), &agent, JOB_STATUS::TODO, jobId).start(aborts);
Verify(Method(mockAgent,write));
@ -404,7 +406,7 @@ TEST_CASE("FailedServer", "[agency][supervision]") {
When(Method(mockAgent, waitFor)).AlwaysReturn(AgentInterface::raft_commit_t::OK);
auto& agent = mockAgent.get();
FailedServer(agency("arango"), &agent, JOB_STATUS::TODO, jobId).start();
FailedServer(agency("arango"), &agent, JOB_STATUS::TODO, jobId).start(aborts);
Verify(Method(mockAgent,write));

View File

@ -54,6 +54,8 @@ using namespace arangodb::basics;
using namespace arangodb::consensus;
using namespace fakeit;
bool aborts = false;
namespace arangodb {
namespace tests {
namespace move_shard_test {
@ -165,12 +167,13 @@ SECTION("the job should fail if toServer does not exist") {
INFO("Agency: " << agency);
auto moveShard = MoveShard(agency, &agent, TODO, jobId);
moveShard.start();
moveShard.start(aborts);
Verify(Method(mockAgent,write));
}
SECTION("the job should fail to start if toServer is already in plan") {
std::function<std::unique_ptr<VPackBuilder>(VPackSlice const&, std::string const&)> createTestStructure = [&](VPackSlice const& s, std::string const& path) {
SECTION("the job should fail to start if fromServer and toServer are planned followers") {
std::function<std::unique_ptr<VPackBuilder>(VPackSlice const&, std::string const&)> createTestStructure =
[&](VPackSlice const& s, std::string const& path) {
std::unique_ptr<VPackBuilder> builder;
builder.reset(new VPackBuilder());
if (s.isObject()) {
@ -183,7 +186,7 @@ SECTION("the job should fail to start if toServer is already in plan") {
}
if (path == "/arango/Target/ToDo") {
builder->add(jobId, createJob(COLLECTION, SHARD_LEADER, SHARD_FOLLOWER1).slice());
builder->add(jobId, createJob(COLLECTION, SHARD_FOLLOWER1, SHARD_LEADER).slice());
}
builder->close();
} else {
@ -206,7 +209,7 @@ SECTION("the job should fail to start if toServer is already in plan") {
INFO("Agency: " << agency);
auto moveShard = MoveShard(agency, &agent, TODO, jobId);
moveShard.start();
moveShard.start(aborts);
Verify(Method(mockAgent,write));
}
@ -245,7 +248,7 @@ SECTION("the job should fail if fromServer does not exist") {
Fake(Method(spy, finish));
Job& spyMoveShard = spy.get();
spyMoveShard.start();
spyMoveShard.start(aborts);
Verify(Method(spy, finish).Matching([](std::string const& server, std::string const& shard, bool success, std::string const& reason, query_t const payload) -> bool {return !success;}));
}
@ -287,7 +290,7 @@ SECTION("the job should fail if fromServer is not in plan of the shard") {
INFO("Agency: " << agency);
auto moveShard = MoveShard(agency, &agent, TODO, jobId);
moveShard.start();
moveShard.start(aborts);
Verify(Method(mockAgent,write));
}
@ -338,7 +341,7 @@ SECTION("the job should fail if fromServer does not exist") {
INFO("Agency: " << agency);
auto moveShard = MoveShard(agency, &agent, TODO, jobId);
moveShard.start();
moveShard.start(aborts);
Verify(Method(mockAgent,write));
}
@ -377,7 +380,7 @@ SECTION("the job should remain in todo if the shard is currently locked") {
INFO("Agency: " << agency);
auto moveShard = MoveShard(agency, &agent, TODO, jobId);
moveShard.start();
moveShard.start(aborts);
}
SECTION("the job should remain in todo if the target server is currently locked") {
@ -415,7 +418,7 @@ SECTION("the job should remain in todo if the target server is currently locked"
INFO("Agency: " << agency);
auto moveShard = MoveShard(agency, &agent, TODO, jobId);
moveShard.start();
moveShard.start(aborts);
}
SECTION("the job should fail if the target server was cleaned out") {
@ -462,7 +465,7 @@ SECTION("the job should fail if the target server was cleaned out") {
INFO("Agency: " << agency);
auto moveShard = MoveShard(agency, &agent, TODO, jobId);
moveShard.start();
moveShard.start(aborts);
Verify(Method(mockAgent,write));
}
@ -508,7 +511,7 @@ SECTION("the job should fail if the target server is failed") {
INFO("Agency: " << agency);
auto moveShard = MoveShard(agency, &agent, TODO, jobId);
moveShard.start();
moveShard.start(aborts);
Verify(Method(mockAgent,write));
}
@ -548,7 +551,7 @@ SECTION("the job should wait until the target server is good") {
INFO("Agency: " << agency);
auto moveShard = MoveShard(agency, &agent, TODO, jobId);
moveShard.start();
moveShard.start(aborts);
}
SECTION("the job should fail if the shard distributes its shards like some other") {
@ -591,7 +594,7 @@ SECTION("the job should fail if the shard distributes its shards like some other
INFO("Agency: " << agency);
auto moveShard = MoveShard(agency, &agent, TODO, jobId);
moveShard.start();
moveShard.start(aborts);
Verify(Method(mockAgent,write));
}
@ -671,7 +674,7 @@ SECTION("the job should be moved to pending when everything is ok") {
INFO("Agency: " << agency);
auto moveShard = MoveShard(agency, &agent, TODO, jobId);
moveShard.start();
moveShard.start(aborts);
Verify(Method(mockAgent,write));
}
@ -727,7 +730,7 @@ SECTION("moving from a follower should be possible") {
INFO("Agency: " << agency);
auto moveShard = MoveShard(agency, &agent, TODO, jobId);
moveShard.start();
moveShard.start(aborts);
Verify(Method(mockAgent,write));
}
@ -855,7 +858,7 @@ SECTION("when moving a shard that is a distributeShardsLike leader move the rest
INFO("Agency: " << agency);
auto moveShard = MoveShard(agency, &agent, TODO, jobId);
moveShard.start();
moveShard.start(aborts);
Verify(Method(mockAgent,write));
}
@ -910,7 +913,7 @@ SECTION("if the job is too old it should be aborted to prevent a deadloop") {
Fake(Method(spy, abort));
Job& spyMoveShard = spy.get();
spyMoveShard.run();
spyMoveShard.run(aborts);
Verify(Method(spy, abort));
}
@ -966,7 +969,7 @@ SECTION("if the job is too old (leader case) it should be aborted to prevent a d
Fake(Method(spy, abort));
Job& spyMoveShard = spy.get();
spyMoveShard.run();
spyMoveShard.run(aborts);
Verify(Method(spy, abort));
}
@ -1015,7 +1018,7 @@ SECTION("if the collection was dropped while moving finish the job") {
Fake(Method(spy, finish));
Job& spyMoveShard = spy.get();
spyMoveShard.run();
spyMoveShard.run(aborts);
Verify(Method(spy, finish).Matching([](std::string const& server, std::string const& shard, bool success, std::string const& reason, query_t const payload) -> bool {
return success;
@ -1066,7 +1069,7 @@ SECTION("if the collection was dropped before the job could be started just fini
Fake(Method(spy, finish));
Job& spyMoveShard = spy.get();
spyMoveShard.start();
spyMoveShard.start(aborts);
Verify(Method(spy, finish).Matching([](std::string const& server, std::string const& shard, bool success, std::string const& reason, query_t const payload) -> bool {return success;}));
@ -1121,7 +1124,7 @@ SECTION("the job should wait until the planned shard situation has been created
INFO("Agency: " << agency);
auto moveShard = MoveShard(agency, &agent, PENDING, jobId);
moveShard.run();
moveShard.run(aborts);
}
SECTION("if the job is done it should properly finish itself") {
@ -1193,7 +1196,7 @@ SECTION("if the job is done it should properly finish itself") {
INFO("Agency: " << agency);
auto moveShard = MoveShard(agency, &agent, PENDING, jobId);
moveShard.run();
moveShard.run(aborts);
Verify(Method(mockAgent,write));
}
@ -1318,7 +1321,7 @@ SECTION("the job should not finish itself when only parts of distributeShardsLik
INFO("Agency: " << agency);
auto moveShard = MoveShard(agency, &agent, PENDING, jobId);
moveShard.run();
moveShard.run(aborts);
}
SECTION("the job should finish when all distributeShardsLike shards have adapted") {
@ -1496,7 +1499,7 @@ SECTION("the job should finish when all distributeShardsLike shards have adapted
INFO("Agency: " << agency);
auto moveShard = MoveShard(agency, &agent, PENDING, jobId);
moveShard.run();
moveShard.run(aborts);
Verify(Method(mockAgent,write));
}
@ -1527,10 +1530,12 @@ SECTION("a moveshard job that just made it to ToDo can simply be aborted") {
When(Method(mockAgent, waitFor)).AlwaysReturn();
When(Method(mockAgent, write)).Do([&](query_t const& q, consensus::AgentInterface::WriteMode w) -> write_ret_t {
INFO("WriteTransaction: " << q->slice().toJson());
REQUIRE(q->slice()[0].length() == 1); // we always simply override! no preconditions...
REQUIRE(q->slice()[0].length() == 2); // we always simply override! no preconditions...
auto writes = q->slice()[0][0];
CHECK(writes.get("/arango/Target/ToDo/1").get("op").copyString() == "delete");
CHECK(std::string(writes.get("/arango/Target/Finished/1").typeName()) == "object");
auto precond = q->slice()[0][1];
CHECK(precond.get("/arango/Target/ToDo/1").get("oldEmpty").isFalse());
return fakeWriteResult;
});
@ -1591,9 +1596,10 @@ SECTION("a pending moveshard job should also put the original server back into p
When(Method(mockAgent, waitFor)).AlwaysReturn();
When(Method(mockAgent, write)).Do([&](query_t const& q, consensus::AgentInterface::WriteMode w) -> write_ret_t {
INFO("WriteTransaction: " << q->slice().toJson());
LOG_DEVEL << q->slice().toJson() << " " << __LINE__;
auto writes = q->slice()[0][0];
CHECK(writes.get("/arango/Target/Pending/1").get("op").copyString() == "delete");
REQUIRE(q->slice()[0].length() == 1); // we always simply override! no preconditions...
REQUIRE(q->slice()[0].length() == 2); // Precondition: to Server not leader yet
CHECK(writes.get("/arango/Supervision/DBServers/" + FREE_SERVER).get("op").copyString() == "delete");
CHECK(writes.get("/arango/Supervision/Shards/" + SHARD).get("op").copyString() == "delete");
CHECK(std::string(writes.get("/arango/Plan/Collections/" + DATABASE + "/" + COLLECTION + "/shards/" + SHARD).typeName()) == "array");
@ -1698,7 +1704,7 @@ SECTION("after the new leader has synchronized the new leader should resign") {
INFO("Agency: " << agency);
auto moveShard = MoveShard(agency, &agent, PENDING, jobId);
moveShard.run();
moveShard.run(aborts);
Verify(Method(mockAgent,write));
}
@ -1762,7 +1768,7 @@ SECTION("when the old leader is not yet ready for resign nothing should happen")
INFO("Agency: " << agency);
auto moveShard = MoveShard(agency, &agent, PENDING, jobId);
moveShard.run();
moveShard.run(aborts);
}
SECTION("aborting the job while a leader transition is in progress (for example when job is timing out) should make the old leader leader again") {
@ -1819,9 +1825,11 @@ SECTION("aborting the job while a leader transition is in progress (for example
When(Method(mockAgent, waitFor)).AlwaysReturn();
When(Method(mockAgent, write)).Do([&](query_t const& q, consensus::AgentInterface::WriteMode w) -> write_ret_t {
INFO("WriteTransaction: " << q->slice().toJson());
LOG_DEVEL << q->slice().toJson() << " " << __LINE__;
auto writes = q->slice()[0][0];
CHECK(writes.get("/arango/Target/Pending/1").get("op").copyString() == "delete");
REQUIRE(q->slice()[0].length() == 1); // we always simply override! no preconditions...
REQUIRE(q->slice()[0].length() == 2); // Precondition: to Server not leader yet
CHECK(writes.get("/arango/Supervision/DBServers/" + FREE_SERVER).get("op").copyString() == "delete");
CHECK(writes.get("/arango/Supervision/Shards/" + SHARD).get("op").copyString() == "delete");
CHECK(std::string(writes.get("/arango/Plan/Collections/" + DATABASE + "/" + COLLECTION + "/shards/" + SHARD).typeName()) == "array");
@ -1924,7 +1932,7 @@ SECTION("if we are ready to resign the old server then finally move to the new l
INFO("Agency: " << agency);
auto moveShard = MoveShard(agency, &agent, PENDING, jobId);
moveShard.run();
moveShard.run(aborts);
Verify(Method(mockAgent,write));
}
@ -2005,7 +2013,7 @@ SECTION("if the new leader took over finish the job") {
INFO("Agency: " << agency);
auto moveShard = MoveShard(agency, &agent, PENDING, jobId);
moveShard.run();
moveShard.run(aborts);
Verify(Method(mockAgent,write));
}
@ -2379,7 +2387,7 @@ SECTION("if the job fails while trying to switch over leadership it should be ab
Fake(Method(spy, abort));
Job& spyMoveShard = spy.get();
spyMoveShard.run();
spyMoveShard.run(aborts);
Verify(Method(spy, abort));
}
@ -2434,7 +2442,7 @@ SECTION("if the job timeouts while the new leader is trying to take over the job
Fake(Method(spy, abort));
Job& spyMoveShard = spy.get();
spyMoveShard.run();
spyMoveShard.run(aborts);
Verify(Method(spy, abort));
}
@ -2520,7 +2528,7 @@ SECTION("when promoting the new leader, the old one should become a resigned fol
INFO("Agency: " << agency);
auto moveShard = MoveShard(agency, &agent, PENDING, jobId);
moveShard.run();
moveShard.run(aborts);
Verify(Method(mockAgent, write));
}

View File

@ -59,6 +59,8 @@ const std::string SHARD_FOLLOWER2 = "follower2";
const std::string FREE_SERVER = "free";
const std::string FREE_SERVER2 = "free2";
bool aborts = false;
const char *agency =
#include "RemoveFollowerTest.json"
;
@ -224,7 +226,7 @@ TEST_CASE("RemoveFollower", "[agency][supervision]") {
When(Method(mockAgent, waitFor)).AlwaysReturn(AgentInterface::raft_commit_t::OK);
auto &agent = mockAgent.get();
RemoveFollower(agency("arango"), &agent, JOB_STATUS::TODO, jobId).start();
RemoveFollower(agency("arango"), &agent, JOB_STATUS::TODO, jobId).start(aborts);
}
@ -283,7 +285,7 @@ TEST_CASE("RemoveFollower", "[agency][supervision]") {
);
When(Method(mockAgent, waitFor)).AlwaysReturn(AgentInterface::raft_commit_t::OK);
auto &agent = mockAgent.get();
RemoveFollower(agency("arango"), &agent, JOB_STATUS::TODO, jobId).start();
RemoveFollower(agency("arango"), &agent, JOB_STATUS::TODO, jobId).start(aborts);
}
@ -349,7 +351,7 @@ TEST_CASE("RemoveFollower", "[agency][supervision]") {
);
When(Method(mockAgent, waitFor)).AlwaysReturn(AgentInterface::raft_commit_t::OK);
auto &agent = mockAgent.get();
RemoveFollower(agency("arango"), &agent, JOB_STATUS::TODO, jobId).start();
RemoveFollower(agency("arango"), &agent, JOB_STATUS::TODO, jobId).start(aborts);
}
@ -412,7 +414,7 @@ TEST_CASE("RemoveFollower", "[agency][supervision]") {
);
When(Method(mockAgent, waitFor)).AlwaysReturn(AgentInterface::raft_commit_t::OK);
AgentInterface &agent = mockAgent.get();
RemoveFollower(agency("arango"), &agent, JOB_STATUS::TODO, jobId).start();
RemoveFollower(agency("arango"), &agent, JOB_STATUS::TODO, jobId).start(aborts);
}
@ -473,7 +475,7 @@ TEST_CASE("RemoveFollower", "[agency][supervision]") {
});
When(Method(mockAgent, waitFor)).AlwaysReturn(AgentInterface::raft_commit_t::OK);
auto& agent = mockAgent.get();
RemoveFollower(agency("arango"), &agent, JOB_STATUS::TODO, jobId).start();
RemoveFollower(agency("arango"), &agent, JOB_STATUS::TODO, jobId).start(aborts);
REQUIRE_NOTHROW(Verify(Method(mockAgent, write)));
}
@ -609,7 +611,7 @@ TEST_CASE("RemoveFollower", "[agency][supervision]") {
);
When(Method(mockAgent, waitFor)).AlwaysReturn(AgentInterface::raft_commit_t::OK);
auto &agent = mockAgent.get();
RemoveFollower(agency("arango"), &agent, JOB_STATUS::TODO, jobId).start();
RemoveFollower(agency("arango"), &agent, JOB_STATUS::TODO, jobId).start(aborts);
REQUIRE_NOTHROW(Verify(Method(mockAgent, write)));
}

View File

@ -125,7 +125,7 @@ function MovingShardsSuite ({useData}) {
res = request(envelope);
} catch (err) {
console.error(
"Exception for POST /_admin/cluster/cleanOutServer:", err.stack);
"Exception for GET /_admin/cluster/cleanOutServer:", err.stack);
return {cleanedServers:[]};
}
if (res.statusCode !== 200) {
@ -145,6 +145,32 @@ function MovingShardsSuite ({useData}) {
return body;
}
////////////////////////////////////////////////////////////////////////////////
/// @brief display agency information in case of a bad outcome
////////////////////////////////////////////////////////////////////////////////
function displayAgencyInformation() {
var coordEndpoint =
global.ArangoClusterInfo.getServerEndpoint("Coordinator0001");
var request = require("@arangodb/request");
var endpointToURL = require("@arangodb/cluster").endpointToURL;
var url = endpointToURL(coordEndpoint);
var res;
try {
var envelope = { method: "GET", url: url + "/_api/cluster/agency-dump" };
res = request(envelope);
} catch (err) {
console.error(
"Exception for GET /_api/cluster/agency-dump:", err.stack);
return;
}
if (res.statusCode !== 200) {
return;
}
var body = res.body;
console.error("Agency state after disaster:", body);
}
////////////////////////////////////////////////////////////////////////////////
/// @brief test whether or not a server is clean
@ -181,6 +207,7 @@ function MovingShardsSuite ({useData}) {
console.info(
"Failed: Server " + id + " was not cleaned out. List of cleaned servers: ["
+ obj.cleanedServers + "]");
displayAgencyInformation();
}
} else {
@ -213,6 +240,7 @@ function MovingShardsSuite ({useData}) {
}
}
if (!ok) {
displayAgencyInformation();
return false;
}

View File

@ -194,6 +194,33 @@ function MovingShardsWithViewSuite (options) {
return body;
}
////////////////////////////////////////////////////////////////////////////////
/// @brief display agency information in case of a bad outcome
////////////////////////////////////////////////////////////////////////////////
function displayAgencyInformation() {
var coordEndpoint =
global.ArangoClusterInfo.getServerEndpoint("Coordinator0001");
var request = require("@arangodb/request");
var endpointToURL = require("@arangodb/cluster").endpointToURL;
var url = endpointToURL(coordEndpoint);
var res;
try {
var envelope = { method: "GET", url: url + "/_api/cluster/agency-dump" };
res = request(envelope);
} catch (err) {
console.error(
"Exception for GET /_api/cluster/agency-dump:", err.stack);
return;
}
if (res.statusCode !== 200) {
return;
}
var body = res.body;
console.error("Agency state after disaster:", body);
}
////////////////////////////////////////////////////////////////////////////////
/// @brief test whether or not a server is clean
@ -230,6 +257,7 @@ function MovingShardsWithViewSuite (options) {
console.info(
"Failed: Server " + id + " was not cleaned out. List of cleaned servers: ["
+ obj.cleanedServers + "]");
displayAgencyInformation();
}
} else {
@ -262,6 +290,7 @@ function MovingShardsWithViewSuite (options) {
}
}
if (!ok) {
displayAgencyInformation();
return false;
}