mirror of https://gitee.com/bigwinds/arangodb
[devel] supervision bug fix backports (#8314)
* back ports for supervision fixes from 3.4 part 1 * back ports for supervision fixes from 3.4 part 2
This commit is contained in:
parent
0d2056550c
commit
68178ba165
30
CHANGELOG
30
CHANGELOG
|
@ -5,6 +5,36 @@ devel
|
|||
|
||||
* fixed overflow in Windows NowNanos in RocksDB
|
||||
|
||||
* Allow MoveShard from leader to a follower, thus swapping the two
|
||||
|
||||
* Supervision fix: Satellite collections, various fixes
|
||||
|
||||
* Add coordinator route for agency dump
|
||||
|
||||
* Supervision fix: abort MoveShard job does not leave a lock behind,
|
||||
|
||||
* Supervision fix: abort MoveShard (leader) job moves forwards when point
|
||||
of no return has been reached,
|
||||
|
||||
* Supervision fix: abort CleanOutServer job does not leave server in
|
||||
ToBeCleanedServers,
|
||||
|
||||
* Supervision fix: move shard with data stopped to early due to wrong usage
|
||||
of compare function
|
||||
|
||||
* Supervision fix: AddFollower only counts good followers, fixing a
|
||||
situation after a FailedLeader job could not find a new working
|
||||
follower
|
||||
|
||||
* Supervision fix: FailedLeader now also considers temporarily BAD
|
||||
servers as replacement followers and does not block servers which
|
||||
currently receive a new shard
|
||||
|
||||
* Supervision fix: Servers in ToBeCleanedServers are no longer considered
|
||||
as replacement servers
|
||||
|
||||
* Maintenance fix: added precondition of unchanged Plan in phase2
|
||||
|
||||
* add "PRUNE <condition>" to AQL Traversals. This allows to early abort searching of
|
||||
unnecessary branches within a traversal.
|
||||
PRUNE is only allowed in the Traversal statement and only between the graph
|
||||
|
|
|
@ -59,7 +59,7 @@ ActiveFailoverJob::ActiveFailoverJob(Node const& snapshot, AgentInterface* agent
|
|||
|
||||
ActiveFailoverJob::~ActiveFailoverJob() {}
|
||||
|
||||
void ActiveFailoverJob::run() { runHelper(_server, ""); }
|
||||
void ActiveFailoverJob::run(bool& aborts) { runHelper(_server, "", aborts); }
|
||||
|
||||
bool ActiveFailoverJob::create(std::shared_ptr<VPackBuilder> envelope) {
|
||||
LOG_TOPIC(DEBUG, Logger::SUPERVISION) << "Todo: Handle failover for leader " + _server;
|
||||
|
@ -130,7 +130,7 @@ bool ActiveFailoverJob::create(std::shared_ptr<VPackBuilder> envelope) {
|
|||
return false;
|
||||
}
|
||||
|
||||
bool ActiveFailoverJob::start() {
|
||||
bool ActiveFailoverJob::start(bool&) {
|
||||
// If anything throws here, the run() method catches it and finishes
|
||||
// the job.
|
||||
|
||||
|
|
|
@ -39,9 +39,9 @@ struct ActiveFailoverJob final : public Job {
|
|||
virtual ~ActiveFailoverJob();
|
||||
|
||||
virtual JOB_STATUS status() override final;
|
||||
virtual void run() override final;
|
||||
virtual void run(bool&) override final;
|
||||
virtual bool create(std::shared_ptr<VPackBuilder> envelope = nullptr) override final;
|
||||
virtual bool start() override final;
|
||||
virtual bool start(bool&) override final;
|
||||
virtual Result abort() override final;
|
||||
|
||||
private:
|
||||
|
|
|
@ -64,7 +64,7 @@ AddFollower::AddFollower(Node const& snapshot, AgentInterface* agent,
|
|||
|
||||
AddFollower::~AddFollower() {}
|
||||
|
||||
void AddFollower::run() { runHelper("", _shard); }
|
||||
void AddFollower::run(bool& aborts) { runHelper("", _shard, aborts); }
|
||||
|
||||
bool AddFollower::create(std::shared_ptr<VPackBuilder> envelope) {
|
||||
LOG_TOPIC(INFO, Logger::SUPERVISION)
|
||||
|
@ -119,7 +119,7 @@ bool AddFollower::create(std::shared_ptr<VPackBuilder> envelope) {
|
|||
return false;
|
||||
}
|
||||
|
||||
bool AddFollower::start() {
|
||||
bool AddFollower::start(bool&) {
|
||||
// If anything throws here, the run() method catches it and finishes
|
||||
// the job.
|
||||
|
||||
|
@ -146,8 +146,33 @@ bool AddFollower::start() {
|
|||
|
||||
// First check that we still have too few followers for the current
|
||||
// `replicationFactor`:
|
||||
size_t desiredReplFactor = collection.hasAsUInt("replicationFactor").first;
|
||||
size_t actualReplFactor = planned.length();
|
||||
size_t desiredReplFactor = 1;
|
||||
auto replFact = collection.hasAsUInt("replicationFactor");
|
||||
if (replFact.second) {
|
||||
desiredReplFactor = replFact.first;
|
||||
} else {
|
||||
auto replFact2 = collection.hasAsString("replicationFactor");
|
||||
if (replFact2.second && replFact2.first == "satellite") {
|
||||
// satellites => distribute to every server
|
||||
auto available = Job::availableServers(_snapshot);
|
||||
desiredReplFactor = Job::countGoodServersInList(_snapshot, available);
|
||||
}
|
||||
}
|
||||
|
||||
VPackBuilder onlyFollowers;
|
||||
{
|
||||
VPackArrayBuilder guard(&onlyFollowers);
|
||||
bool first = true;
|
||||
for (auto const& pp : VPackArrayIterator(planned)) {
|
||||
if (!first) {
|
||||
onlyFollowers.add(pp);
|
||||
}
|
||||
first = false;
|
||||
}
|
||||
}
|
||||
size_t actualReplFactor
|
||||
= 1 + Job::countGoodServersInList(_snapshot, onlyFollowers.slice());
|
||||
// Leader plus good followers in plan
|
||||
if (actualReplFactor >= desiredReplFactor) {
|
||||
finish("", "", true, "job no longer necessary, have enough replicas");
|
||||
return true;
|
||||
|
@ -245,7 +270,7 @@ bool AddFollower::start() {
|
|||
|
||||
// --- Plan changes
|
||||
doForAllShards(_snapshot, _database, shardsLikeMe,
|
||||
[&trx, &chosen](Slice plan, Slice current, std::string& planPath) {
|
||||
[&trx, &chosen](Slice plan, Slice current, std::string& planPath, std::string& curPath) {
|
||||
trx.add(VPackValue(planPath));
|
||||
{
|
||||
VPackArrayBuilder serverList(&trx);
|
||||
|
|
|
@ -42,8 +42,8 @@ struct AddFollower : public Job {
|
|||
|
||||
virtual JOB_STATUS status() override final;
|
||||
virtual bool create(std::shared_ptr<VPackBuilder> envelope = nullptr) override final;
|
||||
virtual void run() override final;
|
||||
virtual bool start() override final;
|
||||
virtual void run(bool&) override final;
|
||||
virtual bool start(bool&) override final;
|
||||
virtual Result abort() override final;
|
||||
|
||||
std::string _database;
|
||||
|
|
|
@ -903,6 +903,37 @@ AgencyCommResult AgencyComm::getValues(std::string const& key) {
|
|||
return result;
|
||||
}
|
||||
|
||||
AgencyCommResult AgencyComm::dump() {
|
||||
std::string url = AgencyComm::AGENCY_URL_PREFIX + "/state";
|
||||
|
||||
AgencyCommResult result =
|
||||
sendWithFailover(
|
||||
arangodb::rest::RequestType::GET,
|
||||
AgencyCommManager::CONNECTION_OPTIONS._requestTimeout,
|
||||
url, VPackSlice::noneSlice());
|
||||
|
||||
if (!result.successful()) {
|
||||
return result;
|
||||
}
|
||||
|
||||
try {
|
||||
|
||||
result.setVPack(VPackParser::fromJson(result.bodyRef()));
|
||||
result._body.clear();
|
||||
result._statusCode = 200;
|
||||
|
||||
} catch (std::exception const& e) {
|
||||
LOG_TOPIC(ERR, Logger::AGENCYCOMM) << "Error transforming result: " << e.what();
|
||||
result.clear();
|
||||
} catch (...) {
|
||||
LOG_TOPIC(ERR, Logger::AGENCYCOMM)
|
||||
<< "Error transforming result: out of memory";
|
||||
result.clear();
|
||||
}
|
||||
|
||||
return result;
|
||||
}
|
||||
|
||||
AgencyCommResult AgencyComm::removeValues(std::string const& key, bool recursive) {
|
||||
AgencyWriteTransaction transaction(AgencyOperation(key, AgencySimpleOperationType::DELETE_OP));
|
||||
|
||||
|
|
|
@ -631,6 +631,8 @@ class AgencyComm {
|
|||
|
||||
std::string version();
|
||||
|
||||
AgencyCommResult dump();
|
||||
|
||||
bool increaseVersion(std::string const& key) {
|
||||
AgencyCommResult result = increment(key);
|
||||
return result.successful();
|
||||
|
|
|
@ -252,6 +252,18 @@ bool Agent::isCommitted(index_t index) {
|
|||
}
|
||||
}
|
||||
|
||||
index_t Agent::index() {
|
||||
|
||||
if (challengeLeadership()) {
|
||||
resign();
|
||||
return 0;
|
||||
}
|
||||
|
||||
MUTEX_LOCKER(tiLocker, _tiLock);
|
||||
return _confirmed[id()];
|
||||
|
||||
}
|
||||
|
||||
// AgentCallback reports id of follower and its highest processed index
|
||||
void Agent::reportIn(std::string const& peerId, index_t index, size_t toLog) {
|
||||
auto startTime = steady_clock::now();
|
||||
|
@ -1584,6 +1596,29 @@ arangodb::consensus::index_t Agent::readDB(Node& node) const {
|
|||
return _commitIndex;
|
||||
}
|
||||
|
||||
/// Get readdb
|
||||
arangodb::consensus::index_t Agent::readDB(VPackBuilder& builder) const {
|
||||
TRI_ASSERT(builder.isOpenObject());
|
||||
|
||||
uint64_t commitIndex = 0;
|
||||
|
||||
{ READ_LOCKER(oLocker, _outputLock);
|
||||
|
||||
commitIndex = _commitIndex;
|
||||
// commit index
|
||||
builder.add("index", VPackValue(commitIndex));
|
||||
builder.add("term", VPackValue(term()));
|
||||
|
||||
// key-value store {}
|
||||
builder.add(VPackValue("agency"));
|
||||
_readDB.get().toBuilder(builder, true); }
|
||||
|
||||
// replicated log []
|
||||
_state.toVelocyPack(commitIndex, builder);
|
||||
|
||||
return commitIndex;
|
||||
}
|
||||
|
||||
void Agent::executeLockedRead(std::function<void()> const& cb) {
|
||||
_tiLock.assertNotLockedByCurrentThread();
|
||||
MUTEX_LOCKER(ioLocker, _ioLock);
|
||||
|
|
|
@ -177,6 +177,9 @@ class Agent final : public arangodb::Thread, public AgentInterface {
|
|||
/// @brief Gossip in
|
||||
bool activeAgency();
|
||||
|
||||
/// @brief Get the index at which the leader is
|
||||
index_t index();
|
||||
|
||||
/// @brief Start orderly shutdown of threads
|
||||
void beginShutdown() override final;
|
||||
|
||||
|
@ -218,6 +221,9 @@ class Agent final : public arangodb::Thread, public AgentInterface {
|
|||
/// @brief Get read store and compaction index
|
||||
index_t readDB(Node&) const;
|
||||
|
||||
/// @brief Get read store and compaction index
|
||||
index_t readDB(VPackBuilder&) const;
|
||||
|
||||
/// @brief Get read store
|
||||
/// WARNING: this assumes caller holds appropriate
|
||||
/// locks or will use executeLockedRead() or
|
||||
|
|
|
@ -58,7 +58,7 @@ CleanOutServer::CleanOutServer(Node const& snapshot, AgentInterface* agent,
|
|||
|
||||
CleanOutServer::~CleanOutServer() {}
|
||||
|
||||
void CleanOutServer::run() { runHelper(_server, ""); }
|
||||
void CleanOutServer::run(bool& aborts) { runHelper(_server, "", aborts); }
|
||||
|
||||
JOB_STATUS CleanOutServer::status() {
|
||||
if (_status != PENDING) {
|
||||
|
@ -121,7 +121,7 @@ JOB_STATUS CleanOutServer::status() {
|
|||
reportTrx.add("op", VPackValue("push"));
|
||||
reportTrx.add("new", VPackValue(_server));
|
||||
}
|
||||
reportTrx.add(VPackValue("/Target/ToBeCleanedServers"));
|
||||
reportTrx.add(VPackValue(toBeCleanedPrefix));
|
||||
{
|
||||
VPackObjectBuilder guard4(&reportTrx);
|
||||
reportTrx.add("op", VPackValue("erase"));
|
||||
|
@ -196,7 +196,7 @@ bool CleanOutServer::create(std::shared_ptr<VPackBuilder> envelope) {
|
|||
return false;
|
||||
}
|
||||
|
||||
bool CleanOutServer::start() {
|
||||
bool CleanOutServer::start(bool& aborts) {
|
||||
// If anything throws here, the run() method catches it and finishes
|
||||
// the job.
|
||||
|
||||
|
@ -318,7 +318,7 @@ bool CleanOutServer::start() {
|
|||
addBlockServer(*pending, _server, _jobId);
|
||||
|
||||
// Put ourselves in list of servers to be cleaned:
|
||||
pending->add(VPackValue("/Target/ToBeCleanedServers"));
|
||||
pending->add(VPackValue(toBeCleanedPrefix));
|
||||
{
|
||||
VPackObjectBuilder guard4(pending.get());
|
||||
pending->add("op", VPackValue("push"));
|
||||
|
@ -388,34 +388,58 @@ bool CleanOutServer::scheduleMoveShards(std::shared_ptr<Builder>& trx) {
|
|||
continue;
|
||||
}
|
||||
|
||||
decltype(servers) serversCopy(servers); // a copy
|
||||
auto replicationFactor = collection.hasAsString("replicationFactor");
|
||||
bool isSatellite = replicationFactor.second && replicationFactor.first == "satellite";
|
||||
|
||||
|
||||
// Only destinations, which are not already holding this shard
|
||||
for (auto const& dbserver : VPackArrayIterator(shard.second->slice())) {
|
||||
serversCopy.erase(std::remove(serversCopy.begin(), serversCopy.end(),
|
||||
dbserver.copyString()),
|
||||
serversCopy.end());
|
||||
}
|
||||
|
||||
bool isLeader = (found == 0);
|
||||
|
||||
// Among those a random destination:
|
||||
std::string toServer;
|
||||
if (serversCopy.empty()) {
|
||||
LOG_TOPIC(DEBUG, Logger::SUPERVISION)
|
||||
<< "No servers remain as target for MoveShard";
|
||||
return false;
|
||||
if (isSatellite) {
|
||||
if (isLeader) {
|
||||
|
||||
std::string toServer = Job::findNonblockedCommonHealthyInSyncFollower(
|
||||
_snapshot, database.first, collptr.first, shard.first);
|
||||
|
||||
MoveShard(_snapshot, _agent, _jobId + "-" + std::to_string(sub++),
|
||||
_jobId, database.first, collptr.first, shard.first, _server,
|
||||
toServer, isLeader, false)
|
||||
.create(trx);
|
||||
|
||||
} else {
|
||||
// Intentionally do nothing. RemoveServer will remove the failed follower
|
||||
LOG_TOPIC(DEBUG, Logger::SUPERVISION) <<
|
||||
"Do nothing for cleanout of follower of the satellite collection " << collection.hasAsString("id").first;
|
||||
continue ;
|
||||
}
|
||||
} else {
|
||||
decltype(servers) serversCopy(servers); // a copy
|
||||
|
||||
// Only destinations, which are not already holding this shard
|
||||
for (auto const& dbserver : VPackArrayIterator(shard.second->slice())) {
|
||||
serversCopy.erase(std::remove(serversCopy.begin(), serversCopy.end(),
|
||||
dbserver.copyString()),
|
||||
serversCopy.end());
|
||||
}
|
||||
|
||||
// Among those a random destination:
|
||||
std::string toServer;
|
||||
if (serversCopy.empty()) {
|
||||
LOG_TOPIC(DEBUG, Logger::SUPERVISION)
|
||||
<< "No servers remain as target for MoveShard";
|
||||
return false;
|
||||
}
|
||||
|
||||
toServer = serversCopy.at(
|
||||
arangodb::RandomGenerator::interval(static_cast<int64_t>(0),
|
||||
serversCopy.size() - 1));
|
||||
|
||||
// Schedule move into trx:
|
||||
MoveShard(_snapshot, _agent, _jobId + "-" + std::to_string(sub++),
|
||||
_jobId, database.first, collptr.first, shard.first, _server,
|
||||
toServer, isLeader, false)
|
||||
.create(trx);
|
||||
}
|
||||
|
||||
toServer = serversCopy.at(
|
||||
arangodb::RandomGenerator::interval(static_cast<int64_t>(0),
|
||||
serversCopy.size() - 1));
|
||||
|
||||
// Schedule move into trx:
|
||||
MoveShard(_snapshot, _agent, _jobId + "-" + std::to_string(sub++),
|
||||
_jobId, database.first, collptr.first, shard.first, _server,
|
||||
toServer, isLeader, false)
|
||||
.create(trx);
|
||||
}
|
||||
}
|
||||
}
|
||||
|
@ -499,17 +523,28 @@ arangodb::Result CleanOutServer::abort() {
|
|||
Node::Children const pends = _snapshot.hasAsChildren(pendingPrefix).first;
|
||||
|
||||
for (auto const& subJob : todos) {
|
||||
if (!subJob.first.compare(0, _jobId.size() + 1, _jobId + "-")) {
|
||||
if (subJob.first.compare(0, _jobId.size() + 1, _jobId + "-") == 0) {
|
||||
JobContext(TODO, subJob.first, _snapshot, _agent).abort();
|
||||
}
|
||||
}
|
||||
for (auto const& subJob : pends) {
|
||||
if (!subJob.first.compare(0, _jobId.size() + 1, _jobId + "-")) {
|
||||
if (subJob.first.compare(0, _jobId.size() + 1, _jobId + "-") == 0) {
|
||||
JobContext(PENDING, subJob.first, _snapshot, _agent).abort();
|
||||
}
|
||||
}
|
||||
|
||||
finish(_server, "", false, "job aborted");
|
||||
auto payload = std::make_shared<VPackBuilder>();
|
||||
{
|
||||
VPackObjectBuilder p(payload.get());
|
||||
payload->add(VPackValue(toBeCleanedPrefix));
|
||||
{
|
||||
VPackObjectBuilder pp(payload.get());
|
||||
payload->add("op", VPackValue("erase"));
|
||||
payload->add("val", VPackValue(_server));
|
||||
}
|
||||
}
|
||||
|
||||
finish(_server, "", false, "job aborted", payload);
|
||||
|
||||
return result;
|
||||
}
|
||||
|
|
|
@ -42,8 +42,8 @@ struct CleanOutServer : public Job {
|
|||
|
||||
virtual JOB_STATUS status() override final;
|
||||
virtual bool create(std::shared_ptr<VPackBuilder> envelope = nullptr) override final;
|
||||
virtual void run() override final;
|
||||
virtual bool start() override final;
|
||||
virtual void run(bool&) override final;
|
||||
virtual bool start(bool&) override final;
|
||||
virtual Result abort() override final;
|
||||
|
||||
// Check if all shards' replication factors can be satisfied after clean out.
|
||||
|
|
|
@ -78,7 +78,7 @@ FailedFollower::FailedFollower(Node const& snapshot, AgentInterface* agent,
|
|||
|
||||
FailedFollower::~FailedFollower() {}
|
||||
|
||||
void FailedFollower::run() { runHelper("", _shard); }
|
||||
void FailedFollower::run(bool& aborts) { runHelper("", _shard, aborts); }
|
||||
|
||||
bool FailedFollower::create(std::shared_ptr<VPackBuilder> envelope) {
|
||||
using namespace std::chrono;
|
||||
|
@ -86,7 +86,7 @@ bool FailedFollower::create(std::shared_ptr<VPackBuilder> envelope) {
|
|||
<< "Create failedFollower for " + _shard + " from " + _from;
|
||||
|
||||
_created = system_clock::now();
|
||||
|
||||
|
||||
if (envelope == nullptr) {
|
||||
_jb = std::make_shared<Builder>();
|
||||
_jb->openArray();
|
||||
|
@ -117,10 +117,10 @@ bool FailedFollower::create(std::shared_ptr<VPackBuilder> envelope) {
|
|||
}
|
||||
|
||||
return true;
|
||||
|
||||
|
||||
}
|
||||
|
||||
bool FailedFollower::start() {
|
||||
bool FailedFollower::start(bool& aborts) {
|
||||
using namespace std::chrono;
|
||||
|
||||
std::vector<std::string> existing =
|
||||
|
@ -141,15 +141,33 @@ bool FailedFollower::start() {
|
|||
// Planned servers vector
|
||||
std::string planPath =
|
||||
planColPrefix + _database + "/" + _collection + "/shards/" + _shard;
|
||||
auto plannedPair = _snapshot.hasAsSlice(planPath); // if missing, what?
|
||||
auto plannedPair = _snapshot.hasAsSlice(planPath);
|
||||
Slice const& planned = plannedPair.first;
|
||||
if (!plannedPair.second) {
|
||||
// not clear what servers should or should not get failover ... retry later
|
||||
finish("", _shard, true,
|
||||
"Plan entry for collection " + _collection + " gone");
|
||||
return false;
|
||||
}
|
||||
|
||||
// Now check if _server is still in this plan, note that it could have
|
||||
// been removed by RemoveFollower already, in which case we simply stop:
|
||||
bool found = false;
|
||||
if (planned.isArray()) {
|
||||
for (auto const& s : VPackArrayIterator(planned)) {
|
||||
if (s.isString() && _from == s.copyString()) {
|
||||
found = true;
|
||||
break;
|
||||
}
|
||||
}
|
||||
}
|
||||
if (!found) {
|
||||
finish("", _shard, true, "Server no longer found in Plan for collection " +
|
||||
_collection + ", our job is done.");
|
||||
return false;
|
||||
}
|
||||
|
||||
// Get proper replacement
|
||||
_to = randomIdleGoodAvailableServer(_snapshot, planned);
|
||||
_to = randomIdleAvailableServer(_snapshot, planned);
|
||||
if (_to.empty()) {
|
||||
// retry later
|
||||
return false;
|
||||
|
@ -255,8 +273,9 @@ bool FailedFollower::start() {
|
|||
return false;
|
||||
} else if (jobId.second) {
|
||||
JobContext(PENDING, jobId.first, _snapshot, _agent).abort();
|
||||
return false;
|
||||
}
|
||||
} // if
|
||||
}
|
||||
|
||||
LOG_TOPIC(DEBUG, Logger::SUPERVISION)
|
||||
<< "FailedFollower start transaction: " << job.toJson();
|
||||
|
|
|
@ -44,8 +44,8 @@ struct FailedFollower : public Job {
|
|||
virtual ~FailedFollower();
|
||||
|
||||
virtual bool create(std::shared_ptr<VPackBuilder> b = nullptr) override final;
|
||||
virtual void run() override final;
|
||||
virtual bool start() override final;
|
||||
virtual void run(bool&) override final;
|
||||
virtual bool start(bool&) override final;
|
||||
virtual JOB_STATUS status() override final;
|
||||
virtual Result abort() override final;
|
||||
|
||||
|
|
|
@ -81,7 +81,7 @@ FailedLeader::FailedLeader(Node const& snapshot, AgentInterface* agent,
|
|||
|
||||
FailedLeader::~FailedLeader() {}
|
||||
|
||||
void FailedLeader::run() { runHelper("", _shard); }
|
||||
void FailedLeader::run(bool& aborts) { runHelper("", _shard, aborts); }
|
||||
|
||||
void FailedLeader::rollback() {
|
||||
// Create new plan servers (exchange _to and _from)
|
||||
|
@ -161,10 +161,10 @@ bool FailedLeader::create(std::shared_ptr<VPackBuilder> b) {
|
|||
}
|
||||
|
||||
return true;
|
||||
|
||||
|
||||
}
|
||||
|
||||
bool FailedLeader::start() {
|
||||
bool FailedLeader::start(bool& aborts) {
|
||||
std::vector<std::string> existing =
|
||||
_snapshot.exists(planColPrefix + _database + "/" + _collection + "/" +
|
||||
"distributeShardsLike");
|
||||
|
@ -232,7 +232,7 @@ bool FailedLeader::start() {
|
|||
}
|
||||
|
||||
// Additional follower, if applicable
|
||||
auto additionalFollower = randomIdleGoodAvailableServer(_snapshot, planned);
|
||||
auto additionalFollower = randomIdleAvailableServer(_snapshot, planned);
|
||||
if (!additionalFollower.empty()) {
|
||||
planv.push_back(additionalFollower);
|
||||
}
|
||||
|
@ -309,7 +309,9 @@ bool FailedLeader::start() {
|
|||
if (jobId.second && !abortable(_snapshot, jobId.first)) {
|
||||
return false;
|
||||
} else if (jobId.second) {
|
||||
aborts = true;
|
||||
JobContext(PENDING, jobId.first, _snapshot, _agent).abort();
|
||||
return false;
|
||||
}
|
||||
}
|
||||
|
||||
|
|
|
@ -44,9 +44,9 @@ struct FailedLeader : public Job {
|
|||
virtual ~FailedLeader();
|
||||
|
||||
virtual bool create(std::shared_ptr<VPackBuilder> b = nullptr) override final;
|
||||
virtual bool start() override final;
|
||||
virtual bool start(bool&) override final;
|
||||
virtual JOB_STATUS status() override final;
|
||||
virtual void run() override final;
|
||||
virtual void run(bool&) override final;
|
||||
virtual Result abort() override final;
|
||||
void rollback();
|
||||
|
||||
|
|
|
@ -58,9 +58,9 @@ FailedServer::FailedServer(Node const& snapshot, AgentInterface* agent,
|
|||
|
||||
FailedServer::~FailedServer() {}
|
||||
|
||||
void FailedServer::run() { runHelper(_server, ""); }
|
||||
void FailedServer::run(bool& aborts) { runHelper(_server, "", aborts); }
|
||||
|
||||
bool FailedServer::start() {
|
||||
bool FailedServer::start(bool& aborts) {
|
||||
using namespace std::chrono;
|
||||
|
||||
// Fail job, if Health back to not FAILED
|
||||
|
@ -71,6 +71,12 @@ bool FailedServer::start() {
|
|||
LOG_TOPIC(INFO, Logger::SUPERVISION) << reason.str();
|
||||
finish(_server, "", false, reason.str());
|
||||
return false;
|
||||
} else if(!status.second) {
|
||||
std::stringstream reason;
|
||||
reason << "Server " << _server << " no longer in health. Already removed. Abort.";
|
||||
LOG_TOPIC(INFO, Logger::SUPERVISION) << reason.str();
|
||||
finish(_server, "", false, reason.str()); // Finish or abort?
|
||||
return false;
|
||||
}
|
||||
|
||||
// Abort job blocking server if abortable
|
||||
|
@ -78,7 +84,9 @@ bool FailedServer::start() {
|
|||
if (jobId.second && !abortable(_snapshot, jobId.first)) {
|
||||
return false;
|
||||
} else if (jobId.second) {
|
||||
aborts = true;
|
||||
JobContext(PENDING, jobId.first, _snapshot, _agent).abort();
|
||||
return false;
|
||||
}
|
||||
|
||||
// Todo entry
|
||||
|
@ -90,8 +98,8 @@ bool FailedServer::start() {
|
|||
if (toDoJob.second) {
|
||||
toDoJob.first.toBuilder(todo);
|
||||
} else {
|
||||
LOG_TOPIC(INFO, Logger::SUPERVISION)
|
||||
<< "Failed to get key " + toDoPrefix + _jobId + " from agency snapshot";
|
||||
LOG_TOPIC(INFO, Logger::SUPERVISION) << "Failed to get key " + toDoPrefix + _jobId +
|
||||
" from agency snapshot";
|
||||
return false;
|
||||
}
|
||||
} else {
|
||||
|
@ -99,7 +107,6 @@ bool FailedServer::start() {
|
|||
}
|
||||
} // Todo entry
|
||||
|
||||
// Pending entry
|
||||
auto transactions = std::make_shared<VPackBuilder>();
|
||||
{
|
||||
VPackArrayBuilder a(transactions.get());
|
||||
|
@ -108,10 +115,10 @@ bool FailedServer::start() {
|
|||
{
|
||||
VPackObjectBuilder oper(transactions.get());
|
||||
// Add pending
|
||||
|
||||
|
||||
auto const& databases = _snapshot.hasAsChildren("/Plan/Collections").first;
|
||||
// auto const& current = _snapshot.hasAsChildren("/Current/Collections").first;
|
||||
|
||||
|
||||
size_t sub = 0;
|
||||
|
||||
// FIXME: looks OK, but only the non-clone shards are put into the job
|
||||
|
@ -124,19 +131,27 @@ bool FailedServer::start() {
|
|||
auto const& replicationFactorPair =
|
||||
collection.hasAsNode("replicationFactor");
|
||||
if (replicationFactorPair.second) {
|
||||
|
||||
VPackSlice const replicationFactor = replicationFactorPair.first.slice();
|
||||
|
||||
if (!replicationFactor.isNumber()) {
|
||||
continue; // no point to try salvaging unreplicated data
|
||||
}
|
||||
|
||||
uint64_t number = 1;
|
||||
try {
|
||||
number = replicationFactor.getNumber<uint64_t>();
|
||||
} catch(...) {
|
||||
}
|
||||
if (number == 1) {
|
||||
continue;
|
||||
bool isSatellite = false;
|
||||
|
||||
if (replicationFactor.isString() && replicationFactor.compareString("satellite") == 0) {
|
||||
isSatellite = true; // do nothing - number = Job::availableServers(_snapshot).size();
|
||||
} else if (replicationFactor.isNumber()) {
|
||||
try {
|
||||
number = replicationFactor.getNumber<uint64_t>();
|
||||
} catch(...) {
|
||||
LOG_TOPIC(ERR, Logger::SUPERVISION) << "Failed to read replicationFactor. job: "
|
||||
<< _jobId << " " << collection.hasAsString("id").first;
|
||||
continue ;
|
||||
}
|
||||
|
||||
if (number == 1) {
|
||||
continue ;
|
||||
}
|
||||
} else {
|
||||
continue; // no point to try salvaging unreplicated data
|
||||
}
|
||||
|
||||
if (collection.has("distributeShardsLike")) {
|
||||
|
@ -152,14 +167,19 @@ bool FailedServer::start() {
|
|||
if (dbs == _server) {
|
||||
if (pos == 0) {
|
||||
FailedLeader(
|
||||
_snapshot, _agent, _jobId + "-" + std::to_string(sub++),
|
||||
_snapshot, _agent, _jobId + "-" + std::to_string(sub++),
|
||||
_jobId, database.first, collptr.first, shard.first, _server)
|
||||
.create(transactions);
|
||||
} else {
|
||||
FailedFollower(
|
||||
_snapshot, _agent, _jobId + "-" + std::to_string(sub++),
|
||||
_jobId, database.first, collptr.first, shard.first, _server)
|
||||
.create(transactions);
|
||||
if (!isSatellite) {
|
||||
FailedFollower(
|
||||
_snapshot, _agent, _jobId + "-" + std::to_string(sub++),
|
||||
_jobId, database.first, collptr.first, shard.first, _server)
|
||||
.create(transactions);
|
||||
} else {
|
||||
LOG_TOPIC(DEBUG, Logger::SUPERVISION) << "Do intentionally nothing for failed follower of satellite collection. job: "
|
||||
<< _jobId;
|
||||
}
|
||||
}
|
||||
}
|
||||
pos++;
|
||||
|
@ -172,7 +192,8 @@ bool FailedServer::start() {
|
|||
transactions->add(VPackValue(pendingPrefix + _jobId));
|
||||
{
|
||||
VPackObjectBuilder ts(transactions.get());
|
||||
transactions->add("timeStarted", VPackValue(timepointToString(system_clock::now())));
|
||||
transactions->add("timeStarted",
|
||||
VPackValue(timepointToString(system_clock::now())));
|
||||
for (auto const& obj : VPackObjectIterator(todo.slice()[0])) {
|
||||
transactions->add(obj.key.copyString(), obj.value);
|
||||
}
|
||||
|
|
|
@ -40,10 +40,10 @@ struct FailedServer : public Job {
|
|||
|
||||
virtual ~FailedServer();
|
||||
|
||||
virtual bool start() override final;
|
||||
virtual bool start(bool&) override final;
|
||||
virtual bool create(std::shared_ptr<VPackBuilder> b = nullptr) override final;
|
||||
virtual JOB_STATUS status() override final;
|
||||
virtual void run() override final;
|
||||
virtual void run(bool&) override final;
|
||||
virtual Result abort() override final;
|
||||
|
||||
std::string _server;
|
||||
|
|
|
@ -38,6 +38,7 @@ std::string const failedPrefix = "/Target/Failed/";
|
|||
std::string const finishedPrefix = "/Target/Finished/";
|
||||
std::string const toDoPrefix = "/Target/ToDo/";
|
||||
std::string const cleanedPrefix = "/Target/CleanedServers";
|
||||
std::string const toBeCleanedPrefix = "/Target/ToBeCleanedServers";
|
||||
std::string const failedServersPrefix = "/Target/FailedServers";
|
||||
std::string const planColPrefix = "/Plan/Collections/";
|
||||
std::string const curColPrefix = "/Current/Collections/";
|
||||
|
@ -96,37 +97,68 @@ bool Job::finish(std::string const& server, std::string const& shard,
|
|||
LOG_TOPIC(WARN, Logger::AGENCY) << "Failed to obtain type of job " << _jobId;
|
||||
}
|
||||
|
||||
// Prepare pending entry, block toserver
|
||||
{
|
||||
VPackArrayBuilder guard(&finished);
|
||||
VPackObjectBuilder guard2(&finished);
|
||||
|
||||
addPutJobIntoSomewhere(finished, success ? "Finished" : "Failed",
|
||||
pending.slice()[0], reason);
|
||||
|
||||
addRemoveJobFromSomewhere(finished, "ToDo", _jobId);
|
||||
addRemoveJobFromSomewhere(finished, "Pending", _jobId);
|
||||
|
||||
// Additional payload, which is to be executed in the finish transaction
|
||||
if (payload != nullptr) {
|
||||
Slice slice = payload->slice();
|
||||
TRI_ASSERT(slice.isObject());
|
||||
// Additional payload, which is to be executed in the finish transaction
|
||||
Slice operations = Slice::emptyObjectSlice();
|
||||
Slice preconditions = Slice::emptyObjectSlice();
|
||||
|
||||
if (payload != nullptr) {
|
||||
Slice slice = payload->slice();
|
||||
TRI_ASSERT(slice.isObject() || slice.isArray());
|
||||
if (slice.isObject()) { // opers only
|
||||
operations = slice;
|
||||
TRI_ASSERT(operations.isObject());
|
||||
} else {
|
||||
TRI_ASSERT(slice.length() < 3); // opers + precs only
|
||||
if (slice.length() > 0) {
|
||||
for (auto const& oper : VPackObjectIterator(slice)) {
|
||||
finished.add(oper.key.copyString(), oper.value);
|
||||
operations = slice[0];
|
||||
TRI_ASSERT(operations.isObject());
|
||||
if (slice.length() > 1) {
|
||||
preconditions = slice[1];
|
||||
TRI_ASSERT(preconditions.isObject());
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
// --- Remove blocks if specified:
|
||||
if (started && !server.empty()) {
|
||||
addReleaseServer(finished, server);
|
||||
}
|
||||
if (started && !shard.empty()) {
|
||||
addReleaseShard(finished, shard);
|
||||
}
|
||||
// Prepare pending entry, block toserver
|
||||
{
|
||||
VPackArrayBuilder guard(&finished);
|
||||
|
||||
} // close object and array
|
||||
{ // operations --
|
||||
VPackObjectBuilder operguard(&finished);
|
||||
|
||||
addPutJobIntoSomewhere(finished, success ? "Finished" : "Failed",
|
||||
pending.slice()[0], reason);
|
||||
|
||||
addRemoveJobFromSomewhere(finished, "ToDo", _jobId);
|
||||
addRemoveJobFromSomewhere(finished, "Pending", _jobId);
|
||||
|
||||
if (operations.length() > 0) {
|
||||
for (auto const& oper : VPackObjectIterator(operations)) {
|
||||
finished.add(oper.key.copyString(), oper.value);
|
||||
}
|
||||
}
|
||||
|
||||
// --- Remove blocks if specified:
|
||||
if (started && !server.empty()) {
|
||||
addReleaseServer(finished, server);
|
||||
}
|
||||
if (started && !shard.empty()) {
|
||||
addReleaseShard(finished, shard);
|
||||
}
|
||||
|
||||
} // -- operations
|
||||
|
||||
if (preconditions != Slice::emptyObjectSlice()) { // preconditions --
|
||||
VPackObjectBuilder precguard(&finished);
|
||||
if (preconditions.length() > 0) {
|
||||
for (auto const& prec : VPackObjectIterator(preconditions)) {
|
||||
finished.add(prec.key.copyString(), prec.value);
|
||||
}
|
||||
}
|
||||
} // -- preconditions
|
||||
|
||||
}
|
||||
|
||||
write_ret_t res = singleWriteTransaction(_agent, finished);
|
||||
if (res.accepted && res.indices.size() == 1 && res.indices[0]) {
|
||||
|
@ -139,53 +171,50 @@ bool Job::finish(std::string const& server, std::string const& shard,
|
|||
return false;
|
||||
}
|
||||
|
||||
std::string Job::randomIdleGoodAvailableServer(Node const& snap,
|
||||
std::string Job::randomIdleAvailableServer(Node const& snap,
|
||||
std::vector<std::string> const& exclude) {
|
||||
std::vector<std::string> as = availableServers(snap);
|
||||
std::string ret;
|
||||
auto ex(exclude);
|
||||
|
||||
// ungood;
|
||||
// Prefer good servers over bad servers
|
||||
std::vector<std::string> good;
|
||||
|
||||
// Only take good servers as valid server.
|
||||
try {
|
||||
for (auto const& srv : snap.hasAsChildren(healthPrefix).first) {
|
||||
if ((*srv.second).hasAsString("Status").first != "GOOD") {
|
||||
ex.push_back(srv.first);
|
||||
// ignore excluded servers
|
||||
if (std::find(std::begin(exclude), std::end(exclude), srv.first) != std::end(exclude)) {
|
||||
continue ;
|
||||
}
|
||||
// ignore servers not in availableServers above:
|
||||
if (std::find(std::begin(as), std::end(as), srv.first) == std::end(as)) {
|
||||
continue ;
|
||||
}
|
||||
|
||||
std::string const& status = (*srv.second).hasAsString("Status").first;
|
||||
if (status == "GOOD") {
|
||||
good.push_back(srv.first);
|
||||
}
|
||||
}
|
||||
} catch (...) {
|
||||
}
|
||||
|
||||
// blocked;
|
||||
try {
|
||||
for (auto const& srv : snap.hasAsChildren(blockedServersPrefix).first) {
|
||||
ex.push_back(srv.first);
|
||||
}
|
||||
} catch (...) {
|
||||
if (good.empty()) {
|
||||
return ret;
|
||||
}
|
||||
|
||||
// Remove excluded servers
|
||||
std::sort(std::begin(ex), std::end(ex));
|
||||
as.erase(std::remove_if(std::begin(as), std::end(as),
|
||||
[&](std::string const& s) {
|
||||
return std::binary_search(std::begin(ex), std::end(ex), s);
|
||||
}),
|
||||
std::end(as));
|
||||
|
||||
// Choose random server from rest
|
||||
if (!as.empty()) {
|
||||
if (as.size() == 1) {
|
||||
ret = as[0];
|
||||
} else {
|
||||
uint16_t interval = static_cast<uint16_t>(as.size() - 1);
|
||||
uint16_t random = RandomGenerator::interval(interval);
|
||||
ret = as.at(random);
|
||||
}
|
||||
if (good.size() == 1) {
|
||||
ret = good[0];
|
||||
return ret;
|
||||
}
|
||||
|
||||
uint16_t interval = static_cast<uint16_t>(good.size() - 1);
|
||||
uint16_t random = RandomGenerator::interval(interval);
|
||||
ret = good.at(random);
|
||||
return ret;
|
||||
}
|
||||
|
||||
std::string Job::randomIdleGoodAvailableServer(Node const& snap, Slice const& exclude) {
|
||||
std::string Job::randomIdleAvailableServer(Node const& snap, Slice const& exclude) {
|
||||
std::vector<std::string> ev;
|
||||
if (exclude.isArray()) {
|
||||
for (const auto& s : VPackArrayIterator(exclude)) {
|
||||
|
@ -194,10 +223,96 @@ std::string Job::randomIdleGoodAvailableServer(Node const& snap, Slice const& ex
|
|||
}
|
||||
}
|
||||
}
|
||||
return randomIdleGoodAvailableServer(snap, ev);
|
||||
return randomIdleAvailableServer(snap, ev);
|
||||
}
|
||||
|
||||
/// @brief Get servers from plan, which are not failed or cleaned out
|
||||
// The following counts in a given server list how many of the servers are
|
||||
// in Status "GOOD".
|
||||
size_t Job::countGoodServersInList(Node const& snap, VPackSlice const& serverList) {
|
||||
size_t count = 0;
|
||||
if (!serverList.isArray()) {
|
||||
// No array, strange, return 0
|
||||
return count;
|
||||
}
|
||||
auto health = snap.hasAsChildren(healthPrefix);
|
||||
// Do we have a Health substructure?
|
||||
if (health.second) {
|
||||
Node::Children& healthData = health.first; // List of servers in Health
|
||||
for (VPackSlice const serverName : VPackArrayIterator(serverList)) {
|
||||
if (serverName.isString()) {
|
||||
// serverName not a string? Then don't count
|
||||
std::string serverStr = serverName.copyString();
|
||||
// Now look up this server:
|
||||
auto it = healthData.find(serverStr);
|
||||
if (it != healthData.end()) {
|
||||
// Only check if found
|
||||
std::shared_ptr<Node> healthNode = it->second;
|
||||
// Check its status:
|
||||
if (healthNode->hasAsString("Status").first == "GOOD") {
|
||||
++count;
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
return count;
|
||||
}
|
||||
|
||||
// The following counts in a given server list how many of the servers are
|
||||
// in Status "GOOD".
|
||||
size_t Job::countGoodServersInList(Node const& snap, std::vector<std::string> const& serverList) {
|
||||
size_t count = 0;
|
||||
auto health = snap.hasAsChildren(healthPrefix);
|
||||
// Do we have a Health substructure?
|
||||
if (health.second) {
|
||||
Node::Children& healthData = health.first; // List of servers in Health
|
||||
for (auto& serverStr : serverList) {
|
||||
// Now look up this server:
|
||||
auto it = healthData.find(serverStr);
|
||||
if (it != healthData.end()) {
|
||||
// Only check if found
|
||||
std::shared_ptr<Node> healthNode = it->second;
|
||||
// Check its status:
|
||||
if (healthNode->hasAsString("Status").first == "GOOD") {
|
||||
++count;
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
return count;
|
||||
}
|
||||
|
||||
/// @brief Check if a server is cleaned or to be cleaned out:
|
||||
bool Job::isInServerList(Node const& snap, std::string const& prefix, std::string const& server, bool isArray) {
|
||||
VPackSlice slice;
|
||||
bool has;
|
||||
bool found = false;
|
||||
if (isArray) {
|
||||
std::tie(slice, has) = snap.hasAsSlice(prefix);
|
||||
if (has && slice.isArray()) {
|
||||
for (auto const& srv : VPackArrayIterator(slice)) {
|
||||
if (srv.copyString() == server) {
|
||||
found = true;
|
||||
break;
|
||||
}
|
||||
}
|
||||
}
|
||||
} else { // an object
|
||||
Node::Children children;
|
||||
std::tie(children, has) = snap.hasAsChildren(prefix);
|
||||
if (has) {
|
||||
for (auto const& srv : children) {
|
||||
if (srv.first == server) {
|
||||
found = true;
|
||||
break;
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
return found;
|
||||
}
|
||||
|
||||
/// @brief Get servers from plan, which are not failed or (to be) cleaned out
|
||||
std::vector<std::string> Job::availableServers(Node const& snapshot) {
|
||||
std::vector<std::string> ret;
|
||||
|
||||
|
@ -207,22 +322,31 @@ std::vector<std::string> Job::availableServers(Node const& snapshot) {
|
|||
ret.push_back(srv.first);
|
||||
}
|
||||
|
||||
// Remove cleaned servers from list (test first to avoid warning log
|
||||
if (snapshot.has(cleanedPrefix)) try {
|
||||
for (auto const& srv :
|
||||
VPackArrayIterator(snapshot.hasAsSlice(cleanedPrefix).first)) {
|
||||
ret.erase(std::remove(ret.begin(), ret.end(), srv.copyString()), ret.end());
|
||||
}
|
||||
} catch (...) {
|
||||
}
|
||||
auto excludePrefix = [&ret, &snapshot](std::string const& prefix, bool isArray) {
|
||||
|
||||
// Remove failed servers from list (test first to avoid warning log)
|
||||
if (snapshot.has(failedServersPrefix)) try {
|
||||
for (auto const& srv : snapshot.hasAsChildren(failedServersPrefix).first) {
|
||||
bool has;
|
||||
VPackSlice slice;
|
||||
Node::Children children;
|
||||
|
||||
if (isArray) {
|
||||
std::tie(slice, has) = snapshot.hasAsSlice(prefix);
|
||||
if (has) {
|
||||
for (auto const& srv : VPackArrayIterator(slice)) {
|
||||
ret.erase(std::remove(ret.begin(), ret.end(), srv.copyString()), ret.end());
|
||||
}
|
||||
}
|
||||
} else {
|
||||
std::tie(children, has) = snapshot.hasAsChildren(prefix);
|
||||
for (auto const& srv : children) {
|
||||
ret.erase(std::remove(ret.begin(), ret.end(), srv.first), ret.end());
|
||||
}
|
||||
} catch (...) {
|
||||
}
|
||||
};
|
||||
|
||||
// Remove (to be) cleaned and failed servers from the list
|
||||
excludePrefix(cleanedPrefix, true);
|
||||
excludePrefix(failedServersPrefix, false);
|
||||
excludePrefix(toBeCleanedPrefix, true);
|
||||
|
||||
return ret;
|
||||
}
|
||||
|
@ -419,7 +543,7 @@ bool Job::abortable(Node const& snapshot, std::string const& jobId) {
|
|||
|
||||
void Job::doForAllShards(Node const& snapshot, std::string& database,
|
||||
std::vector<shard_t>& shards,
|
||||
std::function<void(Slice plan, Slice current, std::string& planPath)> worker) {
|
||||
std::function<void(Slice plan, Slice current, std::string& planPath, std::string& curPath)> worker) {
|
||||
for (auto const& collShard : shards) {
|
||||
std::string shard = collShard.shard;
|
||||
std::string collection = collShard.collection;
|
||||
|
@ -432,7 +556,7 @@ void Job::doForAllShards(Node const& snapshot, std::string& database,
|
|||
Slice plan = snapshot.hasAsSlice(planPath).first;
|
||||
Slice current = snapshot.hasAsSlice(curPath).first;
|
||||
|
||||
worker(plan, current, planPath);
|
||||
worker(plan, current, planPath, curPath);
|
||||
}
|
||||
}
|
||||
|
||||
|
|
|
@ -48,6 +48,7 @@ extern std::string const failedPrefix;
|
|||
extern std::string const finishedPrefix;
|
||||
extern std::string const toDoPrefix;
|
||||
extern std::string const cleanedPrefix;
|
||||
extern std::string const toBeCleanedPrefix;
|
||||
extern std::string const failedServersPrefix;
|
||||
extern std::string const planColPrefix;
|
||||
extern std::string const curColPrefix;
|
||||
|
@ -72,9 +73,9 @@ struct Job {
|
|||
|
||||
virtual ~Job();
|
||||
|
||||
virtual void run() = 0;
|
||||
virtual void run(bool& aborts) = 0;
|
||||
|
||||
void runHelper(std::string const& server, std::string const& shard) {
|
||||
void runHelper(std::string const& server, std::string const& shard, bool& aborts) {
|
||||
if (_status == FAILED) { // happens when the constructor did not work
|
||||
return;
|
||||
}
|
||||
|
@ -87,10 +88,10 @@ struct Job {
|
|||
}
|
||||
try {
|
||||
if (_status == TODO) {
|
||||
start();
|
||||
start(aborts);
|
||||
} else if (_status == NOTFOUND) {
|
||||
if (create(nullptr)) {
|
||||
start();
|
||||
start(aborts);
|
||||
}
|
||||
}
|
||||
} catch (std::exception const& e) {
|
||||
|
@ -112,7 +113,7 @@ struct Job {
|
|||
virtual bool create(std::shared_ptr<VPackBuilder> b) = 0;
|
||||
|
||||
// Returns if job was actually started (i.e. false if directly failed!)
|
||||
virtual bool start() = 0;
|
||||
virtual bool start(bool& aborts) = 0;
|
||||
|
||||
static bool abortable(Node const& snapshot, std::string const& jobId);
|
||||
|
||||
|
@ -121,9 +122,12 @@ struct Job {
|
|||
|
||||
/// @brief Get a random server, which is not blocked, in good condition and
|
||||
/// excluding "exclude" vector
|
||||
static std::string randomIdleGoodAvailableServer(Node const& snap,
|
||||
static std::string randomIdleAvailableServer(Node const& snap,
|
||||
std::vector<std::string> const& exclude);
|
||||
static std::string randomIdleGoodAvailableServer(Node const& snap, VPackSlice const& exclude);
|
||||
static std::string randomIdleAvailableServer(Node const& snap, VPackSlice const& exclude);
|
||||
static size_t countGoodServersInList(Node const& snap, VPackSlice const& serverList);
|
||||
static size_t countGoodServersInList(Node const& snap, std::vector<std::string> const& serverList);
|
||||
static bool isInServerList(Node const& snap, std::string const& prefix, std::string const& server, bool isArray);
|
||||
|
||||
/// @brief Get servers from plan, which are not failed or cleaned out
|
||||
static std::vector<std::string> availableServers(const arangodb::consensus::Node&);
|
||||
|
@ -151,7 +155,7 @@ struct Job {
|
|||
|
||||
static void doForAllShards(
|
||||
Node const& snapshot, std::string& database, std::vector<shard_t>& shards,
|
||||
std::function<void(Slice plan, Slice current, std::string& planPath)> worker);
|
||||
std::function<void(Slice plan, Slice current, std::string& planPath, std::string& curPath)> worker);
|
||||
|
||||
// The following methods adds an operation to a transaction object or
|
||||
// a condition to a precondition object. In all cases, the builder trx
|
||||
|
|
|
@ -73,15 +73,15 @@ void JobContext::create(std::shared_ptr<VPackBuilder> b) {
|
|||
}
|
||||
}
|
||||
|
||||
void JobContext::start() {
|
||||
void JobContext::start(bool& aborts) {
|
||||
if (_job != nullptr) {
|
||||
_job->start();
|
||||
_job->start(aborts);
|
||||
}
|
||||
}
|
||||
|
||||
void JobContext::run() {
|
||||
void JobContext::run(bool& aborts) {
|
||||
if (_job != nullptr) {
|
||||
_job->run();
|
||||
_job->run(aborts);
|
||||
}
|
||||
}
|
||||
|
||||
|
|
|
@ -44,10 +44,10 @@ class JobContext {
|
|||
void create(std::shared_ptr<VPackBuilder> b = nullptr);
|
||||
|
||||
/// @brief Start job
|
||||
void start();
|
||||
void start(bool& aborts);
|
||||
|
||||
/// @brief Run job
|
||||
void run();
|
||||
void run(bool& aborts);
|
||||
|
||||
/// @brief Abort job
|
||||
void abort();
|
||||
|
|
|
@ -93,7 +93,7 @@ MoveShard::MoveShard(Node const& snapshot, AgentInterface* agent,
|
|||
|
||||
MoveShard::~MoveShard() {}
|
||||
|
||||
void MoveShard::run() { runHelper(_to, _shard); }
|
||||
void MoveShard::run(bool& aborts) { runHelper(_to, _shard, aborts); }
|
||||
|
||||
bool MoveShard::create(std::shared_ptr<VPackBuilder> envelope) {
|
||||
LOG_TOPIC(DEBUG, Logger::SUPERVISION)
|
||||
|
@ -167,7 +167,7 @@ bool MoveShard::create(std::shared_ptr<VPackBuilder> envelope) {
|
|||
return false;
|
||||
}
|
||||
|
||||
bool MoveShard::start() {
|
||||
bool MoveShard::start(bool&) {
|
||||
// If anything throws here, the run() method catches it and finishes
|
||||
// the job.
|
||||
|
||||
|
@ -265,11 +265,16 @@ bool MoveShard::start() {
|
|||
|
||||
int found = -1;
|
||||
int count = 0;
|
||||
_toServerIsFollower = false;
|
||||
for (auto const& srv : VPackArrayIterator(planned)) {
|
||||
TRI_ASSERT(srv.isString());
|
||||
if (srv.copyString() == _to) {
|
||||
finish("", "", false, "toServer must not yet be planned for shard");
|
||||
return false;
|
||||
if (!_isLeader) {
|
||||
finish("", "", false, "toServer must not be planned for a following shard");
|
||||
return false;
|
||||
} else {
|
||||
_toServerIsFollower = true;
|
||||
}
|
||||
}
|
||||
if (srv.copyString() == _from) {
|
||||
found = count;
|
||||
|
@ -340,17 +345,18 @@ bool MoveShard::start() {
|
|||
|
||||
// --- Plan changes
|
||||
doForAllShards(_snapshot, _database, shardsLikeMe,
|
||||
[this, &pending](Slice plan, Slice current, std::string& planPath) {
|
||||
[this, &pending](Slice plan, Slice current, std::string& planPath, std::string& curPath) {
|
||||
pending.add(VPackValue(planPath));
|
||||
{
|
||||
VPackArrayBuilder serverList(&pending);
|
||||
if (_isLeader) {
|
||||
TRI_ASSERT(plan[0].copyString() != _to);
|
||||
pending.add(plan[0]);
|
||||
pending.add(VPackValue(_to));
|
||||
if (!_toServerIsFollower) {
|
||||
pending.add(VPackValue(_to));
|
||||
}
|
||||
for (size_t i = 1; i < plan.length(); ++i) {
|
||||
pending.add(plan[i]);
|
||||
TRI_ASSERT(plan[i].copyString() != _to);
|
||||
}
|
||||
} else {
|
||||
for (auto const& srv : VPackArrayIterator(plan)) {
|
||||
|
@ -445,7 +451,7 @@ JOB_STATUS MoveShard::pendingLeader() {
|
|||
// Still the old leader, let's check that the toServer is insync:
|
||||
size_t done = 0; // count the number of shards for which _to is in sync:
|
||||
doForAllShards(_snapshot, _database, shardsLikeMe,
|
||||
[this, &done](Slice plan, Slice current, std::string& planPath) {
|
||||
[this, &done](Slice plan, Slice current, std::string& planPath, std::string& curPath) {
|
||||
for (auto const& s : VPackArrayIterator(current)) {
|
||||
if (s.copyString() == _to) {
|
||||
++done;
|
||||
|
@ -469,7 +475,7 @@ JOB_STATUS MoveShard::pendingLeader() {
|
|||
VPackObjectBuilder trxObject(&trx);
|
||||
VPackObjectBuilder preObject(&pre);
|
||||
doForAllShards(_snapshot, _database, shardsLikeMe,
|
||||
[this, &trx, &pre](Slice plan, Slice current, std::string& planPath) {
|
||||
[this, &trx, &pre](Slice plan, Slice current, std::string& planPath, std::string& curPath) {
|
||||
// Replace _from by "_" + _from
|
||||
trx.add(VPackValue(planPath));
|
||||
{
|
||||
|
@ -500,7 +506,7 @@ JOB_STATUS MoveShard::pendingLeader() {
|
|||
// Retired old leader, let's check that the fromServer has retired:
|
||||
size_t done = 0; // count the number of shards for which leader has retired
|
||||
doForAllShards(_snapshot, _database, shardsLikeMe,
|
||||
[this, &done](Slice plan, Slice current, std::string& planPath) {
|
||||
[this, &done](Slice plan, Slice current, std::string& planPath, std::string& curPath) {
|
||||
if (current.length() > 0 && current[0].copyString() == "_" + _from) {
|
||||
++done;
|
||||
}
|
||||
|
@ -521,7 +527,7 @@ JOB_STATUS MoveShard::pendingLeader() {
|
|||
VPackObjectBuilder trxObject(&trx);
|
||||
VPackObjectBuilder preObject(&pre);
|
||||
doForAllShards(_snapshot, _database, shardsLikeMe,
|
||||
[this, &trx, &pre](Slice plan, Slice current, std::string& planPath) {
|
||||
[this, &trx, &pre](Slice plan, Slice current, std::string& planPath, std::string& curPath) {
|
||||
// Replace "_" + _from by _to and leave _from out:
|
||||
trx.add(VPackValue(planPath));
|
||||
{
|
||||
|
@ -556,7 +562,7 @@ JOB_STATUS MoveShard::pendingLeader() {
|
|||
// all but except the old leader are in sync:
|
||||
size_t done = 0;
|
||||
doForAllShards(_snapshot, _database, shardsLikeMe,
|
||||
[this, &done](Slice plan, Slice current, std::string& planPath) {
|
||||
[this, &done](Slice plan, Slice current, std::string& planPath, std::string& curPath) {
|
||||
if (current.length() > 0 && current[0].copyString() == _to) {
|
||||
if (plan.length() < 3) {
|
||||
// This only happens for replicationFactor == 1, in
|
||||
|
@ -571,7 +577,7 @@ JOB_STATUS MoveShard::pendingLeader() {
|
|||
for (size_t i = 1; i < plan.length() - 1; ++i) {
|
||||
VPackSlice p = plan[i];
|
||||
for (auto const& c : VPackArrayIterator(current)) {
|
||||
if (arangodb::basics::VelocyPackHelper::compare(p, c, true)) {
|
||||
if (arangodb::basics::VelocyPackHelper::compare(p, c, true) == 0) {
|
||||
++found;
|
||||
break;
|
||||
}
|
||||
|
@ -599,7 +605,7 @@ JOB_STATUS MoveShard::pendingLeader() {
|
|||
VPackObjectBuilder trxObject(&trx);
|
||||
VPackObjectBuilder preObject(&pre);
|
||||
doForAllShards(_snapshot, _database, shardsLikeMe,
|
||||
[&trx, &pre, this](Slice plan, Slice current, std::string& planPath) {
|
||||
[&trx, &pre, this](Slice plan, Slice current, std::string& planPath, std::string& curPath) {
|
||||
if (!_remainsFollower) {
|
||||
// Remove _from from the list of follower
|
||||
trx.add(VPackValue(planPath));
|
||||
|
@ -663,7 +669,7 @@ JOB_STATUS MoveShard::pendingFollower() {
|
|||
|
||||
size_t done = 0; // count the number of shards done
|
||||
doForAllShards(_snapshot, _database, shardsLikeMe,
|
||||
[&done](Slice plan, Slice current, std::string& planPath) {
|
||||
[&done](Slice plan, Slice current, std::string& planPath, std::string& curPath) {
|
||||
if (ClusterHelpers::compareServerLists(plan, current)) {
|
||||
++done;
|
||||
}
|
||||
|
@ -695,7 +701,7 @@ JOB_STATUS MoveShard::pendingFollower() {
|
|||
|
||||
// All changes to Plan for all shards, with precondition:
|
||||
doForAllShards(_snapshot, _database, shardsLikeMe,
|
||||
[this, &trx, &precondition](Slice plan, Slice current, std::string& planPath) {
|
||||
[this, &trx, &precondition](Slice plan, Slice current, std::string& planPath, std::string& curPath) {
|
||||
// Remove fromServer from Plan:
|
||||
trx.add(VPackValue(planPath));
|
||||
{
|
||||
|
@ -748,15 +754,51 @@ arangodb::Result MoveShard::abort() {
|
|||
return result;
|
||||
}
|
||||
|
||||
// Can now only be TODO or PENDING
|
||||
|
||||
// Can now only be TODO or PENDING.
|
||||
if (_status == TODO) {
|
||||
finish("", "", true, "job aborted");
|
||||
return result;
|
||||
|
||||
// Do NOT remove, just cause it seems obvious!
|
||||
// We're working off a snapshot.
|
||||
// Make sure ToDo is still actually to be done
|
||||
auto todoPrec = std::make_shared<Builder>();
|
||||
{ VPackArrayBuilder b(todoPrec.get());
|
||||
{ VPackObjectBuilder o(todoPrec.get()); } // nothing to declare
|
||||
{ VPackObjectBuilder path(todoPrec.get()); // expect jobs still to be sitting in ToDo
|
||||
todoPrec->add(VPackValue(toDoPrefix + _jobId));
|
||||
{ VPackObjectBuilder guard(todoPrec.get());
|
||||
todoPrec->add("oldEmpty", VPackValue(false));
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
if (finish("", "", true, "job aborted", todoPrec)) {
|
||||
return result;
|
||||
}
|
||||
_status = PENDING;
|
||||
// If the above finish failed, then we must be in PENDING
|
||||
}
|
||||
|
||||
// Can now only be PENDING
|
||||
// Find the other shards in the same distributeShardsLike group:
|
||||
std::vector<Job::shard_t> shardsLikeMe =
|
||||
clones(_snapshot, _database, _collection, _shard);
|
||||
clones(_snapshot, _database, _collection, _shard);
|
||||
|
||||
// We can no longer abort by reverting to where we started, if any of the
|
||||
// shards of the distributeShardsLike group has already gone to new leader
|
||||
if (_isLeader) {
|
||||
for (auto const& i : shardsLikeMe) {
|
||||
auto const& cur = _snapshot.hasAsArray(
|
||||
curColPrefix + _database + "/" + i.collection + "/" + i.shard + "/" + "servers");
|
||||
if (cur.second && cur.first[0].copyString() == _to) {
|
||||
LOG_TOPIC(INFO, Logger::SUPERVISION) <<
|
||||
"MoveShard can no longer abort through reversion to where it started. Flight forward";
|
||||
finish(_to, _shard, true, "job aborted - new leader already in place");
|
||||
return result;
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
Builder trx; // to build the transaction
|
||||
|
||||
// Now look after a PENDING job:
|
||||
|
@ -767,7 +809,7 @@ arangodb::Result MoveShard::abort() {
|
|||
if (_isLeader) {
|
||||
// All changes to Plan for all shards:
|
||||
doForAllShards(_snapshot, _database, shardsLikeMe,
|
||||
[this, &trx](Slice plan, Slice current, std::string& planPath) {
|
||||
[this, &trx](Slice plan, Slice current, std::string& planPath, std::string& curPath) {
|
||||
// Restore leader to be _from:
|
||||
trx.add(VPackValue(planPath));
|
||||
{
|
||||
|
@ -784,7 +826,7 @@ arangodb::Result MoveShard::abort() {
|
|||
} else {
|
||||
// All changes to Plan for all shards:
|
||||
doForAllShards(_snapshot, _database, shardsLikeMe,
|
||||
[this, &trx](Slice plan, Slice current, std::string& planPath) {
|
||||
[this, &trx](Slice plan, Slice current, std::string& planPath, std::string& curPath) {
|
||||
// Remove toServer from Plan:
|
||||
trx.add(VPackValue(planPath));
|
||||
{
|
||||
|
@ -805,8 +847,18 @@ arangodb::Result MoveShard::abort() {
|
|||
addReleaseServer(trx, _to);
|
||||
addIncreasePlanVersion(trx);
|
||||
}
|
||||
if (_isLeader) { // Precondition, that current is still as in snapshot
|
||||
VPackObjectBuilder preconditionObj(&trx);
|
||||
// Current preconditions for all shards
|
||||
doForAllShards(
|
||||
_snapshot, _database, shardsLikeMe,
|
||||
[this, &trx](
|
||||
Slice plan, Slice current, std::string& planPath, std::string& curPath) {
|
||||
// Current still as is
|
||||
trx.add(curPath, current);
|
||||
});
|
||||
}
|
||||
}
|
||||
|
||||
write_ret_t res = singleWriteTransaction(_agent, trx);
|
||||
|
||||
if (!res.accepted) {
|
||||
|
@ -814,10 +866,16 @@ arangodb::Result MoveShard::abort() {
|
|||
std::string("Lost leadership"));
|
||||
return result;
|
||||
} else if (res.indices[0] == 0) {
|
||||
if (_isLeader) {
|
||||
// Tough luck. Things have changed. We'll move on
|
||||
LOG_TOPIC(INFO, Logger::SUPERVISION) <<
|
||||
"MoveShard can no longer abort through reversion to where it started. Flight forward";
|
||||
finish(_to, _shard, true, "job aborted - new leader already in place");
|
||||
return result;
|
||||
}
|
||||
result = Result(
|
||||
TRI_ERROR_SUPERVISION_GENERAL_FAILURE,
|
||||
std::string("Precondition failed while aborting moveShard job ") + _jobId);
|
||||
return result;
|
||||
TRI_ERROR_SUPERVISION_GENERAL_FAILURE,
|
||||
std::string("Precondition failed while aborting moveShard job ") + _jobId);
|
||||
}
|
||||
|
||||
return result;
|
||||
|
|
|
@ -47,9 +47,9 @@ struct MoveShard : public Job {
|
|||
virtual ~MoveShard();
|
||||
|
||||
virtual JOB_STATUS status() override final;
|
||||
virtual void run() override final;
|
||||
virtual void run(bool&) override final;
|
||||
virtual bool create(std::shared_ptr<VPackBuilder> envelope = nullptr) override final;
|
||||
virtual bool start() override final;
|
||||
virtual bool start(bool&) override final;
|
||||
virtual Result abort() override;
|
||||
JOB_STATUS pendingLeader();
|
||||
JOB_STATUS pendingFollower();
|
||||
|
@ -61,6 +61,8 @@ struct MoveShard : public Job {
|
|||
std::string _to;
|
||||
bool _isLeader;
|
||||
bool _remainsFollower;
|
||||
bool _toServerIsFollower;
|
||||
|
||||
};
|
||||
} // namespace consensus
|
||||
} // namespace arangodb
|
||||
|
|
|
@ -65,7 +65,7 @@ RemoveFollower::RemoveFollower(Node const& snapshot, AgentInterface* agent,
|
|||
|
||||
RemoveFollower::~RemoveFollower() {}
|
||||
|
||||
void RemoveFollower::run() { runHelper("", _shard); }
|
||||
void RemoveFollower::run(bool& aborts) { runHelper("", _shard, aborts); }
|
||||
|
||||
bool RemoveFollower::create(std::shared_ptr<VPackBuilder> envelope) {
|
||||
LOG_TOPIC(DEBUG, Logger::SUPERVISION)
|
||||
|
@ -122,7 +122,7 @@ bool RemoveFollower::create(std::shared_ptr<VPackBuilder> envelope) {
|
|||
return false;
|
||||
}
|
||||
|
||||
bool RemoveFollower::start() {
|
||||
bool RemoveFollower::start(bool&) {
|
||||
// If anything throws here, the run() method catches it and finishes
|
||||
// the job.
|
||||
|
||||
|
@ -149,7 +149,19 @@ bool RemoveFollower::start() {
|
|||
|
||||
// First check that we still have too many followers for the current
|
||||
// `replicationFactor`:
|
||||
size_t desiredReplFactor = collection.hasAsUInt("replicationFactor").first;
|
||||
size_t desiredReplFactor = 1;
|
||||
auto replFact = collection.hasAsUInt("replicationFactor");
|
||||
if (replFact.second) {
|
||||
desiredReplFactor = replFact.first;
|
||||
} else {
|
||||
auto replFact2 = collection.hasAsString("replicationFactor");
|
||||
if (replFact2.second && replFact2.first == "satellite") {
|
||||
// satellites => distribute to every server
|
||||
auto available = Job::availableServers(_snapshot);
|
||||
desiredReplFactor = Job::countGoodServersInList(_snapshot, available);
|
||||
}
|
||||
}
|
||||
|
||||
size_t actualReplFactor = planned.length();
|
||||
if (actualReplFactor <= desiredReplFactor) {
|
||||
finish("", "", true, "job no longer necessary, have few enough replicas");
|
||||
|
@ -187,7 +199,7 @@ bool RemoveFollower::start() {
|
|||
}
|
||||
doForAllShards(_snapshot, _database, shardsLikeMe,
|
||||
[&planned, &overview, &leaderBad](Slice plan, Slice current,
|
||||
std::string& planPath) {
|
||||
std::string& planPath, std::string& curPath) {
|
||||
if (current.length() > 0) {
|
||||
if (current[0].copyString() != planned[0].copyString()) {
|
||||
leaderBad = true;
|
||||
|
@ -286,13 +298,35 @@ bool RemoveFollower::start() {
|
|||
if (pair.second >= 0 &&
|
||||
static_cast<size_t>(pair.second) >= shardsLikeMe.size() &&
|
||||
pair.first != planned[0].copyString()) {
|
||||
chosenToRemove.insert(pair.first);
|
||||
--currentReplFactor;
|
||||
if (Job::isInServerList(_snapshot, toBeCleanedPrefix, pair.first, true) ||
|
||||
Job::isInServerList(_snapshot, cleanedPrefix, pair.first, true)) {
|
||||
// Prefer those cleaned or to be cleaned servers
|
||||
chosenToRemove.insert(pair.first);
|
||||
--currentReplFactor;
|
||||
}
|
||||
}
|
||||
if (currentReplFactor == desiredReplFactor) {
|
||||
break;
|
||||
}
|
||||
}
|
||||
if (currentReplFactor > desiredReplFactor) {
|
||||
// Now allow those which are perfectly good as well:
|
||||
for (auto const& it : reversedPlannedServers) {
|
||||
auto const pair = *overview.find(it);
|
||||
if (pair.second >= 0 &&
|
||||
static_cast<size_t>(pair.second) >= shardsLikeMe.size() &&
|
||||
pair.first != planned[0].copyString()) {
|
||||
if (!Job::isInServerList(_snapshot, toBeCleanedPrefix, pair.first, true) &&
|
||||
!Job::isInServerList(_snapshot, cleanedPrefix, pair.first, true)) {
|
||||
chosenToRemove.insert(pair.first);
|
||||
--currentReplFactor;
|
||||
}
|
||||
}
|
||||
if (currentReplFactor == desiredReplFactor) {
|
||||
break;
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
|
@ -347,7 +381,7 @@ bool RemoveFollower::start() {
|
|||
|
||||
// --- Plan changes
|
||||
doForAllShards(_snapshot, _database, shardsLikeMe,
|
||||
[&trx, &chosenToRemove](Slice plan, Slice current, std::string& planPath) {
|
||||
[&trx, &chosenToRemove](Slice plan, Slice current, std::string& planPath, std::string& curPath) {
|
||||
trx.add(VPackValue(planPath));
|
||||
{
|
||||
VPackArrayBuilder serverList(&trx);
|
||||
|
|
|
@ -42,8 +42,8 @@ struct RemoveFollower : public Job {
|
|||
|
||||
virtual JOB_STATUS status() override final;
|
||||
virtual bool create(std::shared_ptr<VPackBuilder> envelope = nullptr) override final;
|
||||
virtual void run() override final;
|
||||
virtual bool start() override final;
|
||||
virtual void run(bool& aborts) override final;
|
||||
virtual bool start(bool&) override final;
|
||||
virtual Result abort() override final;
|
||||
|
||||
std::string _database;
|
||||
|
|
|
@ -560,7 +560,8 @@ RestStatus RestAgencyHandler::handleConfig() {
|
|||
}
|
||||
|
||||
RestStatus RestAgencyHandler::handleState() {
|
||||
Builder body;
|
||||
|
||||
VPackBuilder body;
|
||||
body.add(VPackValue(VPackValueType::Array));
|
||||
for (auto const& i : _agent->state().get()) {
|
||||
body.add(VPackValue(VPackValueType::Object));
|
||||
|
|
|
@ -32,10 +32,6 @@
|
|||
#include <sstream>
|
||||
#include <thread>
|
||||
|
||||
#include <boost/uuid/uuid.hpp>
|
||||
#include <boost/uuid/uuid_generators.hpp>
|
||||
#include <boost/uuid/uuid_io.hpp>
|
||||
|
||||
#include "Agency/Agent.h"
|
||||
#include "Aql/Query.h"
|
||||
#include "Aql/QueryRegistry.h"
|
||||
|
@ -1526,3 +1522,78 @@ std::shared_ptr<VPackBuilder> State::latestAgencyState(TRI_vocbase_t& vocbase,
|
|||
store.dumpToBuilder(*builder);
|
||||
return builder;
|
||||
}
|
||||
|
||||
/// @brief load a compacted snapshot, returns true if successfull and false
|
||||
/// otherwise. In case of success store and index are modified. The store
|
||||
/// is reset to the state after log index `index` has been applied. Sets
|
||||
/// `index` to 0 if there is no compacted snapshot.
|
||||
uint64_t State::toVelocyPack(index_t lastIndex, VPackBuilder& builder) const {
|
||||
|
||||
TRI_ASSERT(builder.isOpenObject());
|
||||
|
||||
auto bindVars = std::make_shared<VPackBuilder>();
|
||||
{ VPackObjectBuilder b(bindVars.get()); }
|
||||
|
||||
std::string const querystr
|
||||
= "FOR l IN log FILTER l._key <= 'buf" + stringify(lastIndex) +
|
||||
"' SORT l._key RETURN {'_key': l._key, 'timestamp': l.timestamp,"
|
||||
"'clientId': l.clientId, 'request': l.request}";
|
||||
|
||||
TRI_ASSERT(nullptr != _vocbase); // this check was previously in the Query constructor
|
||||
arangodb::aql::Query logQuery(false, *_vocbase, aql::QueryString(querystr), bindVars,
|
||||
nullptr, arangodb::aql::PART_MAIN);
|
||||
|
||||
aql::QueryResult logQueryResult = logQuery.executeSync(_queryRegistry);
|
||||
|
||||
if (logQueryResult.code != TRI_ERROR_NO_ERROR) {
|
||||
THROW_ARANGO_EXCEPTION_MESSAGE(logQueryResult.code, logQueryResult.details);
|
||||
}
|
||||
|
||||
VPackSlice result = logQueryResult.result->slice();
|
||||
std::string firstIndex;
|
||||
uint64_t n = 0;
|
||||
|
||||
builder.add(VPackValue("log"));
|
||||
if (result.isArray()) {
|
||||
try {
|
||||
builder.add(result.resolveExternals());
|
||||
n = result.length();
|
||||
if (n > 0) {
|
||||
firstIndex = result[0].get("_key").copyString();
|
||||
}
|
||||
} catch (...) {
|
||||
VPackArrayBuilder a(&builder);
|
||||
}
|
||||
}
|
||||
|
||||
if (n > 0) {
|
||||
std::string const compstr
|
||||
= "FOR c in compact FILTER c._key >= '" + firstIndex +
|
||||
"' SORT c._key LIMIT 1 RETURN c";
|
||||
|
||||
arangodb::aql::Query compQuery(false, *_vocbase, aql::QueryString(compstr),
|
||||
bindVars, nullptr, arangodb::aql::PART_MAIN);
|
||||
|
||||
aql::QueryResult compQueryResult = compQuery.executeSync(_queryRegistry);
|
||||
|
||||
if (compQueryResult.code != TRI_ERROR_NO_ERROR) {
|
||||
THROW_ARANGO_EXCEPTION_MESSAGE(compQueryResult.code, compQueryResult.details);
|
||||
}
|
||||
|
||||
result = compQueryResult.result->slice();
|
||||
|
||||
if (result.isArray()) {
|
||||
if (result.length() > 0) {
|
||||
builder.add(VPackValue("compaction"));
|
||||
try {
|
||||
builder.add(result[0].resolveExternals());
|
||||
} catch (...) {
|
||||
VPackObjectBuilder a(&builder);
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
return n;
|
||||
}
|
||||
|
||||
|
|
|
@ -81,6 +81,9 @@ class State {
|
|||
/// Default: [first, last]
|
||||
std::vector<log_t> get(index_t = 0, index_t = (std::numeric_limits<uint64_t>::max)()) const;
|
||||
|
||||
|
||||
uint64_t toVelocyPack(index_t lastIndex, VPackBuilder& builder) const;
|
||||
|
||||
private:
|
||||
/// @brief Get complete log entries bound by lower and upper bounds.
|
||||
/// Default: [first, last]
|
||||
|
|
|
@ -331,6 +331,7 @@ void handleOnStatusDBServer(Agent* agent, Node const& snapshot,
|
|||
|
||||
void handleOnStatusCoordinator(Agent* agent, Node const& snapshot, HealthRecord& persisted,
|
||||
HealthRecord& transisted, std::string const& serverID) {
|
||||
|
||||
if (transisted.status == Supervision::HEALTH_STATUS_FAILED) {
|
||||
// if the current foxxmaster server failed => reset the value to ""
|
||||
if (snapshot.hasAsString(foxxmaster).first == serverID) {
|
||||
|
@ -382,6 +383,7 @@ void handleOnStatusSingle(Agent* agent, Node const& snapshot, HealthRecord& pers
|
|||
}
|
||||
}
|
||||
|
||||
|
||||
void handleOnStatus(Agent* agent, Node const& snapshot, HealthRecord& persisted,
|
||||
HealthRecord& transisted, std::string const& serverID,
|
||||
uint64_t const& jobId, std::shared_ptr<VPackBuilder>& envelope) {
|
||||
|
@ -397,6 +399,7 @@ void handleOnStatus(Agent* agent, Node const& snapshot, HealthRecord& persisted,
|
|||
}
|
||||
}
|
||||
|
||||
|
||||
// Build transaction for removing unattended servers from health monitoring
|
||||
query_t arangodb::consensus::removeTransactionBuilder(std::vector<std::string> const& todelete) {
|
||||
query_t del = std::make_shared<Builder>();
|
||||
|
@ -785,6 +788,9 @@ void Supervision::run() {
|
|||
TRI_ASSERT(_agent != nullptr);
|
||||
|
||||
while (!this->isStopping()) {
|
||||
|
||||
auto lapStart = std::chrono::steady_clock::now();
|
||||
|
||||
{
|
||||
MUTEX_LOCKER(locker, _lock);
|
||||
|
||||
|
@ -813,6 +819,8 @@ void Supervision::run() {
|
|||
upgradeAgency();
|
||||
}
|
||||
|
||||
_haveAborts = false;
|
||||
|
||||
if (_agent->leaderFor() > 55 || earlyBird()) {
|
||||
// 55 seconds is less than a minute, which fits to the
|
||||
// 60 seconds timeout in /_admin/cluster/health
|
||||
|
@ -840,7 +848,30 @@ void Supervision::run() {
|
|||
}
|
||||
}
|
||||
}
|
||||
_cv.wait(static_cast<uint64_t>(1000000 * _frequency));
|
||||
|
||||
// If anything was rafted, we need to
|
||||
index_t leaderIndex = _agent->index();
|
||||
|
||||
if (leaderIndex != 0) {
|
||||
while (true) { // No point in progressing, if indexes cannot be advanced
|
||||
auto result = _agent->waitFor(leaderIndex);
|
||||
if (result == Agent::raft_commit_t::UNKNOWN ||
|
||||
result == Agent::raft_commit_t::TIMEOUT) { // Oh snap
|
||||
LOG_TOPIC(WARN, Logger::SUPERVISION) << "Waiting for commits to be done ... ";
|
||||
continue;
|
||||
} else { // Good we can continue
|
||||
break;
|
||||
}
|
||||
|
||||
}
|
||||
}
|
||||
|
||||
auto lapTime = std::chrono::duration_cast<std::chrono::microseconds>(
|
||||
std::chrono::steady_clock::now() - lapStart).count();
|
||||
|
||||
if (lapTime < 1000000) {
|
||||
_cv.wait(static_cast<uint64_t>((1000000 - lapTime) * _frequency));
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
|
@ -1069,6 +1100,7 @@ bool Supervision::handleJobs() {
|
|||
enforceReplication();
|
||||
cleanupLostCollections(_snapshot, _agent, std::to_string(_jobId++));
|
||||
readyOrphanedIndexCreations();
|
||||
|
||||
workJobs();
|
||||
|
||||
return true;
|
||||
|
@ -1078,15 +1110,38 @@ bool Supervision::handleJobs() {
|
|||
void Supervision::workJobs() {
|
||||
_lock.assertLockedByCurrentThread();
|
||||
|
||||
for (auto const& todoEnt : _snapshot.hasAsChildren(toDoPrefix).first) {
|
||||
JobContext(TODO, (*todoEnt.second).hasAsString("jobId").first, _snapshot, _agent)
|
||||
.run();
|
||||
bool dummy = false;
|
||||
auto todos = _snapshot.hasAsChildren(toDoPrefix).first;
|
||||
auto it = todos.begin();
|
||||
static std::string const FAILED = "failed";
|
||||
|
||||
while (it != todos.end()) {
|
||||
auto jobNode = *(it->second);
|
||||
if (jobNode.hasAsString("type").first.compare(0, FAILED.length(), FAILED) == 0) {
|
||||
JobContext(TODO, jobNode.hasAsString("jobId").first, _snapshot, _agent)
|
||||
.run(_haveAborts);
|
||||
it = todos.erase(it);
|
||||
} else {
|
||||
++it;
|
||||
}
|
||||
}
|
||||
|
||||
// Do not start other jobs, if above resilience jobs aborted stuff
|
||||
if (!_haveAborts) {
|
||||
for (auto const& todoEnt : todos) {
|
||||
auto jobNode = *(todoEnt.second);
|
||||
JobContext(TODO, jobNode.hasAsString("jobId").first, _snapshot, _agent)
|
||||
.run(dummy);
|
||||
}
|
||||
}
|
||||
|
||||
for (auto const& pendEnt : _snapshot.hasAsChildren(pendingPrefix).first) {
|
||||
JobContext(PENDING, (*pendEnt.second).hasAsString("jobId").first, _snapshot, _agent)
|
||||
.run();
|
||||
auto pends = _snapshot.hasAsChildren(pendingPrefix).first;
|
||||
for (auto const& pendEnt : pends) {
|
||||
auto jobNode = *(pendEnt.second);
|
||||
JobContext(PENDING, jobNode.hasAsString("jobId").first, _snapshot, _agent)
|
||||
.run(dummy);
|
||||
}
|
||||
|
||||
}
|
||||
|
||||
void Supervision::readyOrphanedIndexCreations() {
|
||||
|
@ -1205,18 +1260,20 @@ void Supervision::enforceReplication() {
|
|||
auto const& col = *(col_.second);
|
||||
|
||||
size_t replicationFactor;
|
||||
if (col.hasAsUInt("replicationFactor").second) {
|
||||
replicationFactor = col.hasAsUInt("replicationFactor").first;
|
||||
auto replFact = col.hasAsUInt("replicationFactor");
|
||||
if (replFact.second) {
|
||||
replicationFactor = replFact.first;
|
||||
} else {
|
||||
LOG_TOPIC(DEBUG, Logger::SUPERVISION)
|
||||
<< "no replicationFactor entry in " << col.toJson();
|
||||
continue;
|
||||
}
|
||||
|
||||
// mop: satellites => distribute to every server
|
||||
if (replicationFactor == 0) {
|
||||
auto available = Job::availableServers(_snapshot);
|
||||
replicationFactor = available.size();
|
||||
auto replFact2 = col.hasAsString("replicationFactor");
|
||||
if (replFact2.second && replFact2.first == "satellite") {
|
||||
// satellites => distribute to every server
|
||||
auto available = Job::availableServers(_snapshot);
|
||||
replicationFactor = Job::countGoodServersInList(_snapshot, available);
|
||||
} else {
|
||||
LOG_TOPIC(DEBUG, Logger::SUPERVISION)
|
||||
<< "no replicationFactor entry in " << col.toJson();
|
||||
continue;
|
||||
}
|
||||
}
|
||||
|
||||
bool clone = col.has("distributeShardsLike");
|
||||
|
@ -1224,9 +1281,24 @@ void Supervision::enforceReplication() {
|
|||
if (!clone) {
|
||||
for (auto const& shard_ : col.hasAsChildren("shards").first) { // Pl shards
|
||||
auto const& shard = *(shard_.second);
|
||||
VPackBuilder onlyFollowers;
|
||||
{
|
||||
VPackArrayBuilder guard(&onlyFollowers);
|
||||
bool first = true;
|
||||
for (auto const& pp : VPackArrayIterator(shard.slice())) {
|
||||
if (!first) {
|
||||
onlyFollowers.add(pp);
|
||||
}
|
||||
first = false;
|
||||
}
|
||||
}
|
||||
size_t actualReplicationFactor
|
||||
= 1 + Job::countGoodServersInList(_snapshot, onlyFollowers.slice());
|
||||
// leader plus GOOD followers
|
||||
size_t apparentReplicationFactor = shard.slice().length();
|
||||
|
||||
size_t actualReplicationFactor = shard.slice().length();
|
||||
if (actualReplicationFactor != replicationFactor) {
|
||||
if (actualReplicationFactor != replicationFactor ||
|
||||
apparentReplicationFactor != replicationFactor) {
|
||||
// Check that there is not yet an addFollower or removeFollower
|
||||
// or moveShard job in ToDo for this shard:
|
||||
auto const& todo = _snapshot.hasAsChildren(toDoPrefix).first;
|
||||
|
@ -1256,11 +1328,12 @@ void Supervision::enforceReplication() {
|
|||
if (actualReplicationFactor < replicationFactor) {
|
||||
AddFollower(_snapshot, _agent, std::to_string(_jobId++),
|
||||
"supervision", db_.first, col_.first, shard_.first)
|
||||
.run();
|
||||
} else {
|
||||
.create();
|
||||
} else if (apparentReplicationFactor > replicationFactor &&
|
||||
actualReplicationFactor >= replicationFactor) {
|
||||
RemoveFollower(_snapshot, _agent, std::to_string(_jobId++),
|
||||
"supervision", db_.first, col_.first, shard_.first)
|
||||
.run();
|
||||
.create();
|
||||
}
|
||||
}
|
||||
}
|
||||
|
@ -1397,9 +1470,10 @@ void Supervision::shrinkCluster() {
|
|||
std::sort(availServers.begin(), availServers.end());
|
||||
|
||||
// Schedule last server for cleanout
|
||||
bool dummy;
|
||||
CleanOutServer(_snapshot, _agent, std::to_string(_jobId++),
|
||||
"supervision", availServers.back())
|
||||
.run();
|
||||
.run(dummy);
|
||||
}
|
||||
}
|
||||
}
|
||||
|
|
|
@ -200,6 +200,7 @@ class Supervision : public arangodb::CriticalThread {
|
|||
double _okThreshold;
|
||||
uint64_t _jobId;
|
||||
uint64_t _jobIdMax;
|
||||
bool _haveAborts; /**< @brief We have accumulated pending aborts in a round */
|
||||
|
||||
// mop: this feels very hacky...we have a hen and egg problem here
|
||||
// we are using /Shutdown in the agency to determine that the cluster should
|
||||
|
|
|
@ -3770,6 +3770,15 @@ arangodb::Result ClusterInfo::getShardServers(ShardID const& shardId,
|
|||
return arangodb::Result(TRI_ERROR_FAILED);
|
||||
}
|
||||
|
||||
|
||||
arangodb::Result ClusterInfo::agencyDump(std::shared_ptr<VPackBuilder> body) {
|
||||
|
||||
AgencyCommResult dump = _agency.dump();
|
||||
body->add(dump.slice());
|
||||
return Result();
|
||||
|
||||
}
|
||||
|
||||
// -----------------------------------------------------------------------------
|
||||
// --SECTION-- END-OF-FILE
|
||||
// -----------------------------------------------------------------------------
|
|
@ -275,6 +275,8 @@ class ClusterInfo {
|
|||
|
||||
uint64_t uniqid(uint64_t = 1);
|
||||
|
||||
arangodb::Result agencyDump(std::shared_ptr<VPackBuilder> body);
|
||||
|
||||
//////////////////////////////////////////////////////////////////////////////
|
||||
/// @brief flush the caches (used for testing only)
|
||||
//////////////////////////////////////////////////////////////////////////////
|
||||
|
|
|
@ -254,10 +254,15 @@ DBServerAgencySyncResult DBServerAgencySync::execute() {
|
|||
}
|
||||
operations.push_back(AgencyOperation("Current/Version",
|
||||
AgencySimpleOperationType::INCREMENT_OP));
|
||||
AgencyWriteTransaction currentTransaction(operations);
|
||||
AgencyPrecondition precondition("Plan/Version",
|
||||
AgencyPrecondition::Type::VALUE, plan->slice().get("Version"));
|
||||
AgencyWriteTransaction currentTransaction(operations, precondition);
|
||||
AgencyCommResult r = comm.sendTransactionWithFailover(currentTransaction);
|
||||
if (!r.successful()) {
|
||||
LOG_TOPIC(ERR, Logger::MAINTENANCE) << "Error reporting to agency";
|
||||
LOG_TOPIC(INFO, Logger::MAINTENANCE)
|
||||
<< "Error reporting to agency: _statusCode: " << r.errorCode()
|
||||
<< " message: " << r.errorMessage()
|
||||
<< ". This can be ignored, since it will be retried automaticlly.";
|
||||
} else {
|
||||
LOG_TOPIC(DEBUG, Logger::MAINTENANCE)
|
||||
<< "Invalidating current in ClusterInfo";
|
||||
|
|
|
@ -18,6 +18,7 @@
|
|||
/// Copyright holder is ArangoDB GmbH, Cologne, Germany
|
||||
///
|
||||
/// @author Simon Grätzer
|
||||
/// @author Kaveh Vahedipour
|
||||
////////////////////////////////////////////////////////////////////////////////
|
||||
|
||||
#include "RestClusterHandler.h"
|
||||
|
@ -25,6 +26,7 @@
|
|||
#include "Agency/Supervision.h"
|
||||
#include "Cluster/ClusterInfo.h"
|
||||
#include "Cluster/ServerState.h"
|
||||
#include "GeneralServer/AuthenticationFeature.h"
|
||||
#include "Replication/ReplicationFeature.h"
|
||||
#include "Rest/HttpRequest.h"
|
||||
#include "Rest/Version.h"
|
||||
|
@ -47,16 +49,46 @@ RestStatus RestClusterHandler::execute() {
|
|||
}
|
||||
|
||||
std::vector<std::string> const& suffixes = _request->suffixes();
|
||||
if (!suffixes.empty() && suffixes[0] == "endpoints") {
|
||||
handleCommandEndpoints();
|
||||
if (!suffixes.empty()) {
|
||||
if (suffixes[0] == "endpoints") {
|
||||
handleCommandEndpoints();
|
||||
} else if (suffixes[0] == "agency-dump") {
|
||||
handleAgencyDump();
|
||||
} else {
|
||||
generateError(
|
||||
Result(TRI_ERROR_FORBIDDEN, "expecting _api/cluster/[endpoints,agency-dump]"));
|
||||
}
|
||||
} else {
|
||||
generateError(
|
||||
Result(TRI_ERROR_FORBIDDEN, "expecting _api/cluster/endpoints"));
|
||||
Result(TRI_ERROR_FORBIDDEN, "expecting _api/cluster/[endpoints,agency-dump]"));
|
||||
}
|
||||
|
||||
return RestStatus::DONE;
|
||||
}
|
||||
|
||||
void RestClusterHandler::handleAgencyDump() {
|
||||
|
||||
AuthenticationFeature* af = AuthenticationFeature::instance();
|
||||
if (af->isActive() && !_request->user().empty()) {
|
||||
auth::Level lvl = auth::Level::NONE;
|
||||
if (af->userManager() != nullptr) {
|
||||
lvl = af->userManager()->databaseAuthLevel(_request->user(), "_system", true);
|
||||
} else {
|
||||
lvl = auth::Level::RW;
|
||||
}
|
||||
if (lvl < auth::Level::RW) {
|
||||
generateError(rest::ResponseCode::FORBIDDEN, TRI_ERROR_HTTP_FORBIDDEN,
|
||||
"you need admin rights to trigger shutdown");
|
||||
return;
|
||||
}
|
||||
}
|
||||
|
||||
std::shared_ptr<VPackBuilder> body = std::make_shared<VPackBuilder>();
|
||||
ClusterInfo::instance()->agencyDump(body);
|
||||
generateResult(rest::ResponseCode::OK, body->slice());
|
||||
|
||||
}
|
||||
|
||||
/// @brief returns information about all coordinator endpoints
|
||||
void RestClusterHandler::handleCommandEndpoints() {
|
||||
ClusterInfo* ci = ClusterInfo::instance();
|
||||
|
|
|
@ -41,6 +41,9 @@ class RestClusterHandler : public arangodb::RestBaseHandler {
|
|||
|
||||
/// _api/cluster/serverInfo
|
||||
void handleCommandServerInfo();
|
||||
|
||||
/// _api/cluster/agency-dump
|
||||
void handleAgencyDump();
|
||||
};
|
||||
} // namespace arangodb
|
||||
|
||||
|
|
|
@ -80,11 +80,14 @@ actions.defineHttp({
|
|||
return;
|
||||
}
|
||||
|
||||
let preconditions = {};
|
||||
preconditions['/arango/Supervision/Health/' + serverId + '/Status'] = {'old': 'FAILED'};
|
||||
// need to make sure it is not responsible for anything
|
||||
if (node.Role === 'DBServer') {
|
||||
let used = [];
|
||||
let count = 0; // Try for 60s if server still in use or not failed
|
||||
let msg = "";
|
||||
let used = [];
|
||||
while (++count <= 60) {
|
||||
let preconditions = {};
|
||||
preconditions['/arango/Supervision/Health/' + serverId + '/Status'] = {'old': 'FAILED'};
|
||||
// need to make sure it is not responsible for anything
|
||||
used = [];
|
||||
preconditions = reducePlanServers(function (data, agencyKey, servers) {
|
||||
data[agencyKey] = {'old': servers};
|
||||
if (servers.indexOf(serverId) !== -1) {
|
||||
|
@ -100,36 +103,41 @@ actions.defineHttp({
|
|||
return data;
|
||||
}, preconditions);
|
||||
|
||||
if (used.length > 0) {
|
||||
actions.resultError(req, res, actions.HTTP_PRECONDITION_FAILED,
|
||||
'the server is still in use at the following locations: ' + JSON.stringify(used));
|
||||
return;
|
||||
preconditions["/arango/Supervision/DBServers/" + serverId]
|
||||
= { "oldEmpty": true };
|
||||
|
||||
if (!checkServerLocked(serverId) && used.length === 0) {
|
||||
let operations = {};
|
||||
operations['/arango/Plan/Coordinators/' + serverId] = {'op': 'delete'};
|
||||
operations['/arango/Plan/DBServers/' + serverId] = {'op': 'delete'};
|
||||
operations['/arango/Current/ServersRegistered/' + serverId] = {'op': 'delete'};
|
||||
operations['/arango/Supervision/Health/' + serverId] = {'op': 'delete'};
|
||||
operations['/arango/Target/MapUniqueToShortID/' + serverId] = {'op': 'delete'};
|
||||
|
||||
try {
|
||||
global.ArangoAgency.write([[operations, preconditions]]);
|
||||
actions.resultOk(req, res, actions.HTTP_OK, true);
|
||||
return;
|
||||
} catch (e) {
|
||||
if (e.code === 412) {
|
||||
console.log("removeServer: got precondition failed, retrying...");
|
||||
} else {
|
||||
console.warn("removeServer: could not talk to agency, retrying...");
|
||||
}
|
||||
}
|
||||
} else {
|
||||
if (used.length > 0) {
|
||||
console.log("removeServer: server", serverId, "still in use in",
|
||||
used.length, "locations.");
|
||||
} else {
|
||||
console.log("removeServer: server", serverId, "locked in agency.");
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
let operations = {};
|
||||
operations['/arango/Plan/Coordinators/' + serverId] = {'op': 'delete'};
|
||||
operations['/arango/Plan/DBServers/' + serverId] = {'op': 'delete'};
|
||||
operations['/arango/Current/ServersRegistered/' + serverId] = {'op': 'delete'};
|
||||
operations['/arango/Supervision/Health/' + serverId] = {'op': 'delete'};
|
||||
operations['/arango/Target/MapUniqueToShortID/' + serverId] = {'op': 'delete'};
|
||||
|
||||
try {
|
||||
global.ArangoAgency.write([[operations, preconditions]]);
|
||||
} catch (e) {
|
||||
if (e.code === 412) {
|
||||
actions.resultError(req, res, actions.HTTP_PRECONDITION_FAILED,
|
||||
'you can only remove failed servers');
|
||||
return;
|
||||
}
|
||||
throw e;
|
||||
}
|
||||
|
||||
actions.resultOk(req, res, actions.HTTP_OK, true);
|
||||
/* DBOnly:
|
||||
|
||||
Current/Databases/YYY/XXX
|
||||
*/
|
||||
wait(1.0);
|
||||
} // while count
|
||||
actions.resultError(req, res, actions.HTTP_PRECONDITION_FAILED,
|
||||
'the server not failed, locked or is still in use at the following '
|
||||
+ 'locations: ' + JSON.stringify(used));
|
||||
}
|
||||
});
|
||||
|
||||
|
@ -635,6 +643,17 @@ function reduceCurrentServers (reducer, data) {
|
|||
}, data);
|
||||
}
|
||||
|
||||
function checkServerLocked (server) {
|
||||
var locks = ArangoAgency.get('Supervision/DBServers');
|
||||
try {
|
||||
if (locks.arango.Supervision.DBServers.hasOwnProperty(server)) {
|
||||
return true;
|
||||
}
|
||||
} catch (e) {
|
||||
}
|
||||
return false;
|
||||
}
|
||||
|
||||
// //////////////////////////////////////////////////////////////////////////////
|
||||
// / @start Docu Block JSF_getNumberOfServers
|
||||
// / (intentionally not in manual)
|
||||
|
|
|
@ -66,7 +66,7 @@ std::vector<std::string> split(std::string const& source,
|
|||
|
||||
/// @brief joins a string
|
||||
template <typename C>
|
||||
std::string join(C const& source, std::string const& delim = ",") {
|
||||
std::string join(C const& source, std::string const& delim) {
|
||||
std::string result;
|
||||
bool first = true;
|
||||
|
||||
|
|
|
@ -418,6 +418,7 @@ rest::ResponseCode GeneralResponse::responseCode(int code) {
|
|||
case TRI_ERROR_CLUSTER_LEADERSHIP_CHALLENGE_ONGOING:
|
||||
case TRI_ERROR_CLUSTER_NOT_LEADER:
|
||||
case TRI_ERROR_SHUTTING_DOWN:
|
||||
case TRI_ERROR_CLUSTER_CONNECTION_LOST:
|
||||
return ResponseCode::SERVICE_UNAVAILABLE;
|
||||
|
||||
case TRI_ERROR_CLUSTER_UNSUPPORTED:
|
||||
|
|
|
@ -54,6 +54,7 @@ const std::string FOLLOWER3 = "SNGL-follower23"; // tick 9, STATE GOOD
|
|||
const std::string FOLLOWER4 = "SNGL-follower4"; // tick 100, STATE BAD
|
||||
const std::string FOLLOWER5 = "SNGL-follower5"; // tick 1000, STATE GOOD wrong leader
|
||||
|
||||
bool aborts = false;
|
||||
|
||||
const char *agency =
|
||||
#include "ActiveFailoverTest.json"
|
||||
|
@ -229,7 +230,7 @@ TEST_CASE("ActiveFailover", "[agency][supervision]") {
|
|||
return fakeWriteResult;
|
||||
});
|
||||
|
||||
REQUIRE(job.start());
|
||||
REQUIRE(job.start(aborts));
|
||||
REQUIRE(job.status() == JOB_STATUS::FINISHED);
|
||||
Verify(Method(mockAgent,write)).Exactly(2);
|
||||
|
||||
|
@ -280,7 +281,7 @@ TEST_CASE("ActiveFailover", "[agency][supervision]") {
|
|||
return fakeWriteResult;
|
||||
});
|
||||
|
||||
REQUIRE(job.start());
|
||||
REQUIRE(job.start(aborts));
|
||||
REQUIRE(job.status() == JOB_STATUS::FINISHED);
|
||||
Verify(Method(mockAgent,write)).Exactly(2);
|
||||
|
||||
|
@ -331,7 +332,7 @@ TEST_CASE("ActiveFailover", "[agency][supervision]") {
|
|||
return fakeWriteResult;
|
||||
});
|
||||
|
||||
REQUIRE_FALSE(job.start());
|
||||
REQUIRE_FALSE(job.start(aborts));
|
||||
// job status stays on TODO and can retry later
|
||||
REQUIRE(job.status() == JOB_STATUS::TODO);
|
||||
Verify(Method(mockAgent,transient)).Exactly(Once);
|
||||
|
@ -402,7 +403,7 @@ TEST_CASE("ActiveFailover", "[agency][supervision]") {
|
|||
return fakeWriteResult;
|
||||
});
|
||||
|
||||
REQUIRE(job.start());
|
||||
REQUIRE(job.start(aborts));
|
||||
// job status stays on TODO and can retry later
|
||||
REQUIRE(job.status() == JOB_STATUS::FINISHED);
|
||||
Verify(Method(mockAgent,transient)).Exactly(1);
|
||||
|
|
|
@ -56,6 +56,8 @@ const std::string SHARD_FOLLOWER2 = "follower2";
|
|||
const std::string FREE_SERVER = "free";
|
||||
const std::string FREE_SERVER2 = "free2";
|
||||
|
||||
bool aborts = false;
|
||||
|
||||
const char *agency =
|
||||
#include "AddFollowerTest.json"
|
||||
;
|
||||
|
@ -209,7 +211,7 @@ TEST_CASE("AddFollower", "[agency][supervision]") {
|
|||
|
||||
When(Method(mockAgent, waitFor)).AlwaysReturn(AgentInterface::raft_commit_t::OK);
|
||||
auto& agent = mockAgent.get();
|
||||
AddFollower(agency("arango"), &agent, JOB_STATUS::TODO, jobId).start();
|
||||
AddFollower(agency("arango"), &agent, JOB_STATUS::TODO, jobId).start(aborts);
|
||||
|
||||
}
|
||||
|
||||
|
@ -266,7 +268,7 @@ TEST_CASE("AddFollower", "[agency][supervision]") {
|
|||
});
|
||||
When(Method(mockAgent, waitFor)).AlwaysReturn(AgentInterface::raft_commit_t::OK);
|
||||
auto& agent = mockAgent.get();
|
||||
AddFollower(agency("arango"), &agent, JOB_STATUS::TODO, jobId).start();
|
||||
AddFollower(agency("arango"), &agent, JOB_STATUS::TODO, jobId).start(aborts);
|
||||
|
||||
}
|
||||
|
||||
|
@ -331,7 +333,7 @@ TEST_CASE("AddFollower", "[agency][supervision]") {
|
|||
});
|
||||
When(Method(mockAgent, waitFor)).AlwaysReturn(AgentInterface::raft_commit_t::OK);
|
||||
AgentInterface &agent = mockAgent.get();
|
||||
AddFollower(agency("arango"), &agent, JOB_STATUS::TODO, jobId).start();
|
||||
AddFollower(agency("arango"), &agent, JOB_STATUS::TODO, jobId).start(aborts);
|
||||
|
||||
}
|
||||
|
||||
|
@ -384,7 +386,7 @@ TEST_CASE("AddFollower", "[agency][supervision]") {
|
|||
});
|
||||
When(Method(mockAgent, waitFor)).AlwaysReturn(AgentInterface::raft_commit_t::OK);
|
||||
auto& agent = mockAgent.get();
|
||||
AddFollower(agency("arango"), &agent, JOB_STATUS::TODO, jobId).start();
|
||||
AddFollower(agency("arango"), &agent, JOB_STATUS::TODO, jobId).start(aborts);
|
||||
|
||||
}
|
||||
|
||||
|
@ -439,7 +441,7 @@ TEST_CASE("AddFollower", "[agency][supervision]") {
|
|||
});
|
||||
When(Method(mockAgent, waitFor)).AlwaysReturn(AgentInterface::raft_commit_t::OK);
|
||||
AgentInterface &agent = mockAgent.get();
|
||||
AddFollower(agency("arango"), &agent, JOB_STATUS::TODO, jobId).start();
|
||||
AddFollower(agency("arango"), &agent, JOB_STATUS::TODO, jobId).start(aborts);
|
||||
|
||||
}
|
||||
|
||||
|
@ -494,7 +496,7 @@ TEST_CASE("AddFollower", "[agency][supervision]") {
|
|||
});
|
||||
When(Method(mockAgent, waitFor)).AlwaysReturn(AgentInterface::raft_commit_t::OK);
|
||||
AgentInterface &agent = mockAgent.get();
|
||||
AddFollower(agency("arango"), &agent, JOB_STATUS::TODO, jobId).start();
|
||||
AddFollower(agency("arango"), &agent, JOB_STATUS::TODO, jobId).start(aborts);
|
||||
|
||||
}
|
||||
|
||||
|
|
|
@ -47,7 +47,7 @@ R"=(
|
|||
"Status": "GOOD"
|
||||
},
|
||||
"leader": {
|
||||
"Status": "FAILED"
|
||||
"Status": "GOOD"
|
||||
},
|
||||
"free": {
|
||||
"Status": "GOOD"
|
||||
|
|
|
@ -50,6 +50,8 @@ const std::string PREFIX = "arango";
|
|||
const std::string SERVER = "leader";
|
||||
const std::string JOBID = "1";
|
||||
|
||||
bool aborts = false;
|
||||
|
||||
typedef std::function<std::unique_ptr<Builder>(
|
||||
Slice const&, std::string const&)>TestStructureType;
|
||||
|
||||
|
@ -157,7 +159,7 @@ VPackBuilder createJob(std::string const& server) {
|
|||
TEST_CASE("CleanOutServer", "[agency][supervision]") {
|
||||
RandomGenerator::initialize(RandomGenerator::RandomType::MERSENNE);
|
||||
auto baseStructure = createRootNode();
|
||||
|
||||
|
||||
write_ret_t fakeWriteResult {true, "", std::vector<apply_ret_t> {APPLIED}, std::vector<index_t> {1}};
|
||||
auto transBuilder = std::make_shared<Builder>();
|
||||
{ VPackArrayBuilder a(transBuilder.get());
|
||||
|
@ -220,7 +222,7 @@ SECTION("cleanout server should fail if the server does not exist") {
|
|||
JOB_STATUS::TODO,
|
||||
JOBID
|
||||
);
|
||||
cleanOutServer.start();
|
||||
cleanOutServer.start(aborts);
|
||||
Verify(Method(mockAgent, write));
|
||||
Verify(Method(mockAgent, waitFor));
|
||||
}
|
||||
|
@ -262,7 +264,7 @@ SECTION("cleanout server should wait if the server is currently blocked") {
|
|||
JOB_STATUS::TODO,
|
||||
JOBID
|
||||
);
|
||||
cleanOutServer.start();
|
||||
cleanOutServer.start(aborts);
|
||||
REQUIRE(true);
|
||||
}
|
||||
|
||||
|
@ -307,7 +309,7 @@ SECTION("cleanout server should wait if the server is not healthy right now") {
|
|||
JOB_STATUS::TODO,
|
||||
JOBID
|
||||
);
|
||||
cleanOutServer.start();
|
||||
cleanOutServer.start(aborts);
|
||||
REQUIRE(true);
|
||||
}
|
||||
|
||||
|
@ -357,7 +359,7 @@ SECTION("cleanout server should fail if the server is already cleaned") {
|
|||
JOB_STATUS::TODO,
|
||||
JOBID
|
||||
);
|
||||
cleanOutServer.start();
|
||||
cleanOutServer.start(aborts);
|
||||
Verify(Method(mockAgent, write));
|
||||
Verify(Method(mockAgent, waitFor));
|
||||
}
|
||||
|
@ -404,7 +406,7 @@ SECTION("cleanout server should fail if the server is failed") {
|
|||
JOB_STATUS::TODO,
|
||||
JOBID
|
||||
);
|
||||
cleanOutServer.start();
|
||||
cleanOutServer.start(aborts);
|
||||
Verify(Method(mockAgent, write));
|
||||
Verify(Method(mockAgent, waitFor));
|
||||
}
|
||||
|
@ -453,7 +455,7 @@ SECTION("cleanout server should fail if the replicationFactor is too big for any
|
|||
JOB_STATUS::TODO,
|
||||
JOBID
|
||||
);
|
||||
cleanOutServer.start();
|
||||
cleanOutServer.start(aborts);
|
||||
Verify(Method(mockAgent, write));
|
||||
Verify(Method(mockAgent, waitFor));
|
||||
}
|
||||
|
@ -503,7 +505,57 @@ SECTION("cleanout server should fail if the replicationFactor is too big for any
|
|||
JOB_STATUS::TODO,
|
||||
JOBID
|
||||
);
|
||||
cleanOutServer.start();
|
||||
cleanOutServer.start(aborts);
|
||||
Verify(Method(mockAgent, write));
|
||||
Verify(Method(mockAgent, waitFor));
|
||||
}
|
||||
|
||||
SECTION("cleanout server should fail if the replicationFactor is too big for any shard after counting in tobecleanedoutservers") {
|
||||
TestStructureType createTestStructure = [&](VPackSlice const& s, std::string const& path) {
|
||||
std::unique_ptr<VPackBuilder> builder;
|
||||
builder.reset(new VPackBuilder());
|
||||
if (s.isObject()) {
|
||||
builder->add(VPackValue(VPackValueType::Object));
|
||||
for (auto const& it: VPackObjectIterator(s)) {
|
||||
auto childBuilder = createTestStructure(it.value, path + "/" + it.key.copyString());
|
||||
if (childBuilder) {
|
||||
builder->add(it.key.copyString(), childBuilder->slice());
|
||||
}
|
||||
}
|
||||
|
||||
if (path == "/arango/Target/ToDo") {
|
||||
builder->add(JOBID, createJob(SERVER).slice());
|
||||
}
|
||||
builder->close();
|
||||
} else {
|
||||
if (path == "/arango/Target/ToBeCleanedServers") {
|
||||
builder->add(VPackValue(VPackValueType::Array));
|
||||
builder->add(VPackValue("free"));
|
||||
builder->close();
|
||||
}
|
||||
builder->add(s);
|
||||
}
|
||||
return builder;
|
||||
};
|
||||
|
||||
Mock<AgentInterface> mockAgent;
|
||||
When(Method(mockAgent, write)).Do([&](query_t const& q, consensus::AgentInterface::WriteMode w) -> write_ret_t {
|
||||
checkFailed(JOB_STATUS::TODO, q);
|
||||
return fakeWriteResult;
|
||||
});
|
||||
When(Method(mockAgent, waitFor)).AlwaysReturn(AgentInterface::raft_commit_t::OK);
|
||||
AgentInterface &agent = mockAgent.get();
|
||||
|
||||
Node agency = createAgency(createTestStructure);
|
||||
INFO("AGENCY: " << agency.toJson());
|
||||
// should not throw
|
||||
auto cleanOutServer = CleanOutServer(
|
||||
agency,
|
||||
&agent,
|
||||
JOB_STATUS::TODO,
|
||||
JOBID
|
||||
);
|
||||
cleanOutServer.start(aborts);
|
||||
Verify(Method(mockAgent, write));
|
||||
Verify(Method(mockAgent, waitFor));
|
||||
}
|
||||
|
@ -549,6 +601,8 @@ SECTION("a cleanout server job should move into pending when everything is ok")
|
|||
CHECK(std::string(writes.get("/arango/Target/Pending/1").get("timeStarted").typeName()) == "string");
|
||||
REQUIRE(std::string(writes.get("/arango/Supervision/DBServers/" + SERVER).typeName()) == "string");
|
||||
REQUIRE(writes.get("/arango/Supervision/DBServers/" + SERVER).copyString() == JOBID);
|
||||
REQUIRE(writes.get("/arango/Target/ToBeCleanedServers").get("op").copyString() == "push");
|
||||
REQUIRE(writes.get("/arango/Target/ToBeCleanedServers").get("new").copyString() == SERVER);
|
||||
REQUIRE(writes.get("/arango/Target/ToDo/1-0").get("toServer").copyString() == "free");
|
||||
|
||||
auto preconditions = q->slice()[0][1];
|
||||
|
@ -570,7 +624,7 @@ SECTION("a cleanout server job should move into pending when everything is ok")
|
|||
JOB_STATUS::TODO,
|
||||
JOBID
|
||||
);
|
||||
cleanOutServer.start();
|
||||
cleanOutServer.start(aborts);
|
||||
Verify(Method(mockAgent, write));
|
||||
Verify(Method(mockAgent, waitFor));
|
||||
}
|
||||
|
@ -621,7 +675,7 @@ SECTION("a cleanout server job should abort after a long timeout") {
|
|||
REQUIRE(std::string(q->slice().typeName()) == "array" );
|
||||
REQUIRE(q->slice().length() == 1);
|
||||
REQUIRE(std::string(q->slice()[0].typeName()) == "array");
|
||||
REQUIRE(q->slice()[0].length() == 1); // we always simply override! no preconditions...
|
||||
REQUIRE(q->slice()[0].length() == 2); // precondition that still in ToDo
|
||||
REQUIRE(std::string(q->slice()[0][0].typeName()) == "object");
|
||||
|
||||
auto writes = q->slice()[0][0];
|
||||
|
@ -630,6 +684,8 @@ SECTION("a cleanout server job should abort after a long timeout") {
|
|||
CHECK(writes.get("/arango/Target/ToDo/1-0").get("op").copyString() == "delete");
|
||||
// a not yet started job will be moved to finished
|
||||
CHECK(std::string(writes.get("/arango/Target/Finished/1-0").typeName()) == "object");
|
||||
auto preconds = q->slice()[0][1];
|
||||
CHECK(preconds.get("/arango/Target/ToDo/1-0").get("oldEmpty").isFalse());
|
||||
} else {
|
||||
// finally cleanout should be failed
|
||||
checkFailed(JOB_STATUS::PENDING, q);
|
||||
|
@ -648,7 +704,7 @@ SECTION("a cleanout server job should abort after a long timeout") {
|
|||
JOB_STATUS::PENDING,
|
||||
JOBID
|
||||
);
|
||||
cleanOutServer.run();
|
||||
cleanOutServer.run(aborts);
|
||||
Verify(Method(mockAgent, write));
|
||||
Verify(Method(mockAgent, waitFor));
|
||||
}
|
||||
|
@ -689,7 +745,7 @@ SECTION("when there are still subjobs to be done it should wait") {
|
|||
JOB_STATUS::PENDING,
|
||||
JOBID
|
||||
);
|
||||
cleanOutServer.run();
|
||||
cleanOutServer.run(aborts);
|
||||
REQUIRE(true);
|
||||
};
|
||||
|
||||
|
@ -748,7 +804,7 @@ SECTION("once all subjobs were successful then the job should be finished") {
|
|||
JOB_STATUS::PENDING,
|
||||
JOBID
|
||||
);
|
||||
cleanOutServer.run();
|
||||
cleanOutServer.run(aborts);
|
||||
REQUIRE(true);
|
||||
}
|
||||
|
||||
|
@ -793,7 +849,7 @@ SECTION("if there was a failed subjob then the job should also fail") {
|
|||
JOB_STATUS::PENDING,
|
||||
JOBID
|
||||
);
|
||||
cleanOutServer.run();
|
||||
cleanOutServer.run(aborts);
|
||||
REQUIRE(true);
|
||||
}
|
||||
|
||||
|
@ -832,7 +888,7 @@ SECTION("when the cleanout server job is aborted all subjobs should be aborted t
|
|||
REQUIRE(std::string(q->slice().typeName()) == "array" );
|
||||
REQUIRE(q->slice().length() == 1);
|
||||
REQUIRE(std::string(q->slice()[0].typeName()) == "array");
|
||||
REQUIRE(q->slice()[0].length() == 1); // we always simply override! no preconditions...
|
||||
REQUIRE(q->slice()[0].length() == 2); // precondition that still in ToDo
|
||||
REQUIRE(std::string(q->slice()[0][0].typeName()) == "object");
|
||||
|
||||
auto writes = q->slice()[0][0];
|
||||
|
@ -841,6 +897,8 @@ SECTION("when the cleanout server job is aborted all subjobs should be aborted t
|
|||
CHECK(writes.get("/arango/Target/ToDo/1-0").get("op").copyString() == "delete");
|
||||
// a not yet started job will be moved to finished
|
||||
CHECK(std::string(writes.get("/arango/Target/Finished/1-0").typeName()) == "object");
|
||||
auto preconds = q->slice()[0][1];
|
||||
CHECK(preconds.get("/arango/Target/ToDo/1-0").get("oldEmpty").isFalse());
|
||||
} else {
|
||||
checkFailed(JOB_STATUS::PENDING, q);
|
||||
}
|
||||
|
|
|
@ -58,6 +58,7 @@ R"=(
|
|||
},
|
||||
"Target": {
|
||||
"CleanedServers": [],
|
||||
"ToBeCleanedServers": [],
|
||||
"FailedServers": {},
|
||||
"MapUniqueToShortID": {
|
||||
"follower1": {
|
||||
|
|
|
@ -58,6 +58,8 @@ const std::string SHARD_FOLLOWER2 = "follower2";
|
|||
const std::string FREE_SERVER = "free";
|
||||
const std::string FREE_SERVER2 = "free2";
|
||||
|
||||
bool aborts = false;
|
||||
|
||||
typedef std::function<std::unique_ptr<Builder>(
|
||||
Slice const&, std::string const&)>TestStructureType;
|
||||
|
||||
|
@ -234,7 +236,7 @@ SECTION("if we want to start and the collection went missing from plan (our trut
|
|||
JOB_STATUS::TODO,
|
||||
jobId
|
||||
);
|
||||
failedFollower.start();
|
||||
failedFollower.start(aborts);
|
||||
}
|
||||
|
||||
SECTION("if we are supposed to fail a distributeShardsLike job we immediately fail because this should be done by a job running on the master shard") {
|
||||
|
@ -294,7 +296,7 @@ SECTION("if we are supposed to fail a distributeShardsLike job we immediately fa
|
|||
JOB_STATUS::TODO,
|
||||
jobId
|
||||
);
|
||||
failedFollower.start();
|
||||
failedFollower.start(aborts);
|
||||
}
|
||||
|
||||
SECTION("if the follower is healthy again we fail the job") {
|
||||
|
@ -356,7 +358,7 @@ SECTION("if the follower is healthy again we fail the job") {
|
|||
JOB_STATUS::TODO,
|
||||
jobId
|
||||
);
|
||||
REQUIRE_FALSE(failedFollower.start());
|
||||
REQUIRE_FALSE(failedFollower.start(aborts));
|
||||
Verify(Method(mockAgent, transact));
|
||||
Verify(Method(mockAgent, write));
|
||||
}
|
||||
|
@ -406,7 +408,7 @@ SECTION("if there is no healthy free server when trying to start just wait") {
|
|||
JOB_STATUS::TODO,
|
||||
jobId
|
||||
);
|
||||
REQUIRE_FALSE(failedFollower.start());
|
||||
REQUIRE_FALSE(failedFollower.start(aborts));
|
||||
}
|
||||
|
||||
SECTION("abort any moveShard job blocking the shard and start") {
|
||||
|
@ -480,13 +482,6 @@ SECTION("abort any moveShard job blocking the shard and start") {
|
|||
return fakeWriteResult;
|
||||
});
|
||||
|
||||
When(Method(mockAgent, transact)).Do([&](query_t const& q) -> trans_ret_t {
|
||||
// check that the job is now pending
|
||||
INFO("Transaction: " << q->slice().toJson());
|
||||
auto writes = q->slice()[0][0];
|
||||
REQUIRE(std::string(writes.get("/arango/Target/Finished/1").typeName()) == "object");
|
||||
return fakeTransResult;
|
||||
});
|
||||
When(Method(mockAgent, waitFor)).AlwaysReturn(AgentInterface::raft_commit_t::OK);
|
||||
AgentInterface &agent = mockAgent.get();
|
||||
auto failedFollower = FailedFollower(
|
||||
|
@ -495,8 +490,7 @@ SECTION("abort any moveShard job blocking the shard and start") {
|
|||
JOB_STATUS::TODO,
|
||||
jobId
|
||||
);
|
||||
REQUIRE(failedFollower.start());
|
||||
Verify(Method(mockAgent, transact));
|
||||
REQUIRE_FALSE(failedFollower.start(aborts));
|
||||
Verify(Method(mockAgent, write));
|
||||
}
|
||||
|
||||
|
@ -574,7 +568,7 @@ SECTION("a successfully started job should finish immediately and set everything
|
|||
JOB_STATUS::TODO,
|
||||
jobId
|
||||
);
|
||||
failedFollower.start();
|
||||
failedFollower.start(aborts);
|
||||
Verify(Method(mockAgent, transact));
|
||||
}
|
||||
|
||||
|
@ -679,7 +673,7 @@ SECTION("the job should handle distributeShardsLike") {
|
|||
JOB_STATUS::TODO,
|
||||
jobId
|
||||
);
|
||||
failedFollower.start();
|
||||
failedFollower.start(aborts);
|
||||
Verify(Method(mockAgent, transact));
|
||||
}
|
||||
|
||||
|
@ -746,7 +740,7 @@ SECTION("the job should timeout after a while") {
|
|||
JOB_STATUS::TODO,
|
||||
jobId
|
||||
);
|
||||
failedFollower.start();
|
||||
failedFollower.start(aborts);
|
||||
Verify(Method(mockAgent, write));
|
||||
}
|
||||
|
||||
|
|
|
@ -58,6 +58,8 @@ const std::string SHARD_FOLLOWER2 = "follower2";
|
|||
const std::string FREE_SERVER = "free";
|
||||
const std::string FREE_SERVER2 = "free2";
|
||||
|
||||
bool aborts = false;
|
||||
|
||||
const char *agency =
|
||||
#include "FailedLeaderTest.json"
|
||||
;
|
||||
|
@ -224,7 +226,7 @@ SECTION("if we want to start and the collection went missing from plan (our trut
|
|||
JOB_STATUS::TODO,
|
||||
jobId
|
||||
);
|
||||
failedLeader.start();
|
||||
failedLeader.start(aborts);
|
||||
}
|
||||
|
||||
SECTION("if we are supposed to fail a distributeShardsLike job we immediately fail because this should be done by a job running on the master shard") {
|
||||
|
@ -280,7 +282,7 @@ SECTION("if we are supposed to fail a distributeShardsLike job we immediately fa
|
|||
JOB_STATUS::TODO,
|
||||
jobId
|
||||
);
|
||||
failedLeader.start();
|
||||
failedLeader.start(aborts);
|
||||
}
|
||||
|
||||
SECTION("if the leader is healthy again we fail the job") {
|
||||
|
@ -337,7 +339,7 @@ SECTION("if the leader is healthy again we fail the job") {
|
|||
JOB_STATUS::TODO,
|
||||
jobId
|
||||
);
|
||||
REQUIRE_FALSE(failedLeader.start());
|
||||
REQUIRE_FALSE(failedLeader.start(aborts));
|
||||
Verify(Method(mockAgent, transact));
|
||||
Verify(Method(mockAgent, write)).Exactly(Once);
|
||||
}
|
||||
|
@ -383,7 +385,7 @@ SECTION("the job must not be started if there is no server that is in sync for e
|
|||
JOB_STATUS::TODO,
|
||||
jobId
|
||||
);
|
||||
REQUIRE_FALSE(failedLeader.start());
|
||||
REQUIRE_FALSE(failedLeader.start(aborts));
|
||||
}
|
||||
|
||||
SECTION("the job must not be started if there if one of the linked shards (distributeShardsLike) is not in sync") {
|
||||
|
@ -444,10 +446,10 @@ SECTION("the job must not be started if there if one of the linked shards (distr
|
|||
JOB_STATUS::TODO,
|
||||
jobId
|
||||
);
|
||||
failedLeader.start();
|
||||
failedLeader.start(aborts);
|
||||
}
|
||||
|
||||
SECTION("abort any moveShard job blocking the shard and start") {
|
||||
SECTION("abort any moveShard job blocking the shard and stay in ToDo") {
|
||||
Mock<AgentInterface> moveShardMockAgent;
|
||||
|
||||
Builder moveShardBuilder;
|
||||
|
@ -518,13 +520,6 @@ SECTION("abort any moveShard job blocking the shard and start") {
|
|||
return fakeWriteResult;
|
||||
});
|
||||
|
||||
When(Method(mockAgent, transact)).Do([&](query_t const& q) -> trans_ret_t {
|
||||
// check that the job is now pending
|
||||
INFO("Transaction: " << q->slice().toJson());
|
||||
auto writes = q->slice()[0][0];
|
||||
REQUIRE(std::string(writes.get("/arango/Target/Pending/1").typeName()) == "object");
|
||||
return fakeTransResult;
|
||||
});
|
||||
When(Method(mockAgent, waitFor)).AlwaysReturn(AgentInterface::raft_commit_t::OK);
|
||||
AgentInterface &agent = mockAgent.get();
|
||||
auto failedLeader = FailedLeader(
|
||||
|
@ -533,8 +528,7 @@ SECTION("abort any moveShard job blocking the shard and start") {
|
|||
JOB_STATUS::TODO,
|
||||
jobId
|
||||
);
|
||||
failedLeader.start();
|
||||
Verify(Method(mockAgent, transact));
|
||||
REQUIRE_FALSE(failedLeader.start(aborts));
|
||||
Verify(Method(mockAgent, write));
|
||||
}
|
||||
|
||||
|
@ -637,7 +631,7 @@ SECTION("if everything is fine than the job should be written to pending, adding
|
|||
JOB_STATUS::TODO,
|
||||
jobId
|
||||
);
|
||||
failedLeader.start();
|
||||
failedLeader.start(aborts);
|
||||
}
|
||||
|
||||
SECTION("if we want are working and our collection went missing from plan the job should just finish") {
|
||||
|
@ -707,7 +701,7 @@ SECTION("if we want are working and our collection went missing from plan the jo
|
|||
JOB_STATUS::PENDING,
|
||||
jobId
|
||||
);
|
||||
failedLeader.run();
|
||||
failedLeader.run(aborts);
|
||||
}
|
||||
|
||||
SECTION("if the newly supposed leader didn't catch up yet we wait") {
|
||||
|
@ -767,7 +761,7 @@ SECTION("if the newly supposed leader didn't catch up yet we wait") {
|
|||
JOB_STATUS::PENDING,
|
||||
jobId
|
||||
);
|
||||
failedLeader.run();
|
||||
failedLeader.run(aborts);
|
||||
}
|
||||
|
||||
SECTION("in case of a timeout the job should be aborted") {
|
||||
|
@ -845,7 +839,7 @@ SECTION("in case of a timeout the job should be aborted") {
|
|||
JOB_STATUS::PENDING,
|
||||
jobId
|
||||
);
|
||||
failedLeader.run();
|
||||
failedLeader.run(aborts);
|
||||
Verify(Method(mockAgent, write));
|
||||
}
|
||||
|
||||
|
@ -924,7 +918,7 @@ SECTION("when everything is finished there should be proper cleanup") {
|
|||
JOB_STATUS::PENDING,
|
||||
jobId
|
||||
);
|
||||
failedLeader.run();
|
||||
failedLeader.run(aborts);
|
||||
Verify(Method(mockAgent, write));
|
||||
}
|
||||
|
||||
|
@ -984,7 +978,7 @@ SECTION("a failedleader must not take a follower into account that is in sync bu
|
|||
JOB_STATUS::TODO,
|
||||
jobId
|
||||
);
|
||||
failedLeader.start();
|
||||
failedLeader.start(aborts);
|
||||
}
|
||||
}
|
||||
}
|
||||
|
|
|
@ -41,6 +41,12 @@ R"=(
|
|||
"Supervision": {
|
||||
"DBServers": {},
|
||||
"Health": {
|
||||
"free2": {
|
||||
"Status": "BAD"
|
||||
},
|
||||
"free": {
|
||||
"Status": "GOOD"
|
||||
},
|
||||
"follower1": {
|
||||
"Status": "GOOD"
|
||||
},
|
||||
|
|
|
@ -58,6 +58,8 @@ const std::string SHARD_FOLLOWER2 = "follower2";
|
|||
const std::string FREE_SERVER = "free";
|
||||
const std::string FREE_SERVER2 = "free2";
|
||||
|
||||
bool aborts = false;
|
||||
|
||||
typedef std::function<std::unique_ptr<Builder>(
|
||||
Slice const&, std::string const&)>TestStructureType;
|
||||
|
||||
|
@ -337,7 +339,7 @@ TEST_CASE("FailedServer", "[agency][supervision]") {
|
|||
|
||||
When(Method(mockAgent, waitFor)).AlwaysReturn(AgentInterface::raft_commit_t::OK);
|
||||
auto& agent = mockAgent.get();
|
||||
FailedServer(agency("arango"), &agent, JOB_STATUS::TODO, jobId).start();
|
||||
FailedServer(agency("arango"), &agent, JOB_STATUS::TODO, jobId).start(aborts);
|
||||
|
||||
Verify(Method(mockAgent,write));
|
||||
|
||||
|
@ -404,7 +406,7 @@ TEST_CASE("FailedServer", "[agency][supervision]") {
|
|||
|
||||
When(Method(mockAgent, waitFor)).AlwaysReturn(AgentInterface::raft_commit_t::OK);
|
||||
auto& agent = mockAgent.get();
|
||||
FailedServer(agency("arango"), &agent, JOB_STATUS::TODO, jobId).start();
|
||||
FailedServer(agency("arango"), &agent, JOB_STATUS::TODO, jobId).start(aborts);
|
||||
|
||||
Verify(Method(mockAgent,write));
|
||||
|
||||
|
|
|
@ -54,6 +54,8 @@ using namespace arangodb::basics;
|
|||
using namespace arangodb::consensus;
|
||||
using namespace fakeit;
|
||||
|
||||
bool aborts = false;
|
||||
|
||||
namespace arangodb {
|
||||
namespace tests {
|
||||
namespace move_shard_test {
|
||||
|
@ -165,12 +167,13 @@ SECTION("the job should fail if toServer does not exist") {
|
|||
|
||||
INFO("Agency: " << agency);
|
||||
auto moveShard = MoveShard(agency, &agent, TODO, jobId);
|
||||
moveShard.start();
|
||||
moveShard.start(aborts);
|
||||
Verify(Method(mockAgent,write));
|
||||
}
|
||||
|
||||
SECTION("the job should fail to start if toServer is already in plan") {
|
||||
std::function<std::unique_ptr<VPackBuilder>(VPackSlice const&, std::string const&)> createTestStructure = [&](VPackSlice const& s, std::string const& path) {
|
||||
SECTION("the job should fail to start if fromServer and toServer are planned followers") {
|
||||
std::function<std::unique_ptr<VPackBuilder>(VPackSlice const&, std::string const&)> createTestStructure =
|
||||
[&](VPackSlice const& s, std::string const& path) {
|
||||
std::unique_ptr<VPackBuilder> builder;
|
||||
builder.reset(new VPackBuilder());
|
||||
if (s.isObject()) {
|
||||
|
@ -183,7 +186,7 @@ SECTION("the job should fail to start if toServer is already in plan") {
|
|||
}
|
||||
|
||||
if (path == "/arango/Target/ToDo") {
|
||||
builder->add(jobId, createJob(COLLECTION, SHARD_LEADER, SHARD_FOLLOWER1).slice());
|
||||
builder->add(jobId, createJob(COLLECTION, SHARD_FOLLOWER1, SHARD_LEADER).slice());
|
||||
}
|
||||
builder->close();
|
||||
} else {
|
||||
|
@ -206,7 +209,7 @@ SECTION("the job should fail to start if toServer is already in plan") {
|
|||
|
||||
INFO("Agency: " << agency);
|
||||
auto moveShard = MoveShard(agency, &agent, TODO, jobId);
|
||||
moveShard.start();
|
||||
moveShard.start(aborts);
|
||||
Verify(Method(mockAgent,write));
|
||||
}
|
||||
|
||||
|
@ -245,7 +248,7 @@ SECTION("the job should fail if fromServer does not exist") {
|
|||
Fake(Method(spy, finish));
|
||||
|
||||
Job& spyMoveShard = spy.get();
|
||||
spyMoveShard.start();
|
||||
spyMoveShard.start(aborts);
|
||||
|
||||
Verify(Method(spy, finish).Matching([](std::string const& server, std::string const& shard, bool success, std::string const& reason, query_t const payload) -> bool {return !success;}));
|
||||
}
|
||||
|
@ -287,7 +290,7 @@ SECTION("the job should fail if fromServer is not in plan of the shard") {
|
|||
|
||||
INFO("Agency: " << agency);
|
||||
auto moveShard = MoveShard(agency, &agent, TODO, jobId);
|
||||
moveShard.start();
|
||||
moveShard.start(aborts);
|
||||
Verify(Method(mockAgent,write));
|
||||
}
|
||||
|
||||
|
@ -338,7 +341,7 @@ SECTION("the job should fail if fromServer does not exist") {
|
|||
|
||||
INFO("Agency: " << agency);
|
||||
auto moveShard = MoveShard(agency, &agent, TODO, jobId);
|
||||
moveShard.start();
|
||||
moveShard.start(aborts);
|
||||
Verify(Method(mockAgent,write));
|
||||
}
|
||||
|
||||
|
@ -377,7 +380,7 @@ SECTION("the job should remain in todo if the shard is currently locked") {
|
|||
|
||||
INFO("Agency: " << agency);
|
||||
auto moveShard = MoveShard(agency, &agent, TODO, jobId);
|
||||
moveShard.start();
|
||||
moveShard.start(aborts);
|
||||
}
|
||||
|
||||
SECTION("the job should remain in todo if the target server is currently locked") {
|
||||
|
@ -415,7 +418,7 @@ SECTION("the job should remain in todo if the target server is currently locked"
|
|||
|
||||
INFO("Agency: " << agency);
|
||||
auto moveShard = MoveShard(agency, &agent, TODO, jobId);
|
||||
moveShard.start();
|
||||
moveShard.start(aborts);
|
||||
}
|
||||
|
||||
SECTION("the job should fail if the target server was cleaned out") {
|
||||
|
@ -462,7 +465,7 @@ SECTION("the job should fail if the target server was cleaned out") {
|
|||
|
||||
INFO("Agency: " << agency);
|
||||
auto moveShard = MoveShard(agency, &agent, TODO, jobId);
|
||||
moveShard.start();
|
||||
moveShard.start(aborts);
|
||||
Verify(Method(mockAgent,write));
|
||||
}
|
||||
|
||||
|
@ -508,7 +511,7 @@ SECTION("the job should fail if the target server is failed") {
|
|||
|
||||
INFO("Agency: " << agency);
|
||||
auto moveShard = MoveShard(agency, &agent, TODO, jobId);
|
||||
moveShard.start();
|
||||
moveShard.start(aborts);
|
||||
Verify(Method(mockAgent,write));
|
||||
}
|
||||
|
||||
|
@ -548,7 +551,7 @@ SECTION("the job should wait until the target server is good") {
|
|||
|
||||
INFO("Agency: " << agency);
|
||||
auto moveShard = MoveShard(agency, &agent, TODO, jobId);
|
||||
moveShard.start();
|
||||
moveShard.start(aborts);
|
||||
}
|
||||
|
||||
SECTION("the job should fail if the shard distributes its shards like some other") {
|
||||
|
@ -591,7 +594,7 @@ SECTION("the job should fail if the shard distributes its shards like some other
|
|||
|
||||
INFO("Agency: " << agency);
|
||||
auto moveShard = MoveShard(agency, &agent, TODO, jobId);
|
||||
moveShard.start();
|
||||
moveShard.start(aborts);
|
||||
Verify(Method(mockAgent,write));
|
||||
}
|
||||
|
||||
|
@ -671,7 +674,7 @@ SECTION("the job should be moved to pending when everything is ok") {
|
|||
|
||||
INFO("Agency: " << agency);
|
||||
auto moveShard = MoveShard(agency, &agent, TODO, jobId);
|
||||
moveShard.start();
|
||||
moveShard.start(aborts);
|
||||
Verify(Method(mockAgent,write));
|
||||
}
|
||||
|
||||
|
@ -727,7 +730,7 @@ SECTION("moving from a follower should be possible") {
|
|||
|
||||
INFO("Agency: " << agency);
|
||||
auto moveShard = MoveShard(agency, &agent, TODO, jobId);
|
||||
moveShard.start();
|
||||
moveShard.start(aborts);
|
||||
Verify(Method(mockAgent,write));
|
||||
}
|
||||
|
||||
|
@ -855,7 +858,7 @@ SECTION("when moving a shard that is a distributeShardsLike leader move the rest
|
|||
|
||||
INFO("Agency: " << agency);
|
||||
auto moveShard = MoveShard(agency, &agent, TODO, jobId);
|
||||
moveShard.start();
|
||||
moveShard.start(aborts);
|
||||
Verify(Method(mockAgent,write));
|
||||
}
|
||||
|
||||
|
@ -910,7 +913,7 @@ SECTION("if the job is too old it should be aborted to prevent a deadloop") {
|
|||
Fake(Method(spy, abort));
|
||||
|
||||
Job& spyMoveShard = spy.get();
|
||||
spyMoveShard.run();
|
||||
spyMoveShard.run(aborts);
|
||||
|
||||
Verify(Method(spy, abort));
|
||||
}
|
||||
|
@ -966,7 +969,7 @@ SECTION("if the job is too old (leader case) it should be aborted to prevent a d
|
|||
Fake(Method(spy, abort));
|
||||
|
||||
Job& spyMoveShard = spy.get();
|
||||
spyMoveShard.run();
|
||||
spyMoveShard.run(aborts);
|
||||
|
||||
Verify(Method(spy, abort));
|
||||
}
|
||||
|
@ -1015,7 +1018,7 @@ SECTION("if the collection was dropped while moving finish the job") {
|
|||
Fake(Method(spy, finish));
|
||||
|
||||
Job& spyMoveShard = spy.get();
|
||||
spyMoveShard.run();
|
||||
spyMoveShard.run(aborts);
|
||||
|
||||
Verify(Method(spy, finish).Matching([](std::string const& server, std::string const& shard, bool success, std::string const& reason, query_t const payload) -> bool {
|
||||
return success;
|
||||
|
@ -1066,7 +1069,7 @@ SECTION("if the collection was dropped before the job could be started just fini
|
|||
Fake(Method(spy, finish));
|
||||
|
||||
Job& spyMoveShard = spy.get();
|
||||
spyMoveShard.start();
|
||||
spyMoveShard.start(aborts);
|
||||
|
||||
Verify(Method(spy, finish).Matching([](std::string const& server, std::string const& shard, bool success, std::string const& reason, query_t const payload) -> bool {return success;}));
|
||||
|
||||
|
@ -1121,7 +1124,7 @@ SECTION("the job should wait until the planned shard situation has been created
|
|||
|
||||
INFO("Agency: " << agency);
|
||||
auto moveShard = MoveShard(agency, &agent, PENDING, jobId);
|
||||
moveShard.run();
|
||||
moveShard.run(aborts);
|
||||
}
|
||||
|
||||
SECTION("if the job is done it should properly finish itself") {
|
||||
|
@ -1193,7 +1196,7 @@ SECTION("if the job is done it should properly finish itself") {
|
|||
|
||||
INFO("Agency: " << agency);
|
||||
auto moveShard = MoveShard(agency, &agent, PENDING, jobId);
|
||||
moveShard.run();
|
||||
moveShard.run(aborts);
|
||||
Verify(Method(mockAgent,write));
|
||||
}
|
||||
|
||||
|
@ -1318,7 +1321,7 @@ SECTION("the job should not finish itself when only parts of distributeShardsLik
|
|||
|
||||
INFO("Agency: " << agency);
|
||||
auto moveShard = MoveShard(agency, &agent, PENDING, jobId);
|
||||
moveShard.run();
|
||||
moveShard.run(aborts);
|
||||
}
|
||||
|
||||
SECTION("the job should finish when all distributeShardsLike shards have adapted") {
|
||||
|
@ -1496,7 +1499,7 @@ SECTION("the job should finish when all distributeShardsLike shards have adapted
|
|||
|
||||
INFO("Agency: " << agency);
|
||||
auto moveShard = MoveShard(agency, &agent, PENDING, jobId);
|
||||
moveShard.run();
|
||||
moveShard.run(aborts);
|
||||
Verify(Method(mockAgent,write));
|
||||
}
|
||||
|
||||
|
@ -1527,10 +1530,12 @@ SECTION("a moveshard job that just made it to ToDo can simply be aborted") {
|
|||
When(Method(mockAgent, waitFor)).AlwaysReturn();
|
||||
When(Method(mockAgent, write)).Do([&](query_t const& q, consensus::AgentInterface::WriteMode w) -> write_ret_t {
|
||||
INFO("WriteTransaction: " << q->slice().toJson());
|
||||
REQUIRE(q->slice()[0].length() == 1); // we always simply override! no preconditions...
|
||||
REQUIRE(q->slice()[0].length() == 2); // we always simply override! no preconditions...
|
||||
auto writes = q->slice()[0][0];
|
||||
CHECK(writes.get("/arango/Target/ToDo/1").get("op").copyString() == "delete");
|
||||
CHECK(std::string(writes.get("/arango/Target/Finished/1").typeName()) == "object");
|
||||
auto precond = q->slice()[0][1];
|
||||
CHECK(precond.get("/arango/Target/ToDo/1").get("oldEmpty").isFalse());
|
||||
|
||||
return fakeWriteResult;
|
||||
});
|
||||
|
@ -1591,9 +1596,10 @@ SECTION("a pending moveshard job should also put the original server back into p
|
|||
When(Method(mockAgent, waitFor)).AlwaysReturn();
|
||||
When(Method(mockAgent, write)).Do([&](query_t const& q, consensus::AgentInterface::WriteMode w) -> write_ret_t {
|
||||
INFO("WriteTransaction: " << q->slice().toJson());
|
||||
LOG_DEVEL << q->slice().toJson() << " " << __LINE__;
|
||||
auto writes = q->slice()[0][0];
|
||||
CHECK(writes.get("/arango/Target/Pending/1").get("op").copyString() == "delete");
|
||||
REQUIRE(q->slice()[0].length() == 1); // we always simply override! no preconditions...
|
||||
REQUIRE(q->slice()[0].length() == 2); // Precondition: to Server not leader yet
|
||||
CHECK(writes.get("/arango/Supervision/DBServers/" + FREE_SERVER).get("op").copyString() == "delete");
|
||||
CHECK(writes.get("/arango/Supervision/Shards/" + SHARD).get("op").copyString() == "delete");
|
||||
CHECK(std::string(writes.get("/arango/Plan/Collections/" + DATABASE + "/" + COLLECTION + "/shards/" + SHARD).typeName()) == "array");
|
||||
|
@ -1698,7 +1704,7 @@ SECTION("after the new leader has synchronized the new leader should resign") {
|
|||
|
||||
INFO("Agency: " << agency);
|
||||
auto moveShard = MoveShard(agency, &agent, PENDING, jobId);
|
||||
moveShard.run();
|
||||
moveShard.run(aborts);
|
||||
Verify(Method(mockAgent,write));
|
||||
}
|
||||
|
||||
|
@ -1762,7 +1768,7 @@ SECTION("when the old leader is not yet ready for resign nothing should happen")
|
|||
|
||||
INFO("Agency: " << agency);
|
||||
auto moveShard = MoveShard(agency, &agent, PENDING, jobId);
|
||||
moveShard.run();
|
||||
moveShard.run(aborts);
|
||||
}
|
||||
|
||||
SECTION("aborting the job while a leader transition is in progress (for example when job is timing out) should make the old leader leader again") {
|
||||
|
@ -1819,9 +1825,11 @@ SECTION("aborting the job while a leader transition is in progress (for example
|
|||
When(Method(mockAgent, waitFor)).AlwaysReturn();
|
||||
When(Method(mockAgent, write)).Do([&](query_t const& q, consensus::AgentInterface::WriteMode w) -> write_ret_t {
|
||||
INFO("WriteTransaction: " << q->slice().toJson());
|
||||
LOG_DEVEL << q->slice().toJson() << " " << __LINE__;
|
||||
|
||||
auto writes = q->slice()[0][0];
|
||||
CHECK(writes.get("/arango/Target/Pending/1").get("op").copyString() == "delete");
|
||||
REQUIRE(q->slice()[0].length() == 1); // we always simply override! no preconditions...
|
||||
REQUIRE(q->slice()[0].length() == 2); // Precondition: to Server not leader yet
|
||||
CHECK(writes.get("/arango/Supervision/DBServers/" + FREE_SERVER).get("op").copyString() == "delete");
|
||||
CHECK(writes.get("/arango/Supervision/Shards/" + SHARD).get("op").copyString() == "delete");
|
||||
CHECK(std::string(writes.get("/arango/Plan/Collections/" + DATABASE + "/" + COLLECTION + "/shards/" + SHARD).typeName()) == "array");
|
||||
|
@ -1924,7 +1932,7 @@ SECTION("if we are ready to resign the old server then finally move to the new l
|
|||
|
||||
INFO("Agency: " << agency);
|
||||
auto moveShard = MoveShard(agency, &agent, PENDING, jobId);
|
||||
moveShard.run();
|
||||
moveShard.run(aborts);
|
||||
Verify(Method(mockAgent,write));
|
||||
}
|
||||
|
||||
|
@ -2005,7 +2013,7 @@ SECTION("if the new leader took over finish the job") {
|
|||
|
||||
INFO("Agency: " << agency);
|
||||
auto moveShard = MoveShard(agency, &agent, PENDING, jobId);
|
||||
moveShard.run();
|
||||
moveShard.run(aborts);
|
||||
Verify(Method(mockAgent,write));
|
||||
}
|
||||
|
||||
|
@ -2379,7 +2387,7 @@ SECTION("if the job fails while trying to switch over leadership it should be ab
|
|||
Fake(Method(spy, abort));
|
||||
|
||||
Job& spyMoveShard = spy.get();
|
||||
spyMoveShard.run();
|
||||
spyMoveShard.run(aborts);
|
||||
|
||||
Verify(Method(spy, abort));
|
||||
}
|
||||
|
@ -2434,7 +2442,7 @@ SECTION("if the job timeouts while the new leader is trying to take over the job
|
|||
Fake(Method(spy, abort));
|
||||
|
||||
Job& spyMoveShard = spy.get();
|
||||
spyMoveShard.run();
|
||||
spyMoveShard.run(aborts);
|
||||
|
||||
Verify(Method(spy, abort));
|
||||
}
|
||||
|
@ -2520,7 +2528,7 @@ SECTION("when promoting the new leader, the old one should become a resigned fol
|
|||
|
||||
INFO("Agency: " << agency);
|
||||
auto moveShard = MoveShard(agency, &agent, PENDING, jobId);
|
||||
moveShard.run();
|
||||
moveShard.run(aborts);
|
||||
Verify(Method(mockAgent, write));
|
||||
}
|
||||
|
||||
|
|
|
@ -59,6 +59,8 @@ const std::string SHARD_FOLLOWER2 = "follower2";
|
|||
const std::string FREE_SERVER = "free";
|
||||
const std::string FREE_SERVER2 = "free2";
|
||||
|
||||
bool aborts = false;
|
||||
|
||||
const char *agency =
|
||||
#include "RemoveFollowerTest.json"
|
||||
;
|
||||
|
@ -224,7 +226,7 @@ TEST_CASE("RemoveFollower", "[agency][supervision]") {
|
|||
|
||||
When(Method(mockAgent, waitFor)).AlwaysReturn(AgentInterface::raft_commit_t::OK);
|
||||
auto &agent = mockAgent.get();
|
||||
RemoveFollower(agency("arango"), &agent, JOB_STATUS::TODO, jobId).start();
|
||||
RemoveFollower(agency("arango"), &agent, JOB_STATUS::TODO, jobId).start(aborts);
|
||||
|
||||
}
|
||||
|
||||
|
@ -283,7 +285,7 @@ TEST_CASE("RemoveFollower", "[agency][supervision]") {
|
|||
);
|
||||
When(Method(mockAgent, waitFor)).AlwaysReturn(AgentInterface::raft_commit_t::OK);
|
||||
auto &agent = mockAgent.get();
|
||||
RemoveFollower(agency("arango"), &agent, JOB_STATUS::TODO, jobId).start();
|
||||
RemoveFollower(agency("arango"), &agent, JOB_STATUS::TODO, jobId).start(aborts);
|
||||
|
||||
}
|
||||
|
||||
|
@ -349,7 +351,7 @@ TEST_CASE("RemoveFollower", "[agency][supervision]") {
|
|||
);
|
||||
When(Method(mockAgent, waitFor)).AlwaysReturn(AgentInterface::raft_commit_t::OK);
|
||||
auto &agent = mockAgent.get();
|
||||
RemoveFollower(agency("arango"), &agent, JOB_STATUS::TODO, jobId).start();
|
||||
RemoveFollower(agency("arango"), &agent, JOB_STATUS::TODO, jobId).start(aborts);
|
||||
|
||||
}
|
||||
|
||||
|
@ -412,7 +414,7 @@ TEST_CASE("RemoveFollower", "[agency][supervision]") {
|
|||
);
|
||||
When(Method(mockAgent, waitFor)).AlwaysReturn(AgentInterface::raft_commit_t::OK);
|
||||
AgentInterface &agent = mockAgent.get();
|
||||
RemoveFollower(agency("arango"), &agent, JOB_STATUS::TODO, jobId).start();
|
||||
RemoveFollower(agency("arango"), &agent, JOB_STATUS::TODO, jobId).start(aborts);
|
||||
|
||||
}
|
||||
|
||||
|
@ -473,7 +475,7 @@ TEST_CASE("RemoveFollower", "[agency][supervision]") {
|
|||
});
|
||||
When(Method(mockAgent, waitFor)).AlwaysReturn(AgentInterface::raft_commit_t::OK);
|
||||
auto& agent = mockAgent.get();
|
||||
RemoveFollower(agency("arango"), &agent, JOB_STATUS::TODO, jobId).start();
|
||||
RemoveFollower(agency("arango"), &agent, JOB_STATUS::TODO, jobId).start(aborts);
|
||||
|
||||
REQUIRE_NOTHROW(Verify(Method(mockAgent, write)));
|
||||
}
|
||||
|
@ -609,7 +611,7 @@ TEST_CASE("RemoveFollower", "[agency][supervision]") {
|
|||
);
|
||||
When(Method(mockAgent, waitFor)).AlwaysReturn(AgentInterface::raft_commit_t::OK);
|
||||
auto &agent = mockAgent.get();
|
||||
RemoveFollower(agency("arango"), &agent, JOB_STATUS::TODO, jobId).start();
|
||||
RemoveFollower(agency("arango"), &agent, JOB_STATUS::TODO, jobId).start(aborts);
|
||||
|
||||
REQUIRE_NOTHROW(Verify(Method(mockAgent, write)));
|
||||
}
|
||||
|
|
|
@ -125,7 +125,7 @@ function MovingShardsSuite ({useData}) {
|
|||
res = request(envelope);
|
||||
} catch (err) {
|
||||
console.error(
|
||||
"Exception for POST /_admin/cluster/cleanOutServer:", err.stack);
|
||||
"Exception for GET /_admin/cluster/cleanOutServer:", err.stack);
|
||||
return {cleanedServers:[]};
|
||||
}
|
||||
if (res.statusCode !== 200) {
|
||||
|
@ -145,6 +145,32 @@ function MovingShardsSuite ({useData}) {
|
|||
return body;
|
||||
}
|
||||
|
||||
////////////////////////////////////////////////////////////////////////////////
|
||||
/// @brief display agency information in case of a bad outcome
|
||||
////////////////////////////////////////////////////////////////////////////////
|
||||
|
||||
function displayAgencyInformation() {
|
||||
var coordEndpoint =
|
||||
global.ArangoClusterInfo.getServerEndpoint("Coordinator0001");
|
||||
var request = require("@arangodb/request");
|
||||
var endpointToURL = require("@arangodb/cluster").endpointToURL;
|
||||
var url = endpointToURL(coordEndpoint);
|
||||
|
||||
var res;
|
||||
try {
|
||||
var envelope = { method: "GET", url: url + "/_api/cluster/agency-dump" };
|
||||
res = request(envelope);
|
||||
} catch (err) {
|
||||
console.error(
|
||||
"Exception for GET /_api/cluster/agency-dump:", err.stack);
|
||||
return;
|
||||
}
|
||||
if (res.statusCode !== 200) {
|
||||
return;
|
||||
}
|
||||
var body = res.body;
|
||||
console.error("Agency state after disaster:", body);
|
||||
}
|
||||
|
||||
////////////////////////////////////////////////////////////////////////////////
|
||||
/// @brief test whether or not a server is clean
|
||||
|
@ -181,6 +207,7 @@ function MovingShardsSuite ({useData}) {
|
|||
console.info(
|
||||
"Failed: Server " + id + " was not cleaned out. List of cleaned servers: ["
|
||||
+ obj.cleanedServers + "]");
|
||||
displayAgencyInformation();
|
||||
}
|
||||
|
||||
} else {
|
||||
|
@ -213,6 +240,7 @@ function MovingShardsSuite ({useData}) {
|
|||
}
|
||||
}
|
||||
if (!ok) {
|
||||
displayAgencyInformation();
|
||||
return false;
|
||||
}
|
||||
|
||||
|
|
|
@ -194,6 +194,33 @@ function MovingShardsWithViewSuite (options) {
|
|||
return body;
|
||||
}
|
||||
|
||||
////////////////////////////////////////////////////////////////////////////////
|
||||
/// @brief display agency information in case of a bad outcome
|
||||
////////////////////////////////////////////////////////////////////////////////
|
||||
|
||||
function displayAgencyInformation() {
|
||||
var coordEndpoint =
|
||||
global.ArangoClusterInfo.getServerEndpoint("Coordinator0001");
|
||||
var request = require("@arangodb/request");
|
||||
var endpointToURL = require("@arangodb/cluster").endpointToURL;
|
||||
var url = endpointToURL(coordEndpoint);
|
||||
|
||||
var res;
|
||||
try {
|
||||
var envelope = { method: "GET", url: url + "/_api/cluster/agency-dump" };
|
||||
res = request(envelope);
|
||||
} catch (err) {
|
||||
console.error(
|
||||
"Exception for GET /_api/cluster/agency-dump:", err.stack);
|
||||
return;
|
||||
}
|
||||
if (res.statusCode !== 200) {
|
||||
return;
|
||||
}
|
||||
var body = res.body;
|
||||
console.error("Agency state after disaster:", body);
|
||||
}
|
||||
|
||||
|
||||
////////////////////////////////////////////////////////////////////////////////
|
||||
/// @brief test whether or not a server is clean
|
||||
|
@ -230,6 +257,7 @@ function MovingShardsWithViewSuite (options) {
|
|||
console.info(
|
||||
"Failed: Server " + id + " was not cleaned out. List of cleaned servers: ["
|
||||
+ obj.cleanedServers + "]");
|
||||
displayAgencyInformation();
|
||||
}
|
||||
|
||||
} else {
|
||||
|
@ -262,6 +290,7 @@ function MovingShardsWithViewSuite (options) {
|
|||
}
|
||||
}
|
||||
if (!ok) {
|
||||
displayAgencyInformation();
|
||||
return false;
|
||||
}
|
||||
|
||||
|
|
Loading…
Reference in New Issue