1
0
Fork 0

Supervision Job for Active Failover (#5066)

This commit is contained in:
Simon 2018-04-23 12:49:41 +02:00 committed by Jan
parent 646db8ca0a
commit 45fbed497b
36 changed files with 1605 additions and 179 deletions

View File

@ -0,0 +1,299 @@
////////////////////////////////////////////////////////////////////////////////
/// DISCLAIMER
///
/// Copyright 2018 ArangoDB GmbH, Cologne, Germany
///
/// Licensed under the Apache License, Version 2.0 (the "License");
/// you may not use this file except in compliance with the License.
/// You may obtain a copy of the License at
///
/// http://www.apache.org/licenses/LICENSE-2.0
///
/// Unless required by applicable law or agreed to in writing, software
/// distributed under the License is distributed on an "AS IS" BASIS,
/// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
/// See the License for the specific language governing permissions and
/// limitations under the License.
///
/// Copyright holder is ArangoDB GmbH, Cologne, Germany
///
/// @author Simon Grätzer
////////////////////////////////////////////////////////////////////////////////
#include "ActiveFailoverJob.h"
#include "Agency/AgentInterface.h"
#include "Agency/Job.h"
#include "Agency/JobContext.h"
#include "Agency/Store.h"
#include "Cluster/ClusterHelpers.h"
#include "VocBase/voc-types.h"
using namespace arangodb;
using namespace arangodb::consensus;
ActiveFailoverJob::ActiveFailoverJob(Node const& snapshot, AgentInterface* agent,
std::string const& jobId, std::string const& creator,
std::string const& failed)
: Job(NOTFOUND, snapshot, agent, jobId, creator),
_server(failed) { }
ActiveFailoverJob::ActiveFailoverJob(Node const& snapshot, AgentInterface* agent,
JOB_STATUS status, std::string const& jobId)
: Job(status, snapshot, agent, jobId) {
// Get job details from agency:
std::string path = pos[status] + _jobId + "/";
auto tmp_server = _snapshot.hasAsString(path + "server");
auto tmp_creator = _snapshot.hasAsString(path + "creator");
if (tmp_server.second && tmp_creator.second) {
_server = tmp_server.first;
_creator = tmp_creator.first;
} else {
std::stringstream err;
err << "Failed to find job " << _jobId << " in agency.";
LOG_TOPIC(ERR, Logger::SUPERVISION) << err.str();
finish(tmp_server.first, "", false, err.str());
_status = FAILED;
}
}
ActiveFailoverJob::~ActiveFailoverJob() {}
void ActiveFailoverJob::run() {
runHelper(_server, "");
}
bool ActiveFailoverJob::create(std::shared_ptr<VPackBuilder> envelope) {
LOG_TOPIC(DEBUG, Logger::SUPERVISION)
<< "Todo: Handle failover for leader " + _server;
bool selfCreate = (envelope == nullptr); // Do we create ourselves?
if (selfCreate) {
_jb = std::make_shared<Builder>();
} else {
_jb = envelope;
}
auto now = timepointToString(std::chrono::system_clock::now());
{ VPackArrayBuilder transaction(_jb.get());
{ VPackObjectBuilder operations(_jb.get());
// Todo entry
_jb->add(VPackValue(toDoPrefix + _jobId));
{ VPackObjectBuilder todo(_jb.get());
_jb->add("creator", VPackValue(_creator));
_jb->add("type", VPackValue("activeFailover"));
_jb->add("server", VPackValue(_server));
_jb->add("jobId", VPackValue(_jobId));
_jb->add("timeCreated", VPackValue(now));
} // todo
// FailedServers entry []
_jb->add(VPackValue(failedServersPrefix + "/" + _server));
{ VPackArrayBuilder failedServers(_jb.get()); }
} // Operations
//Preconditions
{ VPackObjectBuilder health(_jb.get());
// Status should still be BAD
addPreconditionServerHealth(*_jb, _server, Supervision::HEALTH_STATUS_BAD);
// Target/FailedServers does not already include _server
_jb->add(VPackValue(failedServersPrefix + "/" + _server));
{ VPackObjectBuilder old(_jb.get());
_jb->add("oldEmpty", VPackValue(true)); }
// Target/FailedServers is still as in the snapshot
_jb->add(VPackValue(failedServersPrefix));
{ VPackObjectBuilder old(_jb.get());
_jb->add("old", _snapshot(failedServersPrefix).toBuilder().slice());}
} // Preconditions
} // transactions
_status = TODO;
if (!selfCreate) {
return true;
}
write_ret_t res = singleWriteTransaction(_agent, *_jb);
if (res.accepted && res.indices.size() == 1 && res.indices[0]) {
return true;
}
_status = NOTFOUND;
LOG_TOPIC(INFO, Logger::SUPERVISION) << "Failed to insert job " + _jobId;
return false;
}
bool ActiveFailoverJob::start() {
// If anything throws here, the run() method catches it and finishes
// the job.
// Fail job, if Health back to not FAILED
if (checkServerHealth(_snapshot, _server) != Supervision::HEALTH_STATUS_FAILED) {
std::string reason = "Server " + _server + " is no longer failed. " +
"Not starting ActiveFailoverJob job";
LOG_TOPIC(INFO, Logger::SUPERVISION) << reason;
return finish(_server, "", true, reason); // move to /Target/Finished
}
auto leader = _snapshot.hasAsSlice(asyncReplLeader);
if (!leader.second || leader.first.compareString(_server) != 0) {
std::string reason = "Server " + _server + " is not the current replication leader";
LOG_TOPIC(INFO, Logger::SUPERVISION) << reason;
return finish(_server, "", true, reason); // move to /Target/Finished
}
// Abort job blocking server if abortable
auto jobId = _snapshot.hasAsString(blockedServersPrefix + _server);
if (jobId.second && !abortable(_snapshot, jobId.first)) {
return false;
} else if (jobId.second) {
JobContext(PENDING, jobId.first, _snapshot, _agent).abort();
}
// Todo entry
Builder todo;
{ VPackArrayBuilder t(&todo);
if (_jb == nullptr) {
try {
_snapshot(toDoPrefix + _jobId).toBuilder(todo);
} catch (std::exception const&) {
LOG_TOPIC(INFO, Logger::SUPERVISION)
<< "Failed to get key " + toDoPrefix + _jobId + " from agency snapshot";
return false;
}
} else {
todo.add(_jb->slice()[0].get(toDoPrefix + _jobId));
}} // Todo entry
std::string newLeader = findBestFollower();
if (newLeader.empty() || _server == newLeader) {
LOG_TOPIC(INFO, Logger::SUPERVISION) << "No server available, will retry job later";
return false; // job will retry later
}
LOG_TOPIC(INFO, Logger::SUPERVISION) << "Selected '" << newLeader << "' as leader";
// Enter pending, remove todo
Builder pending;
{ VPackArrayBuilder listOfTransactions(&pending);
{ VPackObjectBuilder operations(&pending);
addPutJobIntoSomewhere(pending, "Finished", todo.slice()[0]);
addRemoveJobFromSomewhere(pending, "ToDo", _jobId);
pending.add(asyncReplLeader, VPackValue(newLeader));
} // mutation part of transaction done
// Preconditions
{ VPackObjectBuilder precondition(&pending);
// Failed condition persists
addPreconditionServerHealth(pending, _server, Supervision::HEALTH_STATUS_FAILED);
// Destination server still in good condition
addPreconditionServerHealth(pending, newLeader, Supervision::HEALTH_STATUS_GOOD);
// Destination server should not be blocked by another job
addPreconditionServerNotBlocked(pending, newLeader);
// AsyncReplication leader must be the failed server
addPreconditionUnchanged(pending, asyncReplLeader, leader.first);
} // precondition done
} // array for transaction done
// Transact to agency
write_ret_t res = singleWriteTransaction(_agent, pending);
if (res.accepted && res.indices.size() == 1 && res.indices[0]) {
_status = FINISHED;
LOG_TOPIC(INFO, Logger::SUPERVISION)
<< "Finished: ActiveFailoverJob server " << _server << " failover to " << newLeader;
return true;
}
LOG_TOPIC(INFO, Logger::SUPERVISION) << "Precondition failed for ActiveFailoverJob " + _jobId;
return false;
}
JOB_STATUS ActiveFailoverJob::status() {
if (_status != PENDING) {
return _status;
}
TRI_ASSERT(false); // PENDING is not an option for this job, since it
// travels directly from ToDo to Finished or Failed
return _status;
}
arangodb::Result ActiveFailoverJob::abort() {
// We can assume that the job is in ToDo or not there:
if (_status == NOTFOUND || _status == FINISHED || _status == FAILED) {
return Result(TRI_ERROR_SUPERVISION_GENERAL_FAILURE,
"Failed aborting addFollower job beyond pending stage");
}
Result result;
// Can now only be TODO or PENDING
if (_status == TODO) {
finish("", "", false, "job aborted");
return result;
}
TRI_ASSERT(false); // cannot happen, since job moves directly to FINISHED
return result;
}
typedef std::pair<std::string, TRI_voc_tick_t> ServerTick;
/// Try to select the follower most in-sync with failed leader
std::string ActiveFailoverJob::findBestFollower() {
std::vector<std::string> as = healthyServers(_snapshot);
// blocked; (not sure if this can even happen)
try {
for (auto const& srv : _snapshot(blockedServersPrefix).children()) {
as.erase(std::remove(as.begin(), as.end(), srv.first), as.end());
}
} catch (...) {}
std::vector<ServerTick> ticks;
try { // collect tick values from transient state
query_t trx = std::make_unique<VPackBuilder>();
{
VPackArrayBuilder transactions(trx.get());
VPackArrayBuilder operations(trx.get());
trx->add(VPackValue("/" + Job::agencyPrefix + asyncReplTransientPrefix));
}
trans_ret_t res = _agent->transient(std::move(trx));
if (res.accepted) {
VPackSlice resp = res.result->slice();
if (!resp.isArray() || resp.length() == 0) {
return "";
}
VPackSlice obj = resp.at(0).get({ Job::agencyPrefix, "AsyncReplication"});
for (VPackObjectIterator::ObjectPair pair : VPackObjectIterator(obj)) {
std::string srvUUID = pair.key.copyString();
if (std::find(as.begin(), as.end(), srvUUID) == as.end()) {
continue; // skip inaccessible servers
}
VPackSlice leader = pair.value.get("leader"); // broken leader
VPackSlice lastTick = pair.value.get("lastTick");
if (leader.isString() && leader.compareString(_server) == 0 &&
lastTick.isNumber()) {
ticks.emplace_back(std::move(srvUUID), lastTick.getUInt());
}
}
}
} catch (...) {}
std::sort(ticks.begin(), ticks.end(), [&](ServerTick const& a,
ServerTick const& b) {
return a.second > b.second;
});
if (!ticks.empty()) {
return ticks[0].first;
}
return ""; // fallback to any available server
}

View File

@ -0,0 +1,60 @@
////////////////////////////////////////////////////////////////////////////////
/// DISCLAIMER
///
/// Copyright 2018 ArangoDB GmbH, Cologne, Germany
///
/// Licensed under the Apache License, Version 2.0 (the "License");
/// you may not use this file except in compliance with the License.
/// You may obtain a copy of the License at
///
/// http://www.apache.org/licenses/LICENSE-2.0
///
/// Unless required by applicable law or agreed to in writing, software
/// distributed under the License is distributed on an "AS IS" BASIS,
/// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
/// See the License for the specific language governing permissions and
/// limitations under the License.
///
/// Copyright holder is ArangoDB GmbH, Cologne, Germany
///
/// @author Simon Grätzer
////////////////////////////////////////////////////////////////////////////////
#ifndef ARANGOD_CONSENSUS_ACTIVE_FAILOVER_JOB_H
#define ARANGOD_CONSENSUS_ACTIVE_FAILOVER_JOB_H 1
#include "Job.h"
#include "Supervision.h"
namespace arangodb {
namespace consensus {
struct ActiveFailoverJob final : public Job {
ActiveFailoverJob(Node const& snapshot, AgentInterface* agent, std::string const& jobId,
std::string const& creator,
std::string const& failed);
ActiveFailoverJob(Node const& snapshot, AgentInterface* agent,
JOB_STATUS status, std::string const& jobId);
virtual ~ActiveFailoverJob();
virtual JOB_STATUS status() override final;
virtual void run() override final;
virtual bool create(std::shared_ptr<VPackBuilder> envelope = nullptr)
override final;
virtual bool start() override final;
virtual Result abort() override final;
private:
std::string findBestFollower();
private:
std::string _server;
};
}
}
#endif

View File

@ -43,10 +43,11 @@ std::string const planColPrefix = "/Plan/Collections/";
std::string const curColPrefix = "/Current/Collections/";
std::string const blockedServersPrefix = "/Supervision/DBServers/";
std::string const blockedShardsPrefix = "/Supervision/Shards/";
std::string const serverStatePrefix = "/Sync/ServerStates/";
std::string const planVersion = "/Plan/Version";
std::string const plannedServers = "/Plan/DBServers";
std::string const healthPrefix = "/Supervision/Health/";
std::string const asyncReplLeader = "/Plan/AsyncReplication/Leader";
std::string const asyncReplTransientPrefix = "/AsyncReplication/";
} // namespace arangodb::consensus
} // namespace arangodb
@ -239,6 +240,18 @@ std::vector<std::string> Job::availableServers(Node const& snapshot) {
}
/// @brief Get servers from Supervision with health status GOOD
std::vector<std::string> Job::healthyServers(arangodb::consensus::Node const& snapshot) {
std::vector<std::string> ret;
for (auto const& srv : snapshot(healthPrefix).children()) {
auto healthState = srv.second->hasAsString("Status");
if (healthState.second && healthState.first == Supervision::HEALTH_STATUS_GOOD) {
ret.emplace_back(srv.first);
}
}
return ret;
}
template<typename T> std::vector<size_t> idxsort (const std::vector<T> &v) {
std::vector<size_t> idx(v.size());
@ -413,8 +426,9 @@ bool Job::abortable(Node const& snapshot, std::string const& jobId) {
}
auto const& tmp_type = job.first.hasAsString("type");
std::string type = tmp_type.first;
if (!tmp_type.second || type == "failedServer" || type == "failedLeader") {
std::string const& type = tmp_type.first;
if (!tmp_type.second || type == "failedServer" || type == "failedLeader" ||
type == "activeFailover") {
return false;
} else if (type == "addFollower" || type == "moveShard" ||
type == "cleanOutServer") {
@ -549,12 +563,9 @@ void Job::addReleaseShard(Builder& trx, std::string const& shard) {
std::string Job::checkServerHealth(Node const& snapshot,
std::string const& server) {
auto status = snapshot.hasAsString(healthPrefix + server + "/Status");
if (!status.second) {
return "UNCLEAR";
}
if (status.first != "GOOD") {
return "UNHEALTHY";
}
return "GOOD";
return status.first;
}

View File

@ -53,10 +53,11 @@ extern std::string const planColPrefix;
extern std::string const curColPrefix;
extern std::string const blockedServersPrefix;
extern std::string const blockedShardsPrefix;
extern std::string const serverStatePrefix;
extern std::string const planVersion;
extern std::string const plannedServers;
extern std::string const healthPrefix;
extern std::string const asyncReplLeader;
extern std::string const asyncReplTransientPrefix;
struct Job {
@ -128,6 +129,9 @@ struct Job {
/// @brief Get servers from plan, which are not failed or cleaned out
static std::vector<std::string> availableServers(
const arangodb::consensus::Node&);
/// @brief Get servers from Supervision with health status GOOD
static std::vector<std::string> healthyServers(arangodb::consensus::Node const&);
static std::vector<shard_t> clones(
Node const& snap, std::string const& db, std::string const& col,
@ -169,12 +173,12 @@ struct Job {
static void addReleaseServer(Builder& trx, std::string const& server);
static void addReleaseShard(Builder& trx, std::string const& shard);
static void addPreconditionServerNotBlocked(Builder& pre, std::string const& server);
static void addPreconditionServerHealth(Builder& pre, std::string const& server, std::string const& health);
static void addPreconditionServerHealth(Builder& pre, std::string const& server,
std::string const& health);
static void addPreconditionShardNotBlocked(Builder& pre, std::string const& shard);
static void addPreconditionUnchanged(Builder& pre,
std::string const& key, Slice value);
static std::string checkServerHealth(Node const& snapshot, std::string const& server);
};
inline arangodb::consensus::write_ret_t singleWriteTransaction(
@ -263,8 +267,7 @@ inline arangodb::consensus::trans_ret_t generalTransaction(
}
inline arangodb::consensus::trans_ret_t transient(AgentInterface* _agent,
Builder const& transaction,
bool waitForCommit = true) {
Builder const& transaction) {
query_t envelope = std::make_shared<Builder>();
Slice trx = transaction.slice();
@ -288,7 +291,6 @@ inline arangodb::consensus::trans_ret_t transient(AgentInterface* _agent,
<< "Supervision failed to build transaction for transient: " << e.what();
}
return _agent->transient(envelope);
}

View File

@ -23,6 +23,7 @@
#include "JobContext.h"
#include "Agency/ActiveFailoverJob.h"
#include "Agency/AddFollower.h"
#include "Agency/CleanOutServer.h"
#include "Agency/FailedFollower.h"
@ -59,6 +60,8 @@ JobContext::JobContext (JOB_STATUS status, std::string id, Node const& snapshot,
_job = std::make_unique<AddFollower>(snapshot, agent, status, id);
} else if (type == "removeFollower") {
_job = std::make_unique<RemoveFollower>(snapshot, agent, status, id);
} else if (type == "activeFailover") {
_job = std::make_unique<ActiveFailoverJob>(snapshot, agent, status, id);
} else {
LOG_TOPIC(ERR, Logger::AGENCY) <<
"Failed to run supervision job " << type << " with id " << id;

View File

@ -98,6 +98,7 @@ bool MoveShard::create(std::shared_ptr<VPackBuilder> envelope) {
std::string now(timepointToString(std::chrono::system_clock::now()));
#ifdef ARANGODB_ENABLE_MAINTAINER_MODE
// DBservers
std::string planPath =
planColPrefix + _database + "/" + _collection + "/shards/" + _shard;
@ -105,6 +106,7 @@ bool MoveShard::create(std::shared_ptr<VPackBuilder> envelope) {
Slice plan = _snapshot.hasAsSlice(planPath).first;
TRI_ASSERT(plan.isArray());
TRI_ASSERT(plan[0].isString());
#endif
if (selfCreate) {
_jb->openArray();

View File

@ -1164,7 +1164,6 @@ Slice Node::getArray() const {
return Slice(_vecBuf.data());
}
void Node::clear() {
_children.clear();
_ttl = std::chrono::system_clock::time_point();

View File

@ -272,7 +272,6 @@ class Node {
/// @return second is true if url exists, first populated if second true
std::pair<Slice, bool> hasAsArray(std::string const &) const;
//
// These two operator() functions could be "protected" once
// unit tests updated.

View File

@ -25,6 +25,7 @@
#include <thread>
#include "Agency/ActiveFailoverJob.h"
#include "Agency/AddFollower.h"
#include "Agency/Agent.h"
#include "Agency/CleanOutServer.h"
@ -173,7 +174,6 @@ static std::string const targetShortID = "/Target/MapUniqueToShortID/";
static std::string const currentServersRegisteredPrefix =
"/Current/ServersRegistered";
static std::string const foxxmaster = "/Current/Foxxmaster";
static std::string const asyncReplLeader = "/Plan/AsyncReplication/Leader";
void Supervision::upgradeOne(Builder& builder) {
_lock.assertLockedByCurrentThread();
@ -282,7 +282,6 @@ void handleOnStatusDBServer(
uint64_t const& jobId, std::shared_ptr<VPackBuilder>& envelope) {
std::string failedServerPath = failedServersPrefix + "/" + serverID;
// New condition GOOD:
if (transisted.status == Supervision::HEALTH_STATUS_GOOD) {
if (snapshot.has(failedServerPath)) {
@ -306,7 +305,6 @@ void handleOnStatusDBServer(
"supervision", serverID).create(envelope);
}
}
}
@ -329,18 +327,32 @@ void handleOnStatusCoordinator(
void handleOnStatusSingle(
Agent* agent, Node const& snapshot, HealthRecord& persisted,
HealthRecord& transisted, std::string const& serverID) {
// if the current leader server failed => reset the value to ""
if (transisted.status == Supervision::HEALTH_STATUS_FAILED) {
if (snapshot.hasAsString(asyncReplLeader).first == serverID) {
VPackBuilder create;
{ VPackArrayBuilder tx(&create);
{ VPackObjectBuilder d(&create);
create.add(asyncReplLeader, VPackValue("")); }}
singleWriteTransaction(agent, create);
HealthRecord& transisted, std::string const& serverID,
uint64_t const& jobId, std::shared_ptr<VPackBuilder>& envelope) {
std::string failedServerPath = failedServersPrefix + "/" + serverID;
// New condition GOOD:
if (transisted.status == Supervision::HEALTH_STATUS_GOOD) {
if (snapshot.has(failedServerPath)) {
envelope = std::make_shared<VPackBuilder>();
{ VPackArrayBuilder a(envelope.get());
{ VPackObjectBuilder operations (envelope.get());
envelope->add(VPackValue(failedServerPath));
{ VPackObjectBuilder ccc(envelope.get());
envelope->add("op", VPackValue("delete")); }}}
}
} else if ( // New state: FAILED persisted: GOOD (-> BAD)
persisted.status == Supervision::HEALTH_STATUS_GOOD &&
transisted.status != Supervision::HEALTH_STATUS_GOOD) {
transisted.status = Supervision::HEALTH_STATUS_BAD;
} else if ( // New state: FAILED persisted: BAD (-> Job)
persisted.status == Supervision::HEALTH_STATUS_BAD &&
transisted.status == Supervision::HEALTH_STATUS_FAILED ) {
if (!snapshot.has(failedServerPath)) {
envelope = std::make_shared<VPackBuilder>();
ActiveFailoverJob(snapshot, agent, std::to_string(jobId),
"supervision", serverID).create(envelope);
}
}
}
@ -356,7 +368,8 @@ void handleOnStatus(
handleOnStatusCoordinator(
agent, snapshot, persisted, transisted, serverID);
} else if (serverID.compare(0,4,"SNGL") == 0) {
handleOnStatusSingle(agent, snapshot, persisted, transisted, serverID);
handleOnStatusSingle(agent, snapshot, persisted, transisted,
serverID, jobId, envelope);
} else {
LOG_TOPIC(ERR, Logger::SUPERVISION)
<< "Unknown server type. No supervision action taken. " << serverID;

View File

@ -104,7 +104,7 @@ void auth::TokenCache::invalidateBasicCache() {
auth::TokenCache::Entry auth::TokenCache::checkAuthenticationBasic(
std::string const& secret) {
if (_userManager == nullptr) { // server does not support users
LOG_TOPIC(TRACE, Logger::AUTHENTICATION) << "Basic auth not supported";
LOG_TOPIC(WARN, Logger::AUTHENTICATION) << "Basic auth not supported";
return auth::TokenCache::Entry();
}
@ -185,7 +185,7 @@ auth::TokenCache::Entry auth::TokenCache::checkAuthenticationJWT(
std::vector<std::string> const parts = StringUtils::split(jwt, '.');
if (parts.size() != 3) {
LOG_TOPIC(TRACE, arangodb::Logger::FIXME)
LOG_TOPIC(TRACE, arangodb::Logger::AUTHENTICATION)
<< "Secret contains " << parts.size() << " parts";
return auth::TokenCache::Entry();
}

View File

@ -52,10 +52,10 @@ class TokenCache {
friend class auth::TokenCache;
public:
Entry() : _authenticated(false), _expiry(0) {}
explicit Entry() : _authenticated(false), _expiry(0) {}
explicit Entry(std::string const& username, bool a, double t)
: _username(username), _authenticated(a), _expiry(t) {}
: _username(username), _authenticated(a), _expiry(t) {}
std::string const& username() const { return _username; }
bool authenticated() const { return _authenticated; }
@ -81,7 +81,7 @@ class TokenCache {
/// set new jwt secret, regenerate _jetToken
void setJwtSecret(std::string const&);
std::string jwtSecret() const;
/// Get the jwt token, which should be used for communicatin
/// Get the jwt token, which should be used for communication
std::string const& jwtToken() const noexcept {
TRI_ASSERT(!_jwtToken.empty());
return _jwtToken;

View File

@ -140,6 +140,7 @@ SET(ARANGOD_SOURCES
Actions/ActionFeature.cpp
Actions/RestActionHandler.cpp
Actions/actions.cpp
Agency/ActiveFailoverJob.cpp
Agency/AddFollower.cpp
Agency/AgencyComm.cpp
Agency/AgencyFeature.cpp

View File

@ -554,6 +554,8 @@ void HeartbeatThread::runSingleServer() {
LOG_TOPIC(TRACE, Logger::HEARTBEAT) << "Current leader: " << _myId;
if (applier->isActive()) {
applier->stopAndJoin();
// preemtily remove the transient entry from the agency
_agency.setTransient(transientPath, VPackSlice::emptyObjectSlice(), 0);
}
// ensure everyone has server access
@ -590,8 +592,22 @@ void HeartbeatThread::runSingleServer() {
// wait for everything to calm down for good measure
std::this_thread::sleep_for(std::chrono::seconds(10));
}
TRI_voc_tick_t lastTick = 0; // we always want to set lastTick
auto sendTransient = [&]() {
VPackBuilder builder;
builder.openObject();
builder.add("leader", leader);
builder.add("lastTick", VPackValue(lastTick));
builder.close();
double ttl = std::chrono::duration_cast<std::chrono::seconds>(_interval).count() * 5.0;
_agency.setTransient(transientPath, builder.slice(), ttl);
};
TRI_DEFER(sendTransient());
if (applier->endpoint() != endpoint) { // configure applier for new endpoint
if (applier->isActive() && applier->endpoint() == endpoint) {
lastTick = applier->lastTick();
} else if (applier->endpoint() != endpoint) { // configure applier for new endpoint
if (applier->isActive()) {
applier->stopAndJoin();
}

View File

@ -358,7 +358,7 @@ rest::ResponseCode GeneralCommTask::canAccessPath(
!StringUtils::isPrefix(path, ApiUser)) {
events::NotAuthorized(request);
result = rest::ResponseCode::UNAUTHORIZED;
LOG_TOPIC(TRACE, Logger::AUTHENTICATION) << "Access forbidden to " << path;
LOG_TOPIC(TRACE, Logger::AUTHORIZATION) << "Access forbidden to " << path;
}
// mop: inside the authenticateRequest() request->user will be populated
@ -390,7 +390,7 @@ rest::ResponseCode GeneralCommTask::canAccessPath(
// simon: upgrade rights for Foxx apps. FIXME
result = rest::ResponseCode::OK;
vc->forceSuperuser();
LOG_TOPIC(TRACE, Logger::AUTHENTICATION) << "Upgrading rights for " << path;
LOG_TOPIC(TRACE, Logger::AUTHORIZATION) << "Upgrading rights for " << path;
}
}
}

View File

@ -89,9 +89,12 @@ RestStatus RestAuthHandler::execute() {
_username = usernameSlice.copyString();
std::string const password = passwordSlice.copyString();
AuthenticationFeature* af = AuthenticationFeature::instance();
TRI_ASSERT(af != nullptr);
if (af->userManager()->checkPassword(_username, password)) {
auth::UserManager* um = AuthenticationFeature::instance()->userManager();
if (um == nullptr) {
std::string msg = "This server does not support users";
LOG_TOPIC(ERR, Logger::AUTHENTICATION) << msg;
generateError(rest::ResponseCode::UNAUTHORIZED, TRI_ERROR_HTTP_UNAUTHORIZED, msg);
} else if (um->checkPassword(_username, password)) {
VPackBuilder resultBuilder;
{
VPackObjectBuilder b(&resultBuilder);
@ -101,13 +104,12 @@ RestStatus RestAuthHandler::execute() {
_isValid = true;
generateDocument(resultBuilder.slice(), true, &VPackOptions::Defaults);
return RestStatus::DONE;
} else {
// mop: rfc 2616 10.4.2 (if credentials wrong 401)
generateError(rest::ResponseCode::UNAUTHORIZED,
TRI_ERROR_HTTP_UNAUTHORIZED, "Wrong credentials");
return RestStatus::DONE;
}
return RestStatus::DONE;
}
RestStatus RestAuthHandler::badRequest() {

View File

@ -953,13 +953,14 @@ Result RestReplicationHandler::processRestoreCollection(
ExecContext const* exe = ExecContext::CURRENT;
if (name[0] != '_' && exe != nullptr && !exe->isSuperuser() &&
ServerState::instance()->isSingleServer()) {
AuthenticationFeature* af = AuthenticationFeature::instance();
af->userManager()->updateUser(exe->user(), [&](auth::User& entry) {
entry.grantCollection(_vocbase.name(), col->name(), auth::Level::RW);
return TRI_ERROR_NO_ERROR;
});
auth::UserManager* um = AuthenticationFeature::instance()->userManager();
TRI_ASSERT(um != nullptr); // should not get here
if (um != nullptr) {
um->updateUser(exe->user(), [&](auth::User& entry) {
entry.grantCollection(_vocbase.name(), col->name(), auth::Level::RW);
return TRI_ERROR_NO_ERROR;
});
}
}
return Result();
@ -1124,12 +1125,15 @@ Result RestReplicationHandler::processRestoreCollectionCoordinator(
ExecContext const* exe = ExecContext::CURRENT;
if (name[0] != '_' && exe != nullptr && !exe->isSuperuser()) {
AuthenticationFeature* af = AuthenticationFeature::instance();
af->userManager()->updateUser(ExecContext::CURRENT->user(),
[&](auth::User& entry) {
entry.grantCollection(dbName, col->name(), auth::Level::RW);
return TRI_ERROR_NO_ERROR;
});
auth::UserManager* um = AuthenticationFeature::instance()->userManager();
TRI_ASSERT(um != nullptr); // should not get here
if (um != nullptr) {
um->updateUser(ExecContext::CURRENT->user(),
[&](auth::User& entry) {
entry.grantCollection(dbName, col->name(), auth::Level::RW);
return TRI_ERROR_NO_ERROR;
});
}
}
} catch (basics::Exception const& ex) {
// Error, report it.
@ -1324,8 +1328,12 @@ Result RestReplicationHandler::processRestoreUsersBatch(
auto queryResult = query.execute(queryRegistry);
// neither agency nor dbserver should get here
AuthenticationFeature* af = AuthenticationFeature::instance();
af->userManager()->outdate();
TRI_ASSERT(af->userManager() != nullptr);
if (af->userManager() != nullptr) {
af->userManager()->outdate();
}
af->tokenCache()->invalidateBasicCache();
return Result{queryResult.code};

View File

@ -46,8 +46,10 @@ RestUsersHandler::RestUsersHandler(GeneralRequest* request,
RestStatus RestUsersHandler::execute() {
RequestType const type = _request->requestType();
AuthenticationFeature* af = AuthenticationFeature::instance();
if (af == nullptr) { // nullptr happens only during shutdown
return RestStatus::FAIL;
if (af == nullptr || af->userManager() == nullptr) {
// nullptr may happens during shutdown, or on Agency
generateError(ResponseCode::BAD, TRI_ERROR_NOT_IMPLEMENTED);
return RestStatus::DONE;
}
switch (type) {

View File

@ -50,58 +50,54 @@ VocbaseContext::~VocbaseContext() {
_vocbase.release();
}
/*static*/ VocbaseContext* VocbaseContext::create(
GeneralRequest* req, TRI_vocbase_t& vocbase
) {
VocbaseContext* VocbaseContext::create(GeneralRequest* req, TRI_vocbase_t& vocbase) {
// _vocbase has already been refcounted for us
TRI_ASSERT(!vocbase.isDangling());
AuthenticationFeature* auth = AuthenticationFeature::instance();
TRI_ASSERT(auth != nullptr);
if (auth == nullptr) {
return nullptr;
}
if (!auth->isActive()) {
} else if (!auth->isActive()) {
return new VocbaseContext(req, vocbase, /*isInternal*/ false,
/*sysLevel*/ auth::Level::RW,
/*dbLevel*/ auth::Level::RW);
}
if (req->authenticated()) {
// superusers will have an empty username. This MUST be invalid
// for users authenticating with name / password
if (req->user().empty()) {
if (req->authenticationMethod() != AuthenticationMethod::JWT) {
std::string msg = "only jwt can be used to authenticate as superuser";
LOG_TOPIC(WARN, Logger::AUTHENTICATION) << msg;
THROW_ARANGO_EXCEPTION_MESSAGE(TRI_ERROR_BAD_PARAMETER, msg);
}
return new VocbaseContext(req, vocbase, /*isInternal*/ true,
/*sysLevel*/ auth::Level::RW,
/*dbLevel*/ auth::Level::RW);
}
auth::UserManager* um = auth->userManager();
if (um == nullptr) {
LOG_TOPIC(ERR, Logger::AUTHENTICATION) << "Server does not support users";
return nullptr;
}
auth::Level dbLvl = um->databaseAuthLevel(req->user(), req->databaseName());
auth::Level sysLvl = dbLvl;
if (req->databaseName() != TRI_VOC_SYSTEM_DATABASE) {
sysLvl = um->databaseAuthLevel(req->user(), TRI_VOC_SYSTEM_DATABASE);
}
if (!req->authenticated()) {
return new VocbaseContext(req, vocbase, /*isInternal*/ false,
/*sysLevel*/ sysLvl,
/*dbLevel*/ dbLvl);
/*sysLevel*/ auth::Level::NONE,
/*dbLevel*/ auth::Level::NONE);
}
// superusers will have an empty username. This MUST be invalid
// for users authenticating with name / password
if (req->user().empty()) {
if (req->authenticationMethod() != AuthenticationMethod::JWT) {
std::string msg = "only jwt can be used to authenticate as superuser";
LOG_TOPIC(WARN, Logger::AUTHENTICATION) << msg;
THROW_ARANGO_EXCEPTION_MESSAGE(TRI_ERROR_BAD_PARAMETER, msg);
}
return new VocbaseContext(req, vocbase, /*isInternal*/ true,
/*sysLevel*/ auth::Level::RW,
/*dbLevel*/ auth::Level::RW);
}
auth::UserManager* um = auth->userManager();
if (um == nullptr) {
LOG_TOPIC(WARN, Logger::AUTHENTICATION) << "Server does not support users";
return nullptr;
}
auth::Level dbLvl = um->databaseAuthLevel(req->user(), req->databaseName());
auth::Level sysLvl = dbLvl;
if (req->databaseName() != TRI_VOC_SYSTEM_DATABASE) {
sysLvl = um->databaseAuthLevel(req->user(), TRI_VOC_SYSTEM_DATABASE);
}
return new VocbaseContext(req, vocbase, /*isInternal*/ false,
/*sysLevel*/ auth::Level::NONE,
/*dbLevel*/ auth::Level::NONE);
/*sysLevel*/ sysLvl,
/*dbLevel*/ dbLvl);
}
void VocbaseContext::forceSuperuser() {

View File

@ -198,13 +198,14 @@ Result Collections::create(TRI_vocbase_t* vocbase, std::string const& name,
// do not grant rights on system collections
// in case of success we grant the creating user RW access
if (name[0] != '_' && exe != nullptr && !exe->isSuperuser()) {
auth::UserManager* um = AuthenticationFeature::instance()->userManager();
if (name[0] != '_' && um != nullptr && exe != nullptr && !exe->isSuperuser()) {
// this should not fail, we can not get here without database RW access
af->userManager()->updateUser(
ExecContext::CURRENT->user(), [&](auth::User& entry) {
entry.grantCollection(vocbase->name(), name, auth::Level::RW);
return TRI_ERROR_NO_ERROR;
});
um->updateUser(
ExecContext::CURRENT->user(), [&](auth::User& entry) {
entry.grantCollection(vocbase->name(), name, auth::Level::RW);
return TRI_ERROR_NO_ERROR;
});
}
// reload otherwise collection might not be in yet
@ -215,10 +216,10 @@ Result Collections::create(TRI_vocbase_t* vocbase, std::string const& name,
// do not grant rights on system collections
// in case of success we grant the creating user RW access
if (name[0] != '_' && exe != nullptr && !exe->isSuperuser() &&
ServerState::instance()->isSingleServerOrCoordinator()) {
auth::UserManager* um = AuthenticationFeature::instance()->userManager();
if (name[0] != '_' && um != nullptr && exe != nullptr && !exe->isSuperuser()) {
// this should not fail, we can not get here without database RW access
af->userManager()->updateUser(
um->updateUser(
ExecContext::CURRENT->user(), [&](auth::User& u) {
u.grantCollection(vocbase->name(), name, auth::Level::RW);
return TRI_ERROR_NO_ERROR;
@ -480,8 +481,7 @@ Result Collections::drop(TRI_vocbase_t* vocbase, LogicalCollection* coll,
if (!exec->canUseDatabase(vocbase->name(), auth::Level::RW) ||
!exec->canUseCollection(coll->name(), auth::Level::RW)) {
return Result(TRI_ERROR_FORBIDDEN,
"Insufficient rights to drop "
"collection " +
"Insufficient rights to drop collection " +
coll->name());
} else if (!exec->isSuperuser() && !ServerState::writeOpsEnabled()) {
THROW_ARANGO_EXCEPTION_MESSAGE(TRI_ERROR_ARANGO_READ_ONLY,
@ -510,9 +510,9 @@ Result Collections::drop(TRI_vocbase_t* vocbase, LogicalCollection* coll,
}
}
if (res.ok() && ServerState::instance()->isSingleServerOrCoordinator()) {
AuthenticationFeature* af = AuthenticationFeature::instance();
af->userManager()->enumerateUsers([&](auth::User& entry) -> bool {
auth::UserManager* um = AuthenticationFeature::instance()->userManager();
if (res.ok() && um != nullptr) {
um->enumerateUsers([&](auth::User& entry) -> bool {
return entry.removeCollection(dbname, collName);
});
}

View File

@ -80,12 +80,13 @@ std::vector<std::string> Databases::list(std::string const& user) {
if (ServerState::instance()->isCoordinator()) {
AuthenticationFeature* af = AuthenticationFeature::instance();
auth::UserManager* um = af->userManager();
std::vector<std::string> names;
std::vector<std::string> dbs =
databaseFeature->getDatabaseNamesCoordinator();
for (std::string const& db : dbs) {
if (!af->isActive() ||
af->userManager()->databaseAuthLevel(user, db) > auth::Level::NONE) {
if (!af->isActive() || (um != nullptr &&
um->databaseAuthLevel(user, db) > auth::Level::NONE)) {
names.push_back(db);
}
}
@ -141,7 +142,7 @@ arangodb::Result Databases::info(TRI_vocbase_t* vocbase, VPackBuilder& result) {
arangodb::Result Databases::create(std::string const& dbName,
VPackSlice const& inUsers,
VPackSlice const& inOptions) {
AuthenticationFeature* af = AuthenticationFeature::instance();
auth::UserManager* um = AuthenticationFeature::instance()->userManager();
ExecContext const* exec = ExecContext::CURRENT;
if (exec != nullptr) {
if (!exec->isAdminUser()) {
@ -262,9 +263,9 @@ arangodb::Result Databases::create(std::string const& dbName,
TRI_ASSERT(vocbase->name() == dbName);
// we need to add the permissions before running the upgrade script
if (ExecContext::CURRENT != nullptr) {
if (ExecContext::CURRENT != nullptr && um != nullptr) {
// ignore errors here Result r =
af->userManager()->updateUser(
um->updateUser(
ExecContext::CURRENT->user(), [&](auth::User& entry) {
entry.grantDatabase(dbName, auth::Level::RW);
entry.grantCollection(dbName, "*", auth::Level::RW);
@ -295,10 +296,9 @@ arangodb::Result Databases::create(std::string const& dbName,
TRI_DEFER(vocbase->release());
// we need to add the permissions before running the upgrade script
if (ServerState::instance()->isSingleServer() &&
ExecContext::CURRENT != nullptr) {
if (ExecContext::CURRENT != nullptr && um != nullptr) {
// ignore errors here Result r =
af->userManager()->updateUser(
um->updateUser(
ExecContext::CURRENT->user(), [&](auth::User& entry) {
entry.grantDatabase(dbName, auth::Level::RW);
entry.grantCollection(dbName, "*", auth::Level::RW);
@ -410,10 +410,9 @@ arangodb::Result Databases::drop(TRI_vocbase_t* systemVocbase,
}
Result res;
AuthenticationFeature* af = AuthenticationFeature::instance();
if (ServerState::instance()->isCoordinator() ||
!ServerState::instance()->isRunningInCluster()) {
res = af->userManager()->enumerateUsers([&](auth::User& entry) -> bool {
auth::UserManager* um = AuthenticationFeature::instance()->userManager();
if (um != nullptr) {
res = um->enumerateUsers([&](auth::User& entry) -> bool {
return entry.removeDatabase(dbName);
});
}

View File

@ -122,7 +122,10 @@ bool UpgradeTasks::addDefaultUserOther(TRI_vocbase_t* vocbase,
return false;
}
auth::UserManager* um = AuthenticationFeature::instance()->userManager();
TRI_ASSERT(um != nullptr);
if (um == nullptr) {
return true; // server does not support users
}
for (VPackSlice slice : VPackArrayIterator(users)) {
std::string user = VelocyPackHelper::getStringValue(slice, "username",
StaticStrings::Empty);

View File

@ -1053,11 +1053,12 @@ function startInstanceCluster (instanceInfo, protocol, options,
// we need to find the leading server
if (options.activefailover) {
internal.wait(5.0);
let opts = makeAuthorizationHeaders(authOpts);
opts['headers']['content-type'] = 'application/json';
opts['method'] = 'POST';
let reply = download(agencyUrl + '/_api/agency/read',
'[["/arango/Plan/AsyncReplication/Leader"]]', opts);
let opts = {
method: 'POST',
jwt: crypto.jwtEncode(authOpts['server.jwt-secret'], {'server_id': 'none', 'iss': 'arangodb'}, 'HS256'),
headers: {'content-type': 'application/json' }
};
let reply = download(agencyUrl + '/_api/agency/read', '[["/arango/Plan/AsyncReplication/Leader"]]', opts);
if (!reply.error && reply.code === 200) {
let res = JSON.parse(reply.body);

View File

@ -28,7 +28,8 @@
const functionsDocumentation = {
'resilience': 'resilience tests',
'client_resilience': 'client resilience tests',
'cluster_sync': 'cluster sync tests'
'cluster_sync': 'cluster sync tests',
'active_failover': 'active failover tests'
};
const optionsDocumentation = [
];
@ -85,10 +86,36 @@ function clusterSync (options) {
return tu.performTests(options, testCases, 'cluster_sync', tu.runThere);
}
// //////////////////////////////////////////////////////////////////////////////
// / @brief TEST: active failover
// //////////////////////////////////////////////////////////////////////////////
function activeFailover (options) {
if (options.cluster) {
return {
'active_failover': {
'status': true,
'message': 'skipped because of cluster',
'skipped': true
}
};
}
let testCases = tu.scanTestPath('js/client/tests/active-failover');
options.activefailover = true;
options.singles = 4;
return tu.performTests(options, testCases, 'client_resilience', tu.runInArangosh, {
'server.authentication': 'true',
'server.jwt-secret': 'haxxmann'
});
}
function setup (testFns, defaultFns, opts, fnDocs, optionsDoc) {
testFns['resilience'] = resilience;
testFns['client_resilience'] = clientResilience;
testFns['cluster_sync'] = clusterSync;
testFns['active_failover'] = activeFailover;
for (var attrname in functionsDocumentation) { fnDocs[attrname] = functionsDocumentation[attrname]; }
for (var i = 0; i < optionsDocumentation.length; i++) { optionsDoc.push(optionsDocumentation[i]); }
}

View File

@ -0,0 +1,486 @@
/*jshint strict: false, sub: true */
/*global print, assertTrue, assertEqual */
'use strict';
////////////////////////////////////////////////////////////////////////////////
/// DISCLAIMER
///
/// Copyright 2016 ArangoDB GmbH, Cologne, Germany
/// Copyright 2014 triagens GmbH, Cologne, Germany
///
/// Licensed under the Apache License, Version 2.0 (the "License");
/// you may not use this file except in compliance with the License.
/// You may obtain a copy of the License at
///
/// http://www.apache.org/licenses/LICENSE-2.0
///
/// Unless required by applicable law or agreed to in writing, software
/// distributed under the License is distributed on an "AS IS" BASIS,
/// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
/// See the License for the specific language governing permissions and
/// limitations under the License.
///
/// Copyright holder is ArangoDB GmbH, Cologne, Germany
///
/// @author Andreas Streichardt
////////////////////////////////////////////////////////////////////////////////
const jsunity = require('jsunity');
const internal = require('internal');
const fs = require('fs');
const arangosh = require('@arangodb/arangosh');
const crypto = require('@arangodb/crypto');
const request = require("@arangodb/request");
const tasks = require("@arangodb/tasks");
const arango = internal.arango;
const compareTicks = require("@arangodb/replication").compareTicks;
const wait = internal.wait;
const db = internal.db;
const suspendExternal = internal.suspendExternal;
const continueExternal = internal.continueExternal;
const jwtSecret = 'haxxmann';
const jwtSuperuser = crypto.jwtEncode(jwtSecret, {
"server_id": "test",
"iss": "arangodb",
"exp": Math.floor(Date.now() / 1000) + 3600
}, 'HS256');
const jwtRoot = crypto.jwtEncode(jwtSecret, {
"preferred_username": "root",
"iss": "arangodb",
"exp": Math.floor(Date.now() / 1000) + 3600
}, 'HS256');
if (!internal.env.hasOwnProperty('INSTANCEINFO')) {
throw new Error('env.INSTANCEINFO was not set by caller!');
}
const instanceinfo = JSON.parse(internal.env.INSTANCEINFO);
const cname = "UnitTestActiveFailover";
/*try {
let globals = JSON.parse(process.env.ARANGOSH_GLOBALS);
Object.keys(globals).forEach(g => {
global[g] = globals[g];
});
} catch (e) {
}*/
function getUrl(endpoint) {
return endpoint.replace(/^tcp:/, 'http:').replace(/^ssl:/, 'https:');
}
function baseUrl() {
return getUrl(arango.getEndpoint());
};
function connectToServer(leader) {
arango.reconnect(leader, "_system", "root", "");
db._flushCache();
};
// getEndponts works with any server
function getClusterEndpoints() {
//let jwt = crypto.jwtEncode(options['server.jwt-secret'], {'server_id': 'none', 'iss': 'arangodb'}, 'HS256');
var res = request.get({
url: baseUrl() + "/_api/cluster/endpoints",
auth: {
bearer: jwtRoot,
}
});
assertTrue(res instanceof request.Response);
assertTrue(res.hasOwnProperty('statusCode'), JSON.stringify(res));
assertTrue(res.statusCode === 200, JSON.stringify(res));
assertTrue(res.hasOwnProperty('json'));
assertTrue(res.json.hasOwnProperty('endpoints'));
assertTrue(res.json.endpoints instanceof Array);
assertTrue(res.json.endpoints.length > 0);
return res.json.endpoints.map(e => e.endpoint);
}
function getLoggerState(endpoint) {
var res = request.get({
url: getUrl(endpoint) + "/_db/_system/_api/replication/logger-state",
auth: {
bearer: jwtRoot,
}
});
assertTrue(res instanceof request.Response);
assertTrue(res.hasOwnProperty('statusCode') && res.statusCode === 200);
assertTrue(res.hasOwnProperty('json'));
return arangosh.checkRequestResult(res.json);
}
function getApplierState(endpoint) {
var res = request.get({
url: getUrl(endpoint) + "/_db/_system/_api/replication/applier-state?global=true",
auth: {
bearer: jwtRoot,
}
});
assertTrue(res instanceof request.Response);
assertTrue(res.hasOwnProperty('statusCode') && res.statusCode === 200);
assertTrue(res.hasOwnProperty('json'));
return arangosh.checkRequestResult(res.json);
}
// check the servers are in sync with the leader
function checkInSync(leader, servers, ignore) {
print("Checking in-sync state with lead: ", leader);
let check = (endpoint) => {
if (endpoint === leader || endpoint === ignore) {
return true;
}
let applier = getApplierState(endpoint);
return applier.state.running && applier.endpoint === leader &&
(compareTicks(applier.state.lastAppliedContinuousTick, leaderTick) >= 0 ||
compareTicks(applier.state.lastProcessedContinuousTick, leaderTick) >= 0);
};
const leaderTick = getLoggerState(leader).state.lastLogTick;
let loop = 100;
while (loop-- > 0) {
if (servers.every(check)) {
print("All followers are in sync");
return true;
}
wait(1.0);
}
print("Timeout waiting for followers");
return false;
}
function checkData(server) {
print("Checking data of ", server);
let res = request.get({
url: getUrl(server) + "/_api/collection/" + cname + "/count",
auth: {
bearer: jwtRoot,
}
});
assertTrue(res instanceof request.Response);
//assertTrue(res.hasOwnProperty('statusCode'));
assertTrue(res.statusCode === 200);
return res.json.count;
}
function readAgencyValue(path) {
let agents = instanceinfo.arangods.filter(arangod => arangod.role === "agent");
assertTrue(agents.length > 0, "No agents present");
print("Querying agency... (", path, ")");
var res = request.post({
url: agents[0].url + "/_api/agency/read",
auth: {
bearer: jwtSuperuser,
},
body: JSON.stringify([[path]])
});
assertTrue(res instanceof request.Response);
assertTrue(res.hasOwnProperty('statusCode'), JSON.stringify(res));
assertEqual(res.statusCode, 200, JSON.stringify(res));
assertTrue(res.hasOwnProperty('json'));
//print("Agency response ", res.json);
return arangosh.checkRequestResult(res.json);
}
// resolve leader from agency
function leaderInAgency() {
let i = 10;
do {
let res = readAgencyValue("/arango/Plan/AsyncReplication/Leader");
let uuid = res[0].arango.Plan.AsyncReplication.Leader;
if (uuid && uuid.length > 0) {
res = readAgencyValue("/arango/Supervision/Health");
return res[0].arango.Supervision.Health[uuid].Endpoint;
}
internal.wait(1.0);
} while (i-- > 0);
throw "Unable to resole leader from agency";
}
function checkForFailover(leader) {
print("Waiting for failover of ", leader);
let oldLeaderUUID = "";
let i = 5; // 5 * 5s == 25s
do {
let res = readAgencyValue("/arango/Supervision/Health");
let srvHealth = res[0].arango.Supervision.Health;
Object.keys(srvHealth).forEach(key => {
let srv = srvHealth[key];
if (srv['Endpoint'] === leader && srv.Status === 'FAILED') {
print("Server ", key, "( ", leader, " ) is marked FAILED");
oldLeaderUUID = key;
}
});
if (oldLeaderUUID !== "") {
break;
}
internal.wait(5.0);
} while (i-- > 0);
// now wait for new leader to appear
let nextLeaderUUID = "";
do {
let res = readAgencyValue("/arango/Plan/AsyncReplication/Leader");
nextLeaderUUID = res[0].arango.Plan.AsyncReplication.Leader;
if (nextLeaderUUID !== oldLeaderUUID) {
res = readAgencyValue("/arango/Supervision/Health");
return res[0].arango.Supervision.Health[nextLeaderUUID].Endpoint;
}
internal.wait(5.0);
} while (i-- > 0);
print("Timing out, current leader value: ", nextLeaderUUID);
throw "No failover occured";
}
// Testsuite that quickly checks some of the basic premises of
// the active failover functionality. It is designed as a quicker
// variant of the node resilience tests (for active failover).
// Things like Foxx resilience are not tested
function ActiveFailoverSuite() {
let servers = getClusterEndpoints();
assertTrue(servers.length >= 4, "This test expects four single instances");
let firstLeader = servers[0];
let suspended = [];
let currentLead = leaderInAgency();
return {
setUp: function () {
let col = db._create(cname);
assertTrue(checkInSync(currentLead, servers));
for (let i = 0; i < 10000; i++) {
col.save({ attr: i});
}
},
tearDown: function () {
//db._collection(cname).drop();
//serverTeardown();
suspended.forEach(arangod => {
print("Resuming: ", arangod.endpoint);
assertTrue(continueExternal(arangod.pid));
});
currentLead = leaderInAgency();
print("connecting shell to leader ", currentLead)
connectToServer(currentLead);
if (db._collection(cname)) {
db._drop(cname);
}
assertTrue(checkInSync(currentLead, servers));
let endpoints = getClusterEndpoints();
assertTrue(endpoints.length === servers.length);
assertTrue(endpoints[0] === currentLead);
},
// Basic test if followers get in sync
testFollowerInSync: function () {
assertEqual(servers[0], currentLead);
let col = db._collection(cname);
assertEqual(col.count(), 10000);
assertTrue(checkInSync(currentLead, servers));
assertEqual(checkData(currentLead), 10000);
},
// Simple failover case: Leader is suspended, slave needs to
// take over within a reasonable amount of time
testFailover: function () {
assertTrue(checkInSync(currentLead, servers));
assertEqual(checkData(currentLead), 10000);
suspended = instanceinfo.arangods.filter(arangod => arangod.endpoint === currentLead);
suspended.forEach(arangod => {
print("Suspending Leader: ", arangod.endpoint);
assertTrue(suspendExternal(arangod.pid));
});
let oldLead = currentLead;
// await failover and check that follower get in sync
currentLead = checkForFailover(currentLead);
assertTrue(currentLead !== oldLead);
print("Failover to new leader : ", currentLead);
internal.wait(2.5); // settle down, heartbeat interval is 1s
assertEqual(checkData(currentLead), 10000);
print("New leader has correct data");
// check the remaining followers get in sync
assertTrue(checkInSync(currentLead, servers, oldLead));
// restart the old leader
suspended.forEach(arangod => {
print("Resuming: ", arangod.endpoint);
assertTrue(continueExternal(arangod.pid));
});
suspended = [];
assertTrue(checkInSync(currentLead, servers));
},
// More complex case: We want to get the most up to date follower
// Insert a number of documents, suspend n-1 followers for a few seconds.
// We then suspend the leader and expect a specific follower to take over
testFollowerSelection: function () {
assertTrue(checkInSync(currentLead, servers));
assertEqual(checkData(currentLead), 10000);
// we assume the second leader is still the leader
let endpoints = getClusterEndpoints();
assertTrue(endpoints.length === servers.length);
assertTrue(endpoints[0] === currentLead);
print("Starting data creation task on ", currentLead, " (expect it to fail later)");
connectToServer(currentLead);
/// this task should stop once the server becomes a slave
var task = tasks.register({
name: "UnitTestsFailover",
command: `
const db = require('@arangodb').db;
let col = db._collection("UnitTestActiveFailover");
let cc = col.count();
for (let i = 0; i < 1000000; i++) {
col.save({attr: i + cc});
}`
});
internal.wait(2.5);
// pick a random follower
let nextLead = endpoints[2]; // could be any one of them
// suspend remaining followers
print("Suspending followers, except one");
suspended = instanceinfo.arangods.filter(arangod => arangod.role !== 'agent' &&
arangod.endpoint !== currentLead &&
arangod.endpoint !== nextLead);
suspended.forEach(arangod => {
print("Suspending: ", arangod.endpoint);
assertTrue(suspendExternal(arangod.pid));
});
// check our leader stays intact, while remaining followers fail
let i = 20;
//let expected = servers.length - suspended.length; // should be 2
do {
endpoints = getClusterEndpoints();
assertEqual(endpoints[0], currentLead, "Unwanted leadership failover");
internal.wait(1.0); // Health status may take some time to change
} while (endpoints.length !== 2 && i-- > 0);
assertTrue(i > 0, "timed-out waiting for followers to fail");
assertEqual(endpoints.length, 2);
assertEqual(endpoints[1], nextLead); // this server must become new leader
// resume followers
print("Resuming followers");
suspended.forEach(arangod => {
print("Resuming: ", arangod.endpoint);
assertTrue(continueExternal(arangod.pid));
});
suspended = [];
let upper = checkData(currentLead);
print("Leader inserted ", upper, " documents so far");
print("Suspending leader ", currentLead);
instanceinfo.arangods.forEach(arangod => {
if (arangod.endpoint === currentLead) {
print("Suspending: ", arangod.endpoint);
suspended.push(arangod);
assertTrue(suspendExternal(arangod.pid));
}
});
// await failover and check that follower get in sync
let oldLead = currentLead;
currentLead = checkForFailover(currentLead);
assertTrue(currentLead === nextLead, "Did not fail to best in-sync follower");
internal.wait(2.5); // settle down, heartbeat interval is 1s
let cc = checkData(currentLead);
// we expect to find documents within an acceptable range
assertTrue(10000 <= cc && cc <= upper + 500, "Leader has too little or too many documents");
print("Number of documents is in acceptable range");
assertTrue(checkInSync(currentLead, servers, oldLead));
print("Remaining followers are in sync");
// Resuming stopped second leader
print("Resuming server that still thinks it is leader (ArangoError 1004 is expected)");
suspended.forEach(arangod => {
print("Resuming: ", arangod.endpoint);
assertTrue(continueExternal(arangod.pid));
});
suspended = [];
assertTrue(checkInSync(currentLead, servers));
},
// try to failback to the original leader
testFailback: function() {
if (currentLead === firstLeader) {
return; // nevermind then
}
assertTrue(checkInSync(currentLead, servers));
assertEqual(checkData(currentLead), 10000);
print("Suspending followers, except original leader");
suspended = instanceinfo.arangods.filter(arangod => arangod.role !== 'agent' &&
arangod.endpoint !== firstLeader);
suspended.forEach(arangod => {
print("Suspending: ", arangod.endpoint);
assertTrue(suspendExternal(arangod.pid));
});
// await failover and check that follower get in sync
let oldLead = currentLead;
currentLead = checkForFailover(currentLead);
assertTrue(currentLead === firstLeader, "Did not fail to original leader");
suspended.forEach(arangod => {
print("Resuming: ", arangod.endpoint);
assertTrue(continueExternal(arangod.pid));
});
suspended = [];
assertTrue(checkInSync(currentLead, servers));
assertEqual(checkData(currentLead), 10000);
}
// Try to cleanup everything that was created
/*testCleanup: function () {
let res = readAgencyValue("/arango/Plan/AsyncReplication/Leader");
assertTrue(res !== null);
let uuid = res[0].arango.Plan.AsyncReplication.Leader;
res = readAgencyValue("/arango/Supervision/Health");
let lead = res[0].arango.Supervision.Health[uuid].Endpoint;
connectToServer(lead);
db._drop(cname);
assertTrue(checkInSync(lead, servers));
}*/
};
}
////////////////////////////////////////////////////////////////////////////////
/// @brief executes the test suite
////////////////////////////////////////////////////////////////////////////////
jsunity.run(ActiveFailoverSuite);
return jsunity.done();

View File

@ -295,7 +295,7 @@ function RequestSuite () {
expect(obj.path).to.equal(path);
expect(obj).to.have.property('headers');
expect(obj.headers).to.have.property('authorization');
expect(obj.headers.authorization).to.equal('Bearer ' + auth.bearer);
expect(obj.headers.authorization).to.equal('bearer ' + auth.bearer);
},
////////////////////////////////////////////////////////////////////////////////

View File

@ -166,17 +166,6 @@ function request (req) {
});
}
if (req.auth) {
headers.authorization = (
req.auth.bearer ?
'Bearer ' + req.auth.bearer :
'Basic ' + new Buffer(
req.auth.username + ':' +
req.auth.password
).toString('base64')
);
}
let options = {
method: (req.method || 'get').toUpperCase(),
headers: headers,
@ -197,6 +186,16 @@ function request (req) {
if (req.sslProtocol) {
options.sslProtocol = req.sslProtocol;
}
if (is.existy(req.auth)) {
if (is.existy(req.auth.jwt)) {
options.jwt = req.auth.jwt;
} else if (is.existy(req.auth.bearer)) {
options.jwt = req.auth.bearer;
} else if (is.existy(req.auth.username)) {
options.username = req.auth.username;
options.password = req.auth.password || "";
}
}
let result = internal.download(path, body, options);
return new IncomingResponse(result, req.encoding, req.json);

View File

@ -124,17 +124,6 @@ function clusterRequest(req) {
});
}
if (req.auth) {
headers['authorization'] = ( // eslint-disable-line dot-notation
req.auth.bearer ?
'Bearer ' + req.auth.bearer :
'Basic ' + new Buffer(
req.auth.username + ':' +
req.auth.password
).toString('base64')
);
}
let options = {
method: (req.method || 'get').toUpperCase(),
headers: headers,
@ -152,6 +141,16 @@ function clusterRequest(req) {
} else {
options.maxRedirects = 10;
}
if (is.existy(req.auth)) {
if (is.existy(req.auth.jwt)) {
options.jwt = req.auth.jwt;
} else if (is.existy(req.auth.bearer)) {
options.jwt = req.auth.bearer;
} else if (is.existy(req.auth.username)) {
options.username = req.auth.username;
options.password = req.auth.password || "";
}
}
let result = internal.clusterDownload(path, body, options);
return new Response(result, req.encoding, req.json);
}

View File

@ -31,6 +31,7 @@ var errors = arangodb.errors;
var db = arangodb.db;
var replication = require("@arangodb/replication");
let compareTicks = replication.compareTicks;
var console = require("console");
var internal = require("internal");
var masterEndpoint = arango.getEndpoint();
@ -58,28 +59,6 @@ const collectionCount = function(name) {
return db._collection(name).count();
};
const compareTicks = function(l, r) {
var i;
if (l === null) {
l = "0";
}
if (r === null) {
r = "0";
}
if (l.length !== r.length) {
return l.length - r.length < 0 ? -1 : 1;
}
// length is equal
for (i = 0; i < l.length; ++i) {
if (l[i] !== r[i]) {
return l[i] < r[i] ? -1 : 1;
}
}
return 0;
};
const compare = function(masterFunc, masterFunc2, slaveFuncOngoing, slaveFuncFinal, applierConfiguration) {
var state = {};

View File

@ -665,6 +665,7 @@ void JS_Download(v8::FunctionCallbackInfo<v8::Value> const& args) {
bool returnBodyOnError = false;
int maxRedirects = 5;
uint64_t sslProtocol = TLS_V12;
std::string jwtToken, username, password;
if (args.Length() > 2) {
if (!args[2]->IsObject()) {
@ -757,6 +758,15 @@ void JS_Download(v8::FunctionCallbackInfo<v8::Value> const& args) {
returnBodyOnError = TRI_ObjectToBoolean(
options->Get(TRI_V8_ASCII_STRING(isolate, "returnBodyOnError")));
}
if (options->Has(TRI_V8_ASCII_STRING(isolate, "jwt"))) {
jwtToken = TRI_ObjectToString(options->Get(TRI_V8_ASCII_STRING(isolate, "jwt")));
} else if (options->Has(TRI_V8_ASCII_STRING(isolate, "username"))) {
username = TRI_ObjectToString(options->Get(TRI_V8_ASCII_STRING(isolate, "username")));
if (options->Has(TRI_V8_ASCII_STRING(isolate, "password"))) {
password = TRI_ObjectToString(options->Get(TRI_V8_ASCII_STRING(isolate, "password")));
}
}
}
// outfile
@ -870,6 +880,11 @@ void JS_Download(v8::FunctionCallbackInfo<v8::Value> const& args) {
params.setSupportDeflate(false);
// security by obscurity won't work. Github requires a useragent nowadays.
params.setExposeArangoDB(true);
if (!jwtToken.empty()) {
params.setJwt(jwtToken);
} else if (!username.empty()) {
params.setUserNamePassword("/", username, password);
}
SimpleHttpClient client(connection.get(), params);
v8::Handle<v8::Object> result = v8::Object::New(isolate);

View File

@ -0,0 +1,415 @@
////////////////////////////////////////////////////////////////////////////////
/// @brief test case for ActiverFailover job
///
/// @file
///
/// DISCLAIMER
///
/// Copyright 2018 ArangoDB GmbH, Cologne, Germany
///
/// Licensed under the Apache License, Version 2.0 (the "License");
/// you may not use this file except in compliance with the License.
/// You may obtain a copy of the License at
///
/// http://www.apache.org/licenses/LICENSE-2.0
///
/// Unless required by applicable law or agreed to in writing, software
/// distributed under the License is distributed on an "AS IS" BASIS,
/// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
/// See the License for the specific language governing permissions and
/// limitations under the License.
///
/// Copyright holder is ArangoDB GmbH, Cologne, Germany
///
/// @author Simon Grätzer
////////////////////////////////////////////////////////////////////////////////
#include "catch.hpp"
#include "fakeit.hpp"
#include "Agency/ActiveFailoverJob.h"
#include "Agency/AgentInterface.h"
#include "Agency/Node.h"
#include "lib/Basics/StringUtils.h"
#include "lib/Random/RandomGenerator.h"
#include <iostream>
#include <velocypack/Collection.h>
#include <velocypack/Parser.h>
#include <velocypack/Slice.h>
#include <velocypack/velocypack-aliases.h>
using namespace arangodb;
using namespace arangodb::basics;
using namespace arangodb::consensus;
using namespace fakeit;
namespace arangodb {
namespace tests {
namespace active_failover_test {
const std::string PREFIX = "arango";
const std::string LEADER = "SNGL-leader";
const std::string FOLLOWER1 = "SNGL-follower1"; // tick 10, STATE GOOD
const std::string FOLLOWER2 = "SNGL-follower2"; // tick 1, STATE GOOD
const std::string FOLLOWER3 = "SNGL-follower23"; // tick 9, STATE GOOD
const std::string FOLLOWER4 = "SNGL-follower4"; // tick 100, STATE BAD
const std::string FOLLOWER5 = "SNGL-follower5"; // tick 1000, STATE GOOD wrong leader
const char *agency =
#include "ActiveFailoverTest.json"
;
const char *transient =
#include "ActiveFailoverTestTransient.json"
;
Node createNodeFromBuilder(Builder const& builder) {
Builder opBuilder;
{ VPackObjectBuilder a(&opBuilder);
opBuilder.add("new", builder.slice()); }
Node node("");
node.handle<SET>(opBuilder.slice());
return node;
}
Builder createBuilder(char const* c) {
Options options;
options.checkAttributeUniqueness = true;
VPackParser parser(&options);
parser.parse(c);
Builder builder;
builder.add(parser.steal()->slice());
return builder;
}
typedef std::function<std::unique_ptr<Builder>(
Slice const&, std::string const&)> TestStructType;
inline static std::string typeName (Slice const& slice) {
return std::string(slice.typeName());
}
TEST_CASE("ActiveFailover", "[agency][supervision]") {
arangodb::RandomGenerator::initialize(arangodb::RandomGenerator::RandomType::MERSENNE);
Builder base = createBuilder(agency);
//baseStructure.toBuilder(builder);*/
std::string jobId = "1";
write_ret_t fakeWriteResult {true, "", std::vector<bool> {true}, std::vector<index_t> {1}};
SECTION("creating a job should create a job in todo") {
Mock<AgentInterface> mockAgent;
write_ret_t fakeWriteResult {true, "", std::vector<bool> {true}, std::vector<index_t> {1}};
When(Method(mockAgent, write)).AlwaysDo([&](query_t const& q, bool d) -> write_ret_t {
INFO(q->slice().toJson());
auto expectedJobKey = "/arango/Target/ToDo/" + jobId;
REQUIRE(typeName(q->slice()) == "array" );
REQUIRE(q->slice().length() == 1);
REQUIRE(typeName(q->slice()[0]) == "array");
// operations + preconditions
REQUIRE(q->slice()[0].length() == 2);
REQUIRE(typeName(q->slice()[0][0]) == "object");
REQUIRE(q->slice()[0][0].length() == 2); // should do an entry in todo and failedservers
REQUIRE(typeName(q->slice()[0][0].get(expectedJobKey)) == "object");
auto job = q->slice()[0][0].get(expectedJobKey);
REQUIRE(typeName(job.get("creator")) == "string");
REQUIRE(typeName(job.get("type")) == "string");
CHECK(job.get("type").copyString() == "activeFailover");
REQUIRE(typeName(job.get("server")) == "string");
CHECK(job.get("server").copyString() == LEADER);
CHECK(typeName(job.get("jobId")) == "string");
CHECK(job.get("jobId").copyString() == jobId);
CHECK(typeName(job.get("timeCreated")) == "string");
return fakeWriteResult;
});
When(Method(mockAgent, waitFor)).AlwaysReturn(AgentInterface::raft_commit_t::OK);
auto& agent = mockAgent.get();
Node snapshot = createNodeFromBuilder(base);
ActiveFailoverJob job(snapshot(PREFIX), &agent, jobId, "tests", LEADER);
REQUIRE(job.create());
Verify(Method(mockAgent, write));
}
SECTION("The state is already 'GOOD' and 'Target/FailedServers' is still as in the snapshot. Violate: GOOD") {
const char* tt = R"=({"arango":{"Supervision":{"Health":{"SNGL-leader":{"Status":"GOOD"}}}}})=";
VPackBuilder overw = createBuilder(tt);
VPackBuilder mod = VPackCollection::merge(base.slice(), overw.slice(), true);
Mock<AgentInterface> mockAgent;
When(Method(mockAgent, write)).AlwaysDo([&](query_t const& q, bool d) -> write_ret_t {
INFO(q->slice().toJson());
REQUIRE(typeName(q->slice()) == "array" );
REQUIRE(q->slice().length() == 1);
REQUIRE(typeName(q->slice()[0]) == "array");
// operations + preconditions
REQUIRE(q->slice()[0].length() == 2);
REQUIRE(typeName(q->slice()[0][0]) == "object");
REQUIRE(typeName(q->slice()[0][1]) == "object");
auto writes = q->slice()[0][0];
REQUIRE(typeName(writes.get("/arango/Target/ToDo/1")) == "object");
REQUIRE(writes.get("/arango/Target/ToDo/1").get("server").copyString() == LEADER);
auto precond = q->slice()[0][1];
REQUIRE(typeName(precond.get("/arango/Supervision/Health/SNGL-leader/Status")) == "object");
REQUIRE(precond.get("/arango/Supervision/Health/SNGL-leader/Status").get("old").copyString() == "BAD");
REQUIRE(typeName(precond.get("/arango/Target/FailedServers").get("old")) == "object");
return write_ret_t{false, "", std::vector<bool> {false}, std::vector<index_t> {0}};
});
When(Method(mockAgent, waitFor)).AlwaysReturn(AgentInterface::raft_commit_t::OK);
auto& agent = mockAgent.get();
Node snapshot = createNodeFromBuilder(mod);
ActiveFailoverJob job(snapshot(PREFIX), &agent, jobId, "unittest", LEADER);
REQUIRE_FALSE(job.create());
REQUIRE(job.status() == JOB_STATUS::NOTFOUND);
Verify(Method(mockAgent,write));
} // SECTION
SECTION("Server is healthy again, job finishes") {
const char* health = R"=({"arango":{"Supervision":{"Health":{"SNGL-leader":{"Status":"GOOD"}}},
"Target":{"ToDo":{"1":{"jobId":"1","type":"activeFailover"}}}}})=";
VPackBuilder mod = VPackCollection::merge(base.slice(), createBuilder(health).slice(), true);
Mock<AgentInterface> mockAgent;
When(Method(mockAgent, write)).Do([&](query_t const& q, bool d) -> write_ret_t {
REQUIRE(typeName(q->slice()) == "array" );
REQUIRE(q->slice().length() == 1);
REQUIRE(typeName(q->slice()[0]) == "array");
// operations + preconditions
REQUIRE(q->slice()[0].length() == 2);
REQUIRE(typeName(q->slice()[0][0]) == "object");
REQUIRE(typeName(q->slice()[0][1]) == "object");
auto writes = q->slice()[0][0];
REQUIRE(typeName(writes.get("/arango/Target/ToDo/1")) == "object");
REQUIRE(writes.get("/arango/Target/ToDo/1").get("server").copyString() == LEADER);
auto precond = q->slice()[0][1];
REQUIRE(typeName(precond.get("/arango/Supervision/Health/SNGL-leader/Status")) == "object");
REQUIRE(precond.get("/arango/Supervision/Health/SNGL-leader/Status").get("old").copyString() == "BAD");
REQUIRE(typeName(precond.get("/arango/Target/FailedServers").get("old")) == "object");
return fakeWriteResult;
});
When(Method(mockAgent, waitFor)).AlwaysReturn(AgentInterface::raft_commit_t::OK);
auto& agent = mockAgent.get();
Node snapshot = createNodeFromBuilder(mod); //snapshort contains GOOD leader
ActiveFailoverJob job(snapshot(PREFIX), &agent, jobId, "unittest", LEADER);
REQUIRE(job.create()); // we already put the TODO entry in the snapshot for finish
REQUIRE(job.status() == JOB_STATUS::TODO);
Verify(Method(mockAgent,write)).Exactly(1);
When(Method(mockAgent, write)).Do([&](query_t const& q, bool d) -> write_ret_t {
// check that the job finishes now, without changing leader
VPackSlice writes = q->slice()[0][0];
REQUIRE(typeName(writes.get("/arango/Target/ToDo/1").get("op")) == "string");
REQUIRE(typeName(writes.get("/arango/Target/Finished/1")) == "object");
REQUIRE_FALSE(writes.hasKey("/arango" + asyncReplLeader)); // no change to leader
return fakeWriteResult;
});
REQUIRE(job.start());
REQUIRE(job.status() == JOB_STATUS::FINISHED);
Verify(Method(mockAgent,write)).Exactly(2);
} // SECTION
SECTION("Current leader is different from server in job, job finishes") {
const char* health = R"=({"arango":{"Plan":{"AsyncReplication":{"Leader":"SNGL-follower1"}},
"Target":{"ToDo":{"1":{"jobId":"1","type":"activeFailover"}}}}})=";
VPackBuilder mod = VPackCollection::merge(base.slice(), createBuilder(health).slice(), true);
Mock<AgentInterface> mockAgent;
When(Method(mockAgent, write)).Do([&](query_t const& q, bool d) -> write_ret_t {
REQUIRE(typeName(q->slice()) == "array" );
REQUIRE(q->slice().length() == 1);
REQUIRE(typeName(q->slice()[0]) == "array");
// operations + preconditions
REQUIRE(q->slice()[0].length() == 2);
REQUIRE(typeName(q->slice()[0][0]) == "object");
REQUIRE(typeName(q->slice()[0][1]) == "object");
auto writes = q->slice()[0][0];
REQUIRE(typeName(writes.get("/arango/Target/ToDo/1")) == "object");
REQUIRE(writes.get("/arango/Target/ToDo/1").get("server").copyString() == LEADER);
auto precond = q->slice()[0][1];
REQUIRE(typeName(precond.get("/arango/Supervision/Health/SNGL-leader/Status")) == "object");
REQUIRE(precond.get("/arango/Supervision/Health/SNGL-leader/Status").get("old").copyString() == "BAD");
REQUIRE(typeName(precond.get("/arango/Target/FailedServers").get("old")) == "object");
return fakeWriteResult;
});
When(Method(mockAgent, waitFor)).AlwaysReturn(AgentInterface::raft_commit_t::OK);
auto& agent = mockAgent.get();
Node snapshot = createNodeFromBuilder(mod); //snapshort contains different leader
ActiveFailoverJob job(snapshot(PREFIX), &agent, jobId, "unittest", LEADER);
REQUIRE(job.create()); // we already put the TODO entry in the snapshot for finish
REQUIRE(job.status() == JOB_STATUS::TODO);
Verify(Method(mockAgent,write)).Exactly(1);
When(Method(mockAgent, write)).Do([&](query_t const& q, bool d) -> write_ret_t {
// check that the job finishes now, without changing leader
VPackSlice writes = q->slice()[0][0];
REQUIRE(typeName(writes.get("/arango/Target/ToDo/1").get("op")) == "string");
REQUIRE(typeName(writes.get("/arango/Target/Finished/1")) == "object");
REQUIRE_FALSE(writes.hasKey("/arango" + asyncReplLeader)); // no change to leader
return fakeWriteResult;
});
REQUIRE(job.start());
REQUIRE(job.status() == JOB_STATUS::FINISHED);
Verify(Method(mockAgent,write)).Exactly(2);
} // SECTION
SECTION("no in-sync follower found, job retries") {
// follower follows wrong leader
const char* noInSync = R"=({"arango":{"AsyncReplication":{"SNGL-follower1":{"leader":"abc","lastTick":9}}}})=";
trans_ret_t fakeTransient {true, "", 1, 0, std::make_shared<Builder>(createBuilder(noInSync))};
Mock<AgentInterface> mockAgent;
When(Method(mockAgent, write)).Do([&](query_t const& q, bool d) -> write_ret_t {
REQUIRE(typeName(q->slice()) == "array" );
REQUIRE(q->slice().length() == 1);
REQUIRE(typeName(q->slice()[0]) == "array");
// operations + preconditions
REQUIRE(q->slice()[0].length() == 2);
REQUIRE(typeName(q->slice()[0][0]) == "object");
REQUIRE(typeName(q->slice()[0][1]) == "object");
auto writes = q->slice()[0][0];
REQUIRE(typeName(writes.get("/arango/Target/ToDo/1")) == "object");
REQUIRE(writes.get("/arango/Target/ToDo/1").get("server").copyString() == LEADER);
auto precond = q->slice()[0][1];
REQUIRE(typeName(precond.get("/arango/Supervision/Health/SNGL-leader/Status")) == "object");
REQUIRE(precond.get("/arango/Supervision/Health/SNGL-leader/Status").get("old").copyString() == "BAD");
REQUIRE(typeName(precond.get("/arango/Target/FailedServers").get("old")) == "object");
return fakeWriteResult;
});
When(Method(mockAgent, waitFor)).AlwaysReturn(AgentInterface::raft_commit_t::OK);
auto& agent = mockAgent.get();
Node snapshot = createNodeFromBuilder(base);
ActiveFailoverJob job(snapshot(PREFIX), &agent, jobId, "unittest", LEADER);
REQUIRE(job.create()); // we already put the TODO entry in the snapshot for finish
REQUIRE(job.status() == JOB_STATUS::TODO);
Verify(Method(mockAgent,write)).Exactly(1);
When(Method(mockAgent, transient)).Return(fakeTransient);
When(Method(mockAgent, write)).Do([&](query_t const& q, bool d) -> write_ret_t {
// check that the job fails now
auto writes = q->slice()[0][0];
REQUIRE(std::string(writes.get("/arango/Target/ToDo/1").get("op").typeName()) == "string"); \
CHECK(std::string(writes.get("/arango/Target/Failed/1").typeName()) == "object");
return fakeWriteResult;
});
REQUIRE_FALSE(job.start());
// job status stays on TODO and can retry later
REQUIRE(job.status() == JOB_STATUS::TODO);
Verify(Method(mockAgent,transient)).Exactly(Once);
Verify(Method(mockAgent,write)).Exactly(Once); // finish is not called
} // SECTION
SECTION("follower with best tick value used, job succeeds") {
// 2 in-sync followers, follower1 should be used
trans_ret_t fakeTransient {true, "", 1, 0, std::make_shared<Builder>(createBuilder(transient))};
Mock<AgentInterface> mockAgent;
When(Method(mockAgent, write)).Do([&](query_t const& q, bool d) -> write_ret_t {
REQUIRE(typeName(q->slice()) == "array" );
REQUIRE(q->slice().length() == 1);
REQUIRE(typeName(q->slice()[0]) == "array");
// we always simply override! no preconditions...
REQUIRE(q->slice()[0].length() == 2);
REQUIRE(typeName(q->slice()[0][0]) == "object");
REQUIRE(typeName(q->slice()[0][1]) == "object");
auto writes = q->slice()[0][0];
REQUIRE(typeName(writes.get("/arango/Target/ToDo/1")) == "object");
REQUIRE(writes.get("/arango/Target/ToDo/1").get("server").copyString() == LEADER);
auto precond = q->slice()[0][1];
REQUIRE(typeName(precond.get("/arango/Supervision/Health/SNGL-leader/Status")) == "object");
REQUIRE(precond.get("/arango/Supervision/Health/SNGL-leader/Status").get("old").copyString() == "BAD");
REQUIRE(typeName(precond.get("/arango/Target/FailedServers").get("old")) == "object");
return fakeWriteResult;
});
When(Method(mockAgent, waitFor)).AlwaysReturn(AgentInterface::raft_commit_t::OK);
auto& agent = mockAgent.get();
Node snapshot = createNodeFromBuilder(base);
ActiveFailoverJob job(snapshot(PREFIX), &agent, jobId, "unittest", LEADER);
REQUIRE(job.create()); // we already put the TODO entry in the snapshot for finish
REQUIRE(job.status() == JOB_STATUS::TODO);
Verify(Method(mockAgent,write)).Exactly(1);
When(Method(mockAgent, transient)).Return(fakeTransient);
When(Method(mockAgent, write)).Do([&](query_t const& q, bool d) -> write_ret_t {
REQUIRE(typeName(q->slice()) == "array" );
REQUIRE(q->slice().length() == 1);
REQUIRE(typeName(q->slice()[0]) == "array");
// we always simply override! no preconditions...
REQUIRE(q->slice()[0].length() == 2);
REQUIRE(typeName(q->slice()[0][0]) == "object");
REQUIRE(typeName(q->slice()[0][1]) == "object");
INFO(q->toString());
// check that the job succeeds now
auto writes = q->slice()[0][0];
REQUIRE(typeName(writes.get("/arango/Target/ToDo/1").get("op")) == "string"); \
REQUIRE(typeName(writes.get("/arango/Target/Finished/1")) == "object");
REQUIRE(typeName(writes.get("/arango/Plan/AsyncReplication/Leader")) == "string");
REQUIRE(writes.get("/arango/Plan/AsyncReplication/Leader").copyString() == FOLLOWER1);
auto precond = q->slice()[0][1];
REQUIRE(typeName(precond.get("/arango/Supervision/Health/SNGL-leader/Status")) == "object");
REQUIRE(precond.get("/arango/Supervision/Health/SNGL-leader/Status").get("old").copyString() == "FAILED");
REQUIRE(precond.get("/arango/Supervision/Health/SNGL-follower1/Status").get("old").copyString() == "GOOD");
REQUIRE(precond.get("/arango/Plan/AsyncReplication/Leader").get("old").copyString() == LEADER);
return fakeWriteResult;
});
REQUIRE(job.start());
// job status stays on TODO and can retry later
REQUIRE(job.status() == JOB_STATUS::FINISHED);
Verify(Method(mockAgent,transient)).Exactly(1);
Verify(Method(mockAgent,write)).Exactly(2);
} // SECTION
};
}}}

View File

@ -0,0 +1,63 @@
R"=(
{
"arango": {
"Current": {
"Version": 1,
"DBServers": {},
"Singles": {
"SNGL-follower1": "none",
"SNGL-follower2": "none",
"SNGL-follower3": "none",
"SNGL-follower4": "none",
"SNGL-follower5": "none",
"SNGL-leader": "none"
}
},
"Plan": {
"Version": 1,
"DBServers": {},
"AsyncReplication": {
"Leader": "SNGL-leader"
},
"Singles": {
"SNGL-follower1": "none",
"SNGL-follower2": "none",
"SNGL-follower3": "none",
"SNGL-follower4": "none",
"SNGL-follower5": "none",
"SNGL-leader": "none"
}
},
"Supervision": {
"DBServers": {},
"Health": {
"SNGL-follower1": {
"Status": "GOOD"
},
"SNGL-follower2": {
"Status": "GOOD"
},
"SNGL-follower3": {
"Status": "GOOD"
},
"SNGL-follower4": {
"Status": "BAD"
},
"SNGL-follower5": {
"Status": "GOOD"
},
"SNGL-leader": {
"Status": "FAILED"
}
},
"Shards": {}
},
"Target": {
"FailedServers": {},
"Failed": {},
"Finished": {},
"ToDo": {}
}
}
}
)="

View File

@ -0,0 +1,28 @@
R"=(
[{
"arango": {
"AsyncReplication": {
"SNGL-follower1": {
"lastTick": 10,
"leader": "SNGL-leader"
},
"SNGL-follower2": {
"lastTick": 1,
"leader": "SNGL-leader"
},
"SNGL-follower3": {
"lastTick": 9,
"leader": "SNGL-leader"
},
"SNGL-follower4": {
"lastTick": 100,
"leader": "SNGL-leader"
},
"SNGL-follower5": {
"lastTick": 1000,
"leader": "SNGL-follower2"
}
}
}
}]
)="

View File

@ -157,7 +157,7 @@ TEST_CASE("AddFollower", "[agency][supervision]") {
SECTION("<collection> still exists, if missing, job is finished, move to "
"Target/Finished") {
TestStructType createTestStructure = [&](
Slice const& s, std::string const& path) {

View File

@ -27,7 +27,6 @@
#include "catch.hpp"
#include "fakeit.hpp"
#include "Agency/AddFollower.h"
#include "Agency/FailedFollower.h"
#include "Agency/MoveShard.h"
#include "Agency/AgentInterface.h"

View File

@ -326,13 +326,12 @@ SECTION("if the leader is healthy again we fail the job") {
auto writes = q->slice()[0][0]; \
REQUIRE(std::string(writes.get("/arango/Target/ToDo/1").get("op").typeName()) == "string"); \
CHECK(std::string(writes.get("/arango/Target/Failed/1").typeName()) == "object");
return fakeWriteResult;
});
When(Method(mockAgent, waitFor)).AlwaysReturn();
AgentInterface &agent = mockAgent.get();
auto failedLeader = FailedLeader(
agency("arango"),
agency(PREFIX),
&agent,
JOB_STATUS::TODO,
jobId

View File

@ -60,6 +60,7 @@ endif ()
add_executable(
arangodbtests
${CMAKE_SOURCE_DIR}/arangod/RestServer/FlushFeature.cpp
Agency/ActiveFailoverTest.cpp
Agency/AddFollowerTest.cpp
Agency/CleanOutServerTest.cpp
Agency/FailedFollowerTest.cpp