mirror of https://gitee.com/bigwinds/arangodb
2074 lines
68 KiB
C++
2074 lines
68 KiB
C++
////////////////////////////////////////////////////////////////////////////////
|
|
/// DISCLAIMER
|
|
///
|
|
/// Copyright 2014-2018 ArangoDB GmbH, Cologne, Germany
|
|
/// Copyright 2004-2014 triAGENS GmbH, Cologne, Germany
|
|
///
|
|
/// Licensed under the Apache License, Version 2.0 (the "License");
|
|
/// you may not use this file except in compliance with the License.
|
|
/// You may obtain a copy of the License at
|
|
///
|
|
/// http://www.apache.org/licenses/LICENSE-2.0
|
|
///
|
|
/// Unless required by applicable law or agreed to in writing, software
|
|
/// distributed under the License is distributed on an "AS IS" BASIS,
|
|
/// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
|
/// See the License for the specific language governing permissions and
|
|
/// limitations under the License.
|
|
///
|
|
/// Copyright holder is ArangoDB GmbH, Cologne, Germany
|
|
///
|
|
/// @author Kaveh Vahedipour
|
|
////////////////////////////////////////////////////////////////////////////////
|
|
|
|
#include "Agent.h"
|
|
|
|
#include <velocypack/Iterator.h>
|
|
#include <velocypack/velocypack-aliases.h>
|
|
|
|
#include <chrono>
|
|
#include <thread>
|
|
|
|
#include "Agency/AgentCallback.h"
|
|
#include "Agency/AgencyFeature.h"
|
|
#include "Agency/GossipCallback.h"
|
|
#include "Basics/ConditionLocker.h"
|
|
#include "Basics/ReadLocker.h"
|
|
#include "Basics/WriteLocker.h"
|
|
#include "Basics/ScopeGuard.h"
|
|
#include "RestServer/QueryRegistryFeature.h"
|
|
#include "RestServer/SystemDatabaseFeature.h"
|
|
#include "Scheduler/Scheduler.h"
|
|
#include "VocBase/vocbase.h"
|
|
|
|
using namespace arangodb::application_features;
|
|
using namespace arangodb::velocypack;
|
|
using namespace std::chrono;
|
|
|
|
namespace arangodb {
|
|
namespace consensus {
|
|
|
|
// Instanciations of some declarations in AgencyCommon.h:
|
|
|
|
std::string const pubApiPrefix("/_api/agency/");
|
|
std::string const privApiPrefix("/_api/agency_priv/");
|
|
std::string const NO_LEADER("");
|
|
|
|
/// Agent configuration
|
|
Agent::Agent(config_t const& config)
|
|
: Thread("Agent"),
|
|
_config(config),
|
|
_commitIndex(0),
|
|
_spearhead(this),
|
|
_readDB(this),
|
|
_transient(this),
|
|
_agentNeedsWakeup(false),
|
|
_compactor(this),
|
|
_ready(false),
|
|
_preparing(0) {
|
|
_state.configure(this);
|
|
_constituent.configure(this);
|
|
if (size() > 1) {
|
|
_inception = std::make_unique<Inception>(this);
|
|
} else {
|
|
_leaderSince = 0;
|
|
}
|
|
}
|
|
|
|
/// This agent's id
|
|
std::string Agent::id() const { return _config.id(); }
|
|
|
|
/// Agent's id is set once from state machine
|
|
bool Agent::id(std::string const& id) {
|
|
bool success;
|
|
if ((success = _config.setId(id))) {
|
|
LOG_TOPIC("32d95", DEBUG, Logger::AGENCY) << "My id is " << id;
|
|
} else {
|
|
LOG_TOPIC("37f6b", ERR, Logger::AGENCY) << "Cannot reassign id once set: My id is "
|
|
<< _config.id() << " reassignment to " << id;
|
|
}
|
|
return success;
|
|
}
|
|
|
|
/// Merge command line and persisted comfigurations
|
|
bool Agent::mergeConfiguration(VPackSlice const& persisted) {
|
|
return _config.merge(persisted); // Concurrency managed in merge
|
|
}
|
|
|
|
/// Dtor shuts down thread
|
|
Agent::~Agent() {
|
|
waitForThreadsStop();
|
|
// This usually was already done called from AgencyFeature::unprepare,
|
|
// but since this only waits for the threads to stop, it can be done
|
|
// multiple times, and we do it just in case the Agent object was
|
|
// created but never really started. Here, we exit with a fatal error
|
|
// if the threads do not stop in time.
|
|
shutdown(); // wait for the main Agent thread to terminate
|
|
}
|
|
|
|
/// Wait until threads are terminated:
|
|
void Agent::waitForThreadsStop() {
|
|
// It is allowed to call this multiple times, we do so from the constructor
|
|
// and from AgencyFeature::unprepare.
|
|
int counter = 0;
|
|
while (_constituent.isRunning() || _compactor.isRunning() ||
|
|
(_config.supervision() && _supervision.isRunning()) ||
|
|
(_inception != nullptr && _inception->isRunning())) {
|
|
std::this_thread::sleep_for(std::chrono::microseconds(100000));
|
|
|
|
// fail fatally after 5 mins:
|
|
if (++counter >= 10 * 60 * 5) {
|
|
LOG_TOPIC("b8ad5", FATAL, Logger::AGENCY) << "some agency thread did not finish";
|
|
FATAL_ERROR_EXIT();
|
|
}
|
|
}
|
|
// initiate shutdown of main Agent thread, but do not wait for it yet
|
|
// -> this happens in the destructor
|
|
beginShutdown();
|
|
}
|
|
|
|
/// State machine
|
|
State const& Agent::state() const { return _state; }
|
|
|
|
/// Start all agent thread
|
|
bool Agent::start() {
|
|
LOG_TOPIC("9a90e", DEBUG, Logger::AGENCY) << "Starting agency comm worker.";
|
|
Thread::start();
|
|
return true;
|
|
}
|
|
|
|
/// Get all logs from state machine
|
|
query_t Agent::allLogs() const { return _state.allLogs(); }
|
|
|
|
/// This agent's term
|
|
term_t Agent::term() const { return _constituent.term(); }
|
|
|
|
/// Agency size
|
|
size_t Agent::size() const { return _config.size(); }
|
|
|
|
/// My endpoint
|
|
std::string Agent::endpoint() const { return _config.endpoint(); }
|
|
|
|
/// Handle voting
|
|
priv_rpc_ret_t Agent::requestVote(term_t termOfPeer, std::string const& id,
|
|
index_t lastLogIndex, index_t lastLogTerm,
|
|
query_t const& query, int64_t timeoutMult) {
|
|
if (timeoutMult != -1 && timeoutMult != _config.timeoutMult()) {
|
|
adjustTimeoutMult(timeoutMult);
|
|
LOG_TOPIC("81f2a", WARN, Logger::AGENCY) << "Voter: setting timeout multiplier to "
|
|
<< timeoutMult << " for next term.";
|
|
}
|
|
|
|
bool doIVote = _constituent.vote(termOfPeer, id, lastLogIndex, lastLogTerm);
|
|
return priv_rpc_ret_t(doIVote, this->term());
|
|
}
|
|
|
|
/// Get copy of momentary configuration
|
|
config_t const Agent::config() const { return _config; }
|
|
|
|
/// Adjust timeoutMult:
|
|
void Agent::adjustTimeoutMult(int64_t timeoutMult) {
|
|
_config.setTimeoutMult(timeoutMult);
|
|
}
|
|
|
|
/// Get timeoutMult:
|
|
int64_t Agent::getTimeoutMult() const { return _config.timeoutMult(); }
|
|
|
|
/// Leader's id
|
|
std::string Agent::leaderID() const { return _constituent.leaderID(); }
|
|
|
|
/// Are we leading?
|
|
bool Agent::leading() const {
|
|
// When we become leader, we are first entering a preparation phase.
|
|
// Note that this method returns true regardless of whether we
|
|
// are still preparing or not. The preparation phases 1 and 2 are
|
|
// indicated by the _preparing member in the Agent, the Constituent is
|
|
// already LEADER.
|
|
// The Constituent has to send out AppendEntriesRPC calls immediately, but
|
|
// only when we are properly leading (with initialized stores etc.)
|
|
// can we execute requests.
|
|
return _constituent.leading();
|
|
}
|
|
|
|
// Waits here for confirmation of log's commits up to index. Timeout in seconds.
|
|
AgentInterface::raft_commit_t Agent::waitFor(index_t index, double timeout) {
|
|
if (size() == 1) { // single host agency
|
|
return Agent::raft_commit_t::OK;
|
|
}
|
|
|
|
auto startTime = steady_clock::now();
|
|
index_t lastCommitIndex = 0;
|
|
|
|
// Wait until woken up through AgentCallback
|
|
while (true) {
|
|
/// success?
|
|
/// (_waitForCV's mutex stops writes to _commitIndex)
|
|
CONDITION_LOCKER(guard, _waitForCV);
|
|
if (leading()) {
|
|
if (lastCommitIndex != _commitIndex) {
|
|
// We restart the timeout computation if there has been progress:
|
|
startTime = steady_clock::now();
|
|
}
|
|
lastCommitIndex = _commitIndex;
|
|
if (lastCommitIndex >= index) {
|
|
return Agent::raft_commit_t::OK;
|
|
}
|
|
} else {
|
|
return Agent::raft_commit_t::UNKNOWN;
|
|
}
|
|
|
|
duration<double> d = steady_clock::now() - startTime;
|
|
|
|
LOG_TOPIC("37e05", DEBUG, Logger::AGENCY)
|
|
<< "waitFor: index: " << index << " _commitIndex: " << _commitIndex
|
|
<< " _lastCommitIndex: " << lastCommitIndex << " elapsedTime: " << d.count();
|
|
|
|
if (d.count() >= timeout) {
|
|
return Agent::raft_commit_t::TIMEOUT;
|
|
}
|
|
|
|
// Go to sleep:
|
|
_waitForCV.wait(static_cast<uint64_t>(1.0e6 * (timeout - d.count())));
|
|
|
|
// shutting down
|
|
if (this->isStopping()) {
|
|
return Agent::raft_commit_t::UNKNOWN;
|
|
}
|
|
}
|
|
|
|
// We should never get here
|
|
TRI_ASSERT(false);
|
|
|
|
return Agent::raft_commit_t::UNKNOWN;
|
|
}
|
|
|
|
// Check if log is committed up to index.
|
|
bool Agent::isCommitted(index_t index) {
|
|
if (size() == 1) { // single host agency
|
|
return true;
|
|
}
|
|
|
|
CONDITION_LOCKER(guard, _waitForCV);
|
|
if (leading()) {
|
|
return _commitIndex >= index;
|
|
} else {
|
|
return false;
|
|
}
|
|
}
|
|
|
|
index_t Agent::index() {
|
|
|
|
if (challengeLeadership()) {
|
|
resign();
|
|
return 0;
|
|
}
|
|
|
|
MUTEX_LOCKER(tiLocker, _tiLock);
|
|
return _confirmed[id()];
|
|
|
|
}
|
|
|
|
// AgentCallback reports id of follower and its highest processed index
|
|
void Agent::reportIn(std::string const& peerId, index_t index, size_t toLog) {
|
|
auto startTime = steady_clock::now();
|
|
|
|
// only update the time stamps here:
|
|
{
|
|
MUTEX_LOCKER(tiLocker, _tiLock);
|
|
|
|
// Update last acknowledged answer
|
|
auto t = steady_clock::now();
|
|
std::chrono::duration<double> d = t - _lastAcked[peerId];
|
|
auto secsSince = d.count();
|
|
if (secsSince < 1.5e9 && peerId != id() &&
|
|
secsSince > _config.minPing() * _config.timeoutMult()) {
|
|
LOG_TOPIC("6fe73", WARN, Logger::AGENCY)
|
|
<< "Last confirmation from peer " << peerId
|
|
<< " was received more than minPing ago: " << secsSince;
|
|
}
|
|
LOG_TOPIC("9ee0b", DEBUG, Logger::AGENCY)
|
|
<< "Setting _lastAcked[" << peerId << "] to time "
|
|
<< std::chrono::duration_cast<std::chrono::microseconds>(t.time_since_epoch())
|
|
.count();
|
|
_lastAcked[peerId] = t;
|
|
|
|
if (index > _confirmed[peerId]) { // progress this follower?
|
|
_confirmed[peerId] = index;
|
|
if (toLog > 0) { // We want to reset the wait time only if a package callback
|
|
LOG_TOPIC("ba4d2", DEBUG, Logger::AGENCY)
|
|
<< "Got call back of " << toLog
|
|
<< " logs, resetting _earliestPackage to now for id " << peerId;
|
|
_earliestPackage[peerId] = steady_clock::now();
|
|
}
|
|
wakeupMainLoop(); // only necessary for non-empty callbacks
|
|
}
|
|
}
|
|
|
|
duration<double> reportInTime = steady_clock::now() - startTime;
|
|
if (reportInTime.count() > 0.1) {
|
|
LOG_TOPIC("b4854", DEBUG, Logger::AGENCY)
|
|
<< "reportIn took longer than 0.1s: " << reportInTime.count();
|
|
}
|
|
}
|
|
|
|
/// @brief Report a failed append entry call from AgentCallback
|
|
void Agent::reportFailed(std::string const& slaveId, size_t toLog, bool sent) {
|
|
if (toLog > 0) {
|
|
// This is only used for non-empty appendEntriesRPC calls. If such calls
|
|
// fail, we have to set this earliestPackage time to now such that the
|
|
// main thread tries again immediately: and for that agent starting at 0
|
|
// which effectively will be _state.firstIndex().
|
|
MUTEX_LOCKER(guard, _tiLock);
|
|
LOG_TOPIC("9e856", DEBUG, Logger::AGENCY)
|
|
<< "Resetting _earliestPackage to now for id " << slaveId;
|
|
_earliestPackage[slaveId] = steady_clock::now() + seconds(1);
|
|
_confirmed[slaveId] = 0;
|
|
} else {
|
|
// answer to sendAppendEntries to empty request, when follower's highest
|
|
// log index is 0. This is necessary so that a possibly restarted agent
|
|
// without persistence immediately is brought up to date. We only want to do
|
|
// this, when the agent was able to answer and no or corrupt answer is
|
|
// handled
|
|
if (sent) {
|
|
MUTEX_LOCKER(guard, _tiLock);
|
|
_confirmed[slaveId] = 0;
|
|
}
|
|
}
|
|
}
|
|
|
|
/// Followers' append entries
|
|
priv_rpc_ret_t Agent::recvAppendEntriesRPC(term_t term, std::string const& leaderId,
|
|
index_t prevIndex, term_t prevTerm,
|
|
index_t leaderCommitIndex,
|
|
query_t const& queries) {
|
|
LOG_TOPIC("62f43", DEBUG, Logger::AGENCY)
|
|
<< "Got AppendEntriesRPC from " << leaderId << " with term " << term;
|
|
|
|
term_t t(this->term());
|
|
if (!ready()) { // We have not been able to put together our configuration
|
|
LOG_TOPIC("7e96c", DEBUG, Logger::AGENCY) << "Agent is not ready yet.";
|
|
return priv_rpc_ret_t(false, t);
|
|
}
|
|
|
|
VPackSlice payload = queries->slice();
|
|
|
|
// Update commit index
|
|
if (payload.type() != VPackValueType::Array) {
|
|
LOG_TOPIC("449b2", DEBUG, Logger::AGENCY)
|
|
<< "Received malformed entries for appending. Discarding!";
|
|
return priv_rpc_ret_t(false, t);
|
|
}
|
|
|
|
size_t nqs = payload.length();
|
|
if (nqs > 0 && !payload[0].get("readDB").isNone()) {
|
|
// We have received a compacted state.
|
|
// Whatever we got in our own state is meaningless now. It is a new world.
|
|
// checkLeader just does plausibility as if it were an empty request
|
|
prevIndex = 0;
|
|
prevTerm = 0;
|
|
}
|
|
|
|
// Leadership claim plausibility check
|
|
if (!_constituent.checkLeader(term, leaderId, prevIndex, prevTerm)) {
|
|
LOG_TOPIC("fc654", DEBUG, Logger::AGENCY) << "Not accepting appendEntries from " << leaderId;
|
|
return priv_rpc_ret_t(false, t);
|
|
}
|
|
|
|
// Empty appendEntries:
|
|
// We answer with success if and only if our highest index is greater 0.
|
|
// Else we want to indicate to the leader that we are behind and need data:
|
|
// a single false will go back and trigger _confirmed[thisfollower] = 0
|
|
if (nqs == 0) {
|
|
auto lastIndex = _state.lastIndex();
|
|
if (lastIndex > 0) {
|
|
LOG_TOPIC("b0b19", DEBUG, Logger::AGENCY)
|
|
<< "Finished empty AppendEntriesRPC from " << leaderId
|
|
<< " with term " << term;
|
|
{
|
|
WRITE_LOCKER(oLocker, _outputLock);
|
|
_commitIndex = std::max(_commitIndex, std::min(leaderCommitIndex, lastIndex));
|
|
}
|
|
return priv_rpc_ret_t(true, t);
|
|
} else {
|
|
return priv_rpc_ret_t(false, t);
|
|
}
|
|
}
|
|
|
|
bool ok = true;
|
|
index_t lastIndex = 0; // Index of last entry in our log
|
|
try {
|
|
lastIndex = _state.logFollower(queries);
|
|
if (lastIndex < payload[nqs - 1].get("index").getNumber<index_t>()) {
|
|
// We could not log all the entries in this query, we need to report
|
|
// this to the leader!
|
|
ok = false;
|
|
}
|
|
} catch (std::exception const& e) {
|
|
LOG_TOPIC("bedb8", DEBUG, Logger::AGENCY) << "Exception during log append: " << __FILE__
|
|
<< __LINE__ << " " << e.what();
|
|
}
|
|
|
|
{
|
|
WRITE_LOCKER(oLocker, _outputLock);
|
|
CONDITION_LOCKER(guard, _waitForCV);
|
|
_commitIndex = std::max(_commitIndex, std::min(leaderCommitIndex, lastIndex));
|
|
_waitForCV.broadcast();
|
|
if (leaderCommitIndex >= _state.nextCompactionAfter() &&
|
|
payload[nqs - 1].get("index").getNumber<index_t>() >= _state.nextCompactionAfter()) {
|
|
_compactor.wakeUp();
|
|
}
|
|
}
|
|
|
|
LOG_TOPIC("83504", DEBUG, Logger::AGENCY)
|
|
<< "Finished AppendEntriesRPC from " << leaderId << " with term " << term;
|
|
|
|
return priv_rpc_ret_t(ok, t);
|
|
}
|
|
|
|
/// Leader's append entries
|
|
void Agent::sendAppendEntriesRPC() {
|
|
auto cc = ClusterComm::instance();
|
|
if (cc == nullptr) {
|
|
// nullptr only happens during controlled shutdown
|
|
return;
|
|
}
|
|
|
|
// _lastSent only accessed in main thread
|
|
std::string const myid = id();
|
|
|
|
for (auto const& followerId : _config.active()) {
|
|
if (followerId != myid && leading()) {
|
|
term_t t(0);
|
|
|
|
index_t lastConfirmed;
|
|
auto startTime = steady_clock::now();
|
|
SteadyTimePoint earliestPackage;
|
|
SteadyTimePoint lastAcked;
|
|
|
|
{
|
|
t = this->term();
|
|
MUTEX_LOCKER(tiLocker, _tiLock);
|
|
lastConfirmed = _confirmed[followerId];
|
|
lastAcked = _lastAcked[followerId];
|
|
earliestPackage = _earliestPackage[followerId];
|
|
}
|
|
|
|
// We essentially have to send some log entries from their lastConfirmed+1
|
|
// on. However, we have to take care of the case that their lastConfirmed
|
|
// is a value which is very outdated, such that we have in the meantime
|
|
// done a log compaction and do not actually have lastConfirmed+1 any
|
|
// more. In that case, we need to send our latest snapshot at index S
|
|
// (say), and then some log entries from (and including) S on. This is
|
|
// to ensure that the other side does not only have the snapshot, but
|
|
// also the log entry which produced that snapshot.
|
|
// Therefore, we will set lastConfirmed to one less than our latest
|
|
// snapshot in this special case, and we will always fetch enough
|
|
// entries from the log to fulfull our duties.
|
|
|
|
if ((steady_clock::now() - earliestPackage).count() < 0 ||
|
|
_state.lastIndex() <= lastConfirmed) {
|
|
LOG_TOPIC("cfeed", DEBUG, Logger::AGENCY) << "Nothing to append.";
|
|
continue;
|
|
}
|
|
|
|
duration<double> lockTime = steady_clock::now() - startTime;
|
|
if (lockTime.count() > 0.1) {
|
|
LOG_TOPIC("b8f60", WARN, Logger::AGENCY)
|
|
<< "Reading lastConfirmed took too long: " << lockTime.count();
|
|
}
|
|
|
|
index_t commitIndex;
|
|
{
|
|
READ_LOCKER(oLocker, _outputLock);
|
|
commitIndex = _commitIndex;
|
|
}
|
|
|
|
// If the follower is behind our first log entry send last snapshot and
|
|
// following logs. Else try to have the follower catch up in regular order.
|
|
bool needSnapshot = lastConfirmed < _state.firstIndex();
|
|
if (needSnapshot) {
|
|
lastConfirmed = _state.lastCompactionAt() - 1;
|
|
}
|
|
|
|
LOG_TOPIC("7c578", TRACE, Logger::AGENCY) << "Getting unconfirmed from " << lastConfirmed
|
|
<< " to " << lastConfirmed + 99;
|
|
// If lastConfirmed is one minus the first log entry, then this is
|
|
// corrected in _state::get and we only get from the beginning of the
|
|
// log.
|
|
std::vector<log_t> unconfirmed = _state.get(lastConfirmed, lastConfirmed + 99);
|
|
|
|
lockTime = steady_clock::now() - startTime;
|
|
if (lockTime.count() > 0.2) {
|
|
LOG_TOPIC("03cb9", WARN, Logger::AGENCY)
|
|
<< "Finding unconfirmed entries took too long: " << lockTime.count();
|
|
}
|
|
|
|
// Note that despite compaction this vector can never be empty, since
|
|
// any compaction keeps at least one active log entry!
|
|
|
|
if (unconfirmed.empty()) {
|
|
LOG_TOPIC("0b993", ERR, Logger::AGENCY) << "Unexpected empty unconfirmed: "
|
|
<< "lastConfirmed=" << lastConfirmed
|
|
<< " commitIndex=" << commitIndex;
|
|
TRI_ASSERT(false);
|
|
}
|
|
|
|
// Note that if we get here we have at least two entries, otherwise
|
|
// we would have done continue earlier, since this can only happen
|
|
// if lastConfirmed is equal to the last index in our log, in which
|
|
// case there is nothing to replicate.
|
|
|
|
duration<double> m = steady_clock::now() - _lastSent[followerId];
|
|
|
|
if (m.count() > _config.minPing() &&
|
|
_lastSent[followerId].time_since_epoch().count() != 0) {
|
|
LOG_TOPIC("0ddbd", DEBUG, Logger::AGENCY)
|
|
<< "Note: sent out last AppendEntriesRPC "
|
|
<< "to follower " << followerId
|
|
<< " more than minPing ago: " << m.count() << " lastAcked: "
|
|
<< duration_cast<duration<double>>(lastAcked.time_since_epoch()).count();
|
|
}
|
|
index_t lowest = unconfirmed.front().index;
|
|
|
|
Store snapshot(this, "snapshot");
|
|
index_t snapshotIndex;
|
|
term_t snapshotTerm;
|
|
|
|
if (lowest > lastConfirmed || needSnapshot) {
|
|
// Ooops, compaction has thrown away so many log entries that
|
|
// we cannot actually update the follower. We need to send our
|
|
// latest snapshot instead:
|
|
bool success = false;
|
|
try {
|
|
success = _state.loadLastCompactedSnapshot(snapshot, snapshotIndex, snapshotTerm);
|
|
} catch (std::exception const& e) {
|
|
LOG_TOPIC("f2287", WARN, Logger::AGENCY)
|
|
<< "Exception thrown by loadLastCompactedSnapshot: " << e.what();
|
|
}
|
|
if (!success) {
|
|
LOG_TOPIC("6e2b8", WARN, Logger::AGENCY)
|
|
<< "Could not load last compacted snapshot, not sending "
|
|
"appendEntriesRPC!";
|
|
continue;
|
|
}
|
|
if (snapshotTerm == 0) {
|
|
// No shapshot yet
|
|
needSnapshot = false;
|
|
}
|
|
}
|
|
|
|
// RPC path
|
|
std::stringstream path;
|
|
index_t prevLogIndex = unconfirmed.front().index;
|
|
index_t prevLogTerm = unconfirmed.front().term;
|
|
if (needSnapshot) {
|
|
prevLogIndex = snapshotIndex;
|
|
prevLogTerm = snapshotTerm;
|
|
}
|
|
{
|
|
path << "/_api/agency_priv/appendEntries?term=" << t
|
|
<< "&leaderId=" << id() << "&prevLogIndex=" << prevLogIndex
|
|
<< "&prevLogTerm=" << prevLogTerm << "&leaderCommit=" << commitIndex
|
|
<< "&senderTimeStamp=" << std::llround(steadyClockToDouble() * 1000);
|
|
}
|
|
|
|
// Body
|
|
Builder builder;
|
|
builder.add(VPackValue(VPackValueType::Array));
|
|
|
|
if (needSnapshot) {
|
|
{
|
|
VPackObjectBuilder guard(&builder);
|
|
builder.add(VPackValue("readDB"));
|
|
{
|
|
VPackArrayBuilder guard2(&builder);
|
|
snapshot.dumpToBuilder(builder);
|
|
}
|
|
builder.add("term", VPackValue(snapshotTerm));
|
|
builder.add("index", VPackValue(snapshotIndex));
|
|
}
|
|
}
|
|
|
|
size_t toLog = 0;
|
|
index_t highest = 0;
|
|
for (size_t i = 0; i < unconfirmed.size(); ++i) {
|
|
auto const& entry = unconfirmed.at(i);
|
|
if (entry.index > lastConfirmed) {
|
|
// This condition is crucial, because usually we have one more
|
|
// entry than we need in unconfirmed, so we want to skip this. If,
|
|
// however, we have sent a snapshot, we need to send the log entry
|
|
// with the same index than the snapshot along to retain the
|
|
// invariant of our data structure that the _log in _state is
|
|
// non-empty.
|
|
builder.add(VPackValue(VPackValueType::Object));
|
|
builder.add("index", VPackValue(entry.index));
|
|
builder.add("term", VPackValue(entry.term));
|
|
builder.add("query", VPackSlice(entry.entry->data()));
|
|
builder.add("clientId", VPackValue(entry.clientId));
|
|
builder.add("timestamp", VPackValue(entry.timestamp.count()));
|
|
builder.close();
|
|
highest = entry.index;
|
|
++toLog;
|
|
}
|
|
}
|
|
builder.close();
|
|
|
|
// Really leading?
|
|
if (challengeLeadership()) {
|
|
resign();
|
|
return;
|
|
}
|
|
|
|
// Postpone sending the next message for 30 seconds or until an
|
|
// error or successful result occurs.
|
|
earliestPackage = steady_clock::now() + std::chrono::seconds(30);
|
|
{
|
|
MUTEX_LOCKER(tiLocker, _tiLock);
|
|
_earliestPackage[followerId] = earliestPackage;
|
|
}
|
|
LOG_TOPIC("99061", DEBUG, Logger::AGENCY)
|
|
<< "Setting _earliestPackage to now + 30s for id " << followerId;
|
|
|
|
// Send request
|
|
std::unordered_map<std::string, std::string> headerFields;
|
|
cc->asyncRequest(1, _config.poolAt(followerId),
|
|
arangodb::rest::RequestType::POST, path.str(),
|
|
std::make_shared<std::string>(builder.toJson()), headerFields,
|
|
std::make_shared<AgentCallback>(this, followerId, highest, toLog),
|
|
150.0, true);
|
|
// Note the timeout is relatively long, but due to the 30 seconds
|
|
// above, we only ever have at most 5 messages in flight.
|
|
|
|
_lastSent[followerId] = steady_clock::now();
|
|
// _constituent.notifyHeartbeatSent(followerId);
|
|
// Do not notify constituent, because the AppendEntriesRPC here could
|
|
// take a very long time, so this must not disturb the empty ones
|
|
// being sent out.
|
|
|
|
LOG_TOPIC("2d80d", DEBUG, Logger::AGENCY)
|
|
<< "Appending (" << (uint64_t)(TRI_microtime() * 1000000000.0) << ") "
|
|
<< unconfirmed.size() - 1 << " entries up to index " << highest
|
|
<< (needSnapshot ? " and a snapshot" : "") << " to follower "
|
|
<< followerId << ". Next real log contact to " << followerId << " in: "
|
|
<< std::chrono::duration<double, std::milli>(earliestPackage - steady_clock::now())
|
|
.count()
|
|
<< "ms";
|
|
}
|
|
}
|
|
}
|
|
|
|
void Agent::resign(term_t otherTerm) {
|
|
LOG_TOPIC("494a7", DEBUG, Logger::AGENCY) << "Resigning in term " << _constituent.term()
|
|
<< " because of peer's term " << otherTerm;
|
|
_constituent.follow(otherTerm, NO_LEADER);
|
|
endPrepareLeadership();
|
|
}
|
|
|
|
/// Leader's append entries, empty ones for heartbeat, triggered by Constituent
|
|
void Agent::sendEmptyAppendEntriesRPC(std::string followerId) {
|
|
auto cc = ClusterComm::instance();
|
|
if (cc == nullptr) {
|
|
// nullptr only happens during controlled shutdown
|
|
return;
|
|
}
|
|
|
|
if (!leading()) {
|
|
LOG_TOPIC("95220", DEBUG, Logger::AGENCY)
|
|
<< "Not sending empty appendEntriesRPC to follower " << followerId
|
|
<< " because we are no longer leading.";
|
|
return;
|
|
}
|
|
|
|
index_t commitIndex;
|
|
{
|
|
READ_LOCKER(oLocker, _outputLock);
|
|
commitIndex = _commitIndex;
|
|
}
|
|
|
|
// RPC path
|
|
std::stringstream path;
|
|
{
|
|
path << "/_api/agency_priv/appendEntries?term=" << _constituent.term()
|
|
<< "&leaderId=" << id() << "&prevLogIndex=0"
|
|
<< "&prevLogTerm=0&leaderCommit=" << commitIndex
|
|
<< "&senderTimeStamp=" << std::llround(steadyClockToDouble() * 1000);
|
|
}
|
|
|
|
// Just check once more:
|
|
if (!leading()) {
|
|
LOG_TOPIC("99dc2", DEBUG, Logger::AGENCY)
|
|
<< "Not sending empty appendEntriesRPC to follower " << followerId
|
|
<< " because we are no longer leading.";
|
|
return;
|
|
}
|
|
|
|
// Send request
|
|
std::unordered_map<std::string, std::string> headerFields;
|
|
cc->asyncRequest(1, _config.poolAt(followerId), arangodb::rest::RequestType::POST,
|
|
path.str(), std::make_shared<std::string>("[]"), headerFields,
|
|
std::make_shared<AgentCallback>(this, followerId, 0, 0),
|
|
3 * _config.minPing() * _config.timeoutMult(), true);
|
|
_constituent.notifyHeartbeatSent(followerId);
|
|
|
|
double now = TRI_microtime();
|
|
LOG_TOPIC("54798", DEBUG, Logger::AGENCY)
|
|
<< "Sending empty appendEntriesRPC to follower " << followerId;
|
|
double diff = TRI_microtime() - now;
|
|
if (diff > 0.01) {
|
|
LOG_TOPIC("cfb7b", DEBUG, Logger::AGENCY)
|
|
<< "Logging of a line took more than 1/100 of a second, this is bad:" << diff;
|
|
}
|
|
}
|
|
|
|
void Agent::advanceCommitIndex() {
|
|
// Determine median _confirmed value of followers:
|
|
std::vector<index_t> temp;
|
|
{
|
|
MUTEX_LOCKER(_tiLocker, _tiLock);
|
|
for (auto const& id : config().active()) {
|
|
if (_confirmed.find(id) != _confirmed.end()) {
|
|
temp.push_back(_confirmed[id]);
|
|
}
|
|
}
|
|
}
|
|
|
|
index_t quorum = size() / 2 + 1;
|
|
if (temp.size() < quorum) {
|
|
LOG_TOPIC("47f8c", WARN, Logger::AGENCY)
|
|
<< "_confirmed not populated, quorum: " << quorum << ".";
|
|
return;
|
|
}
|
|
std::sort(temp.begin(), temp.end());
|
|
index_t index = temp[temp.size() - quorum];
|
|
|
|
term_t t = _constituent.term();
|
|
{
|
|
WRITE_LOCKER(oLocker, _outputLock);
|
|
if (index > _commitIndex) {
|
|
CONDITION_LOCKER(guard, _waitForCV);
|
|
LOG_TOPIC("e24a9", TRACE, Logger::AGENCY)
|
|
<< "Critical mass for commiting " << _commitIndex + 1 << " through "
|
|
<< index << " to read db";
|
|
// Change _readDB and _commitIndex atomically together:
|
|
_readDB.applyLogEntries(_state.slices(/* inform others by callbacks */
|
|
_commitIndex + 1, index),
|
|
_commitIndex, t, true);
|
|
|
|
_commitIndex = index;
|
|
LOG_TOPIC("e24aa", DEBUG, Logger::AGENCY)
|
|
<< "Critical mass for commiting " << _commitIndex + 1 << " through "
|
|
<< index << " to read db, done";
|
|
// Wake up rest handlers:
|
|
_waitForCV.broadcast();
|
|
|
|
if (_commitIndex >= _state.nextCompactionAfter()) {
|
|
_compactor.wakeUp();
|
|
}
|
|
}
|
|
}
|
|
}
|
|
|
|
// Check if I am member of active agency
|
|
bool Agent::active() const {
|
|
std::vector<std::string> active = _config.active();
|
|
return (find(active.begin(), active.end(), id()) != active.end());
|
|
}
|
|
|
|
/// @brief Activate agency (Inception thread for multi-host, main thread else)
|
|
void Agent::activateAgency() {
|
|
_config.activate();
|
|
try {
|
|
_state.persistActiveAgents(_config.activeToBuilder(), _config.poolToBuilder());
|
|
} catch (std::exception const& e) {
|
|
LOG_TOPIC("6578d", FATAL, Logger::AGENCY) << "Failed to persist active agency: " << e.what();
|
|
FATAL_ERROR_EXIT();
|
|
}
|
|
}
|
|
|
|
/// Load persistent state called once
|
|
void Agent::load() {
|
|
auto* sysDbFeature =
|
|
arangodb::application_features::ApplicationServer::lookupFeature<arangodb::SystemDatabaseFeature>();
|
|
arangodb::SystemDatabaseFeature::ptr vocbase =
|
|
sysDbFeature ? sysDbFeature->use() : nullptr;
|
|
auto queryRegistry = QueryRegistryFeature::registry();
|
|
|
|
if (vocbase == nullptr) {
|
|
LOG_TOPIC("63e36", FATAL, Logger::AGENCY) << "could not determine _system database";
|
|
FATAL_ERROR_EXIT();
|
|
}
|
|
|
|
{
|
|
_tiLock.assertNotLockedByCurrentThread();
|
|
MUTEX_LOCKER(guard, _ioLock); // need this for callback to set _spearhead
|
|
// Note that _state.loadCollections eventually does a callback to the
|
|
// setPersistedState method, which acquires _outputLock and _waitForCV.
|
|
|
|
LOG_TOPIC("c07e1", DEBUG, Logger::AGENCY) << "Loading persistent state.";
|
|
|
|
if (!_state.loadCollections(vocbase.get(), queryRegistry, _config.waitForSync())) {
|
|
LOG_TOPIC("9b680", FATAL, Logger::AGENCY)
|
|
<< "Failed to load persistent state on startup.";
|
|
FATAL_ERROR_EXIT();
|
|
}
|
|
}
|
|
|
|
// Note that the agent thread is terminated immediately when there is only
|
|
// one agent, since no AppendEntriesRPC have to be issued. Therefore,
|
|
// this thread is almost certainly terminated (and thus isStopping() returns
|
|
// true), when we get here.
|
|
if (size() > 1 && this->isStopping()) {
|
|
return;
|
|
}
|
|
|
|
wakeupMainLoop();
|
|
|
|
_compactor.start();
|
|
|
|
LOG_TOPIC("6e997", DEBUG, Logger::AGENCY) << "Starting spearhead worker.";
|
|
|
|
_constituent.start(vocbase.get(), queryRegistry);
|
|
persistConfiguration(term());
|
|
|
|
if (_config.supervision()) {
|
|
LOG_TOPIC("7658f", DEBUG, Logger::AGENCY) << "Starting cluster sanity facilities";
|
|
_supervision.start(this);
|
|
}
|
|
|
|
if (_inception != nullptr) { // resilient agency only
|
|
_inception->start();
|
|
} else {
|
|
MUTEX_LOCKER(guard, _ioLock); // need this for callback to set _spearhead
|
|
READ_LOCKER(guard2, _outputLock);
|
|
_spearhead = _readDB;
|
|
activateAgency();
|
|
}
|
|
}
|
|
|
|
/// Still leading? Under MUTEX from ::read or ::write
|
|
bool Agent::challengeLeadership() {
|
|
MUTEX_LOCKER(tiLocker, _tiLock);
|
|
size_t good = 0;
|
|
|
|
std::string const myid = id();
|
|
|
|
for (auto const& i : _lastAcked) {
|
|
if (i.first != myid) { // do not count ourselves
|
|
duration<double> m = steady_clock::now() - i.second;
|
|
LOG_TOPIC("22f78", DEBUG, Logger::AGENCY)
|
|
<< "challengeLeadership: found "
|
|
"_lastAcked["
|
|
<< i.first << "] to be " << m.count() << " seconds in the past.";
|
|
|
|
// This is rather arbitrary here: We used to have 0.9 here to absolutely
|
|
// ensure that a leader resigns before another one even starts an
|
|
// election. However, the Raft paper does not mention this at all. Rather,
|
|
// in the paper it is written that the leader should resign immediately if
|
|
// it sees a higher term from another server. Currently we have not implemented
|
|
// to return the follower's term with a response to AppendEntriesRPC, so
|
|
// the leader cannot find out a higher term this way. The leader can,
|
|
// however, see a higher term in the incoming AppendEntriesRPC a new
|
|
// leader sends out, and it will immediately resign if it sees that. For
|
|
// the moment, this value here can stay. We should soon implement sending
|
|
// the follower's term back with each response and probably get rid of
|
|
// this method altogether, but this requires a bit more thought.
|
|
if (_config.maxPing() * _config.timeoutMult() > m.count()) {
|
|
++good;
|
|
}
|
|
}
|
|
}
|
|
LOG_TOPIC("0e75d", DEBUG, Logger::AGENCY) << "challengeLeadership: good=" << good;
|
|
|
|
return (good < size() / 2); // not counting myself
|
|
}
|
|
|
|
/// Get last acknowledged responses on leader
|
|
void Agent::lastAckedAgo(Builder& ret) const {
|
|
std::unordered_map<std::string, index_t> confirmed;
|
|
std::unordered_map<std::string, SteadyTimePoint> lastAcked;
|
|
std::unordered_map<std::string, SteadyTimePoint> lastSent;
|
|
index_t lastCompactionAt, nextCompactionAfter;
|
|
|
|
{
|
|
MUTEX_LOCKER(tiLocker, _tiLock);
|
|
lastAcked = _lastAcked;
|
|
confirmed = _confirmed;
|
|
lastSent = _lastSent;
|
|
lastCompactionAt = _state.lastCompactionAt();
|
|
nextCompactionAfter = _state.nextCompactionAfter();
|
|
}
|
|
|
|
std::function<double(std::pair<std::string, SteadyTimePoint> const&)> dur2str =
|
|
[&](std::pair<std::string, SteadyTimePoint> const& i) {
|
|
return id() == i.first
|
|
? 0.0
|
|
: 1.0e-3 *
|
|
std::floor(
|
|
duration<double>(steady_clock::now() - i.second).count() * 1.0e3);
|
|
};
|
|
|
|
ret.add("lastCompactionAt", VPackValue(lastCompactionAt));
|
|
ret.add("nextCompactionAfter", VPackValue(nextCompactionAfter));
|
|
if (leading()) {
|
|
ret.add(VPackValue("lastAcked"));
|
|
VPackObjectBuilder b(&ret);
|
|
for (auto const& i : lastAcked) {
|
|
auto lsit = lastSent.find(i.first);
|
|
// Note that it is possible that a server is already in lastAcked
|
|
// but not yet in lastSent, since lastSent only has times of non-empty
|
|
// appendEntriesRPC calls, but we also get lastAcked entries for the
|
|
// empty ones.
|
|
ret.add(VPackValue(i.first));
|
|
{
|
|
VPackObjectBuilder o(&ret);
|
|
ret.add("lastAckedTime", VPackValue(dur2str(i)));
|
|
ret.add("lastAckedIndex", VPackValue(confirmed.at(i.first)));
|
|
if (i.first != id()) {
|
|
if (lsit != lastSent.end()) {
|
|
ret.add("lastAppend", VPackValue(dur2str(*lsit)));
|
|
} else {
|
|
ret.add("lastAppend", VPackValue(dur2str(i)));
|
|
// This is just for the above mentioned case, which will very
|
|
// soon be rectified.
|
|
}
|
|
}
|
|
}
|
|
}
|
|
}
|
|
}
|
|
|
|
trans_ret_t Agent::transact(query_t const& queries) {
|
|
arangodb::consensus::index_t maxind = 0; // maximum write index
|
|
|
|
// Note that we are leading (_constituent.leading()) if and only
|
|
// if _constituent.leaderId == our own ID. Therefore, we do not have
|
|
// to use leading() or _constituent.leading() here, but can simply
|
|
// look at the leaderID.
|
|
auto leader = _constituent.leaderID();
|
|
if (leader != id()) {
|
|
return trans_ret_t(false, leader);
|
|
}
|
|
|
|
{
|
|
CONDITION_LOCKER(guard, _waitForCV);
|
|
while (getPrepareLeadership() != 0) {
|
|
_waitForCV.wait(100);
|
|
}
|
|
}
|
|
|
|
// Apply to spearhead and get indices for log entries
|
|
auto qs = queries->slice();
|
|
addTrxsOngoing(qs); // remember that these are ongoing
|
|
size_t failed;
|
|
auto ret = std::make_shared<arangodb::velocypack::Builder>();
|
|
{
|
|
TRI_DEFER(removeTrxsOngoing(qs));
|
|
// Note that once the transactions are in our log, we can remove them
|
|
// from the list of ongoing ones, although they might not yet be committed.
|
|
// This is because then, inquire will find them in the log and draw its
|
|
// own conclusions. The map of ongoing trxs is only to cover the time
|
|
// from when we receive the request until we have appended the trxs
|
|
// ourselves.
|
|
failed = 0;
|
|
ret->openArray();
|
|
// Only leader else redirect
|
|
if (challengeLeadership()) {
|
|
resign();
|
|
return trans_ret_t(false, NO_LEADER);
|
|
}
|
|
|
|
term_t currentTerm = term(); // this is the term we will be working with
|
|
|
|
// Check that we are actually still the leader:
|
|
if (!leading()) {
|
|
return trans_ret_t(false, NO_LEADER);
|
|
}
|
|
|
|
_tiLock.assertNotLockedByCurrentThread();
|
|
MUTEX_LOCKER(ioLocker, _ioLock);
|
|
|
|
for (const auto& query : VPackArrayIterator(qs)) {
|
|
// Check that we are actually still the leader:
|
|
if (!leading()) {
|
|
return trans_ret_t(false, NO_LEADER);
|
|
}
|
|
if (query[0].isObject()) {
|
|
check_ret_t res = _spearhead.applyTransaction(query);
|
|
if (res.successful()) {
|
|
maxind = (query.length() == 3 && query[2].isString())
|
|
? _state.logLeaderSingle(query[0], currentTerm, query[2].copyString())
|
|
: _state.logLeaderSingle(query[0], currentTerm);
|
|
ret->add(VPackValue(maxind));
|
|
} else {
|
|
_spearhead.read(res.failed->slice(), *ret);
|
|
++failed;
|
|
}
|
|
} else if (query[0].isString()) {
|
|
_spearhead.read(query, *ret);
|
|
}
|
|
}
|
|
ret->close();
|
|
}
|
|
|
|
// Report that leader has persisted
|
|
reportIn(id(), maxind);
|
|
|
|
if (size() == 1) {
|
|
advanceCommitIndex();
|
|
}
|
|
|
|
return trans_ret_t(true, id(), maxind, failed, ret);
|
|
}
|
|
|
|
// Non-persistent write to non-persisted key-value store
|
|
trans_ret_t Agent::transient(query_t const& queries) {
|
|
// Note that we are leading (_constituent.leading()) if and only
|
|
// if _constituent.leaderId == our own ID. Therefore, we do not have
|
|
// to use leading() or _constituent.leading() here, but can simply
|
|
// look at the leaderID.
|
|
auto leader = _constituent.leaderID();
|
|
if (leader != id()) {
|
|
return trans_ret_t(false, leader);
|
|
}
|
|
|
|
{
|
|
CONDITION_LOCKER(guard, _waitForCV);
|
|
while (getPrepareLeadership() != 0) {
|
|
_waitForCV.wait(100);
|
|
}
|
|
}
|
|
|
|
auto ret = std::make_shared<arangodb::velocypack::Builder>();
|
|
|
|
// Apply to spearhead and get indices for log entries
|
|
{
|
|
VPackArrayBuilder b(ret.get());
|
|
|
|
// Only leader else redirect
|
|
if (challengeLeadership()) {
|
|
resign();
|
|
return trans_ret_t(false, NO_LEADER);
|
|
}
|
|
|
|
_tiLock.assertNotLockedByCurrentThread();
|
|
MUTEX_LOCKER(ioLocker, _ioLock);
|
|
|
|
// Read and writes
|
|
for (const auto& query : VPackArrayIterator(queries->slice())) {
|
|
if (query[0].isObject()) {
|
|
ret->add(VPackValue(_transient.applyTransaction(query).successful()));
|
|
} else if (query[0].isString()) {
|
|
_transient.read(query, *ret);
|
|
}
|
|
}
|
|
}
|
|
|
|
return trans_ret_t(true, id(), 0, 0, ret);
|
|
}
|
|
|
|
write_ret_t Agent::inquire(query_t const& query) {
|
|
// Note that we are leading (_constituent.leading()) if and only
|
|
// if _constituent.leaderId == our own ID. Therefore, we do not have
|
|
// to use leading() or _constituent.leading() here, but can simply
|
|
// look at the leaderID.
|
|
auto leader = _constituent.leaderID();
|
|
if (leader != id()) {
|
|
return write_ret_t(false, leader);
|
|
}
|
|
|
|
write_ret_t ret;
|
|
|
|
while (true) {
|
|
// Check ongoing ones:
|
|
bool found = false;
|
|
for (auto const& s : VPackArrayIterator(query->slice())) {
|
|
std::string ss = s.copyString();
|
|
if (isTrxOngoing(ss)) {
|
|
found = true;
|
|
break;
|
|
}
|
|
}
|
|
if (!found) {
|
|
break;
|
|
}
|
|
std::this_thread::sleep_for(std::chrono::duration<double>(0.1));
|
|
leader = _constituent.leaderID();
|
|
if (leader != id()) {
|
|
return write_ret_t(false, leader);
|
|
}
|
|
}
|
|
|
|
_tiLock.assertNotLockedByCurrentThread();
|
|
MUTEX_LOCKER(ioLocker, _ioLock);
|
|
|
|
ret.indices = _state.inquire(query);
|
|
|
|
ret.accepted = true;
|
|
|
|
return ret;
|
|
}
|
|
|
|
/// Write new entries to replicated state and store
|
|
write_ret_t Agent::write(query_t const& query, WriteMode const& wmode) {
|
|
std::vector<apply_ret_t> applied;
|
|
std::vector<index_t> indices;
|
|
auto multihost = size() > 1;
|
|
|
|
// Note that we are leading (_constituent.leading()) if and only
|
|
// if _constituent.leaderId == our own ID. Therefore, we do not have
|
|
// to use leading() or _constituent.leading() here, but can simply
|
|
// look at the leaderID.
|
|
auto leader = _constituent.leaderID();
|
|
if (multihost && leader != id()) {
|
|
return write_ret_t(false, leader);
|
|
}
|
|
|
|
if (!wmode.discardStartup()) {
|
|
CONDITION_LOCKER(guard, _waitForCV);
|
|
while (getPrepareLeadership() != 0) {
|
|
_waitForCV.wait(100);
|
|
}
|
|
}
|
|
|
|
{
|
|
addTrxsOngoing(query->slice()); // remember that these are ongoing
|
|
TRI_DEFER(removeTrxsOngoing(query->slice()));
|
|
// Note that once the transactions are in our log, we can remove them
|
|
// from the list of ongoing ones, although they might not yet be committed.
|
|
// This is because then, inquire will find them in the log and draw its
|
|
// own conclusions. The map of ongoing trxs is only to cover the time
|
|
// from when we receive the request until we have appended the trxs
|
|
// ourselves.
|
|
|
|
auto slice = query->slice();
|
|
size_t ntrans = slice.length();
|
|
size_t npacks = ntrans / _config.maxAppendSize();
|
|
if (ntrans % _config.maxAppendSize() != 0) {
|
|
npacks++;
|
|
}
|
|
|
|
term_t currentTerm = term(); // this is the term we will be working with
|
|
|
|
// Check that we are actually still the leader:
|
|
if (!leading()) {
|
|
return write_ret_t(false, NO_LEADER);
|
|
}
|
|
|
|
// Apply to spearhead and get indices for log entries
|
|
// Avoid keeping lock indefinitely
|
|
for (size_t i = 0, l = 0; i < npacks; ++i) {
|
|
query_t chunk = std::make_shared<Builder>();
|
|
{
|
|
VPackArrayBuilder b(chunk.get());
|
|
for (size_t j = 0; j < _config.maxAppendSize() && l < ntrans; ++j, ++l) {
|
|
chunk->add(slice.at(l));
|
|
}
|
|
}
|
|
|
|
// Only leader else redirect
|
|
if (multihost && challengeLeadership()) {
|
|
resign();
|
|
return write_ret_t(false, NO_LEADER);
|
|
}
|
|
|
|
// Check that we are actually still the leader:
|
|
if (!leading()) {
|
|
return write_ret_t(false, NO_LEADER);
|
|
}
|
|
|
|
_tiLock.assertNotLockedByCurrentThread();
|
|
MUTEX_LOCKER(ioLocker, _ioLock);
|
|
|
|
applied = _spearhead.applyTransactions(chunk, wmode);
|
|
auto tmp = _state.logLeaderMulti(chunk, applied, currentTerm);
|
|
indices.insert(indices.end(), tmp.begin(), tmp.end());
|
|
}
|
|
}
|
|
|
|
// Maximum log index
|
|
index_t maxind = 0;
|
|
if (!indices.empty()) {
|
|
maxind = *std::max_element(indices.begin(), indices.end());
|
|
}
|
|
|
|
// Report that leader has persisted
|
|
reportIn(id(), maxind);
|
|
|
|
if (size() == 1) {
|
|
advanceCommitIndex();
|
|
}
|
|
|
|
return write_ret_t(true, id(), applied, indices);
|
|
}
|
|
|
|
/// Read from store
|
|
read_ret_t Agent::read(query_t const& query) {
|
|
// Note that we are leading (_constituent.leading()) if and only
|
|
// if _constituent.leaderId == our own ID. Therefore, we do not have
|
|
// to use leading() or _constituent.leading() here, but can simply
|
|
// look at the leaderID.
|
|
auto leader = _constituent.leaderID();
|
|
if (leader != id()) {
|
|
return read_ret_t(false, leader);
|
|
}
|
|
|
|
{
|
|
CONDITION_LOCKER(guard, _waitForCV);
|
|
while (getPrepareLeadership() != 0) {
|
|
_waitForCV.wait(100);
|
|
}
|
|
}
|
|
|
|
// Only leader else redirect
|
|
if (challengeLeadership()) {
|
|
resign();
|
|
return read_ret_t(false, NO_LEADER);
|
|
}
|
|
|
|
leader = _constituent.leaderID();
|
|
auto result = std::make_shared<arangodb::velocypack::Builder>();
|
|
|
|
READ_LOCKER(oLocker, _outputLock);
|
|
|
|
// Retrieve data from readDB
|
|
std::vector<bool> success = _readDB.read(query, result);
|
|
|
|
return read_ret_t(true, leader, std::move(success), std::move(result));
|
|
}
|
|
|
|
/// Send out append entries to followers regularly or on event
|
|
void Agent::run() {
|
|
// Only run in case we are in multi-host mode
|
|
while (!this->isStopping() && size() > 1) {
|
|
{
|
|
// We set the variable to false here, if any change happens during
|
|
// or after the calls in this loop, this will be set to true to
|
|
// indicate no sleeping. Any change will happen under the mutex.
|
|
CONDITION_LOCKER(guard, _appendCV);
|
|
_agentNeedsWakeup = false;
|
|
}
|
|
|
|
if (leading() && getPrepareLeadership() == 1) {
|
|
// If we are officially leading but the _preparing flag is set, we
|
|
// are in the process of preparing for leadership. This flag is
|
|
// set when the Constituent celebrates an election victory. Here,
|
|
// in the main thread, we do the actual preparations:
|
|
|
|
if (!prepareLead()) {
|
|
_constituent.follow(0); // do not change _term or _votedFor
|
|
} else {
|
|
// we need to start work as leader
|
|
lead();
|
|
}
|
|
|
|
donePrepareLeadership(); // we are ready to roll, except that we
|
|
// have to wait for the _commitIndex to
|
|
// reach the end of our log
|
|
}
|
|
|
|
// Leader working only
|
|
if (leading()) {
|
|
if (1 == getPrepareLeadership()) {
|
|
// Skip the usual work and the waiting such that above preparation
|
|
// code runs immediately. We will return with value 2 such that
|
|
// replication and confirmation of it can happen. Service will
|
|
// continue once _commitIndex has reached the end of the log and then
|
|
// getPrepareLeadership() will finally return 0.
|
|
continue;
|
|
}
|
|
|
|
// Challenge leadership.
|
|
// Let's proactively know, that we no longer lead instead of finding out
|
|
// through read/write.
|
|
if (challengeLeadership()) {
|
|
resign();
|
|
continue;
|
|
}
|
|
|
|
// Append entries to followers
|
|
sendAppendEntriesRPC();
|
|
|
|
// Check whether we can advance _commitIndex
|
|
advanceCommitIndex();
|
|
|
|
// Empty store callback trash bin
|
|
emptyCbTrashBin();
|
|
|
|
bool commenceService = false;
|
|
{
|
|
READ_LOCKER(oLocker, _outputLock);
|
|
if (leading() && getPrepareLeadership() == 2 && _commitIndex == _state.lastIndex()) {
|
|
commenceService = true;
|
|
}
|
|
}
|
|
|
|
if (commenceService) {
|
|
_tiLock.assertNotLockedByCurrentThread();
|
|
MUTEX_LOCKER(ioLocker, _ioLock);
|
|
READ_LOCKER(oLocker, _outputLock);
|
|
_spearhead = _readDB;
|
|
endPrepareLeadership(); // finally service can commence
|
|
}
|
|
|
|
// Go to sleep some:
|
|
{
|
|
CONDITION_LOCKER(guard, _appendCV);
|
|
if (!_agentNeedsWakeup) {
|
|
// wait up to minPing():
|
|
_appendCV.wait(static_cast<uint64_t>(1.0e6 * _config.minPing()));
|
|
// We leave minPing here without the multiplier to run this
|
|
// loop often enough in cases of high load.
|
|
}
|
|
}
|
|
} else {
|
|
CONDITION_LOCKER(guard, _appendCV);
|
|
if (!_agentNeedsWakeup) {
|
|
_appendCV.wait(1000000);
|
|
}
|
|
}
|
|
}
|
|
}
|
|
|
|
void Agent::persistConfiguration(term_t t) {
|
|
// Agency configuration
|
|
auto agency = std::make_shared<Builder>();
|
|
{
|
|
VPackArrayBuilder trxs(agency.get());
|
|
{
|
|
VPackArrayBuilder trx(agency.get());
|
|
{
|
|
VPackObjectBuilder oper(agency.get());
|
|
agency->add(VPackValue(RECONFIGURE));
|
|
{
|
|
VPackObjectBuilder a(agency.get());
|
|
agency->add("op", VPackValue("set"));
|
|
agency->add(VPackValue("new"));
|
|
{
|
|
VPackObjectBuilder aa(agency.get());
|
|
agency->add("term", VPackValue(t));
|
|
agency->add(config_t::idStr, VPackValue(id()));
|
|
agency->add(config_t::activeStr, _config.activeToBuilder()->slice());
|
|
agency->add(config_t::poolStr, _config.poolToBuilder()->slice());
|
|
agency->add("size", VPackValue(size()));
|
|
agency->add(config_t::timeoutMultStr, VPackValue(_config.timeoutMult()));
|
|
}
|
|
}
|
|
}
|
|
}
|
|
}
|
|
|
|
// In case we've lost leadership, no harm will arise as the failed write
|
|
// prevents bogus agency configuration to be replicated among agents. ***
|
|
write(agency, WriteMode(true, true));
|
|
}
|
|
|
|
/// Orderly shutdown
|
|
void Agent::beginShutdown() {
|
|
Thread::beginShutdown();
|
|
|
|
// Stop constituent and key value stores
|
|
_constituent.beginShutdown();
|
|
|
|
// Stop supervision
|
|
if (_config.supervision()) {
|
|
_supervision.beginShutdown();
|
|
}
|
|
|
|
// Stop inception process
|
|
if (_inception != nullptr) { // resilient agency only
|
|
_inception->beginShutdown();
|
|
}
|
|
|
|
// Compactor
|
|
_compactor.beginShutdown();
|
|
|
|
// Wake up all waiting rest handlers
|
|
{
|
|
CONDITION_LOCKER(guard, _waitForCV);
|
|
_waitForCV.broadcast();
|
|
}
|
|
|
|
// Wake up run
|
|
wakeupMainLoop();
|
|
}
|
|
|
|
bool Agent::prepareLead() {
|
|
{
|
|
// Erase _earliestPackage, which allows for immediate sending of
|
|
// AppendEntriesRPC when we become a leader.
|
|
MUTEX_LOCKER(tiLocker, _tiLock);
|
|
_earliestPackage.clear();
|
|
}
|
|
|
|
{
|
|
// Clear transient for supervision start
|
|
MUTEX_LOCKER(ioLocker, _ioLock);
|
|
_transient.clear();
|
|
}
|
|
|
|
// Key value stores
|
|
try {
|
|
rebuildDBs();
|
|
} catch (std::exception const& e) {
|
|
LOG_TOPIC("aa3cd", ERR, Logger::AGENCY) << "Failed to rebuild key value stores." << e.what();
|
|
return false;
|
|
}
|
|
|
|
// Reset last acknowledged
|
|
{
|
|
MUTEX_LOCKER(tiLocker, _tiLock);
|
|
for (auto const& i : _config.active()) {
|
|
_lastAcked[i] = steady_clock::now();
|
|
}
|
|
}
|
|
|
|
return true;
|
|
}
|
|
|
|
/// Becoming leader
|
|
void Agent::lead() {
|
|
{
|
|
// We cannot start sendAppendentries before first log index.
|
|
// Any missing indices before _commitIndex were compacted.
|
|
// DO NOT EDIT without understanding the consequences for sendAppendEntries!
|
|
index_t commitIndex;
|
|
{
|
|
READ_LOCKER(oLocker, _outputLock);
|
|
commitIndex = _commitIndex;
|
|
}
|
|
|
|
MUTEX_LOCKER(tiLocker, _tiLock);
|
|
for (auto& i : _confirmed) {
|
|
if (i.first != id()) {
|
|
i.second = commitIndex;
|
|
}
|
|
}
|
|
}
|
|
|
|
// Agency configuration
|
|
term_t myterm;
|
|
myterm = _constituent.term();
|
|
|
|
persistConfiguration(myterm);
|
|
|
|
// This is all right now, in the main loop we will wait until a
|
|
// majority of all servers have replicated this configuration.
|
|
// Then we will copy the _readDB to the _spearhead and start service.
|
|
}
|
|
|
|
// How long back did I take over leadership, result in seconds
|
|
int64_t Agent::leaderFor() const {
|
|
return std::chrono::duration_cast<std::chrono::duration<int64_t>>(
|
|
std::chrono::steady_clock::now().time_since_epoch())
|
|
.count() -
|
|
_leaderSince;
|
|
}
|
|
|
|
void Agent::updatePeerEndpoint(query_t const& message) {
|
|
VPackSlice slice = message->slice();
|
|
|
|
if (!slice.isObject() || slice.length() == 0) {
|
|
THROW_ARANGO_EXCEPTION_MESSAGE(TRI_ERROR_AGENCY_INFORM_MUST_BE_OBJECT,
|
|
std::string("Improper greeting: ") + slice.toJson());
|
|
}
|
|
|
|
std::string uuid, endpoint;
|
|
try {
|
|
uuid = slice.keyAt(0).copyString();
|
|
} catch (std::exception const& e) {
|
|
THROW_ARANGO_EXCEPTION_MESSAGE(TRI_ERROR_AGENCY_INFORM_MUST_BE_OBJECT,
|
|
std::string("Cannot deal with UUID: ") + e.what());
|
|
}
|
|
|
|
try {
|
|
endpoint = slice.valueAt(0).copyString();
|
|
} catch (std::exception const& e) {
|
|
THROW_ARANGO_EXCEPTION_MESSAGE(TRI_ERROR_AGENCY_INFORM_MUST_BE_OBJECT,
|
|
std::string("Cannot deal with UUID: ") + e.what());
|
|
}
|
|
|
|
updatePeerEndpoint(uuid, endpoint);
|
|
}
|
|
|
|
bool Agent::addGossipPeer(std::string const& endpoint) {
|
|
return _config.addGossipPeer(endpoint);
|
|
}
|
|
|
|
void Agent::updatePeerEndpoint(std::string const& id, std::string const& ep) {
|
|
if (_config.updateEndpoint(id, ep)) {
|
|
if (!challengeLeadership()) {
|
|
persistConfiguration(term());
|
|
}
|
|
}
|
|
}
|
|
|
|
void Agent::notify(query_t const& message) {
|
|
VPackSlice slice = message->slice();
|
|
|
|
if (!slice.isObject()) {
|
|
THROW_ARANGO_EXCEPTION_MESSAGE(
|
|
TRI_ERROR_AGENCY_INFORM_MUST_BE_OBJECT,
|
|
std::string("Inform message must be an object. Incoming type is ") +
|
|
slice.typeName());
|
|
}
|
|
|
|
if (!slice.hasKey("id") || !slice.get("id").isString()) {
|
|
THROW_ARANGO_EXCEPTION(TRI_ERROR_AGENCY_INFORM_MUST_CONTAIN_ID);
|
|
}
|
|
if (!slice.hasKey("term")) {
|
|
THROW_ARANGO_EXCEPTION(TRI_ERROR_AGENCY_INFORM_MUST_CONTAIN_TERM);
|
|
}
|
|
_constituent.update(slice.get("id").copyString(), slice.get("term").getUInt());
|
|
|
|
if (!slice.hasKey("active") || !slice.get("active").isArray()) {
|
|
THROW_ARANGO_EXCEPTION(TRI_ERROR_AGENCY_INFORM_MUST_CONTAIN_ACTIVE);
|
|
}
|
|
if (!slice.hasKey("pool") || !slice.get("pool").isObject()) {
|
|
THROW_ARANGO_EXCEPTION(TRI_ERROR_AGENCY_INFORM_MUST_CONTAIN_POOL);
|
|
}
|
|
if (!slice.hasKey("min ping") || !slice.get("min ping").isNumber()) {
|
|
THROW_ARANGO_EXCEPTION(TRI_ERROR_AGENCY_INFORM_MUST_CONTAIN_MIN_PING);
|
|
}
|
|
if (!slice.hasKey("max ping") || !slice.get("max ping").isNumber()) {
|
|
THROW_ARANGO_EXCEPTION(TRI_ERROR_AGENCY_INFORM_MUST_CONTAIN_MAX_PING);
|
|
}
|
|
if (!slice.hasKey("timeoutMult") || !slice.get("timeoutMult").isInteger()) {
|
|
THROW_ARANGO_EXCEPTION(TRI_ERROR_AGENCY_INFORM_MUST_CONTAIN_TIMEOUT_MULT);
|
|
}
|
|
|
|
_config.update(message);
|
|
|
|
_state.persistActiveAgents(_config.activeToBuilder(), _config.poolToBuilder());
|
|
}
|
|
|
|
// Rebuild key value stores
|
|
void Agent::rebuildDBs() {
|
|
term_t term = _constituent.term();
|
|
|
|
_tiLock.assertNotLockedByCurrentThread();
|
|
MUTEX_LOCKER(ioLocker, _ioLock);
|
|
WRITE_LOCKER(oLocker, _outputLock);
|
|
CONDITION_LOCKER(guard, _waitForCV);
|
|
|
|
index_t lastCompactionIndex;
|
|
|
|
// We must go back to clean sheet
|
|
_readDB.clear();
|
|
_spearhead.clear();
|
|
|
|
if (!_state.loadLastCompactedSnapshot(_readDB, lastCompactionIndex, term)) {
|
|
THROW_ARANGO_EXCEPTION(TRI_ERROR_AGENCY_CANNOT_REBUILD_DBS);
|
|
}
|
|
|
|
_commitIndex = lastCompactionIndex;
|
|
_waitForCV.broadcast();
|
|
|
|
// Apply logs from last applied index to leader's commit index
|
|
LOG_TOPIC("b12cb", DEBUG, Logger::AGENCY)
|
|
<< "Rebuilding key-value stores from index " << lastCompactionIndex
|
|
<< " to " << _commitIndex << " " << _state;
|
|
|
|
{
|
|
auto logs = _state.slices(lastCompactionIndex + 1, _commitIndex);
|
|
_readDB.applyLogEntries(logs, _commitIndex, term, false /* do not send callbacks */);
|
|
}
|
|
_spearhead = _readDB;
|
|
|
|
LOG_TOPIC("a66dc", INFO, Logger::AGENCY) << id() << " rebuilt key-value stores - serving.";
|
|
}
|
|
|
|
/// Compact read db
|
|
void Agent::compact() {
|
|
// We do not allow the case _config.compactionKeepSize() == 0, since
|
|
// we need to keep a part of the recent log. Therefore we cannot use
|
|
// the _readDB ever, since we have to compute a state of the key/value
|
|
// space well before _commitIndex anyway. Apart from this, the compaction
|
|
// code runs on the followers as well where we do not have a _readDB
|
|
// anyway.
|
|
index_t commitIndex;
|
|
{
|
|
READ_LOCKER(guard, _outputLock);
|
|
commitIndex = _commitIndex;
|
|
}
|
|
|
|
if (commitIndex >= _state.nextCompactionAfter()) {
|
|
// This check needs to be here, because the compactor thread wakes us
|
|
// up every 5 seconds.
|
|
// Note that it is OK to compact anywhere before or at _commitIndex.
|
|
if (!_state.compact(commitIndex, _config.compactionKeepSize())) {
|
|
LOG_TOPIC("70234", WARN, Logger::AGENCY)
|
|
<< "Compaction for index " << commitIndex << " with keep size "
|
|
<< _config.compactionKeepSize() << " did not work.";
|
|
}
|
|
}
|
|
}
|
|
|
|
/// Last commit index
|
|
arangodb::consensus::index_t Agent::lastCommitted() const {
|
|
READ_LOCKER(oLocker, _outputLock);
|
|
return _commitIndex;
|
|
}
|
|
|
|
/// Last log entry
|
|
log_t Agent::lastLog() const { return _state.lastLog(); }
|
|
|
|
/// Get spearhead
|
|
Store const& Agent::spearhead() const { return _spearhead; }
|
|
|
|
/// Get _readDB reference with intentionally no lock acquired here.
|
|
/// Safe ONLY IF via executeLock() (see example Supervisor.cpp)
|
|
Store const& Agent::readDB() const { return _readDB; }
|
|
|
|
/// Get readdb
|
|
arangodb::consensus::index_t Agent::readDB(Node& node) const {
|
|
READ_LOCKER(oLocker, _outputLock);
|
|
node = _readDB.get();
|
|
return _commitIndex;
|
|
}
|
|
|
|
/// Get readdb
|
|
arangodb::consensus::index_t Agent::readDB(VPackBuilder& builder) const {
|
|
TRI_ASSERT(builder.isOpenObject());
|
|
|
|
uint64_t commitIndex = 0;
|
|
|
|
{ READ_LOCKER(oLocker, _outputLock);
|
|
|
|
commitIndex = _commitIndex;
|
|
// commit index
|
|
builder.add("index", VPackValue(commitIndex));
|
|
builder.add("term", VPackValue(term()));
|
|
|
|
// key-value store {}
|
|
builder.add(VPackValue("agency"));
|
|
_readDB.get().toBuilder(builder, true); }
|
|
|
|
// replicated log []
|
|
_state.toVelocyPack(commitIndex, builder);
|
|
|
|
return commitIndex;
|
|
}
|
|
|
|
void Agent::executeLockedRead(std::function<void()> const& cb) {
|
|
_tiLock.assertNotLockedByCurrentThread();
|
|
MUTEX_LOCKER(ioLocker, _ioLock);
|
|
READ_LOCKER(oLocker, _outputLock);
|
|
cb();
|
|
}
|
|
|
|
void Agent::executeLockedWrite(std::function<void()> const& cb) {
|
|
_tiLock.assertNotLockedByCurrentThread();
|
|
MUTEX_LOCKER(ioLocker, _ioLock);
|
|
WRITE_LOCKER(oLocker, _outputLock);
|
|
CONDITION_LOCKER(guard, _waitForCV);
|
|
cb();
|
|
}
|
|
|
|
/// Get transient
|
|
/// intentionally no lock is acquired here, so we can return
|
|
/// a const reference
|
|
/// the caller has to make sure the lock is actually held
|
|
Store const& Agent::transient() const {
|
|
_ioLock.assertLockedByCurrentThread();
|
|
return _transient;
|
|
}
|
|
|
|
/// Rebuild from persisted state
|
|
void Agent::setPersistedState(VPackSlice const& compaction) {
|
|
// Catch up with compacted state, this is only called at startup
|
|
_spearhead = compaction;
|
|
|
|
// Catch up with commit
|
|
try {
|
|
WRITE_LOCKER(oLocker, _outputLock);
|
|
CONDITION_LOCKER(guard, _waitForCV);
|
|
_readDB = compaction;
|
|
_commitIndex =
|
|
arangodb::basics::StringUtils::uint64(compaction.get("_key").copyString());
|
|
_waitForCV.broadcast();
|
|
} catch (std::exception const& e) {
|
|
LOG_TOPIC("70844", ERR, Logger::AGENCY) << e.what() << " " << __FILE__ << __LINE__;
|
|
}
|
|
}
|
|
|
|
/// Are we still starting up?
|
|
bool Agent::booting() { return (!_config.poolComplete()); }
|
|
|
|
/// We expect an object as follows {id:<id>,endpoint:<endpoint>,pool:{...}}
|
|
/// key: uuid value: endpoint
|
|
/// Lock configuration and compare
|
|
/// Add whatever is missing in our list.
|
|
/// Compare whatever is in our list already. (ASSERT identity)
|
|
/// If I know more immediately contact peer with my list.
|
|
query_t Agent::gossip(query_t const& in, bool isCallback, size_t version) {
|
|
LOG_TOPIC("1ae7b", DEBUG, Logger::AGENCY) << "Incoming gossip: " << in->slice().toJson();
|
|
|
|
VPackSlice slice = in->slice();
|
|
if (!slice.isObject()) {
|
|
THROW_ARANGO_EXCEPTION_MESSAGE(
|
|
20001,
|
|
std::string("Gossip message must be an object. Incoming type is ") +
|
|
slice.typeName());
|
|
}
|
|
|
|
if (slice.hasKey(StaticStrings::Error)) {
|
|
if (slice.get(StaticStrings::Code).getNumber<int>() == 403) {
|
|
LOG_TOPIC("6591b", FATAL, Logger::AGENCY)
|
|
<< "Gossip peer does not have us in their pool " << slice.toJson();
|
|
FATAL_ERROR_EXIT(); /// We don't belong here
|
|
} else {
|
|
LOG_TOPIC("949bb", DEBUG, Logger::AGENCY)
|
|
<< "Received gossip error. We'll retry " << slice.toJson();
|
|
}
|
|
query_t out = std::make_shared<Builder>();
|
|
return out;
|
|
}
|
|
|
|
if (!slice.hasKey("id") || !slice.get("id").isString()) {
|
|
THROW_ARANGO_EXCEPTION_MESSAGE(
|
|
20002, "Gossip message must contain string parameter 'id'");
|
|
}
|
|
std::string id = slice.get("id").copyString();
|
|
|
|
// If pool is complete and id not in our pool reject under all circumstances
|
|
if (_config.poolComplete() && !_config.findInPool(id)) {
|
|
query_t ret = std::make_shared<VPackBuilder>();
|
|
{
|
|
VPackObjectBuilder o(ret.get());
|
|
ret->add(StaticStrings::Code, VPackValue(403));
|
|
ret->add(StaticStrings::Error, VPackValue(true));
|
|
ret->add(StaticStrings::ErrorMessage,
|
|
VPackValue("This agents is not member of this pool"));
|
|
ret->add(StaticStrings::ErrorNum, VPackValue(403));
|
|
}
|
|
return ret;
|
|
}
|
|
|
|
if (!slice.hasKey("endpoint") || !slice.get("endpoint").isString()) {
|
|
THROW_ARANGO_EXCEPTION_MESSAGE(
|
|
20003, "Gossip message must contain string parameter 'endpoint'");
|
|
}
|
|
std::string endpoint = slice.get("endpoint").copyString();
|
|
|
|
if (_inception != nullptr && isCallback) {
|
|
_inception->reportVersionForEp(endpoint, version);
|
|
}
|
|
|
|
LOG_TOPIC("9d2d9", TRACE, Logger::AGENCY)
|
|
<< "Gossip " << ((isCallback) ? "callback" : "call") << " from " << endpoint;
|
|
|
|
if (!slice.hasKey("pool") || !slice.get("pool").isObject()) {
|
|
THROW_ARANGO_EXCEPTION_MESSAGE(
|
|
20003, "Gossip message must contain object parameter 'pool'");
|
|
}
|
|
VPackSlice pslice = slice.get("pool");
|
|
|
|
LOG_TOPIC("65dd8", TRACE, Logger::AGENCY) << "Received gossip " << slice.toJson();
|
|
for (auto const& pair : VPackObjectIterator(pslice)) {
|
|
if (!pair.value.isString()) {
|
|
THROW_ARANGO_EXCEPTION_MESSAGE(
|
|
20004, "Gossip message pool must contain string parameters");
|
|
}
|
|
}
|
|
|
|
query_t out = std::make_shared<Builder>();
|
|
|
|
{
|
|
VPackObjectBuilder b(out.get());
|
|
|
|
std::unordered_set<std::string> gossipPeers = _config.gossipPeers();
|
|
if (!gossipPeers.empty() && !isCallback) {
|
|
try {
|
|
_config.eraseGossipPeer(endpoint);
|
|
} catch (std::exception const& e) {
|
|
LOG_TOPIC("58f08", ERR, Logger::AGENCY) << __FILE__ << ":" << __LINE__ << " " << e.what();
|
|
}
|
|
}
|
|
|
|
std::string err;
|
|
config_t::upsert_t upsert = config_t::UNCHANGED;
|
|
|
|
/// Pool incomplete or the other guy is in my pool: I'll gossip.
|
|
if (!_config.poolComplete() || _config.matchPeer(id, endpoint)) {
|
|
upsert = _config.upsertPool(pslice, id);
|
|
if (upsert == config_t::WRONG) {
|
|
LOG_TOPIC("32973", FATAL, Logger::AGENCY) << "Discrepancy in agent pool!";
|
|
FATAL_ERROR_EXIT(); /// disagreement over pool membership are fatal!
|
|
}
|
|
|
|
// Wrapped in envelope in RestAgencyPrivHandler
|
|
auto pool = _config.pool();
|
|
out->add(VPackValue("pool"));
|
|
{
|
|
VPackObjectBuilder bb(out.get());
|
|
for (auto const& i : pool) {
|
|
out->add(i.first, VPackValue(i.second));
|
|
}
|
|
}
|
|
|
|
} else { // Pool complete & id's endpoint not matching.
|
|
|
|
// Not leader: redirect / 503
|
|
if (challengeLeadership()) {
|
|
out->add("redirect", VPackValue(true));
|
|
out->add("id", VPackValue(leaderID()));
|
|
} else { // leader magic
|
|
auto tmp = _config;
|
|
tmp.upsertPool(pslice, id);
|
|
auto query = std::make_shared<VPackBuilder>();
|
|
{
|
|
VPackArrayBuilder trs(query.get());
|
|
{
|
|
VPackArrayBuilder tr(query.get());
|
|
{
|
|
VPackObjectBuilder o(query.get());
|
|
query->add(VPackValue(RECONFIGURE));
|
|
{
|
|
VPackObjectBuilder o(query.get());
|
|
query->add("op", VPackValue("set"));
|
|
query->add(VPackValue("new"));
|
|
{
|
|
VPackObjectBuilder c(query.get());
|
|
tmp.toBuilder(*query);
|
|
}
|
|
}
|
|
}
|
|
}
|
|
}
|
|
|
|
LOG_TOPIC("e85f0", DEBUG, Logger::AGENCY)
|
|
<< "persisting new agency configuration via RAFT: " << query->toJson();
|
|
|
|
// Do write
|
|
write_ret_t ret;
|
|
try {
|
|
ret = write(query, WriteMode(false, true));
|
|
arangodb::consensus::index_t max_index = 0;
|
|
if (ret.indices.size() > 0) {
|
|
max_index = *std::max_element(ret.indices.begin(), ret.indices.end());
|
|
}
|
|
if (max_index > 0) { // We have a RAFT index. Wait for the RAFT commit.
|
|
auto result = waitFor(max_index);
|
|
if (result != Agent::raft_commit_t::OK) {
|
|
err =
|
|
"failed to retrieve RAFT index for updated agency endpoints";
|
|
} else {
|
|
auto pool = _config.pool();
|
|
out->add(VPackValue("pool"));
|
|
{
|
|
VPackObjectBuilder bb(out.get());
|
|
for (auto const& i : pool) {
|
|
out->add(i.first, VPackValue(i.second));
|
|
}
|
|
}
|
|
}
|
|
} else {
|
|
err = "failed to retrieve RAFT index for updated agency endpoints";
|
|
}
|
|
} catch (std::exception const& e) {
|
|
err = std::string("failed to write new agency to RAFT") + e.what();
|
|
LOG_TOPIC("17dc2", ERR, Logger::AGENCY) << err;
|
|
}
|
|
}
|
|
|
|
if (!err.empty()) {
|
|
out->add(StaticStrings::Code, VPackValue(500));
|
|
out->add(StaticStrings::Error, VPackValue(true));
|
|
out->add(StaticStrings::ErrorMessage, VPackValue(err));
|
|
out->add(StaticStrings::ErrorNum, VPackValue(500));
|
|
}
|
|
}
|
|
|
|
// let gossip loop know that it has new data
|
|
if (_inception != nullptr && upsert == config_t::CHANGED) {
|
|
_inception->signalConditionVar();
|
|
}
|
|
}
|
|
|
|
if (!isCallback) {
|
|
LOG_TOPIC("1e95f", TRACE, Logger::AGENCY) << "Answering with gossip " << out->slice().toJson();
|
|
}
|
|
|
|
return out;
|
|
}
|
|
|
|
void Agent::resetRAFTTimes(double min_timeout, double max_timeout) {
|
|
_config.pingTimes(min_timeout, max_timeout);
|
|
}
|
|
|
|
void Agent::ready(bool b) {
|
|
// From main thread of Inception
|
|
_ready = b;
|
|
}
|
|
|
|
bool Agent::ready() const {
|
|
if (size() == 1) {
|
|
return true;
|
|
}
|
|
|
|
return _ready;
|
|
}
|
|
|
|
|
|
|
|
void Agent::trashStoreCallback(std::string const& url, query_t const& body) {
|
|
|
|
auto const& slice = body->slice();
|
|
TRI_ASSERT(slice.isObject());
|
|
|
|
// body consists of object holding keys index, term and the observed keys
|
|
// we'll remove observation on every key and according observer url
|
|
for (auto const& i : VPackObjectIterator(slice)) {
|
|
if (!i.key.isEqualString("term") && !i.key.isEqualString("index")) {
|
|
MUTEX_LOCKER(lock, _cbtLock);
|
|
_callbackTrashBin[i.key.copyString()].emplace(url);
|
|
}
|
|
}
|
|
}
|
|
|
|
|
|
void Agent::emptyCbTrashBin() {
|
|
|
|
using clock = std::chrono::steady_clock;
|
|
|
|
auto envelope = std::make_shared<VPackBuilder>();
|
|
{
|
|
_cbtLock.assertNotLockedByCurrentThread();
|
|
MUTEX_LOCKER(lock, _cbtLock);
|
|
|
|
auto early =
|
|
std::chrono::duration_cast<std::chrono::seconds>(
|
|
clock::now() - _callbackLastPurged).count() < 10;
|
|
|
|
if (early || _callbackTrashBin.empty()) {
|
|
return;
|
|
}
|
|
|
|
{
|
|
VPackArrayBuilder trxs(envelope.get());
|
|
for (auto const& i : _callbackTrashBin) {
|
|
for (auto const& j : i.second) {
|
|
{
|
|
VPackArrayBuilder trx(envelope.get());
|
|
{
|
|
VPackObjectBuilder ak(envelope.get());
|
|
envelope->add(VPackValue(i.first));
|
|
{
|
|
VPackObjectBuilder oper(envelope.get());
|
|
envelope->add("op", VPackValue("unobserve"));
|
|
envelope->add("url", VPackValue(j));
|
|
}
|
|
}
|
|
}
|
|
}
|
|
}
|
|
}
|
|
_callbackTrashBin.clear();
|
|
_callbackLastPurged = std::chrono::steady_clock::now();
|
|
}
|
|
|
|
LOG_TOPIC("12ad3", DEBUG, Logger::AGENCY) << "scheduling unobserve: " << envelope->toJson();
|
|
|
|
// This is a best effort attempt. If either the queueing or the write fail,
|
|
// while above _callbackTrashBin has been cleaned, entries will repopulate with
|
|
// future 404 errors, when they are triggered again. So either way these attempts
|
|
// are repeated until such time, when the callbacks are gone successfully through
|
|
// queue + write.
|
|
auto* scheduler = SchedulerFeature::SCHEDULER;
|
|
if (scheduler != nullptr) {
|
|
bool ok = scheduler->queue(RequestLane::INTERNAL_LOW, [envelope = std::move(envelope)] {
|
|
auto* agent = AgencyFeature::AGENT;
|
|
if (!application_features::ApplicationServer::isStopping() && agent) {
|
|
agent->write(envelope);
|
|
}
|
|
});
|
|
LOG_TOPIC_IF("52461", DEBUG, Logger::AGENCY, !ok) << "Could not schedule callback cleanup job.";
|
|
}
|
|
|
|
}
|
|
|
|
|
|
query_t Agent::buildDB(arangodb::consensus::index_t index) {
|
|
Store store(this);
|
|
index_t oldIndex;
|
|
term_t term;
|
|
if (!_state.loadLastCompactedSnapshot(store, oldIndex, term)) {
|
|
THROW_ARANGO_EXCEPTION(TRI_ERROR_AGENCY_CANNOT_REBUILD_DBS);
|
|
}
|
|
|
|
{
|
|
READ_LOCKER(oLocker, _outputLock);
|
|
if (index > _commitIndex) {
|
|
LOG_TOPIC("88754", INFO, Logger::AGENCY)
|
|
<< "Cannot snapshot beyond leaderCommitIndex: " << _commitIndex;
|
|
index = _commitIndex;
|
|
} else if (index < oldIndex) {
|
|
LOG_TOPIC("cb67b", INFO, Logger::AGENCY)
|
|
<< "Cannot snapshot before last compaction index: " << oldIndex;
|
|
index = oldIndex;
|
|
}
|
|
}
|
|
|
|
{
|
|
if (index > oldIndex) {
|
|
auto logs = _state.slices(oldIndex + 1, index);
|
|
store.applyLogEntries(logs, index, term, false /* do not perform callbacks */);
|
|
} else {
|
|
VPackBuilder logs;
|
|
logs.openArray();
|
|
logs.close();
|
|
store.applyLogEntries(logs, index, term, false /* do not perform callbacks */);
|
|
}
|
|
}
|
|
|
|
auto builder = std::make_shared<VPackBuilder>();
|
|
store.toBuilder(*builder);
|
|
|
|
return builder;
|
|
}
|
|
|
|
void Agent::addTrxsOngoing(Slice trxs) {
|
|
try {
|
|
MUTEX_LOCKER(guard, _trxsLock);
|
|
for (auto const& trx : VPackArrayIterator(trxs)) {
|
|
if (trx.isArray() && trx.length() == 3 && trx[0].isObject() && trx[2].isString()) {
|
|
// only those are interesting:
|
|
_ongoingTrxs.insert(trx[2].copyString());
|
|
}
|
|
}
|
|
} catch (...) {
|
|
}
|
|
}
|
|
|
|
void Agent::removeTrxsOngoing(Slice trxs) {
|
|
try {
|
|
MUTEX_LOCKER(guard, _trxsLock);
|
|
for (auto const& trx : VPackArrayIterator(trxs)) {
|
|
if (trx.isArray() && trx.length() == 3 && trx[0].isObject() && trx[2].isString()) {
|
|
// only those are interesting:
|
|
_ongoingTrxs.erase(trx[2].copyString());
|
|
}
|
|
}
|
|
} catch (...) {
|
|
}
|
|
}
|
|
|
|
bool Agent::isTrxOngoing(std::string& id) {
|
|
try {
|
|
MUTEX_LOCKER(guard, _trxsLock);
|
|
auto it = _ongoingTrxs.find(id);
|
|
return it != _ongoingTrxs.end();
|
|
} catch (...) {
|
|
return false;
|
|
}
|
|
}
|
|
|
|
Inception const* Agent::inception() const { return _inception.get(); }
|
|
|
|
void Agent::updateConfiguration(Slice const& slice) {
|
|
_config.updateConfiguration(slice);
|
|
}
|
|
|
|
} // namespace consensus
|
|
} // namespace arangodb
|