//////////////////////////////////////////////////////////////////////////////// /// DISCLAIMER /// /// Copyright 2014-2016 ArangoDB GmbH, Cologne, Germany /// Copyright 2004-2014 triAGENS GmbH, Cologne, Germany /// /// Licensed under the Apache License, Version 2.0 (the "License"); /// you may not use this file except in compliance with the License. /// You may obtain a copy of the License at /// /// http://www.apache.org/licenses/LICENSE-2.0 /// /// Unless required by applicable law or agreed to in writing, software /// distributed under the License is distributed on an "AS IS" BASIS, /// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. /// See the License for the specific language governing permissions and /// limitations under the License. /// /// Copyright holder is ArangoDB GmbH, Cologne, Germany /// /// @author Kaveh Vahedipour //////////////////////////////////////////////////////////////////////////////// #include "Constituent.h" #include #include #include #include #include #include "Agency/Agent.h" #include "Aql/Query.h" #include "Aql/QueryRegistry.h" #include "Basics/ConditionLocker.h" #include "Cluster/ClusterComm.h" #include "Logger/Logger.h" #include "Random/RandomGenerator.h" #include "Utils/OperationOptions.h" #include "Utils/OperationResult.h" #include "Utils/SingleCollectionTransaction.h" #include "Transaction/StandaloneContext.h" #include "VocBase/ticks.h" #include "VocBase/vocbase.h" using namespace arangodb::consensus; using namespace arangodb::rest; using namespace arangodb::velocypack; using namespace arangodb; // (std::numeric_limits::max)(); /// Raft role names for display purposes const std::vector roleStr({"Follower", "Candidate", "Leader"}); /// Configure with agent's configuration void Constituent::configure(Agent* agent) { MUTEX_LOCKER(guard, _castLock); _agent = agent; TRI_ASSERT(_agent != nullptr); if (size() == 1) { _role = LEADER; LOG_TOPIC(DEBUG, Logger::AGENCY) << "Set _role to LEADER in term " << _term; } } // Default ctor Constituent::Constituent() : Thread("Constituent"), _vocbase(nullptr), _queryRegistry(nullptr), _term(0), _cast(false), _leaderID(NO_LEADER), _lastHeartbeatSeen(0.0), _role(FOLLOWER), _agent(nullptr), _votedFor(NO_LEADER) {} /// Shutdown if not already Constituent::~Constituent() { if (!isStopping()) { shutdown(); } } /// Wait for sync bool Constituent::waitForSync() const { return _agent->config().waitForSync(); } /// Get my term term_t Constituent::term() const { MUTEX_LOCKER(guard, _castLock); return _term; } /// Update my term void Constituent::term(term_t t) { MUTEX_LOCKER(guard, _castLock); termNoLock(t); } void Constituent::termNoLock(term_t t) { // Only call this when you have the _castLock term_t tmp = _term; _term = t; if (tmp != t) { LOG_TOPIC(DEBUG, Logger::AGENCY) << _id << ": changing term, current role:" << roleStr[_role] << " new term " << t; _cast = false; if (!_votedFor.empty()) { Builder body; { VPackObjectBuilder b(&body); std::ostringstream i_str; i_str << std::setw(20) << std::setfill('0') << t; body.add("_key", Value(i_str.str())); body.add("term", Value(t)); body.add("voted_for", Value(_votedFor)); } TRI_ASSERT(_vocbase != nullptr); auto transactionContext = std::make_shared(_vocbase); SingleCollectionTransaction trx(transactionContext, "election", AccessMode::Type::WRITE); auto res = trx.begin(); if (!res.ok()) { THROW_ARANGO_EXCEPTION(res); } OperationOptions options; options.waitForSync = _agent->config().waitForSync(); options.silent = true; OperationResult result; try { result = trx.insert("election", body.slice(), options); } catch (std::exception const& e) { LOG_TOPIC(FATAL, Logger::AGENCY) << "Failed to persist RAFT election ballot: " << e.what() << ". Bailing out." << __FILE__ << ":" << __LINE__; FATAL_ERROR_EXIT(); } res = trx.finish(result.code); } } } bool Constituent::logUpToDate( arangodb::consensus::index_t prevLogIndex, term_t prevLogTerm) const { log_t myLastLogEntry = _agent->state().lastLog(); return (prevLogTerm > myLastLogEntry.term || (prevLogTerm == myLastLogEntry.term && prevLogIndex >= myLastLogEntry.index)); } bool Constituent::logMatches( arangodb::consensus::index_t prevLogIndex, term_t prevLogTerm) const { int res = _agent->state().checkLog(prevLogIndex, prevLogTerm); if (res == 1) { return true; } else if (res == -1) { return false; } else { return true; // This is important: If we have compacted away this log // entry, then we know that this or a later entry was // already committed by a majority and is therefore // set in stone. Therefore the check must return true // here and this is correct behaviour. // The other case in which we do not have the log entry // is if it is so new that we have never heard about it // in this case we can safely return true here as well, // since we will replace our own log anyway in the very // near future. } } /// My role role_t Constituent::role() const { MUTEX_LOCKER(guard, _castLock); return _role; } /// Become follower in term void Constituent::follow(term_t t) { MUTEX_LOCKER(guard, _castLock); followNoLock(t); } void Constituent::followNoLock(term_t t) { _term = t; _role = FOLLOWER; LOG_TOPIC(DEBUG, Logger::AGENCY) << "Set _role to FOLLOWER in term " << _term; if (_leaderID == _id) { _leaderID = NO_LEADER; LOG_TOPIC(DEBUG, Logger::AGENCY) << "Setting _leaderID to NO_LEADER."; } else { LOG_TOPIC(INFO, Logger::AGENCY) << _id << ": following " << _leaderID << " in term " << t ; } CONDITION_LOCKER(guard, _cv); _cv.signal(); } /// Become leader void Constituent::lead(term_t term) { // we need to rebuild spear_head and read_db _agent->beginPrepareLeadership(); TRI_DEFER(_agent->endPrepareLeadership()); if (!_agent->prepareLead()) { { MUTEX_LOCKER(guard, _castLock); followNoLock(term); } return; } { MUTEX_LOCKER(guard, _castLock); // if we already have a higher term, ignore this request if (term < _term) { followNoLock(_term); return; } // if we already lead, ignore this request if (_role == LEADER) { TRI_ASSERT(_leaderID == _id); return; } // I'm the leader _role = LEADER; LOG_TOPIC(INFO, Logger::AGENCY) << _id << ": leading in term " << _term; _leaderID = _id; } // we need to start work as leader _agent->lead(); } /// Become candidate void Constituent::candidate() { MUTEX_LOCKER(guard, _castLock); if (_leaderID != NO_LEADER) { _leaderID = NO_LEADER; LOG_TOPIC(DEBUG, Logger::AGENCY) << "Set _leaderID to NO_LEADER"; } if (_role != CANDIDATE) { _role = CANDIDATE; LOG_TOPIC(INFO, Logger::AGENCY) << _id << ": candidating in term " << _term; } } /// Leading? bool Constituent::leading() const { MUTEX_LOCKER(guard, _castLock); return _role == LEADER; } /// Following? bool Constituent::following() const { MUTEX_LOCKER(guard, _castLock); return _role == FOLLOWER; } /// Running as candidate? bool Constituent::running() const { MUTEX_LOCKER(guard, _castLock); return _role == CANDIDATE; } /// Get current leader's id std::string Constituent::leaderID() const { MUTEX_LOCKER(guard, _castLock); return _leaderID; } /// Agency size size_t Constituent::size() const { return _agent->config().size(); } /// Get endpoint to an id std::string Constituent::endpoint(std::string id) const { return _agent->config().poolAt(id); } /// @brief Check leader bool Constituent::checkLeader( term_t term, std::string id, index_t prevLogIndex, term_t prevLogTerm) { TRI_ASSERT(_vocbase != nullptr); MUTEX_LOCKER(guard, _castLock); LOG_TOPIC(TRACE, Logger::AGENCY) << "checkLeader(term: " << term << ", leaderId: "<< id << ", prev-log-index: " << prevLogIndex << ", prev-log-term: " << prevLogTerm << ") in term " << _term; if (term < _term) { return false; } _lastHeartbeatSeen = TRI_microtime(); LOG_TOPIC(TRACE, Logger::AGENCY) << "setting last heartbeat: " << _lastHeartbeatSeen; if (term > _term) { termNoLock(term); } if (!logMatches(prevLogIndex, prevLogTerm)) { return false; } if (_leaderID != id) { LOG_TOPIC(DEBUG, Logger::AGENCY) << "Set _leaderID to " << id << " in term " << _term; _leaderID = id; TRI_ASSERT(_leaderID != _id); if (_role != FOLLOWER) { followNoLock(term); } } return true; } /// @brief Vote bool Constituent::vote(term_t termOfPeer, std::string id, index_t prevLogIndex, term_t prevLogTerm) { if (!_agent->ready()) { return false; } TRI_ASSERT(_vocbase != nullptr); LOG_TOPIC(TRACE, Logger::AGENCY) << "vote(termOfPeer: " << termOfPeer << ", leaderId: " << id << ", prev-log-index: " << prevLogIndex << ", prev-log-term: " << prevLogTerm << ") in (my) term " << _term; MUTEX_LOCKER(guard, _castLock); if (termOfPeer > _term) { termNoLock(termOfPeer); if (_role != FOLLOWER) { followNoLock(_term); } _cast = false; _votedFor = ""; } else if (termOfPeer < _term) { // termOfPeer < _term, simply ignore and do not vote: LOG_TOPIC(DEBUG, Logger::AGENCY) << "ignoring RequestVoteRPC with old term " << termOfPeer << ", we are already at term " << _term; return false; } if (_cast) { // already voted in this term if (_votedFor == id) { LOG_TOPIC(DEBUG, Logger::AGENCY) << "repeating vote for " << id; return true; } LOG_TOPIC(DEBUG, Logger::AGENCY) << "not voting for " << id << " since we have already voted for " << _votedFor << " in this term"; return false; } // Now decide whether or not we vote for this server, we have to // take into account paragraph 5.4.1 in the Raft paper, so we need // to check that his log is at least as up to date as ours: log_t myLastLogEntry = _agent->state().lastLog(); if (prevLogTerm > myLastLogEntry.term || (prevLogTerm == myLastLogEntry.term && prevLogIndex >= myLastLogEntry.index)) { LOG_TOPIC(DEBUG, Logger::AGENCY) << "voting for " << id << " in term " << _term; _cast = true; _votedFor = id; return true; } LOG_TOPIC(DEBUG, Logger::AGENCY) << "not voting for " << id << " since his log is not up to date: " << "my last log entry: (" << myLastLogEntry.term << ", " << myLastLogEntry.index << "), his last log entry: (" << prevLogTerm << ", " << prevLogIndex << ")"; return false; // do not vote for this uninformed guy! } /// @brief Call to election void Constituent::callElection() { using namespace std::chrono; auto timeout = steady_clock::now() + duration(_agent->config().minPing()); std::vector active = _agent->config().active(); CoordTransactionID coordinatorTransactionID = TRI_NewTickServer(); term_t savedTerm; { MUTEX_LOCKER(locker, _castLock); this->termNoLock(_term + 1); // raise my term _cast = true; _votedFor = _id; savedTerm = _term; LOG_TOPIC(DEBUG, Logger::AGENCY) << "Set _leaderID to NO_LEADER" << " in term " << _term; _leaderID = NO_LEADER; } std::string body; std::stringstream path; path << "/_api/agency_priv/requestVote?term=" << savedTerm << "&candidateId=" << _id << "&prevLogIndex=" << _agent->lastLog().index << "&prevLogTerm=" << _agent->lastLog().term; auto cc = ClusterComm::instance(); // Ask everyone for their vote for (auto const& i : active) { if (i != _id) { auto headerFields = std::make_unique>(); if (!isStopping() && cc != nullptr) { cc->asyncRequest( "", coordinatorTransactionID, _agent->config().poolAt(i), rest::RequestType::GET, path.str(), std::make_shared(body), headerFields, nullptr, 0.9 * _agent->config().minPing(), true); } } } // Collect ballots. I vote for myself. size_t yea = 1; size_t nay = 0; size_t majority = size() / 2 + 1; // We collect votes, we leave the following loop when one of the following // conditions is met: // (1) A majority of nay votes have been received // (2) A majority of yea votes (including ourselves) have been received // (3) At least yyy time has passed, in this case we give up without // a conclusive vote. while (true) { if (steady_clock::now() >= timeout) { // Timeout. follow(_term); break; } if (!isStopping() && cc != nullptr) { auto res = ClusterComm::instance()->wait( "", coordinatorTransactionID, 0, "", duration(timeout - steady_clock::now()).count()); if (res.status == CL_COMM_SENT) { auto body = res.result->getBodyVelocyPack(); VPackSlice slc = body->slice(); // Got ballot if (slc.isObject() && slc.hasKey("term") && slc.hasKey("voteGranted")) { // Follow right away? term_t t = slc.get("term").getUInt(); if (t > _term) { follow(t); break; } // Check result and counts if(slc.get("voteGranted").getBool()) { // majority in favour? if (++yea >= majority) { lead(savedTerm); break; } // Vote is counted as yea, continue loop continue; } } } } // Count the vote as a nay if (++nay >= majority) { // Network: majority against? follow(_term); break; } } LOG_TOPIC(DEBUG, Logger::AGENCY) << "Election: Have received " << yea << " yeas and " << nay << " nays, the " << (yea >= majority ? "yeas" : "nays") << " have it."; // Clean up if (!isStopping() && cc != nullptr) { ClusterComm::instance()->drop("", coordinatorTransactionID, 0, ""); } } void Constituent::update(std::string const& leaderID, term_t t) { MUTEX_LOCKER(guard, _castLock); _term = t; if (_leaderID != leaderID) { LOG_TOPIC(DEBUG, Logger::AGENCY) << "Constituent::update: setting _leaderID to " << leaderID << " in term " << _term; _leaderID = leaderID; _role = FOLLOWER; } } /// Start clean shutdown void Constituent::beginShutdown() { Thread::beginShutdown(); CONDITION_LOCKER(guard, _cv); guard.broadcast(); } /// Start operation bool Constituent::start(TRI_vocbase_t* vocbase, aql::QueryRegistry* queryRegistry) { TRI_ASSERT(vocbase != nullptr); _vocbase = vocbase; _queryRegistry = queryRegistry; return Thread::start(); } /// Get persisted information and run election process void Constituent::run() { // single instance _id = _agent->config().id(); TRI_ASSERT(_vocbase != nullptr); auto bindVars = std::make_shared(); bindVars->openObject(); bindVars->close(); // Most recent vote { std::string const aql("FOR l IN election SORT l._key DESC LIMIT 1 RETURN l"); arangodb::aql::Query query(false, _vocbase, arangodb::aql::QueryString(aql), bindVars, nullptr, arangodb::aql::PART_MAIN); auto queryResult = query.execute(_queryRegistry); if (queryResult.code != TRI_ERROR_NO_ERROR) { THROW_ARANGO_EXCEPTION_MESSAGE(queryResult.code, queryResult.details); } VPackSlice result = queryResult.result->slice(); if (result.isArray()) { for (auto const& i : VPackArrayIterator(result)) { auto ii = i.resolveExternals(); try { MUTEX_LOCKER(locker, _castLock); _term = ii.get("term").getUInt(); _votedFor = ii.get("voted_for").copyString(); } catch (std::exception const&) { LOG_TOPIC(ERR, Logger::AGENCY) << "Persisted election entries corrupt! Defaulting term,vote (0,0)"; } } } } std::vector act = _agent->config().active(); while ( !this->isStopping() // Obvious && (!_agent->ready() || find(act.begin(), act.end(), _id) == act.end())) { // Active agent CONDITION_LOCKER(guardv, _cv); _cv.wait(50000); act = _agent->config().active(); } if (size() == 1) { _leaderID = _agent->config().id(); LOG_TOPIC(DEBUG, Logger::AGENCY) << "Set _leaderID to " << _leaderID << " in term " << _term; } else { { MUTEX_LOCKER(guard, _castLock); _role = FOLLOWER; } while (!this->isStopping()) { if (_role == FOLLOWER) { static double const M = 1.0e6; int64_t a = static_cast(M * _agent->config().minPing()); int64_t b = static_cast(M * _agent->config().maxPing()); int64_t randTimeout = RandomGenerator::interval(a, b); int64_t randWait = randTimeout; { MUTEX_LOCKER(guard, _castLock); // in the beginning, pure random, after that, we might have to // wait for less than planned, since the last heartbeat we have // seen is already some time ago, note that this waiting time // can become negative: if (_lastHeartbeatSeen > 0.0) { double now = TRI_microtime(); randWait -= static_cast(M * (now-_lastHeartbeatSeen)); if (randWait < a) { randWait = a; } else if (randWait > b) { randWait = b; } } } LOG_TOPIC(TRACE, Logger::AGENCY) << "Random timeout: " << randTimeout << ", wait: " << randWait; if (randWait > 0.0) { CONDITION_LOCKER(guardv, _cv); _cv.wait(randWait); } bool isTimeout = false; { MUTEX_LOCKER(guard, _castLock); if (_lastHeartbeatSeen <= 0.0) { LOG_TOPIC(TRACE, Logger::AGENCY) << "no heartbeat seen"; isTimeout = true; } else { double diff = TRI_microtime() - _lastHeartbeatSeen; LOG_TOPIC(TRACE, Logger::AGENCY) << "last heartbeat: " << diff << "sec ago"; isTimeout = (static_cast(M * diff) > randTimeout); } } if (isTimeout) { LOG_TOPIC(TRACE, Logger::AGENCY) << "timeout, calling an election"; candidate(); } } else if (_role == CANDIDATE) { callElection(); // Run for office } else { int32_t left = static_cast(100000.0 * _agent->config().minPing()); long randTimeout = static_cast(left); { CONDITION_LOCKER(guardv, _cv); _cv.wait(randTimeout); } } } } }