mirror of https://gitee.com/bigwinds/arangodb
agency wouldn't restart from persisted state
This commit is contained in:
parent
fe72c4d87d
commit
2b93488e3f
|
@ -519,10 +519,11 @@ void Constituent::run() {
|
|||
|
||||
if (result.isArray()) {
|
||||
for (auto const& i : VPackArrayIterator(result)) {
|
||||
auto ii = i.resolveExternals();
|
||||
try {
|
||||
MUTEX_LOCKER(locker, _castLock);
|
||||
_term = i.get("term").getUInt();
|
||||
_votedFor = i.get("voted_for").copyString();
|
||||
_term = ii.get("term").getUInt();
|
||||
_votedFor = ii.get("voted_for").copyString();
|
||||
} catch (std::exception const&) {
|
||||
LOG_TOPIC(ERR, Logger::AGENCY)
|
||||
<< "Persisted election entries corrupt! Defaulting term,vote (0,0)";
|
||||
|
|
|
@ -37,8 +37,6 @@ class GossipCallback : public arangodb::ClusterCommCallback {
|
|||
|
||||
virtual bool operator()(arangodb::ClusterCommResult*) override final;
|
||||
|
||||
void shutdown();
|
||||
|
||||
private:
|
||||
Agent* _agent;
|
||||
};
|
||||
|
|
|
@ -25,10 +25,12 @@
|
|||
|
||||
#include "Agency/Agent.h"
|
||||
#include "Agency/GossipCallback.h"
|
||||
#include "Agency/MeasureCallback.h"
|
||||
#include "Basics/ConditionLocker.h"
|
||||
#include "Cluster/ClusterComm.h"
|
||||
|
||||
#include <chrono>
|
||||
#include <numeric>
|
||||
#include <thread>
|
||||
|
||||
using namespace arangodb::consensus;
|
||||
|
@ -105,7 +107,7 @@ void Inception::gossip() {
|
|||
LOG_TOPIC(DEBUG, Logger::AGENCY) << "Stopping active gossipping!";
|
||||
} else {
|
||||
LOG_TOPIC(ERR, Logger::AGENCY)
|
||||
<< "Failed to find complete pool of agents. Giving up!";
|
||||
<< "Failed to find complete pool of agents. Giving up!";
|
||||
}
|
||||
break;
|
||||
}
|
||||
|
@ -225,7 +227,7 @@ bool Inception::restartingActiveAgent() {
|
|||
if (active.empty()) {
|
||||
return true;
|
||||
}
|
||||
|
||||
|
||||
for (auto& i : active) {
|
||||
|
||||
if (i != myConfig.id() && i != "") {
|
||||
|
@ -236,9 +238,9 @@ bool Inception::restartingActiveAgent() {
|
|||
std::unordered_map<std::string, std::string>(), 2.0);
|
||||
|
||||
if (comres->status == CL_COMM_SENT) {
|
||||
|
||||
|
||||
try {
|
||||
|
||||
|
||||
auto theirActive = comres->result->getBodyVelocyPack()->
|
||||
slice().get("configuration").get("active").toJson();
|
||||
auto myActive = myConfig.activeToBuilder()->toJson();
|
||||
|
@ -252,23 +254,15 @@ bool Inception::restartingActiveAgent() {
|
|||
} else {
|
||||
i = "";
|
||||
}
|
||||
|
||||
|
||||
} catch (std::exception const& e) {
|
||||
LOG_TOPIC(FATAL, Logger::AGENCY)
|
||||
<< "Assumed active RAFT peer has no active agency list: " << e.what()
|
||||
<< "Administrative intervention needed.";
|
||||
FATAL_ERROR_EXIT();
|
||||
return false;
|
||||
}
|
||||
|
||||
} else {
|
||||
/*
|
||||
LOG_TOPIC(FATAL, Logger::AGENCY)
|
||||
<< "Assumed active RAFT peer and I disagree on active membership."
|
||||
<< "Administrative intervention needed.";
|
||||
FATAL_ERROR_EXIT();
|
||||
return false;*/
|
||||
}
|
||||
return false;
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
}
|
||||
|
@ -290,8 +284,82 @@ bool Inception::restartingActiveAgent() {
|
|||
}
|
||||
|
||||
return false;
|
||||
|
||||
|
||||
}
|
||||
|
||||
inline static int64_t timeStamp() {
|
||||
using namespace std::chrono;
|
||||
return duration_cast<milliseconds>(
|
||||
steady_clock::now().time_since_epoch()).count();
|
||||
}
|
||||
|
||||
void Inception::reportIn(std::string const& peerId, uint64_t start) {
|
||||
MUTEX_LOCKER(lock, _pLock);
|
||||
_pings.push_back((double)(timeStamp()-start));
|
||||
}
|
||||
|
||||
bool Inception::estimateRAFTInterval() {
|
||||
|
||||
using namespace std::chrono;
|
||||
|
||||
auto pool = _agent->config().pool();
|
||||
std::string path("/_api/agency/config");
|
||||
for (size_t i = 0; i < 25; ++i) {
|
||||
for (auto const& peer : pool) {
|
||||
std::string clientid = peer.first + std::to_string(i);
|
||||
auto hf =
|
||||
std::make_unique<std::unordered_map<std::string, std::string>>();
|
||||
arangodb::ClusterComm::instance()->asyncRequest(
|
||||
clientid, 1, peer.second, rest::RequestType::GET, path,
|
||||
std::make_shared<std::string>(), hf,
|
||||
std::make_shared<MeasureCallback>(this, peer.second, timeStamp()),
|
||||
2.0, true);
|
||||
}
|
||||
}
|
||||
|
||||
auto s = system_clock::now();
|
||||
seconds timeout(3);
|
||||
while (true) {
|
||||
|
||||
_cv.wait(50000);
|
||||
|
||||
{
|
||||
MUTEX_LOCKER(lock, _pLock);
|
||||
if (_pings.size() == 25*pool.size()) {
|
||||
LOG_TOPIC(DEBUG, Logger::AGENCY) << "All pings are in";
|
||||
break;
|
||||
}
|
||||
}
|
||||
|
||||
if ((system_clock::now() - s) > timeout) {
|
||||
LOG_TOPIC(DEBUG, Logger::AGENCY) << "Timed out waiting for pings";
|
||||
}
|
||||
|
||||
}
|
||||
|
||||
{
|
||||
|
||||
MUTEX_LOCKER(lock, _pLock);
|
||||
size_t num = _pings.size();
|
||||
|
||||
if (num > 0) {
|
||||
|
||||
double sum, mean, sq_sum, stdev;
|
||||
sum = std::accumulate(_pings.begin(), _pings.end(), 0.0);
|
||||
mean = sum / num;
|
||||
std::vector<double> diff(num);
|
||||
std::transform(_pings.begin(), _pings.end(), _pings.begin(),
|
||||
std::bind2nd(std::minus<double>(), mean));
|
||||
sq_sum =
|
||||
std::inner_product(_pings.begin(), _pings.end(), _pings.begin(), 0.0);
|
||||
stdev = std::sqrt(sq_sum / num);
|
||||
|
||||
LOG(DEBUG) << "mean(" << mean << ") stdev(" << stdev<< ")";
|
||||
|
||||
}
|
||||
}
|
||||
return true;
|
||||
|
||||
}
|
||||
|
||||
|
||||
|
@ -306,24 +374,22 @@ void Inception::run() {
|
|||
// 1. If active agency, do as you're told
|
||||
if (activeAgencyFromPersistence()) {
|
||||
_agent->ready(true);
|
||||
return;
|
||||
}
|
||||
|
||||
|
||||
// 2. If we think that we used to be active agent
|
||||
if (restartingActiveAgent()) {
|
||||
_agent->ready(true);
|
||||
return;
|
||||
if (!_agent->ready() && restartingActiveAgent()) {
|
||||
_agent->ready(true);
|
||||
}
|
||||
|
||||
|
||||
// 3. Else gossip
|
||||
config_t config = _agent->config();
|
||||
if (!config.poolComplete()) {
|
||||
if (!_agent->ready() && !config.poolComplete()) {
|
||||
gossip();
|
||||
}
|
||||
|
||||
// 4. If still incomplete bail out :(
|
||||
config = _agent->config();
|
||||
if (!config.poolComplete()) {
|
||||
if (!_agent->ready() && !config.poolComplete()) {
|
||||
LOG_TOPIC(FATAL, Logger::AGENCY)
|
||||
<< "Failed to build environment for RAFT algorithm. Bailing out!";
|
||||
FATAL_ERROR_EXIT();
|
||||
|
@ -331,6 +397,10 @@ void Inception::run() {
|
|||
|
||||
_agent->ready(true);
|
||||
|
||||
if (_agent->ready()) {
|
||||
estimateRAFTInterval();
|
||||
}
|
||||
|
||||
}
|
||||
|
||||
// @brief Graceful shutdown
|
||||
|
|
|
@ -28,6 +28,7 @@
|
|||
|
||||
#include "Basics/Common.h"
|
||||
#include "Basics/ConditionVariable.h"
|
||||
#include "Basics/Mutex.h"
|
||||
#include "Basics/Thread.h"
|
||||
|
||||
#include <velocypack/Builder.h>
|
||||
|
@ -53,6 +54,9 @@ public:
|
|||
/// @brief Defualt dtor
|
||||
virtual ~Inception();
|
||||
|
||||
/// @brief Report in from callbacks
|
||||
void reportIn(std::string const&, uint64_t);
|
||||
|
||||
void beginShutdown() override;
|
||||
void run() override;
|
||||
|
||||
|
@ -60,18 +64,24 @@ public:
|
|||
|
||||
/// @brief Find active agency from persisted
|
||||
bool activeAgencyFromPersistence();
|
||||
|
||||
/// @brief We are a restarting active RAFT agent
|
||||
bool restartingActiveAgent();
|
||||
|
||||
/// @brief Find active agency from command line
|
||||
bool activeAgencyFromCommandLine();
|
||||
|
||||
/// @brief Try to estimate good RAFT min/max timeouts
|
||||
bool estimateRAFTInterval();
|
||||
|
||||
/// @brief Gossip your way into the agency
|
||||
void gossip();
|
||||
|
||||
|
||||
Agent* _agent; //< @brief The agent
|
||||
arangodb::basics::ConditionVariable _cv; //< @brief For proper shutdown
|
||||
|
||||
std::vector<double> _pings; //< @brief pings
|
||||
mutable arangodb::Mutex _pLock; //< @brief Guard pings
|
||||
|
||||
};
|
||||
|
||||
}}
|
||||
|
|
|
@ -0,0 +1,39 @@
|
|||
////////////////////////////////////////////////////////////////////////////////
|
||||
/// DISCLAIMER
|
||||
///
|
||||
/// Copyright 2014-2016 ArangoDB GmbH, Cologne, Germany
|
||||
/// Copyright 2004-2014 triAGENS GmbH, Cologne, Germany
|
||||
///
|
||||
/// Licensed under the Apache License, Version 2.0 (the "License");
|
||||
/// you may not use this file except in compliance with the License.
|
||||
/// You may obtain a copy of the License at
|
||||
///
|
||||
/// http://www.apache.org/licenses/LICENSE-2.0
|
||||
///
|
||||
/// Unless required by applicable law or agreed to in writing, software
|
||||
/// distributed under the License is distributed on an "AS IS" BASIS,
|
||||
/// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
/// See the License for the specific language governing permissions and
|
||||
/// limitations under the License.
|
||||
///
|
||||
/// Copyright holder is ArangoDB GmbH, Cologne, Germany
|
||||
///
|
||||
/// @author Kaveh Vahedipour
|
||||
////////////////////////////////////////////////////////////////////////////////
|
||||
|
||||
#include "MeasureCallback.h"
|
||||
#include "Agent.h"
|
||||
|
||||
using namespace arangodb::consensus;
|
||||
using namespace arangodb::velocypack;
|
||||
|
||||
MeasureCallback::MeasureCallback(
|
||||
Inception* inc, std::string const& peerId, uint64_t sent) :
|
||||
_inc(inc), _peerId(peerId), _sent(sent){}
|
||||
|
||||
bool MeasureCallback::operator()(arangodb::ClusterCommResult* res) {
|
||||
if (res->status == CL_COMM_SENT && res->result->getHttpReturnCode() == 200) {
|
||||
_inc->reportIn(_peerId, _sent);
|
||||
}
|
||||
return true;
|
||||
}
|
|
@ -0,0 +1,49 @@
|
|||
////////////////////////////////////////////////////////////////////////////////
|
||||
/// DISCLAIMER
|
||||
///
|
||||
/// Copyright 2014-2016 ArangoDB GmbH, Cologne, Germany
|
||||
/// Copyright 2004-2014 triAGENS GmbH, Cologne, Germany
|
||||
///
|
||||
/// Licensed under the Apache License, Version 2.0 (the "License");
|
||||
/// you may not use this file except in compliance with the License.
|
||||
/// You may obtain a copy of the License at
|
||||
///
|
||||
/// http://www.apache.org/licenses/LICENSE-2.0
|
||||
///
|
||||
/// Unless required by applicable law or agreed to in writing, software
|
||||
/// distributed under the License is distributed on an "AS IS" BASIS,
|
||||
/// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
/// See the License for the specific language governing permissions and
|
||||
/// limitations under the License.
|
||||
///
|
||||
/// Copyright holder is ArangoDB GmbH, Cologne, Germany
|
||||
///
|
||||
/// @author Kaveh Vahedipour
|
||||
////////////////////////////////////////////////////////////////////////////////
|
||||
|
||||
#ifndef ARANGOD_CONSENSUS_MEASURE_CALLBACK_H
|
||||
#define ARANGOD_CONSENSUS_MEASURE_CALLBACK_H 1
|
||||
|
||||
#include "Cluster/ClusterComm.h"
|
||||
|
||||
namespace arangodb {
|
||||
namespace consensus {
|
||||
|
||||
class Inception;
|
||||
|
||||
class MeasureCallback : public arangodb::ClusterCommCallback {
|
||||
|
||||
public:
|
||||
explicit MeasureCallback(Inception*, std::string const&, uint64_t);
|
||||
|
||||
virtual bool operator()(arangodb::ClusterCommResult*) override final;
|
||||
|
||||
private:
|
||||
Inception* _inc;
|
||||
std::string const _peerId;
|
||||
uint64_t const _sent;
|
||||
};
|
||||
}
|
||||
} // namespace
|
||||
|
||||
#endif
|
|
@ -12,7 +12,7 @@
|
|||
///
|
||||
/// Unless required by applicable law or agreed to in writing, software
|
||||
/// distributed under the License is distributed on an "AS IS" BASIS,
|
||||
/// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
/// WITHOUT ARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
/// See the License for the specific language governing permissions and
|
||||
/// limitations under the License.
|
||||
///
|
||||
|
@ -223,8 +223,6 @@ size_t State::removeConflicts(query_t const& transactions) { // Under MUTEX in A
|
|||
queryResult.details);
|
||||
}
|
||||
|
||||
queryResult.result->slice();
|
||||
|
||||
// volatile logs
|
||||
_log.erase(_log.begin() + pos, _log.end());
|
||||
|
||||
|
@ -274,7 +272,7 @@ std::vector<log_t> State::get(arangodb::consensus::index_t start,
|
|||
std::vector<VPackSlice> State::slices(arangodb::consensus::index_t start,
|
||||
arangodb::consensus::index_t end) const {
|
||||
std::vector<VPackSlice> slices;
|
||||
MUTEX_LOCKER(mutexLocker, _logLock); // Cannot be read lock (Compaction)
|
||||
// MUTEX_LOCKER(mutexLocker, _logLock); // Cannot be read lock (Compaction)
|
||||
|
||||
if (_log.empty()) {
|
||||
return slices;
|
||||
|
@ -393,6 +391,7 @@ bool State::loadCollections(TRI_vocbase_t* vocbase,
|
|||
_options.waitForSync = waitForSync;
|
||||
_options.silent = true;
|
||||
|
||||
|
||||
if (loadPersisted()) {
|
||||
MUTEX_LOCKER(logLock, _logLock);
|
||||
if (_log.empty()) {
|
||||
|
@ -420,7 +419,9 @@ bool State::loadPersisted() {
|
|||
loadOrPersistConfiguration();
|
||||
|
||||
if (checkCollection("log") && checkCollection("compact")) {
|
||||
return (loadCompacted() && loadRemaining());
|
||||
bool lc = loadCompacted();
|
||||
bool lr = loadRemaining();
|
||||
return (lc && lr);
|
||||
}
|
||||
|
||||
LOG_TOPIC(DEBUG, Logger::AGENCY) << "Couldn't find persisted log";
|
||||
|
@ -435,6 +436,7 @@ bool State::loadCompacted() {
|
|||
bindVars->openObject();
|
||||
bindVars->close();
|
||||
|
||||
|
||||
std::string const aql(
|
||||
std::string("FOR c IN compact SORT c._key DESC LIMIT 1 RETURN c"));
|
||||
arangodb::aql::Query query(false, _vocbase, aql.c_str(), aql.size(), bindVars,
|
||||
|
@ -451,10 +453,11 @@ bool State::loadCompacted() {
|
|||
if (result.isArray() && result.length()) {
|
||||
MUTEX_LOCKER(logLock, _logLock);
|
||||
for (auto const& i : VPackArrayIterator(result)) {
|
||||
auto ii = i.resolveExternals();
|
||||
buffer_t tmp = std::make_shared<arangodb::velocypack::Buffer<uint8_t>>();
|
||||
(*_agent) = i;
|
||||
(*_agent) = ii;
|
||||
try {
|
||||
_cur = std::stoul(i.get("_key").copyString());
|
||||
_cur = std::stoul(ii.get("_key").copyString());
|
||||
} catch (std::exception const& e) {
|
||||
LOG_TOPIC(ERR, Logger::AGENCY) << e.what() << " " << __FILE__
|
||||
<< __LINE__;
|
||||
|
@ -490,8 +493,10 @@ bool State::loadOrPersistConfiguration() {
|
|||
result.length()) { // We already have a persisted conf
|
||||
|
||||
try {
|
||||
LOG_TOPIC(DEBUG, Logger::AGENCY) << "Merging configuration " << result[0].toJson();
|
||||
_agent->mergeConfiguration(result[0]);
|
||||
LOG_TOPIC(DEBUG, Logger::AGENCY)
|
||||
<< "Merging configuration " << result[0].resolveExternals().toJson();
|
||||
_agent->mergeConfiguration(result[0].resolveExternals());
|
||||
|
||||
} catch (std::exception const& e) {
|
||||
LOG_TOPIC(ERR, Logger::AGENCY)
|
||||
<< "Failed to merge persisted configuration into runtime "
|
||||
|
@ -551,37 +556,40 @@ bool State::loadRemaining() {
|
|||
nullptr, arangodb::aql::PART_MAIN);
|
||||
|
||||
auto queryResult = query.execute(QueryRegistryFeature::QUERY_REGISTRY);
|
||||
|
||||
|
||||
if (queryResult.code != TRI_ERROR_NO_ERROR) {
|
||||
THROW_ARANGO_EXCEPTION_MESSAGE(queryResult.code, queryResult.details);
|
||||
}
|
||||
|
||||
|
||||
auto result = queryResult.result->slice();
|
||||
|
||||
|
||||
MUTEX_LOCKER(logLock, _logLock);
|
||||
if (result.isArray()) {
|
||||
_log.clear();
|
||||
for (auto const& i : VPackArrayIterator(result)) {
|
||||
|
||||
_log.clear();
|
||||
|
||||
for (auto const& i : VPackArrayIterator(result)) {
|
||||
buffer_t tmp = std::make_shared<arangodb::velocypack::Buffer<uint8_t>>();
|
||||
auto req = i.get("request");
|
||||
tmp->append(req.startAs<char const>(), req.byteSize());
|
||||
try {
|
||||
auto ii = i.resolveExternals();
|
||||
auto req = ii.get("request");
|
||||
tmp->append(req.startAs<char const>(), req.byteSize());
|
||||
try {
|
||||
_log.push_back(
|
||||
log_t(std::stoi(i.get(StaticStrings::KeyString).copyString()),
|
||||
static_cast<term_t>(i.get("term").getUInt()), tmp));
|
||||
log_t(std::stoi(ii.get(StaticStrings::KeyString).copyString()),
|
||||
static_cast<term_t>(ii.get("term").getUInt()), tmp));
|
||||
} catch (std::exception const& e) {
|
||||
LOG_TOPIC(ERR, Logger::AGENCY)
|
||||
<< "Failed to convert " +
|
||||
i.get(StaticStrings::KeyString).copyString() +
|
||||
ii.get(StaticStrings::KeyString).copyString() +
|
||||
" to integer via std::stoi."
|
||||
<< e.what();
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
|
||||
_agent->rebuildDBs();
|
||||
TRI_ASSERT(!_log.empty());
|
||||
_agent->lastCommitted(_log.back().index);
|
||||
TRI_ASSERT(!_log.empty());
|
||||
_agent->lastCommitted(_log.back().index);
|
||||
|
||||
return true;
|
||||
}
|
||||
|
|
|
@ -95,6 +95,7 @@ SET(ARANGOD_SOURCES
|
|||
Agency/GossipCallback.cpp
|
||||
Agency/Inception.cpp
|
||||
Agency/Job.cpp
|
||||
Agency/MeasureCallback.cpp
|
||||
Agency/MoveShard.cpp
|
||||
Agency/NotifyCallback.cpp
|
||||
Agency/Node.cpp
|
||||
|
|
|
@ -59,7 +59,6 @@
|
|||
|
||||
// loop over all databases
|
||||
for (const database of databases) {
|
||||
console.log(database);
|
||||
db._useDatabase(database);
|
||||
// and initialize Foxx applications
|
||||
try {
|
||||
|
|
Loading…
Reference in New Issue