mirror of https://gitee.com/bigwinds/arangodb
Fixing recovery
This commit is contained in:
parent
11998b4e5b
commit
ac37e7c85e
|
@ -111,11 +111,25 @@ VertexComputation<float, float, float>* PageRankAlgorithm::createComputation(
|
|||
return new PRComputation(_threshold);
|
||||
}
|
||||
|
||||
Aggregator* PageRankAlgorithm::aggregator(std::string const& name) const {
|
||||
if (name == "convergence") {
|
||||
return new FloatMaxAggregator(-1);
|
||||
} else if (name == "nonfailedCount") {
|
||||
return new SumAggregator<uint32_t>(0);
|
||||
} else if (name == "totalrank") {
|
||||
return new SumAggregator<float>(0);
|
||||
} else if (name == "step") {
|
||||
return new ValueAggregator<uint32_t>(0);
|
||||
} else if (name == "scale") {
|
||||
return new ValueAggregator<float>(-1);
|
||||
}
|
||||
return nullptr;
|
||||
}
|
||||
|
||||
struct PRCompensation : public VertexCompensation<float, float, float> {
|
||||
PRCompensation() {}
|
||||
void compensate(bool inLostPartition) override {
|
||||
const uint32_t* step = getAggregatedValue<uint32_t>("step");
|
||||
if (step) {
|
||||
if (*step == 0 && !inLostPartition) {
|
||||
uint32_t c = 1;
|
||||
aggregate("nonfailedCount", &c);
|
||||
|
@ -132,7 +146,6 @@ struct PRCompensation : public VertexCompensation<float, float, float> {
|
|||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
};
|
||||
|
||||
VertexCompensation<float, float, float>* PageRankAlgorithm::createCompensation(
|
||||
|
@ -140,21 +153,6 @@ VertexCompensation<float, float, float>* PageRankAlgorithm::createCompensation(
|
|||
return new PRCompensation();
|
||||
}
|
||||
|
||||
Aggregator* PageRankAlgorithm::aggregator(std::string const& name) const {
|
||||
if (name == "convergence") {
|
||||
return new FloatMaxAggregator(-1);
|
||||
} else if (name == "nonfailedCount") {
|
||||
return new SumAggregator<uint32_t>(0);
|
||||
} else if (name == "totalrank") {
|
||||
return new SumAggregator<float>(0);
|
||||
} else if (name == "step") {
|
||||
return new ValueAggregator<uint32_t>(0);
|
||||
} else if (name == "scale") {
|
||||
return new ValueAggregator<float>(-1);
|
||||
}
|
||||
return nullptr;
|
||||
}
|
||||
|
||||
struct PRMasterContext : public MasterContext {
|
||||
PRMasterContext(VPackSlice params) : MasterContext(params){};
|
||||
bool postGlobalSuperstep(uint64_t gss) {
|
||||
|
|
|
@ -75,6 +75,7 @@ void Conductor::start(std::string const& algoName, VPackSlice userConfig) {
|
|||
THROW_ARANGO_EXCEPTION_MESSAGE(TRI_ERROR_BAD_PARAMETER,
|
||||
"Algorithm not found");
|
||||
}
|
||||
_masterContext.reset(_algorithm->masterContext(userConfig));
|
||||
_aggregators.reset(new AggregatorHandler(_algorithm.get()));
|
||||
// configure the async mode as optional
|
||||
VPackSlice async = _userParams.slice().get("async");
|
||||
|
@ -270,7 +271,12 @@ void Conductor::finishedRecovery(VPackSlice& data) {
|
|||
if (_masterContext) {
|
||||
proceed = proceed || _masterContext->postCompensation(_globalSuperstep);
|
||||
}
|
||||
if (proceed) {
|
||||
if (!proceed) {
|
||||
LOG(INFO) << "Recovery finished. Proceeding normally";
|
||||
_startGlobalStep();
|
||||
return;
|
||||
}
|
||||
|
||||
VPackBuilder b;
|
||||
b.openObject();
|
||||
b.add(Utils::executionNumberKey, VPackValue(_executionNumber));
|
||||
|
@ -290,9 +296,6 @@ void Conductor::finishedRecovery(VPackSlice& data) {
|
|||
cancel();
|
||||
LOG(INFO) << "Recovery failed";
|
||||
}
|
||||
} else {
|
||||
_startGlobalStep();
|
||||
}
|
||||
} else {
|
||||
LOG(ERR) << "Recovery not supported";
|
||||
}
|
||||
|
@ -419,23 +422,27 @@ int Conductor::_initializeWorkers(std::string const& suffix,
|
|||
std::map<CollectionID, std::string> collectionPlanIdMap;
|
||||
std::map<ServerID, std::map<CollectionID, std::vector<ShardID>>> vertexMap,
|
||||
edgeMap;
|
||||
std::vector<ShardID> allShardIDs;
|
||||
std::vector<ShardID> shardList;
|
||||
|
||||
// resolve plan id's and shards on the servers
|
||||
for (auto& collection : _vertexCollections) {
|
||||
collectionPlanIdMap.emplace(collection->name(),
|
||||
collection->planId_as_string());
|
||||
resolveShards(collection.get(), vertexMap, allShardIDs);
|
||||
resolveShards(collection.get(), vertexMap, shardList);
|
||||
}
|
||||
for (auto& collection : _edgeCollections) {
|
||||
collectionPlanIdMap.emplace(collection->name(),
|
||||
collection->planId_as_string());
|
||||
resolveShards(collection.get(), edgeMap, allShardIDs);
|
||||
resolveShards(collection.get(), edgeMap, shardList);
|
||||
}
|
||||
_dbServers.clear();
|
||||
for (auto const& pair : vertexMap) {
|
||||
_dbServers.push_back(pair.first);
|
||||
}
|
||||
// do not reload all shard id's, this list is must stay in the same order
|
||||
if (_allShards.size() == 0) {
|
||||
_allShards = shardList;
|
||||
}
|
||||
|
||||
std::string const path =
|
||||
Utils::baseUrl(_vocbaseGuard.vocbase()->name()) + suffix;
|
||||
|
@ -459,7 +466,9 @@ int Conductor::_initializeWorkers(std::string const& suffix,
|
|||
b.add(Utils::asyncMode, VPackValue(_asyncMode));
|
||||
b.add(Utils::lazyLoading, VPackValue(_lazyLoading));
|
||||
if (additional.isObject()) {
|
||||
b.add(additional);
|
||||
for (auto const& pair : VPackObjectIterator(additional)) {
|
||||
b.add(pair.key.copyString(), pair.value);
|
||||
}
|
||||
}
|
||||
|
||||
b.add(Utils::vertexShardsKey, VPackValue(VPackValueType::Object));
|
||||
|
@ -486,7 +495,7 @@ int Conductor::_initializeWorkers(std::string const& suffix,
|
|||
}
|
||||
b.close();
|
||||
b.add(Utils::globalShardListKey, VPackValue(VPackValueType::Array));
|
||||
for (std::string const& shard : allShardIDs) {
|
||||
for (std::string const& shard : _allShards) {
|
||||
b.add(VPackValue(shard));
|
||||
}
|
||||
b.close();
|
||||
|
|
|
@ -64,6 +64,7 @@ class Conductor {
|
|||
std::vector<std::shared_ptr<LogicalCollection>> _vertexCollections;
|
||||
std::vector<std::shared_ptr<LogicalCollection>> _edgeCollections;
|
||||
std::vector<ServerID> _dbServers;
|
||||
std::vector<ShardID> _allShards;// persistent shard list
|
||||
|
||||
// initialized on startup
|
||||
std::unique_ptr<AggregatorHandler> _aggregators;
|
||||
|
|
|
@ -128,6 +128,7 @@ void PregelFeature::cleanupAll() {
|
|||
}
|
||||
_conductors.clear();
|
||||
for (auto it : _workers) {
|
||||
it.second->cancelGlobalStep(VPackSlice());
|
||||
delete (it.second);
|
||||
}
|
||||
_workers.clear();
|
||||
|
|
|
@ -117,9 +117,11 @@ template <typename V, typename E, typename M>
|
|||
Worker<V, E, M>::~Worker() {
|
||||
LOG(INFO) << "Called ~Worker()";
|
||||
_state = WorkerState::DONE;
|
||||
usleep(5000);//5ms wait for threads to die
|
||||
delete _readCache;
|
||||
delete _writeCache;
|
||||
delete _writeCacheNextGSS;
|
||||
_writeCache = nullptr;
|
||||
}
|
||||
|
||||
template <typename V, typename E, typename M>
|
||||
|
@ -287,7 +289,8 @@ void Worker<V, E, M>::_startProcessing() {
|
|||
return;
|
||||
}
|
||||
auto vertices = _graphStore->vertexIterator(start, end);
|
||||
if (_processVertices(vertices)) { // should work like a join operation
|
||||
// should work like a join operation
|
||||
if (_processVertices(vertices) && _state == WorkerState::COMPUTING) {
|
||||
_finishedProcessing(); // last thread turns the lights out
|
||||
}
|
||||
});
|
||||
|
@ -367,6 +370,9 @@ bool Worker<V, E, M>::_processVertices(
|
|||
}
|
||||
// ==================== send messages to other shards ====================
|
||||
outCache->flushMessages();
|
||||
if (!_writeCache) {// ~Worker was called
|
||||
return false;
|
||||
}
|
||||
if (vertexComputation->_nextPhase) {
|
||||
_requestedNextGSS = true;
|
||||
_nextGSSSendMessageCount += outCache->sendCountNextGSS();
|
||||
|
@ -511,6 +517,7 @@ void Worker<V, E, M>::finalizeExecution(VPackSlice body) {
|
|||
|
||||
template <typename V, typename E, typename M>
|
||||
void Worker<V, E, M>::startRecovery(VPackSlice data) {
|
||||
{// other methods might lock _commandMutex
|
||||
MUTEX_LOCKER(guard, _commandMutex);
|
||||
|
||||
_state = WorkerState::RECOVERING;
|
||||
|
@ -519,6 +526,7 @@ void Worker<V, E, M>::startRecovery(VPackSlice data) {
|
|||
if (_writeCacheNextGSS) {
|
||||
_writeCacheNextGSS->clear();
|
||||
}
|
||||
}
|
||||
|
||||
VPackSlice method = data.get(Utils::recoveryMethodKey);
|
||||
if (method.compareString(Utils::compensate) == 0) {
|
||||
|
|
|
@ -86,7 +86,7 @@ done
|
|||
# Currently the agency does not wait for all servers to shutdown
|
||||
# This causes a race condisiton where all servers wait to tell the agency
|
||||
# they are shutting down
|
||||
sleep 5
|
||||
sleep 10
|
||||
|
||||
echo Shutting down agency ...
|
||||
for aid in `seq 0 $(( $NRAGENTS - 1 ))`; do
|
||||
|
|
Loading…
Reference in New Issue