1
0
Fork 0

Fixing recovery

This commit is contained in:
Simon Grätzer 2017-01-16 13:58:30 +01:00
parent 11998b4e5b
commit ac37e7c85e
6 changed files with 80 additions and 63 deletions

View File

@ -111,11 +111,25 @@ VertexComputation<float, float, float>* PageRankAlgorithm::createComputation(
return new PRComputation(_threshold);
}
Aggregator* PageRankAlgorithm::aggregator(std::string const& name) const {
if (name == "convergence") {
return new FloatMaxAggregator(-1);
} else if (name == "nonfailedCount") {
return new SumAggregator<uint32_t>(0);
} else if (name == "totalrank") {
return new SumAggregator<float>(0);
} else if (name == "step") {
return new ValueAggregator<uint32_t>(0);
} else if (name == "scale") {
return new ValueAggregator<float>(-1);
}
return nullptr;
}
struct PRCompensation : public VertexCompensation<float, float, float> {
PRCompensation() {}
void compensate(bool inLostPartition) override {
const uint32_t* step = getAggregatedValue<uint32_t>("step");
if (step) {
if (*step == 0 && !inLostPartition) {
uint32_t c = 1;
aggregate("nonfailedCount", &c);
@ -132,7 +146,6 @@ struct PRCompensation : public VertexCompensation<float, float, float> {
}
}
}
}
};
VertexCompensation<float, float, float>* PageRankAlgorithm::createCompensation(
@ -140,21 +153,6 @@ VertexCompensation<float, float, float>* PageRankAlgorithm::createCompensation(
return new PRCompensation();
}
Aggregator* PageRankAlgorithm::aggregator(std::string const& name) const {
if (name == "convergence") {
return new FloatMaxAggregator(-1);
} else if (name == "nonfailedCount") {
return new SumAggregator<uint32_t>(0);
} else if (name == "totalrank") {
return new SumAggregator<float>(0);
} else if (name == "step") {
return new ValueAggregator<uint32_t>(0);
} else if (name == "scale") {
return new ValueAggregator<float>(-1);
}
return nullptr;
}
struct PRMasterContext : public MasterContext {
PRMasterContext(VPackSlice params) : MasterContext(params){};
bool postGlobalSuperstep(uint64_t gss) {

View File

@ -75,6 +75,7 @@ void Conductor::start(std::string const& algoName, VPackSlice userConfig) {
THROW_ARANGO_EXCEPTION_MESSAGE(TRI_ERROR_BAD_PARAMETER,
"Algorithm not found");
}
_masterContext.reset(_algorithm->masterContext(userConfig));
_aggregators.reset(new AggregatorHandler(_algorithm.get()));
// configure the async mode as optional
VPackSlice async = _userParams.slice().get("async");
@ -270,7 +271,12 @@ void Conductor::finishedRecovery(VPackSlice& data) {
if (_masterContext) {
proceed = proceed || _masterContext->postCompensation(_globalSuperstep);
}
if (proceed) {
if (!proceed) {
LOG(INFO) << "Recovery finished. Proceeding normally";
_startGlobalStep();
return;
}
VPackBuilder b;
b.openObject();
b.add(Utils::executionNumberKey, VPackValue(_executionNumber));
@ -290,9 +296,6 @@ void Conductor::finishedRecovery(VPackSlice& data) {
cancel();
LOG(INFO) << "Recovery failed";
}
} else {
_startGlobalStep();
}
} else {
LOG(ERR) << "Recovery not supported";
}
@ -419,23 +422,27 @@ int Conductor::_initializeWorkers(std::string const& suffix,
std::map<CollectionID, std::string> collectionPlanIdMap;
std::map<ServerID, std::map<CollectionID, std::vector<ShardID>>> vertexMap,
edgeMap;
std::vector<ShardID> allShardIDs;
std::vector<ShardID> shardList;
// resolve plan id's and shards on the servers
for (auto& collection : _vertexCollections) {
collectionPlanIdMap.emplace(collection->name(),
collection->planId_as_string());
resolveShards(collection.get(), vertexMap, allShardIDs);
resolveShards(collection.get(), vertexMap, shardList);
}
for (auto& collection : _edgeCollections) {
collectionPlanIdMap.emplace(collection->name(),
collection->planId_as_string());
resolveShards(collection.get(), edgeMap, allShardIDs);
resolveShards(collection.get(), edgeMap, shardList);
}
_dbServers.clear();
for (auto const& pair : vertexMap) {
_dbServers.push_back(pair.first);
}
// do not reload all shard id's, this list is must stay in the same order
if (_allShards.size() == 0) {
_allShards = shardList;
}
std::string const path =
Utils::baseUrl(_vocbaseGuard.vocbase()->name()) + suffix;
@ -459,7 +466,9 @@ int Conductor::_initializeWorkers(std::string const& suffix,
b.add(Utils::asyncMode, VPackValue(_asyncMode));
b.add(Utils::lazyLoading, VPackValue(_lazyLoading));
if (additional.isObject()) {
b.add(additional);
for (auto const& pair : VPackObjectIterator(additional)) {
b.add(pair.key.copyString(), pair.value);
}
}
b.add(Utils::vertexShardsKey, VPackValue(VPackValueType::Object));
@ -486,7 +495,7 @@ int Conductor::_initializeWorkers(std::string const& suffix,
}
b.close();
b.add(Utils::globalShardListKey, VPackValue(VPackValueType::Array));
for (std::string const& shard : allShardIDs) {
for (std::string const& shard : _allShards) {
b.add(VPackValue(shard));
}
b.close();

View File

@ -64,6 +64,7 @@ class Conductor {
std::vector<std::shared_ptr<LogicalCollection>> _vertexCollections;
std::vector<std::shared_ptr<LogicalCollection>> _edgeCollections;
std::vector<ServerID> _dbServers;
std::vector<ShardID> _allShards;// persistent shard list
// initialized on startup
std::unique_ptr<AggregatorHandler> _aggregators;

View File

@ -128,6 +128,7 @@ void PregelFeature::cleanupAll() {
}
_conductors.clear();
for (auto it : _workers) {
it.second->cancelGlobalStep(VPackSlice());
delete (it.second);
}
_workers.clear();

View File

@ -117,9 +117,11 @@ template <typename V, typename E, typename M>
Worker<V, E, M>::~Worker() {
LOG(INFO) << "Called ~Worker()";
_state = WorkerState::DONE;
usleep(5000);//5ms wait for threads to die
delete _readCache;
delete _writeCache;
delete _writeCacheNextGSS;
_writeCache = nullptr;
}
template <typename V, typename E, typename M>
@ -287,7 +289,8 @@ void Worker<V, E, M>::_startProcessing() {
return;
}
auto vertices = _graphStore->vertexIterator(start, end);
if (_processVertices(vertices)) { // should work like a join operation
// should work like a join operation
if (_processVertices(vertices) && _state == WorkerState::COMPUTING) {
_finishedProcessing(); // last thread turns the lights out
}
});
@ -367,6 +370,9 @@ bool Worker<V, E, M>::_processVertices(
}
// ==================== send messages to other shards ====================
outCache->flushMessages();
if (!_writeCache) {// ~Worker was called
return false;
}
if (vertexComputation->_nextPhase) {
_requestedNextGSS = true;
_nextGSSSendMessageCount += outCache->sendCountNextGSS();
@ -511,6 +517,7 @@ void Worker<V, E, M>::finalizeExecution(VPackSlice body) {
template <typename V, typename E, typename M>
void Worker<V, E, M>::startRecovery(VPackSlice data) {
{// other methods might lock _commandMutex
MUTEX_LOCKER(guard, _commandMutex);
_state = WorkerState::RECOVERING;
@ -519,6 +526,7 @@ void Worker<V, E, M>::startRecovery(VPackSlice data) {
if (_writeCacheNextGSS) {
_writeCacheNextGSS->clear();
}
}
VPackSlice method = data.get(Utils::recoveryMethodKey);
if (method.compareString(Utils::compensate) == 0) {

View File

@ -86,7 +86,7 @@ done
# Currently the agency does not wait for all servers to shutdown
# This causes a race condisiton where all servers wait to tell the agency
# they are shutting down
sleep 5
sleep 10
echo Shutting down agency ...
for aid in `seq 0 $(( $NRAGENTS - 1 ))`; do