1
0
Fork 0

Fixing recovery

This commit is contained in:
Simon Grätzer 2017-01-16 13:58:30 +01:00
parent 11998b4e5b
commit ac37e7c85e
6 changed files with 80 additions and 63 deletions

View File

@ -111,11 +111,25 @@ VertexComputation<float, float, float>* PageRankAlgorithm::createComputation(
return new PRComputation(_threshold); return new PRComputation(_threshold);
} }
Aggregator* PageRankAlgorithm::aggregator(std::string const& name) const {
if (name == "convergence") {
return new FloatMaxAggregator(-1);
} else if (name == "nonfailedCount") {
return new SumAggregator<uint32_t>(0);
} else if (name == "totalrank") {
return new SumAggregator<float>(0);
} else if (name == "step") {
return new ValueAggregator<uint32_t>(0);
} else if (name == "scale") {
return new ValueAggregator<float>(-1);
}
return nullptr;
}
struct PRCompensation : public VertexCompensation<float, float, float> { struct PRCompensation : public VertexCompensation<float, float, float> {
PRCompensation() {} PRCompensation() {}
void compensate(bool inLostPartition) override { void compensate(bool inLostPartition) override {
const uint32_t* step = getAggregatedValue<uint32_t>("step"); const uint32_t* step = getAggregatedValue<uint32_t>("step");
if (step) {
if (*step == 0 && !inLostPartition) { if (*step == 0 && !inLostPartition) {
uint32_t c = 1; uint32_t c = 1;
aggregate("nonfailedCount", &c); aggregate("nonfailedCount", &c);
@ -132,7 +146,6 @@ struct PRCompensation : public VertexCompensation<float, float, float> {
} }
} }
} }
}
}; };
VertexCompensation<float, float, float>* PageRankAlgorithm::createCompensation( VertexCompensation<float, float, float>* PageRankAlgorithm::createCompensation(
@ -140,21 +153,6 @@ VertexCompensation<float, float, float>* PageRankAlgorithm::createCompensation(
return new PRCompensation(); return new PRCompensation();
} }
Aggregator* PageRankAlgorithm::aggregator(std::string const& name) const {
if (name == "convergence") {
return new FloatMaxAggregator(-1);
} else if (name == "nonfailedCount") {
return new SumAggregator<uint32_t>(0);
} else if (name == "totalrank") {
return new SumAggregator<float>(0);
} else if (name == "step") {
return new ValueAggregator<uint32_t>(0);
} else if (name == "scale") {
return new ValueAggregator<float>(-1);
}
return nullptr;
}
struct PRMasterContext : public MasterContext { struct PRMasterContext : public MasterContext {
PRMasterContext(VPackSlice params) : MasterContext(params){}; PRMasterContext(VPackSlice params) : MasterContext(params){};
bool postGlobalSuperstep(uint64_t gss) { bool postGlobalSuperstep(uint64_t gss) {

View File

@ -75,6 +75,7 @@ void Conductor::start(std::string const& algoName, VPackSlice userConfig) {
THROW_ARANGO_EXCEPTION_MESSAGE(TRI_ERROR_BAD_PARAMETER, THROW_ARANGO_EXCEPTION_MESSAGE(TRI_ERROR_BAD_PARAMETER,
"Algorithm not found"); "Algorithm not found");
} }
_masterContext.reset(_algorithm->masterContext(userConfig));
_aggregators.reset(new AggregatorHandler(_algorithm.get())); _aggregators.reset(new AggregatorHandler(_algorithm.get()));
// configure the async mode as optional // configure the async mode as optional
VPackSlice async = _userParams.slice().get("async"); VPackSlice async = _userParams.slice().get("async");
@ -270,7 +271,12 @@ void Conductor::finishedRecovery(VPackSlice& data) {
if (_masterContext) { if (_masterContext) {
proceed = proceed || _masterContext->postCompensation(_globalSuperstep); proceed = proceed || _masterContext->postCompensation(_globalSuperstep);
} }
if (proceed) { if (!proceed) {
LOG(INFO) << "Recovery finished. Proceeding normally";
_startGlobalStep();
return;
}
VPackBuilder b; VPackBuilder b;
b.openObject(); b.openObject();
b.add(Utils::executionNumberKey, VPackValue(_executionNumber)); b.add(Utils::executionNumberKey, VPackValue(_executionNumber));
@ -290,9 +296,6 @@ void Conductor::finishedRecovery(VPackSlice& data) {
cancel(); cancel();
LOG(INFO) << "Recovery failed"; LOG(INFO) << "Recovery failed";
} }
} else {
_startGlobalStep();
}
} else { } else {
LOG(ERR) << "Recovery not supported"; LOG(ERR) << "Recovery not supported";
} }
@ -419,23 +422,27 @@ int Conductor::_initializeWorkers(std::string const& suffix,
std::map<CollectionID, std::string> collectionPlanIdMap; std::map<CollectionID, std::string> collectionPlanIdMap;
std::map<ServerID, std::map<CollectionID, std::vector<ShardID>>> vertexMap, std::map<ServerID, std::map<CollectionID, std::vector<ShardID>>> vertexMap,
edgeMap; edgeMap;
std::vector<ShardID> allShardIDs; std::vector<ShardID> shardList;
// resolve plan id's and shards on the servers // resolve plan id's and shards on the servers
for (auto& collection : _vertexCollections) { for (auto& collection : _vertexCollections) {
collectionPlanIdMap.emplace(collection->name(), collectionPlanIdMap.emplace(collection->name(),
collection->planId_as_string()); collection->planId_as_string());
resolveShards(collection.get(), vertexMap, allShardIDs); resolveShards(collection.get(), vertexMap, shardList);
} }
for (auto& collection : _edgeCollections) { for (auto& collection : _edgeCollections) {
collectionPlanIdMap.emplace(collection->name(), collectionPlanIdMap.emplace(collection->name(),
collection->planId_as_string()); collection->planId_as_string());
resolveShards(collection.get(), edgeMap, allShardIDs); resolveShards(collection.get(), edgeMap, shardList);
} }
_dbServers.clear(); _dbServers.clear();
for (auto const& pair : vertexMap) { for (auto const& pair : vertexMap) {
_dbServers.push_back(pair.first); _dbServers.push_back(pair.first);
} }
// do not reload all shard id's, this list is must stay in the same order
if (_allShards.size() == 0) {
_allShards = shardList;
}
std::string const path = std::string const path =
Utils::baseUrl(_vocbaseGuard.vocbase()->name()) + suffix; Utils::baseUrl(_vocbaseGuard.vocbase()->name()) + suffix;
@ -459,7 +466,9 @@ int Conductor::_initializeWorkers(std::string const& suffix,
b.add(Utils::asyncMode, VPackValue(_asyncMode)); b.add(Utils::asyncMode, VPackValue(_asyncMode));
b.add(Utils::lazyLoading, VPackValue(_lazyLoading)); b.add(Utils::lazyLoading, VPackValue(_lazyLoading));
if (additional.isObject()) { if (additional.isObject()) {
b.add(additional); for (auto const& pair : VPackObjectIterator(additional)) {
b.add(pair.key.copyString(), pair.value);
}
} }
b.add(Utils::vertexShardsKey, VPackValue(VPackValueType::Object)); b.add(Utils::vertexShardsKey, VPackValue(VPackValueType::Object));
@ -486,7 +495,7 @@ int Conductor::_initializeWorkers(std::string const& suffix,
} }
b.close(); b.close();
b.add(Utils::globalShardListKey, VPackValue(VPackValueType::Array)); b.add(Utils::globalShardListKey, VPackValue(VPackValueType::Array));
for (std::string const& shard : allShardIDs) { for (std::string const& shard : _allShards) {
b.add(VPackValue(shard)); b.add(VPackValue(shard));
} }
b.close(); b.close();

View File

@ -64,6 +64,7 @@ class Conductor {
std::vector<std::shared_ptr<LogicalCollection>> _vertexCollections; std::vector<std::shared_ptr<LogicalCollection>> _vertexCollections;
std::vector<std::shared_ptr<LogicalCollection>> _edgeCollections; std::vector<std::shared_ptr<LogicalCollection>> _edgeCollections;
std::vector<ServerID> _dbServers; std::vector<ServerID> _dbServers;
std::vector<ShardID> _allShards;// persistent shard list
// initialized on startup // initialized on startup
std::unique_ptr<AggregatorHandler> _aggregators; std::unique_ptr<AggregatorHandler> _aggregators;

View File

@ -128,6 +128,7 @@ void PregelFeature::cleanupAll() {
} }
_conductors.clear(); _conductors.clear();
for (auto it : _workers) { for (auto it : _workers) {
it.second->cancelGlobalStep(VPackSlice());
delete (it.second); delete (it.second);
} }
_workers.clear(); _workers.clear();

View File

@ -117,9 +117,11 @@ template <typename V, typename E, typename M>
Worker<V, E, M>::~Worker() { Worker<V, E, M>::~Worker() {
LOG(INFO) << "Called ~Worker()"; LOG(INFO) << "Called ~Worker()";
_state = WorkerState::DONE; _state = WorkerState::DONE;
usleep(5000);//5ms wait for threads to die
delete _readCache; delete _readCache;
delete _writeCache; delete _writeCache;
delete _writeCacheNextGSS; delete _writeCacheNextGSS;
_writeCache = nullptr;
} }
template <typename V, typename E, typename M> template <typename V, typename E, typename M>
@ -287,7 +289,8 @@ void Worker<V, E, M>::_startProcessing() {
return; return;
} }
auto vertices = _graphStore->vertexIterator(start, end); auto vertices = _graphStore->vertexIterator(start, end);
if (_processVertices(vertices)) { // should work like a join operation // should work like a join operation
if (_processVertices(vertices) && _state == WorkerState::COMPUTING) {
_finishedProcessing(); // last thread turns the lights out _finishedProcessing(); // last thread turns the lights out
} }
}); });
@ -367,6 +370,9 @@ bool Worker<V, E, M>::_processVertices(
} }
// ==================== send messages to other shards ==================== // ==================== send messages to other shards ====================
outCache->flushMessages(); outCache->flushMessages();
if (!_writeCache) {// ~Worker was called
return false;
}
if (vertexComputation->_nextPhase) { if (vertexComputation->_nextPhase) {
_requestedNextGSS = true; _requestedNextGSS = true;
_nextGSSSendMessageCount += outCache->sendCountNextGSS(); _nextGSSSendMessageCount += outCache->sendCountNextGSS();
@ -511,6 +517,7 @@ void Worker<V, E, M>::finalizeExecution(VPackSlice body) {
template <typename V, typename E, typename M> template <typename V, typename E, typename M>
void Worker<V, E, M>::startRecovery(VPackSlice data) { void Worker<V, E, M>::startRecovery(VPackSlice data) {
{// other methods might lock _commandMutex
MUTEX_LOCKER(guard, _commandMutex); MUTEX_LOCKER(guard, _commandMutex);
_state = WorkerState::RECOVERING; _state = WorkerState::RECOVERING;
@ -519,6 +526,7 @@ void Worker<V, E, M>::startRecovery(VPackSlice data) {
if (_writeCacheNextGSS) { if (_writeCacheNextGSS) {
_writeCacheNextGSS->clear(); _writeCacheNextGSS->clear();
} }
}
VPackSlice method = data.get(Utils::recoveryMethodKey); VPackSlice method = data.get(Utils::recoveryMethodKey);
if (method.compareString(Utils::compensate) == 0) { if (method.compareString(Utils::compensate) == 0) {

View File

@ -86,7 +86,7 @@ done
# Currently the agency does not wait for all servers to shutdown # Currently the agency does not wait for all servers to shutdown
# This causes a race condisiton where all servers wait to tell the agency # This causes a race condisiton where all servers wait to tell the agency
# they are shutting down # they are shutting down
sleep 5 sleep 10
echo Shutting down agency ... echo Shutting down agency ...
for aid in `seq 0 $(( $NRAGENTS - 1 ))`; do for aid in `seq 0 $(( $NRAGENTS - 1 ))`; do