Fixing recovery

2017-01-16 13:58:30 +01:00 · 2017-01-16 13:58:30 +01:00 · ac37e7c85e
parent 11998b4e5b
commit ac37e7c85e
6 changed files with 80 additions and 63 deletions
--- a/arangod/Pregel/Algos/PageRank.cpp
+++ b/arangod/Pregel/Algos/PageRank.cpp
@ -111,11 +111,25 @@ VertexComputation<float, float, float>* PageRankAlgorithm::createComputation(
  return new PRComputation(_threshold);
 }

+Aggregator* PageRankAlgorithm::aggregator(std::string const& name) const {
+  if (name == "convergence") {
+    return new FloatMaxAggregator(-1);
+  } else if (name == "nonfailedCount") {
+    return new SumAggregator<uint32_t>(0);
+  } else if (name == "totalrank") {
+    return new SumAggregator<float>(0);
+  } else if (name == "step") {
+    return new ValueAggregator<uint32_t>(0);
+  } else if (name == "scale") {
+    return new ValueAggregator<float>(-1);
+  }
+  return nullptr;
+}
+
 struct PRCompensation : public VertexCompensation<float, float, float> {
  PRCompensation() {}
  void compensate(bool inLostPartition) override {
    const uint32_t* step = getAggregatedValue<uint32_t>("step");
-    if (step) {
    if (*step == 0 && !inLostPartition) {
      uint32_t c = 1;
      aggregate("nonfailedCount", &c);
@ -132,7 +146,6 @@ struct PRCompensation : public VertexCompensation<float, float, float> {
      }
    }
  }
-  }
 };

 VertexCompensation<float, float, float>* PageRankAlgorithm::createCompensation(
@ -140,21 +153,6 @@ VertexCompensation<float, float, float>* PageRankAlgorithm::createCompensation(
  return new PRCompensation();
 }

-Aggregator* PageRankAlgorithm::aggregator(std::string const& name) const {
-  if (name == "convergence") {
-    return new FloatMaxAggregator(-1);
-  } else if (name == "nonfailedCount") {
-    return new SumAggregator<uint32_t>(0);
-  } else if (name == "totalrank") {
-    return new SumAggregator<float>(0);
-  } else if (name == "step") {
-    return new ValueAggregator<uint32_t>(0);
-  } else if (name == "scale") {
-    return new ValueAggregator<float>(-1);
-  }
-  return nullptr;
-}
-
 struct PRMasterContext : public MasterContext {
  PRMasterContext(VPackSlice params) : MasterContext(params){};
  bool postGlobalSuperstep(uint64_t gss) {
--- a/arangod/Pregel/Conductor.cpp
+++ b/arangod/Pregel/Conductor.cpp
@ -75,6 +75,7 @@ void Conductor::start(std::string const& algoName, VPackSlice userConfig) {
    THROW_ARANGO_EXCEPTION_MESSAGE(TRI_ERROR_BAD_PARAMETER,
                                   "Algorithm not found");
  }
+  _masterContext.reset(_algorithm->masterContext(userConfig));
  _aggregators.reset(new AggregatorHandler(_algorithm.get()));
  // configure the async mode as optional
  VPackSlice async = _userParams.slice().get("async");
@ -270,7 +271,12 @@ void Conductor::finishedRecovery(VPackSlice& data) {
    if (_masterContext) {
      proceed = proceed || _masterContext->postCompensation(_globalSuperstep);
    }
-    if (proceed) {
+    if (!proceed) {
+      LOG(INFO) << "Recovery finished. Proceeding normally";
+      _startGlobalStep();
+      return;
+    }
+    
    VPackBuilder b;
    b.openObject();
    b.add(Utils::executionNumberKey, VPackValue(_executionNumber));
@ -290,9 +296,6 @@ void Conductor::finishedRecovery(VPackSlice& data) {
      cancel();
      LOG(INFO) << "Recovery failed";
    }
-    } else {
-      _startGlobalStep();
-    }
  } else {
    LOG(ERR) << "Recovery not supported";
  }
@ -419,23 +422,27 @@ int Conductor::_initializeWorkers(std::string const& suffix,
  std::map<CollectionID, std::string> collectionPlanIdMap;
  std::map<ServerID, std::map<CollectionID, std::vector<ShardID>>> vertexMap,
      edgeMap;
-  std::vector<ShardID> allShardIDs;
+  std::vector<ShardID> shardList;

  // resolve plan id's and shards on the servers
  for (auto& collection : _vertexCollections) {
    collectionPlanIdMap.emplace(collection->name(),
                                collection->planId_as_string());
-    resolveShards(collection.get(), vertexMap, allShardIDs);
+    resolveShards(collection.get(), vertexMap, shardList);
  }
  for (auto& collection : _edgeCollections) {
    collectionPlanIdMap.emplace(collection->name(),
                                collection->planId_as_string());
-    resolveShards(collection.get(), edgeMap, allShardIDs);
+    resolveShards(collection.get(), edgeMap, shardList);
  }
  _dbServers.clear();
  for (auto const& pair : vertexMap) {
    _dbServers.push_back(pair.first);
  }
+  // do not reload all shard id's, this list is must stay in the same order
+  if (_allShards.size() == 0) {
+    _allShards = shardList;
+  }

  std::string const path =
      Utils::baseUrl(_vocbaseGuard.vocbase()->name()) + suffix;
@ -459,7 +466,9 @@ int Conductor::_initializeWorkers(std::string const& suffix,
    b.add(Utils::asyncMode, VPackValue(_asyncMode));
    b.add(Utils::lazyLoading, VPackValue(_lazyLoading));
    if (additional.isObject()) {
-      b.add(additional);
+      for (auto const& pair : VPackObjectIterator(additional)) {
+        b.add(pair.key.copyString(), pair.value);
+      }
    }
    
    b.add(Utils::vertexShardsKey, VPackValue(VPackValueType::Object));
@ -486,7 +495,7 @@ int Conductor::_initializeWorkers(std::string const& suffix,
    }
    b.close();
    b.add(Utils::globalShardListKey, VPackValue(VPackValueType::Array));
-    for (std::string const& shard : allShardIDs) {
+    for (std::string const& shard : _allShards) {
      b.add(VPackValue(shard));
    }
    b.close();
--- a/arangod/Pregel/Conductor.h
+++ b/arangod/Pregel/Conductor.h
@ -64,6 +64,7 @@ class Conductor {
  std::vector<std::shared_ptr<LogicalCollection>> _vertexCollections;
  std::vector<std::shared_ptr<LogicalCollection>> _edgeCollections;
  std::vector<ServerID> _dbServers;
+  std::vector<ShardID> _allShards;// persistent shard list

  // initialized on startup
  std::unique_ptr<AggregatorHandler> _aggregators;
--- a/arangod/Pregel/PregelFeature.cpp
+++ b/arangod/Pregel/PregelFeature.cpp
@ -128,6 +128,7 @@ void PregelFeature::cleanupAll() {
  }
  _conductors.clear();
  for (auto it : _workers) {
+    it.second->cancelGlobalStep(VPackSlice());
    delete (it.second);
  }
  _workers.clear();
--- a/arangod/Pregel/Worker.cpp
+++ b/arangod/Pregel/Worker.cpp
@ -117,9 +117,11 @@ template <typename V, typename E, typename M>
 Worker<V, E, M>::~Worker() {
  LOG(INFO) << "Called ~Worker()";
  _state = WorkerState::DONE;
+  usleep(5000);//5ms wait for threads to die
  delete _readCache;
  delete _writeCache;
  delete _writeCacheNextGSS;
+  _writeCache = nullptr;
 }

 template <typename V, typename E, typename M>
@ -287,7 +289,8 @@ void Worker<V, E, M>::_startProcessing() {
        return;
      }
      auto vertices = _graphStore->vertexIterator(start, end);
-      if (_processVertices(vertices)) {  // should work like a join operation
+      // should work like a join operation
+      if (_processVertices(vertices) && _state == WorkerState::COMPUTING) {
        _finishedProcessing();           // last thread turns the lights out
      }
    });
@ -367,6 +370,9 @@ bool Worker<V, E, M>::_processVertices(
  }
  // ==================== send messages to other shards ====================
  outCache->flushMessages();
+  if (!_writeCache) {// ~Worker was called
+    return false;
+  }
  if (vertexComputation->_nextPhase) {
    _requestedNextGSS = true;
    _nextGSSSendMessageCount += outCache->sendCountNextGSS();
@ -511,6 +517,7 @@ void Worker<V, E, M>::finalizeExecution(VPackSlice body) {

 template <typename V, typename E, typename M>
 void Worker<V, E, M>::startRecovery(VPackSlice data) {
+  {// other methods might lock _commandMutex
    MUTEX_LOCKER(guard, _commandMutex);

    _state = WorkerState::RECOVERING;
@ -519,6 +526,7 @@ void Worker<V, E, M>::startRecovery(VPackSlice data) {
    if (_writeCacheNextGSS) {
      _writeCacheNextGSS->clear();
    }
+  }
  
  VPackSlice method = data.get(Utils::recoveryMethodKey);
  if (method.compareString(Utils::compensate) == 0) {
--- a/scripts/shutdownLocalCluster.sh
+++ b/scripts/shutdownLocalCluster.sh
@ -86,7 +86,7 @@ done
 # Currently the agency does not wait for all servers to shutdown
 # This causes a race condisiton where all servers wait to tell the agency
 # they are shutting down
-sleep 5
+sleep 10

 echo Shutting down agency ... 
 for aid in `seq 0 $(( $NRAGENTS - 1 ))`; do