arangodb/arangod/Pregel/Recovery.cpp

////////////////////////////////////////////////////////////////////////////////
/// DISCLAIMER
///
/// Copyright 2016 ArangoDB GmbH, Cologne, Germany
///
/// Licensed under the Apache License, Version 2.0 (the "License");
/// you may not use this file except in compliance with the License.
/// You may obtain a copy of the License at
///
///     http://www.apache.org/licenses/LICENSE-2.0
///
/// Unless required by applicable law or agreed to in writing, software
/// distributed under the License is distributed on an "AS IS" BASIS,
/// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
/// See the License for the specific language governing permissions and
/// limitations under the License.
///
/// Copyright holder is ArangoDB GmbH, Cologne, Germany
///
/// @author Simon Grätzer
////////////////////////////////////////////////////////////////////////////////

#include "Recovery.h"

#include <algorithm>
#include "Agency/Supervision.h"
#include "Basics/MutexLocker.h"
#include "Cluster/ClusterInfo.h"
#include "Pregel/Conductor.h"
#include "Pregel/PregelFeature.h"
#include "Pregel/ThreadPool.h"
#include "Pregel/Utils.h"
#include "Pregel/WorkerConfig.h"
#include "VocBase/LogicalCollection.h"

using namespace arangodb;
using namespace arangodb::pregel;

RecoveryManager::RecoveryManager() {}  //(AgencyCallbackRegistry* registry){}
// : _agencyCallbackRegistry(registry)

RecoveryManager::~RecoveryManager() {
  //  for (auto const& call : _agencyCallbacks) {
  //    _agencyCallbackRegistry->unregisterCallback(call.second);
  //  }
  //  _agencyCallbacks.clear();
  _listeners.clear();
}

void RecoveryManager::stopMonitoring(Conductor* listener) {
  MUTEX_LOCKER(guard, _lock);

  for (auto& pair : _listeners) {
    if (pair.second.find(listener) != pair.second.end()) {
      pair.second.erase(listener);
    }
    //    if (pair.second.size() == 0) {
    //      std::shared_ptr<AgencyCallback> callback =
    //      _agencyCallbacks[pair.first];
    //      _agencyCallbackRegistry->unregisterCallback(callback);
    //      _agencyCallbacks.erase(pair.first);
    //    }
  }
}

void RecoveryManager::monitorCollections(
    std::vector<std::shared_ptr<LogicalCollection>> const& collections,
    Conductor* listener) {
  MUTEX_LOCKER(guard, _lock);

  for (auto const& coll : collections) {
    CollectionID cid = coll->cid_as_string();
    std::shared_ptr<std::vector<ShardID>> shards =
        ClusterInfo::instance()->getShardList(cid);
    if (!shards) {
      continue;
    }

    for (ShardID const& shard : *(shards.get())) {
      std::set<Conductor*>& conductors = _listeners[shard];
      if (conductors.find(listener) != conductors.end()) {
        continue;
      }
      conductors.insert(listener);
      //_monitorShard(coll->dbName(), cid, shard);

      std::shared_ptr<std::vector<ServerID>> servers =
          ClusterInfo::instance()->getResponsibleServer(shard);
      if (servers->size() > 0) {
        // _lock is already held
        _primaryServers[shard] = servers->at(0);
      }
    }
  }
}

/*
/// Only call while holding _lock
void RecoveryManager::_monitorShard(DatabaseID const& databaseName,
                                  CollectionID const& cid,
                                  ShardID const& shard) {

std::function<bool(VPackSlice const& result)> listener =
    [this, shard](VPackSlice const& result) {
      MUTEX_LOCKER(guard, _lock);  // we are editing _primaryServers

      auto const& conductors = _listeners.find(shard);
      if (conductors == _listeners.end()) {
        return false;
      }

      if (result.isArray()) {
        if (result.length() > 0) {
          ServerID nextPrimary = result.at(0).copyString();
          auto const& currentPrimary = _primaryServers.find(shard);
          if (currentPrimary != _primaryServers.end() &&
              currentPrimary->second != nextPrimary) {
            _primaryServers[shard] = nextPrimary;
            for (Conductor* cc : conductors->second) {
              cc->startRecovery();
            }
          }
        } else {  // what a terrible failure
          for (Conductor* cc : conductors->second) {
            cc->cancel();
          }
        }
      }

      LOG_TOPIC(INFO, Logger::PREGEL) << result.toString();
      return true;
    };

std::string path = "Plan/Collections/" + databaseName + "/" + cid
                   + "/shards/" + shard;

// first let's resolve the primary so we know if it has changed later
// AgencyCommResult result = _agency.getValues(path);
std::shared_ptr<std::vector<ServerID>> servers =
    ClusterInfo::instance()->getResponsibleServer(shard);
if (servers->size() > 0) {
  // _lock is already held
  _primaryServers[shard] = servers->at(0);

  auto call =
      std::make_shared<AgencyCallback>(_agency, path, listener, true, false);
  _agencyCallbacks.emplace(shard, call);
  _agencyCallbackRegistry->registerCallback(call);
}
}*/

int RecoveryManager::filterGoodServers(std::vector<ServerID> const& servers,
                                       std::vector<ServerID>& goodServers) {
  // TODO I could also use ClusterInfo::failedServers
  AgencyCommResult result = _agency.getValues("Supervision/Health");
  if (result.successful()) {
    VPackSlice serversRegistered =
        result.slice()[0].get(std::vector<std::string>(
            {AgencyCommManager::path(), "Supervision", "Health"}));

    LOG_TOPIC(INFO, Logger::PREGEL) << "Server Status: "
                                    << serversRegistered.toJson();

    if (serversRegistered.isObject()) {
      for (auto const& res : VPackObjectIterator(serversRegistered)) {
        VPackSlice serverId = res.key;
        VPackSlice slice = res.value;
        if (slice.isObject() && slice.hasKey("Status")) {
          VPackSlice status = slice.get("Status");
          if (status.compareString(
                  consensus::Supervision::HEALTH_STATUS_GOOD) == 0) {
            ServerID name = serverId.copyString();
            if (std::find(servers.begin(), servers.end(), name) !=
                servers.end()) {
              goodServers.push_back(name);
            }
          }
        }
      }
    }
  } else {
    return result.errorCode();
  }
  return TRI_ERROR_NO_ERROR;
}

void RecoveryManager::updatedFailedServers() {
  MUTEX_LOCKER(guard, _lock);  // we are accessing _primaryServers

  std::vector<std::string> const failed =
      ClusterInfo::instance()->getFailedServers();
  for (auto const& pair : _primaryServers) {
    auto const& it = std::find(failed.begin(), failed.end(), pair.second);
    if (it != failed.end()) {
      // found a failed server
      ShardID const& shard = pair.first;
      ThreadPool* pool = PregelFeature::instance()->threadPool();
      pool->enqueue([this, shard] { _renewPrimaryServer(shard); });
    }
  }
}

// should try to figure out if the primary server for a shards has changed
// it doesn't really matter if this is called multiple times, this should not
// affect the outcome.
// don't call while holding _lock
void RecoveryManager::_renewPrimaryServer(ShardID const& shard) {
  MUTEX_LOCKER(guard, _lock);  // editing

  ClusterInfo* ci = ClusterInfo::instance();
  auto const& conductors = _listeners.find(shard);
  auto const& currentPrimary = _primaryServers.find(shard);
  if (conductors == _listeners.end() ||
      currentPrimary == _primaryServers.end()) {
    LOG_TOPIC(ERR, Logger::PREGEL) << "Shard is not properly registered";
    return;
  }

  int tries = 0;
  do {
    std::shared_ptr<std::vector<ServerID>> servers =
        ci->getResponsibleServer(shard);
    if (servers) {
      ServerID const& nextPrimary = servers->front();
      if (currentPrimary->second != nextPrimary) {
        _primaryServers[shard] = nextPrimary;
        for (Conductor* cc : conductors->second) {
          cc->startRecovery();
        }
        LOG_TOPIC(INFO, Logger::PREGEL) << "Recovery action was initiated";
        break;
      }
    }
    usleep(100000);  // 100ms
    tries++;
  } while (tries < 3);
}