arangodb/arangod/Aql/EngineInfoContainerDBServer...

////////////////////////////////////////////////////////////////////////////////
/// DISCLAIMER
///
/// Copyright 2014-2016 ArangoDB GmbH, Cologne, Germany
/// Copyright 2004-2014 triAGENS GmbH, Cologne, Germany
///
/// Licensed under the Apache License, Version 2.0 (the "License");
/// you may not use this file except in compliance with the License.
/// You may obtain a copy of the License at
///
///     http://www.apache.org/licenses/LICENSE-2.0
///
/// Unless required by applicable law or agreed to in writing, software
/// distributed under the License is distributed on an "AS IS" BASIS,
/// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
/// See the License for the specific language governing permissions and
/// limitations under the License.
///
/// Copyright holder is ArangoDB GmbH, Cologne, Germany
///
/// @author Michael Hackstein
////////////////////////////////////////////////////////////////////////////////

#include "EngineInfoContainerDBServer.h"

#include "Aql/ClusterNodes.h"
#include "Aql/Collection.h"
#include "Aql/ExecutionEngine.h"
#include "Aql/ExecutionNode.h"
#include "Aql/GraphNode.h"
#include "Aql/IndexNode.h"
#include "Aql/ModificationNodes.h"
#include "Aql/Query.h"
#include "Basics/Result.h"
#include "Cluster/ClusterComm.h"
#include "Cluster/ServerState.h"
#include "Cluster/TraverserEngineRegistry.h"
#include "Graph/BaseOptions.h"
#include "StorageEngine/TransactionState.h"
#include "VocBase/LogicalCollection.h"
#include "VocBase/ticks.h"
#ifdef USE_IRESEARCH
#include "IResearch/IResearchViewNode.h"
#endif

using namespace arangodb;
using namespace arangodb::aql;
static const double SETUP_TIMEOUT = 25.0;

static Result ExtractRemoteAndShard(VPackSlice keySlice, size_t& remoteId, std::string& shardId) {
  TRI_ASSERT(keySlice.isString()); // used as  a key in Json
  StringRef key(keySlice);
  size_t p = key.find(':');
  if (p == std::string::npos) {
    return {TRI_ERROR_CLUSTER_AQL_COMMUNICATION,
      "Unexpected response from DBServer during setup"};
  }
  StringRef remId = key.substr(0, p);
  remoteId = basics::StringUtils::uint64(remId.begin(), remId.length());
  if (remoteId == 0) {
    return {TRI_ERROR_CLUSTER_AQL_COMMUNICATION,
      "Unexpected response from DBServer during setup"};
  }
  shardId = key.substr(p + 1).toString();
  if (shardId.empty()) {
    return {TRI_ERROR_CLUSTER_AQL_COMMUNICATION,
      "Unexpected response from DBServer during setup"};
  }
  return {TRI_ERROR_NO_ERROR};
}

EngineInfoContainerDBServer::EngineInfo::EngineInfo(size_t idOfRemoteNode) noexcept
    : _idOfRemoteNode(idOfRemoteNode),
      _otherId(0),
      _collection(nullptr) {
}

EngineInfoContainerDBServer::EngineInfo::~EngineInfo() {
  // This container is not responsible for nodes
  // they are managed by the AST somewhere else
  // We are also not responsible for the collection.
  TRI_ASSERT(!_nodes.empty());
}

EngineInfoContainerDBServer::EngineInfo::EngineInfo(EngineInfo&& other) noexcept
    : _nodes(std::move(other._nodes)),
      _idOfRemoteNode(other._idOfRemoteNode),
      _otherId(other._otherId),
      _collection(other._collection) {
  TRI_ASSERT(!_nodes.empty());
  TRI_ASSERT(_collection != nullptr);
}

void EngineInfoContainerDBServer::EngineInfo::addNode(ExecutionNode* node) {
  switch (node->getType()) {
    case ExecutionNode::ENUMERATE_COLLECTION: {
      TRI_ASSERT(_type == ExecutionNode::MAX_NODE_TYPE_VALUE);
      auto ecNode = ExecutionNode::castTo<EnumerateCollectionNode*>(node);
      if (ecNode->isRestricted()) {
        TRI_ASSERT(_restrictedShard.empty());
        _restrictedShard = ecNode->restrictedShard();
      }
      break;
    }
    case ExecutionNode::INDEX: {
      TRI_ASSERT(_type == ExecutionNode::MAX_NODE_TYPE_VALUE);
      auto idxNode = ExecutionNode::castTo<IndexNode*>(node);
      if (idxNode->isRestricted()) {
        TRI_ASSERT(_restrictedShard.empty());
        _restrictedShard = idxNode->restrictedShard();
      }
      break;
    }
#ifdef USE_IRESEARCH
    case ExecutionNode::ENUMERATE_IRESEARCH_VIEW:{
      TRI_ASSERT(_type == ExecutionNode::MAX_NODE_TYPE_VALUE);
      auto* viewNode = ExecutionNode::castTo<iresearch::IResearchViewNode*>(node);
      TRI_ASSERT(viewNode);
      _type = ExecutionNode::ENUMERATE_IRESEARCH_VIEW;
      _view = &viewNode->view();
      break;
    }
#endif
    case ExecutionNode::INSERT:
    case ExecutionNode::UPDATE:
    case ExecutionNode::REMOVE:
    case ExecutionNode::REPLACE:
    case ExecutionNode::UPSERT: {
      TRI_ASSERT(_type == ExecutionNode::MAX_NODE_TYPE_VALUE);
      auto modNode = ExecutionNode::castTo<ModificationNode*>(node);
      if (modNode->isRestricted()) {
        TRI_ASSERT(_restrictedShard.empty());
        _restrictedShard = modNode->restrictedShard();
      }
      break;
    }
    default:
      // do nothing
      break;
  }
  _nodes.emplace_back(node);
}

Collection const* EngineInfoContainerDBServer::EngineInfo::collection() const noexcept {
  TRI_ASSERT(ExecutionNode::MAX_NODE_TYPE_VALUE == _type);
  return _collection;
}

LogicalView const* EngineInfoContainerDBServer::EngineInfo::view() const noexcept {
  TRI_ASSERT(ExecutionNode::ENUMERATE_IRESEARCH_VIEW == _type);
  return _view;
}

void EngineInfoContainerDBServer::EngineInfo::serializeSnippet(
    Query* query,
    std::vector<ShardID> const& shards,
    VPackBuilder& infoBuilder
) const {
  // The Key is required to build up the queryId mapping later
  std::string id = arangodb::basics::StringUtils::itoa(_idOfRemoteNode);
  for (auto const& shard : shards) {
    id += ':';
    id += shard;
  }
  infoBuilder.add(VPackValue(id));

  TRI_ASSERT(!_nodes.empty());
  // copy the relevant fragment of the plan for the list of shards
  // Note that in these parts of the query there are no SubqueryNodes,
  // since they are all on the coordinator!

  ExecutionPlan plan(query->ast());
  ExecutionNode* previous = nullptr;

  for (auto enIt = _nodes.rbegin(), end = _nodes.rend(); enIt != end; ++enIt) {
    ExecutionNode const* current = *enIt;
    auto* clone = current->clone(&plan, false, false);
    auto const nodeType = clone->getType();

    // we need to count nodes by type ourselves, as we will set the "varUsageComputed"
    // flag below (which will handle the counting)
    plan.increaseCounter(nodeType);

#ifdef USE_IRESEARCH
    if (ExecutionNode::ENUMERATE_IRESEARCH_VIEW == nodeType) {
      auto* viewNode = ExecutionNode::castTo<iresearch::IResearchViewNode*>(clone);
      viewNode->shards() = shards;
    } else
#endif
    if (ExecutionNode::REMOTE == nodeType) {
      auto rem = ExecutionNode::castTo<RemoteNode*>(clone);
      // update the remote node with the information about the query
      rem->server("server:" + arangodb::ServerState::instance()->getId());
      rem->ownName(id);
      rem->queryId(_otherId);

      // only one of the remote blocks is responsible for forwarding the
      // initializeCursor and shutDown requests
      // for simplicity, we always use the first remote block if we have more
      // than one

      // Do we still need this???
      rem->isResponsibleForInitializeCursor(true);
    }

    if (previous != nullptr) {
      clone->addDependency(previous);
    }

    previous = clone;
  }
  TRI_ASSERT(previous != nullptr);

  plan.root(previous);
  plan.setVarUsageComputed();
  // Always Verbose
  const unsigned flags = ExecutionNode::SERIALIZE_DETAILS;
  plan.root()->toVelocyPack(infoBuilder, flags, /*keepTopLevelOpen*/false);
}

void EngineInfoContainerDBServer::EngineInfo::serializeSnippet(
    Query* query,
    ShardID id,
    VPackBuilder& infoBuilder,
    bool isResponsibleForInit
) const {
  if (!_restrictedShard.empty()) {
    if (id != _restrictedShard) {
      return;
    }
    // We only have one shard it has to be responsible!
    isResponsibleForInit = true;
  }
  // The Key is required to build up the queryId mapping later
  infoBuilder.add(VPackValue(
      arangodb::basics::StringUtils::itoa(_idOfRemoteNode) + ":" + id));

  TRI_ASSERT(!_nodes.empty());
  // copy the relevant fragment of the plan for each shard
  // Note that in these parts of the query there are no SubqueryNodes,
  // since they are all on the coordinator!
  // Also note: As _collection is set to the correct current shard
  // this clone does the translation collection => shardId implicitly
  // at the relevant parts of the query.

  _collection->setCurrentShard(id);

  ExecutionPlan plan(query->ast());
  ExecutionNode* previous = nullptr;

  for (auto enIt = _nodes.rbegin(), end = _nodes.rend(); enIt != end; ++enIt) {
    ExecutionNode const* current = *enIt;
    auto clone = current->clone(&plan, false, false);
    auto const nodeType = clone->getType();

    // we need to count nodes by type ourselves, as we will set the "varUsageComputed"
    // flag below (which will handle the counting)
    plan.increaseCounter(nodeType);

    if (ExecutionNode::REMOTE == nodeType) {
      auto rem = ExecutionNode::castTo<RemoteNode*>(clone);
      // update the remote node with the information about the query
      rem->server("server:" + arangodb::ServerState::instance()->getId());
      rem->ownName(id);
      rem->queryId(_otherId);

      // only one of the remote blocks is responsible for forwarding the
      // initializeCursor and shutDown requests
      // for simplicity, we always use the first remote block if we have more
      // than one

      // Do we still need this???
      rem->isResponsibleForInitializeCursor(isResponsibleForInit);
    }

    if (previous != nullptr) {
      clone->addDependency(previous);
    }

    previous = clone;
  }
  TRI_ASSERT(previous != nullptr);

  plan.root(previous);
  plan.setVarUsageComputed();
  const unsigned flags = ExecutionNode::SERIALIZE_DETAILS;
  plan.root()->toVelocyPack(infoBuilder, flags, /*keepTopLevelOpen*/false);
  _collection->resetCurrentShard();
}

void EngineInfoContainerDBServer::CollectionInfo::mergeShards(std::shared_ptr<std::vector<ShardID>> const& shards) {
  for (auto const& s : *shards) {
    usedShards.emplace(s);
  }
}

EngineInfoContainerDBServer::EngineInfoContainerDBServer(Query* query) noexcept
  : _query(query) {
}

void EngineInfoContainerDBServer::addNode(ExecutionNode* node) {
  TRI_ASSERT(!_engineStack.empty());
  _engineStack.top()->addNode(node);
  switch (node->getType()) {
    case ExecutionNode::ENUMERATE_COLLECTION:
      {
        auto ecNode = ExecutionNode::castTo<EnumerateCollectionNode*>(node);
        auto col = ecNode->collection();
        if (ecNode->isRestricted()) {
          std::unordered_set<std::string> restrict{ecNode->restrictedShard()};
          handleCollection(col, AccessMode::Type::READ, restrict);
        } else {
          handleCollection(col, AccessMode::Type::READ);
        }
        updateCollection(col);
        break;
      }
    case ExecutionNode::INDEX:
      {
        auto idxNode = ExecutionNode::castTo<IndexNode*>(node);
        auto col = idxNode->collection();
        if (idxNode->isRestricted()) {
          std::unordered_set<std::string> restrict{idxNode->restrictedShard()};
          handleCollection(col, AccessMode::Type::READ, restrict);
        } else {
          handleCollection(col, AccessMode::Type::READ);
        }
        updateCollection(col);
        break;
      }
#ifdef USE_IRESEARCH
    case ExecutionNode::ENUMERATE_IRESEARCH_VIEW: {
      auto* viewNode = ExecutionNode::castTo<iresearch::IResearchViewNode*>(node);
      TRI_ASSERT(viewNode);

      for (aql::Collection const& col : viewNode->collections()) {
        auto& info = handleCollection(&col, AccessMode::Type::READ);
        info.views.push_back(&viewNode->view());
      }
      break;
    }
#endif
    case ExecutionNode::INSERT:
    case ExecutionNode::UPDATE:
    case ExecutionNode::REMOVE:
    case ExecutionNode::REPLACE:
    case ExecutionNode::UPSERT:
      {
        auto modNode = ExecutionNode::castTo<ModificationNode*>(node);
        auto col = modNode->collection();
        if (modNode->isRestricted()) {
          std::unordered_set<std::string> restrict{modNode->restrictedShard()};
          handleCollection(col, AccessMode::Type::WRITE, restrict);
        } else {
          handleCollection(col, AccessMode::Type::WRITE);
        }
        updateCollection(col);
        break;
      }
    default:
      // Do nothing
      break;
  };
}

void EngineInfoContainerDBServer::openSnippet(size_t idOfRemoteNode) {
  _engineStack.emplace(std::make_shared<EngineInfo>(idOfRemoteNode));
}

// Closing a snippet means:
// 1. pop it off the stack.
// 2. Wire it up with the given coordinator ID
// 3. Move it in the Collection => Engine map

void EngineInfoContainerDBServer::closeSnippet(QueryId coordinatorEngineId) {
  TRI_ASSERT(!_engineStack.empty());
  auto e = _engineStack.top();
  TRI_ASSERT(e);
  _engineStack.pop();

  e->connectQueryId(coordinatorEngineId);

#ifdef USE_IRESEARCH
  if (ExecutionNode::ENUMERATE_IRESEARCH_VIEW == e->type()) {
    _viewInfos[e->view()].emplace_back(std::move(e));
  } else
#endif
  {
    TRI_ASSERT(e->collection() != nullptr);
    auto it = _collectionInfos.find(e->collection());
    // This is not possible we have a snippet where no collection is involved
    TRI_ASSERT(it != _collectionInfos.end());
    if (it == _collectionInfos.end()) {
      THROW_ARANGO_EXCEPTION_MESSAGE(
        TRI_ERROR_INTERNAL,
        "Created a DBServer QuerySnippet without a Collection. This should not happen. Please report this query to ArangoDB"
      );
    }
    it->second.engines.emplace_back(std::move(e));
  }
}

/**
 * @brief Take care of this collection, set the lock state accordingly
 *        and maintain the list of used shards for this collection.
 *
 * @param col The collection that should be used
 * @param accessType The lock-type of this collection
 * @param restrictedShards The list of shards that can be relevant in this query (a subset of the collection shards).
 *        Empty set means no restriction
 */
EngineInfoContainerDBServer::CollectionInfo& EngineInfoContainerDBServer::handleCollection(
    Collection const* col,
    AccessMode::Type const& accessType,
    std::unordered_set<std::string> const& restrictedShards /*= {}*/
) {
  auto const shards = col->shardIds(
    restrictedShards.empty()
      ? _query->queryOptions().shardIds
      : restrictedShards
  );

  // What if we have an empty shard list here?
  if (shards->empty()) {
    // TODO FIXME
    LOG_TOPIC(WARN, arangodb::Logger::AQL) << "TEMPORARY: A collection access of a query has no result in any shard";
  }

  auto& info = _collectionInfos[col];

  // We need to upgrade the lock
  info.lockType = std::max(info.lockType, accessType);
  info.mergeShards(shards);

  return info;
}

// Then we update the collection pointer of the last engine.
#ifndef USE_ENTERPRISE
void EngineInfoContainerDBServer::updateCollection(Collection const* col) {
  TRI_ASSERT(!_engineStack.empty());
  auto e = _engineStack.top();
  // ... const_cast
  e->collection(const_cast<Collection*>(col));
}
#endif

void EngineInfoContainerDBServer::DBServerInfo::addShardLock(
    AccessMode::Type const& lock, ShardID const& id) {
  _shardLocking[lock].emplace_back(id);
}

void EngineInfoContainerDBServer::DBServerInfo::addEngine(
    std::shared_ptr<EngineInfoContainerDBServer::EngineInfo> info,
    ShardID const& id) {
  _engineInfos[info].emplace_back(id);
}

void EngineInfoContainerDBServer::DBServerInfo::buildMessage(
    Query* query, VPackBuilder& infoBuilder) const {
  TRI_ASSERT(infoBuilder.isEmpty());

  infoBuilder.openObject();
  infoBuilder.add(VPackValue("lockInfo"));
  infoBuilder.openObject();
  for (auto const& shardLocks : _shardLocking) {
    infoBuilder.add(VPackValue(AccessMode::typeString(shardLocks.first)));
    infoBuilder.openArray();
    for (auto const& s : shardLocks.second) {
      infoBuilder.add(VPackValue(s));
    }
    infoBuilder.close();  // The array
  }
  infoBuilder.close();  // lockInfo
  infoBuilder.add(VPackValue("options"));
  injectQueryOptions(query, infoBuilder);
  infoBuilder.add(VPackValue("variables"));
  // This will open and close an Object.
  query->ast()->variables()->toVelocyPack(infoBuilder);
  infoBuilder.add(VPackValue("snippets"));
  infoBuilder.openObject();

  for (auto const& it : _engineInfos) {
    TRI_ASSERT(it.first);
    auto const& engine = *it.first;
    auto const& shards = it.second;

#ifdef USE_IRESEARCH
    // serialize for the list of shards
    if (it.first->type() == ExecutionNode::ENUMERATE_IRESEARCH_VIEW) {
      engine.serializeSnippet(query, shards, infoBuilder);
      continue;
    }
#endif

    bool isResponsibleForInit = true;
    for (auto const& shard : shards) {
      engine.serializeSnippet(query, shard, infoBuilder, isResponsibleForInit);
      isResponsibleForInit = false;
    }
  }
  infoBuilder.close();  // snippets
  injectTraverserEngines(infoBuilder);
  infoBuilder.close();  // Object
}

void EngineInfoContainerDBServer::DBServerInfo::injectTraverserEngines(
    VPackBuilder& infoBuilder) const {
  if (_traverserEngineInfos.empty()) {
    return;
  }
  TRI_ASSERT(infoBuilder.isOpenObject());
  infoBuilder.add(VPackValue("traverserEngines"));
  infoBuilder.openArray();
  for (auto const& it : _traverserEngineInfos) {
    GraphNode* en = it.first;
    auto const& list = it.second;
    infoBuilder.openObject();
    {
      // Options
      infoBuilder.add(VPackValue("options"));
      graph::BaseOptions* opts = en->options();
      opts->buildEngineInfo(infoBuilder);
    }
    {
      // Variables
      std::vector<aql::Variable const*> vars;
      en->getConditionVariables(vars);
      if (!vars.empty()) {
        infoBuilder.add(VPackValue("variables"));
        infoBuilder.openArray();
        for (auto v : vars) {
          v->toVelocyPack(infoBuilder);
        }
        infoBuilder.close();
      }
    }

    infoBuilder.add(VPackValue("shards"));
    infoBuilder.openObject();
    infoBuilder.add(VPackValue("vertices"));
    infoBuilder.openObject();
    for (auto const& col : list.vertexCollections) {
      infoBuilder.add(VPackValue(col.first));
      infoBuilder.openArray();
      for (auto const& v : col.second) {
        infoBuilder.add(VPackValue(v));
      }
      infoBuilder.close();  // this collection
    }
    infoBuilder.close();  // vertices

    infoBuilder.add(VPackValue("edges"));
    infoBuilder.openArray();
    for (auto const& edgeShards : list.edgeCollections) {
      infoBuilder.openArray();
      for (auto const& e : edgeShards) {
        infoBuilder.add(VPackValue(e));
      }
      infoBuilder.close();
    }
    infoBuilder.close();  // edges
    infoBuilder.close();  // shards

    en->enhanceEngineInfo(infoBuilder);

    infoBuilder.close();  // base
  }

  infoBuilder.close();  // traverserEngines
}

void EngineInfoContainerDBServer::DBServerInfo::combineTraverserEngines(
    ServerID const& serverID, VPackSlice const ids) {
  if (ids.length() != _traverserEngineInfos.size()) {
    THROW_ARANGO_EXCEPTION_MESSAGE(TRI_ERROR_CLUSTER_AQL_COMMUNICATION,
                                   "The DBServer was not able to create enough "
                                   "traversal engines. This can happen during "
                                   "failover. Please check; " +
                                       serverID);
  }
  auto idIter = VPackArrayIterator(ids);
  // We need to use the same order of iterating over
  // the traverserEngineInfos to wire the correct GraphNodes
  // to the correct engine ids
  for (auto const& it : _traverserEngineInfos) {
    it.first->addEngine(
        idIter.value().getNumber<traverser::TraverserEngineID>(), serverID);
    idIter.next();
  }
}

void EngineInfoContainerDBServer::DBServerInfo::addTraverserEngine(
    GraphNode* node, TraverserEngineShardLists&& shards) {
  _traverserEngineInfos.push_back(std::make_pair(node, std::move(shards)));
}

void EngineInfoContainerDBServer::DBServerInfo::injectQueryOptions(
    Query* query, VPackBuilder& infoBuilder) const {
  // the toVelocyPack will open & close the "options" object
  query->queryOptions().toVelocyPack(infoBuilder, true);
}

std::map<ServerID, EngineInfoContainerDBServer::DBServerInfo>
EngineInfoContainerDBServer::createDBServerMapping(
    std::unordered_set<ShardID>& lockedShards) const {
  auto* ci = ClusterInfo::instance();
  TRI_ASSERT(ci);

  std::map<ServerID, DBServerInfo> dbServerMapping;

  for (auto const& it : _collectionInfos) {
    // it.first => Collection const*
    // it.second.lockType => Lock Type
    // it.second.engines => All Engines using this collection
    // it.second.usedShards => All shards of this collection releveant for this query
    auto const& colInfo = it.second;

    for (auto const& s : colInfo.usedShards) {
      lockedShards.emplace(s);

      auto const servers = ci->getResponsibleServer(s);

      if (!servers || servers->empty()) {
        THROW_ARANGO_EXCEPTION_MESSAGE(
          TRI_ERROR_CLUSTER_BACKEND_UNAVAILABLE,
          "Could not find responsible server for shard " + s
        );
      }

      auto& responsible = (*servers)[0];
      auto& mapping = dbServerMapping[responsible];

      mapping.addShardLock(colInfo.lockType, s);

      for (auto& e : colInfo.engines) {
        mapping.addEngine(e, s);
      }

      for (auto const* view : colInfo.views) {
        auto const viewInfo = _viewInfos.find(view);

        if (viewInfo == _viewInfos.end()) {
          continue;
        }

        for (auto const& viewEngine : viewInfo->second) {
          mapping.addEngine(viewEngine, s);
        }
      }
    }
  }

#ifdef USE_ENTERPRISE
  prepareSatellites(dbServerMapping);
#endif

  return dbServerMapping;
}

void EngineInfoContainerDBServer::injectGraphNodesToMapping(
    std::map<ServerID, EngineInfoContainerDBServer::DBServerInfo>&
        dbServerMapping) const {
  if (_graphNodes.empty()) {
    return;
  }

#ifdef USE_ENTERPRISE
  transaction::Methods* trx = _query->trx();
  transaction::Options& trxOps = _query->trx()->state()->options();
#endif

  auto clusterInfo = arangodb::ClusterInfo::instance();

  /// Typedef for a complicated mapping used in TraverserEngines.
  typedef std::unordered_map<
      ServerID, EngineInfoContainerDBServer::TraverserEngineShardLists>
      Serv2ColMap;

  for (GraphNode* en : _graphNodes) {
    // Every node needs it's own Serv2ColMap
    Serv2ColMap mappingServerToCollections;
    en->prepareOptions();

    std::vector<std::unique_ptr<arangodb::aql::Collection>> const& edges =
        en->edgeColls();

    // Here we create a mapping
    // ServerID => ResponsibleShards
    // Where Responsible shards is divided in edgeCollections and
    // vertexCollections
    // For edgeCollections the Ordering is important for the index access.
    // Also the same edgeCollection can be included twice (iff direction is ANY)
    size_t length = edges.size();

    auto findServerLists = [&](ShardID const& shard) -> Serv2ColMap::iterator {
      auto serverList = clusterInfo->getResponsibleServer(shard);
      if (serverList->empty()) {
        THROW_ARANGO_EXCEPTION_MESSAGE(
            TRI_ERROR_CLUSTER_BACKEND_UNAVAILABLE,
            "Could not find responsible server for shard " + shard);
      }
      TRI_ASSERT(!serverList->empty());
      auto& leader = (*serverList)[0];
      auto pair = mappingServerToCollections.find(leader);
      if (pair == mappingServerToCollections.end()) {
        mappingServerToCollections.emplace(leader,
                                           TraverserEngineShardLists{length});
        pair = mappingServerToCollections.find(leader);
      }
      return pair;
    };

    std::unordered_set<std::string> const& restrictToShards =
        _query->queryOptions().shardIds;
    for (size_t i = 0; i < length; ++i) {
      auto shardIds = edges[i]->shardIds(restrictToShards);
      for (auto const& shard : *shardIds) {
        auto pair = findServerLists(shard);
        pair->second.edgeCollections[i].emplace_back(shard);
      }
    }

    std::vector<std::unique_ptr<arangodb::aql::Collection>> const& vertices =
        en->vertexColls();
    if (vertices.empty()) {
      std::unordered_set<std::string> knownEdges;
      for (auto const& it : edges) {
        knownEdges.emplace(it->getName());
      }
      // This case indicates we do not have a named graph. We simply use
      // ALL collections known to this query.
      std::map<std::string, Collection*>* cs =
          _query->collections()->collections();
      for (auto const& collection : (*cs)) {
        if (knownEdges.find(collection.second->getName()) == knownEdges.end()) {
          // This collection is not one of the edge collections used in this
          // graph.
          auto shardIds = collection.second->shardIds(restrictToShards);
          for (ShardID const& shard : *shardIds) {
            auto pair = findServerLists(shard);
            pair->second.vertexCollections[collection.second->getName()]
                .emplace_back(shard);
#ifdef USE_ENTERPRISE
            if (trx->isInaccessibleCollectionId(
                    collection.second->getPlanId())) {
              TRI_ASSERT(
                  ServerState::instance()->isSingleServerOrCoordinator());
              TRI_ASSERT(trxOps.skipInaccessibleCollections);
              pair->second.inaccessibleShards.insert(shard);
              pair->second.inaccessibleShards.insert(
                  std::to_string(collection.second->getCollection()->id()));
            }
#endif
          }
        }
      }
      // We have to make sure that all engines at least know all vertex
      // collections.
      // Thanks to fanout...
      for (auto const& collection : (*cs)) {
        for (auto& entry : mappingServerToCollections) {
          auto it =
              entry.second.vertexCollections.find(collection.second->getName());
          if (it == entry.second.vertexCollections.end()) {
            entry.second.vertexCollections.emplace(collection.second->getName(),
                                                   std::vector<ShardID>());
          }
        }
      }
    } else {
      // This Traversal is started with a GRAPH. It knows all relevant
      // collections.
      for (auto const& it : vertices) {
        auto shardIds = it->shardIds(restrictToShards);
        for (ShardID const& shard : *shardIds) {
          auto pair = findServerLists(shard);
          pair->second.vertexCollections[it->getName()].emplace_back(shard);
#ifdef USE_ENTERPRISE
          if (trx->isInaccessibleCollectionId(it->getPlanId())) {
            TRI_ASSERT(trxOps.skipInaccessibleCollections);
            pair->second.inaccessibleShards.insert(shard);
            pair->second.inaccessibleShards.insert(
                std::to_string(it->getCollection()->id()));
          }
#endif
        }
      }
      // We have to make sure that all engines at least know all vertex
      // collections.
      // Thanks to fanout...
      for (auto const& it : vertices) {
        for (auto& entry : mappingServerToCollections) {
          auto vIt = entry.second.vertexCollections.find(it->getName());
          if (vIt == entry.second.vertexCollections.end()) {
            entry.second.vertexCollections.emplace(it->getName(),
                                                   std::vector<ShardID>());
          }
        }
      }
    }

    // Now we have sorted all collections on the db servers. Hand the engines
    // over
    // to the server builder.

    // NOTE: This is only valid because it is single threaded and we do not
    // have concurrent access. We move out stuff from this Map => memory will
    // get corrupted if we would read it again.
    for (auto it : mappingServerToCollections) {
      // This condition is guaranteed because all shards have been prepared
      // for locking in the EngineInfos
      TRI_ASSERT(dbServerMapping.find(it.first) != dbServerMapping.end());
      dbServerMapping.find(it.first)->second.addTraverserEngine(
          en, std::move(it.second));
    }
  }
}

Result EngineInfoContainerDBServer::buildEngines(
    MapRemoteToSnippet& queryIds,
    std::unordered_set<ShardID>& lockedShards) const {
  TRI_ASSERT(_engineStack.empty());

  // We create a map for DBServer => All Query snippets executed there
  auto dbServerMapping = createDBServerMapping(lockedShards);
  // This Mapping does not contain Traversal Engines
  //
  // We add traversal engines if necessary
  injectGraphNodesToMapping(dbServerMapping);

  auto cc = ClusterComm::instance();

  if (cc == nullptr) {
    // nullptr only happens on controlled shutdown
    return {TRI_ERROR_SHUTTING_DOWN};
  }

  std::string const url(
    "/_db/"
    + arangodb::basics::StringUtils::urlEncode(_query->vocbase().name())
    + "/_api/aql/setup"
  );
  bool needCleanup = true;
  auto cleanup = [&]() {
    if (needCleanup) {
      cleanupEngines(
        cc, TRI_ERROR_INTERNAL, _query->vocbase().name(), queryIds
      );
    }
  };
  TRI_DEFER(cleanup());

  std::unordered_map<std::string, std::string> headers;
  // Build Lookup Infos
  VPackBuilder infoBuilder;

  for (auto& it : dbServerMapping) {
    std::string const serverDest = "server:" + it.first;

    LOG_TOPIC(DEBUG, arangodb::Logger::AQL) << "Building Engine Info for "
                                            << it.first;
    infoBuilder.clear();
    it.second.buildMessage(_query, infoBuilder);
    LOG_TOPIC(DEBUG, arangodb::Logger::AQL) << "Sending the Engine info: "
                                            << infoBuilder.toJson();

    // Now we send to DBServers.
    // We expect a body with {snippets: {id => engineId}, traverserEngines:
    // [engineId]}}

    CoordTransactionID coordTransactionID = TRI_NewTickServer();
    auto res = cc->syncRequest("", coordTransactionID, serverDest,
                               RequestType::POST, url, infoBuilder.toJson(),
                               headers, SETUP_TIMEOUT);

    if (res->getErrorCode() != TRI_ERROR_NO_ERROR) {
      LOG_TOPIC(DEBUG, Logger::AQL) << it.first << " responded with "
                                    << res->getErrorCode() << " -> "
                                    << res->stringifyErrorMessage();
      LOG_TOPIC(TRACE, Logger::AQL) << infoBuilder.toJson();
      return {res->getErrorCode(), res->stringifyErrorMessage()};
    }

    std::shared_ptr<VPackBuilder> builder = res->result->getBodyVelocyPack();
    VPackSlice response = builder->slice();

    if (!response.isObject() || !response.get("result").isObject()) {
      LOG_TOPIC(ERR, Logger::AQL) << "Received error information from "
                                  << it.first << " : " << response.toJson();
      return {TRI_ERROR_CLUSTER_AQL_COMMUNICATION,
              "Unable to deploy query on all required "
              "servers. This can happen during "
              "Failover. Please check: " +
                  it.first};
    }

    VPackSlice result = response.get("result");
    VPackSlice snippets = result.get("snippets");

    for (auto const& resEntry : VPackObjectIterator(snippets)) {
      if (!resEntry.value.isString()) {
        return {TRI_ERROR_CLUSTER_AQL_COMMUNICATION,
                "Unable to deploy query on all required "
                "servers. This can happen during "
                "Failover. Please check: " +
                    it.first};
      }
      size_t remoteId = 0;
      std::string shardId = "";
      auto res = ExtractRemoteAndShard(resEntry.key, remoteId, shardId);
      if (!res.ok()) {
        return res;
      }
      TRI_ASSERT(remoteId != 0);
      TRI_ASSERT(!shardId.empty());
      auto& remote = queryIds[remoteId];
      auto& thisServer = remote[serverDest];
      thisServer.emplace_back(resEntry.value.copyString());
    }

    VPackSlice travEngines = result.get("traverserEngines");
    if (!travEngines.isNone()) {
      if (!travEngines.isArray()) {
        return {TRI_ERROR_CLUSTER_AQL_COMMUNICATION,
                "Unable to deploy query on all required "
                "servers. This can happen during "
                "Failover. Please check: " +
                    it.first};
      }

      it.second.combineTraverserEngines(it.first, travEngines);
    }
  }

#ifdef USE_ENTERPRISE
  resetSatellites();
#endif
  needCleanup = false;
  return TRI_ERROR_NO_ERROR;
}

void EngineInfoContainerDBServer::addGraphNode(GraphNode* node) {
  // Add all Edge Collections to the Transactions, Traversals do never write
  for (auto const& col : node->edgeColls()) {
    handleCollection(col.get(), AccessMode::Type::READ);
  }

  // Add all Vertex Collections to the Transactions, Traversals do never write
  auto& vCols = node->vertexColls();
  if (vCols.empty()) {
    // This case indicates we do not have a named graph. We simply use
    // ALL collections known to this query.
    std::map<std::string, Collection*>* cs =
        _query->collections()->collections();
    for (auto const& col : *cs) {
      handleCollection(col.second, AccessMode::Type::READ);
    }
  } else {
    for (auto const& col : node->vertexColls()) {
      handleCollection(col.get(), AccessMode::Type::READ);
    }
  }

  _graphNodes.emplace_back(node);
}

/**
 * @brief Will send a shutdown to all engines registered in the list of
 * queryIds.
 * NOTE: This function will ignore all queryids where the key is not of
 * the expected format
 * they may be leftovers from Coordinator.
 * Will also clear the list of queryIds after return.
 *
 * @param cc The ClusterComm
 * @param errorCode error Code to be send to DBServers for logging.
 * @param dbname Name of the database this query is executed in.
 * @param queryIds A map of QueryIds of the format: (remoteNodeId:shardId) ->
 * queryid.
 */
void EngineInfoContainerDBServer::cleanupEngines(
    std::shared_ptr<ClusterComm> cc, int errorCode,
    std::string const& dbname, MapRemoteToSnippet& queryIds) const {
  // Shutdown query snippets
  std::string url("/_db/" + arangodb::basics::StringUtils::urlEncode(dbname) +
                  "/_api/aql/shutdown/");
  std::vector<ClusterCommRequest> requests;
  auto body = std::make_shared<std::string>("{\"code\":" +
                                            std::to_string(errorCode) + "}");
  for (auto const& it : queryIds) {
    // it.first == RemoteNodeId, we don't need this
    // it.second server -> [snippets]
    for (auto const& serToSnippets : it.second) {
      auto server = serToSnippets.first;
      for (auto const& shardId : serToSnippets.second) {
        requests.emplace_back(server, rest::RequestType::PUT, url + shardId,
                              body);
      }
    }
  }

  // Shutdown traverser engines
  url = "/_db/" + arangodb::basics::StringUtils::urlEncode(dbname) +
                  "/_internal/traverser/";
  std::shared_ptr<std::string> noBody;
  for (auto const& gn : _graphNodes) {
    auto allEngines = gn->engines();
    for (auto const& engine : *allEngines) {
      requests.emplace_back(engine.first, rest::RequestType::DELETE_REQ,
                            url + basics::StringUtils::itoa(engine.second), noBody);
    }
  }

  cc->fireAndForgetRequests(requests);
  queryIds.clear();
}