arangodb/arangosh/Dump/DumpFeature.cpp

////////////////////////////////////////////////////////////////////////////////
/// DISCLAIMER
///
/// Copyright 2016 ArangoDB GmbH, Cologne, Germany
///
/// Licensed under the Apache License, Version 2.0 (the "License");
/// you may not use this file except in compliance with the License.
/// You may obtain a copy of the License at
///
///     http://www.apache.org/licenses/LICENSE-2.0
///
/// Unless required by applicable law or agreed to in writing, software
/// distributed under the License is distributed on an "AS IS" BASIS,
/// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
/// See the License for the specific language governing permissions and
/// limitations under the License.
///
/// Copyright holder is ArangoDB GmbH, Cologne, Germany
///
/// @author Jan Steemann
/// @author Dan Larkin-York
////////////////////////////////////////////////////////////////////////////////

#include "DumpFeature.h"

#include <chrono>
#include <iostream>
#include <thread>

#include <velocypack/Builder.h>
#include <velocypack/Collection.h>
#include <velocypack/Iterator.h>
#include <velocypack/velocypack-aliases.h>
#include <boost/algorithm/clamp.hpp>

#include "ApplicationFeatures/ApplicationServer.h"
#include "Basics/FileUtils.h"
#include "Basics/MutexLocker.h"
#include "Basics/Result.h"
#include "Basics/StaticStrings.h"
#include "Basics/StringUtils.h"
#include "Basics/VelocyPackHelper.h"
#include "Maskings/Maskings.h"
#include "ProgramOptions/ProgramOptions.h"
#include "Random/RandomGenerator.h"
#include "Shell/ClientFeature.h"
#include "SimpleHttpClient/SimpleHttpClient.h"
#include "SimpleHttpClient/SimpleHttpResult.h"
#include "Ssl/SslInterface.h"
#include "Utils/ManagedDirectory.h"

#ifdef USE_ENTERPRISE
#include "Enterprise/Encryption/EncryptionFeature.h"
#endif

namespace {

/// @brief fake client id we will send to the server. the server keeps
/// track of all connected clients
static uint64_t clientId = 0;

/// @brief name of the feature to report to application server
constexpr auto FeatureName = "Dump";

/// @brief minimum amount of data to fetch from server in a single batch
constexpr uint64_t MinChunkSize = 1024 * 128;

/// @brief maximum amount of data to fetch from server in a single batch
// NB: larger value may cause tcp issues (check exact limits)
constexpr uint64_t MaxChunkSize = 1024 * 1024 * 96;

/// @brief generic error for if server returns bad/unexpected json
const arangodb::Result ErrorMalformedJsonResponse = {
    TRI_ERROR_INTERNAL, "got malformed JSON response from server"};

/// @brief check whether HTTP response is valid, complete, and not an error
arangodb::Result checkHttpResponse(
    arangodb::httpclient::SimpleHttpClient& client,
    std::unique_ptr<arangodb::httpclient::SimpleHttpResult> const& response) {
  using arangodb::basics::StringUtils::itoa;
  if (response == nullptr || !response->isComplete()) {
    return {TRI_ERROR_INTERNAL,
            "got invalid response from server: " + client.getErrorMessage()};
  }
  if (response->wasHttpError()) {
    int errorNum = TRI_ERROR_INTERNAL;
    std::string errorMsg = response->getHttpReturnMessage();
    std::shared_ptr<arangodb::velocypack::Builder> bodyBuilder(response->getBodyVelocyPack());
    arangodb::velocypack::Slice error = bodyBuilder->slice();
    if (!error.isNone() && error.hasKey(arangodb::StaticStrings::ErrorMessage)) {
      errorNum = error.get(arangodb::StaticStrings::ErrorNum).getNumericValue<int>();
      errorMsg = error.get(arangodb::StaticStrings::ErrorMessage).copyString();
    }
    return {errorNum, "got invalid response from server: HTTP " +
                      itoa(response->getHttpReturnCode()) + ": " + errorMsg};
  }
  return {TRI_ERROR_NO_ERROR};
}

/// @brief checks that a file pointer is valid and file status is ok
bool fileOk(arangodb::ManagedDirectory::File* file) {
  return (file && file->status().ok());
}

/// @brief assuming file pointer is not ok, generate/extract proper error
arangodb::Result fileError(arangodb::ManagedDirectory::File* file,
                           bool isWritable) {
  if (!file) {
    if (isWritable) {
      return {TRI_ERROR_CANNOT_WRITE_FILE};
    } else {
      return {TRI_ERROR_CANNOT_READ_FILE};
    }
  }
  return file->status();
}

/// @brief start a batch via the replication API
std::pair<arangodb::Result, uint64_t> startBatch(
    arangodb::httpclient::SimpleHttpClient& client, std::string const& DBserver) {
  using arangodb::basics::VelocyPackHelper;
  using arangodb::basics::StringUtils::uint64;

  std::string url = "/_api/replication/batch?serverId=" + std::to_string(clientId);
  std::string const body = "{\"ttl\":300}";
  std::string urlExt;
  if (!DBserver.empty()) {
    url += "&DBserver=" + DBserver;
  }

  std::unique_ptr<arangodb::httpclient::SimpleHttpResult> response(
      client.request(arangodb::rest::RequestType::POST, url,
                     body.c_str(), body.size()));
  auto check = ::checkHttpResponse(client, response);
  if (check.fail()) {
    LOG_TOPIC(ERR, arangodb::Logger::DUMP) << "An error occurred while creating dump context: " << check.errorMessage();
    return {check, 0};
  }

  // extract vpack body from response
  std::shared_ptr<VPackBuilder> parsedBody;
  try {
    parsedBody = response->getBodyVelocyPack();
  } catch (...) {
    return {::ErrorMalformedJsonResponse, 0};
  }
  VPackSlice const resBody = parsedBody->slice();

  // look up "id" value
  std::string const id = VelocyPackHelper::getStringValue(resBody, "id", "");

  return {{TRI_ERROR_NO_ERROR}, uint64(id)};
}

/// @brief prolongs a batch to ensure we can complete our dump
void extendBatch(arangodb::httpclient::SimpleHttpClient& client,
                 std::string const& DBserver, uint64_t batchId) {
  using arangodb::basics::StringUtils::itoa;
  TRI_ASSERT(batchId > 0);

  std::string url = "/_api/replication/batch/" + itoa(batchId) + "?serverId=" + std::to_string(clientId);
  std::string const body = "{\"ttl\":300}";
  if (!DBserver.empty()) {
    url += "&DBserver=" + DBserver;
  }

  std::unique_ptr<arangodb::httpclient::SimpleHttpResult> response(
      client.request(arangodb::rest::RequestType::PUT, url,
                     body.c_str(), body.size()));
  // ignore any return value
}

/// @brief mark our batch finished so resources can be freed on server
void endBatch(arangodb::httpclient::SimpleHttpClient& client,
              std::string DBserver, uint64_t& batchId) {
  using arangodb::basics::StringUtils::itoa;
  TRI_ASSERT(batchId > 0);

  std::string url = "/_api/replication/batch/" + itoa(batchId) + "?serverId=" + std::to_string(clientId);
  if (!DBserver.empty()) {
    url += "&DBserver=" + DBserver;
  }

  std::unique_ptr<arangodb::httpclient::SimpleHttpResult> response(
      client.request(arangodb::rest::RequestType::DELETE_REQ, url,
                     nullptr, 0));
  // ignore any return value

  // overwrite the input id
  batchId = 0;
}

/// @brief execute a WAL flush request
void flushWal(arangodb::httpclient::SimpleHttpClient& client) {
  static std::string const url =
      "/_admin/wal/flush?waitForSync=true&waitForCollector=true";

  std::unique_ptr<arangodb::httpclient::SimpleHttpResult> response(
      client.request(arangodb::rest::RequestType::PUT, url, nullptr, 0));
  auto check = ::checkHttpResponse(client, response);
  if (check.fail()) {
    // TODO should we abort early here?
    LOG_TOPIC(ERR, arangodb::Logger::DUMP)
        << "Got invalid response from server when flushing WAL: " + check.errorMessage();
  }
}

bool isIgnoredHiddenEnterpriseCollection(
    arangodb::DumpFeature::Options const& options, std::string const& name) {
#ifdef USE_ENTERPRISE
  if (!options.force && name[0] == '_') {
    if (strncmp(name.c_str(), "_local_", 7) == 0 ||
        strncmp(name.c_str(), "_from_", 6) == 0 ||
        strncmp(name.c_str(), "_to_", 4) == 0) {
      LOG_TOPIC(INFO, arangodb::Logger::DUMP)
          << "Dump is ignoring collection '" << name
          << "'. Will be created via SmartGraphs of a full dump. If you want to "
             "dump this collection anyway use 'arangodump --force'. "
             "However this is not recommended and you should instead dump "
             "the EdgeCollection of the SmartGraph instead.";
      return true;
    }
  }
#endif
  return false;
}

arangodb::Result dumpJsonObjects(arangodb::DumpFeature::JobData& jobData,
                                 arangodb::ManagedDirectory::File& file,
                                 arangodb::basics::StringBuffer const& body) {
  arangodb::basics::StringBuffer masked(1, false);
  arangodb::basics::StringBuffer const* result = &body;

  if (jobData.maskings != nullptr) {
    jobData.maskings->mask(jobData.name, body, masked);
    result = &masked;
  }

  file.write(result->c_str(), result->length());

  if (file.status().fail()) {
    return {TRI_ERROR_CANNOT_WRITE_FILE};
  }

  jobData.stats.totalWritten += static_cast<uint64_t>(result->length());

  return {TRI_ERROR_NO_ERROR};
}

/// @brief dump the actual data from an individual collection
arangodb::Result dumpCollection(arangodb::httpclient::SimpleHttpClient& client,
                                arangodb::DumpFeature::JobData& jobData,
                                arangodb::ManagedDirectory::File& file,
                                std::string const& name,
                                std::string const& server, uint64_t batchId,
                                uint64_t minTick, uint64_t maxTick) {
  using arangodb::basics::StringUtils::boolean;
  using arangodb::basics::StringUtils::itoa;
  using arangodb::basics::StringUtils::uint64;
  using arangodb::basics::StringUtils::urlEncode;

  uint64_t fromTick = minTick;
  uint64_t chunkSize =
      jobData.options.initialChunkSize;  // will grow adaptively up to max
  std::string baseUrl = "/_api/replication/dump?collection=" + urlEncode(name) +
                        "&batchId=" + itoa(batchId) + "&ticks=false";
  if (jobData.options.clusterMode) {
    // we are in cluster mode, must specify dbserver
    baseUrl += "&DBserver=" + server;
  } else {
    // we are in single-server mode, we already flushed the wal
    baseUrl += "&flush=false";
  }

  while (true) {
    std::string url =
        baseUrl + "&from=" + itoa(fromTick) + "&chunkSize=" + itoa(chunkSize);
    if (maxTick > 0) {  // limit to a certain timeframe
      url += "&to=" + itoa(maxTick);
    }

    ++(jobData.stats.totalBatches);  // count how many chunks we are fetching

    // make the actual request for data
    std::unique_ptr<arangodb::httpclient::SimpleHttpResult> response(
        client.request(arangodb::rest::RequestType::GET, url, nullptr, 0));
    auto check = ::checkHttpResponse(client, response);
    if (check.fail()) {
      LOG_TOPIC(ERR, arangodb::Logger::DUMP) << "An error occurred while dumping collection '" << name << "': " << check.errorMessage();
      return check;
    }

    // find out whether there are more results to fetch
    bool checkMore = false;

    bool headerExtracted;
    std::string header = response->getHeaderField(
        arangodb::StaticStrings::ReplicationHeaderCheckMore, headerExtracted);
    if (headerExtracted) {
      // first check the basic flag
      checkMore = boolean(header);
      if (checkMore) {
        // now check if the actual tick has changed
        header = response->getHeaderField(
            arangodb::StaticStrings::ReplicationHeaderLastIncluded,
            headerExtracted);
        if (headerExtracted) {
          uint64_t tick = uint64(header);
          if (tick > fromTick) {
            fromTick = tick;
          } else {
            // we got the same tick again, this indicates we're at the end
            checkMore = false;
          }
        }
      }
    }
    if (!headerExtracted) {  // NOT else, fallthrough from outer or inner above
      return {TRI_ERROR_REPLICATION_INVALID_RESPONSE,
              std::string("got invalid response from server: required header is missing while dumping collection '") + name + "'"};
    }

    // now actually write retrieved data to dump file
    arangodb::basics::StringBuffer const& body = response->getBody();
    arangodb::Result result = dumpJsonObjects(jobData, file, body);

    if (result.fail()) {
      return result;
    }

    if (!checkMore || fromTick == 0) {
      // all done, return successful
      return {TRI_ERROR_NO_ERROR};
    }

    // more data to retrieve, adaptively increase chunksize
    if (chunkSize < jobData.options.maxChunkSize) {
      chunkSize = static_cast<uint64_t>(chunkSize * 1.5);
      if (chunkSize > jobData.options.maxChunkSize) {
        chunkSize = jobData.options.maxChunkSize;
      }
    }
  }

  // should never get here, but need to make compiler play nice
  TRI_ASSERT(false);
  return {TRI_ERROR_INTERNAL};
}

/// @brief processes a single collection dumping job in single-server mode
arangodb::Result handleCollection(
    arangodb::httpclient::SimpleHttpClient& client,
    arangodb::DumpFeature::JobData& jobData,
    arangodb::ManagedDirectory::File& file) {
  // keep the batch alive
  ::extendBatch(client, "", jobData.batchId);

  // do the hard work in another function...
  return ::dumpCollection(client, jobData, file, jobData.name, "",
                          jobData.batchId, jobData.options.tickStart,
                          jobData.options.tickEnd);
}

/// @brief handle a single collection dumping job in cluster mode
arangodb::Result handleCollectionCluster(
    arangodb::httpclient::SimpleHttpClient& client,
    arangodb::DumpFeature::JobData& jobData,
    arangodb::ManagedDirectory::File& file) {
  arangodb::Result result{TRI_ERROR_NO_ERROR};

  // First we have to go through all the shards, what are they?
  VPackSlice const parameters = jobData.collectionInfo.get("parameters");
  VPackSlice const shards = parameters.get("shards");

  // Iterate over the Map of shardId to server list
  for (auto const it : VPackObjectIterator(shards)) {
    // extract shard name
    TRI_ASSERT(it.key.isString());
    std::string shardName = it.key.copyString();

    // extract dbserver id
    if (!it.value.isArray() || it.value.length() == 0 ||
        !it.value[0].isString()) {
      return {TRI_ERROR_BAD_PARAMETER,
              "unexpected value for 'shards' attribute"};
    }
    std::string DBserver = it.value[0].copyString();

    if (jobData.options.progress) {
      LOG_TOPIC(INFO, arangodb::Logger::DUMP)
          << "# Dumping shard '" << shardName << "' from DBserver '" << DBserver
          << "' ...";
    }

    // make sure we have a batch on this dbserver
    uint64_t batchId;
    std::tie(result, batchId) = ::startBatch(client, DBserver);
    if (result.ok()) {
      // do the hard work elsewhere
      result = ::dumpCollection(client, jobData, file, shardName, DBserver,
                                batchId, 0, UINT64_MAX);
      ::endBatch(client, DBserver, batchId);
    }

    if (result.fail()) {
      // fail early for collection if a given shard fails
      break;
    }
  }

  return result;
}

/// @brief process a single job from the queue
arangodb::Result processJob(arangodb::httpclient::SimpleHttpClient& client,
                            arangodb::DumpFeature::JobData& jobData) {
  using arangodb::velocypack::ObjectBuilder;

  arangodb::Result result{TRI_ERROR_NO_ERROR};

  bool dumpStructure = true;

  if (dumpStructure && jobData.maskings != nullptr) {
    dumpStructure = jobData.maskings->shouldDumpStructure(jobData.name);
  }

  if (!dumpStructure) {
    if (jobData.options.progress) {
      LOG_TOPIC(INFO, arangodb::Logger::DUMP)
          << "# Dumping collection '" << jobData.name << "'...";
    }

    return result;
  }

  // prep hex string of collection name
  std::string const hexString(
      arangodb::rest::SslInterface::sslMD5(jobData.name));

  // found a collection!
  if (jobData.options.progress) {
    LOG_TOPIC(INFO, arangodb::Logger::DUMP)
        << "# Dumping collection '" << jobData.name << "'...";
  }
  ++(jobData.stats.totalCollections);

  {
    // save meta data
    auto file = jobData.directory.writableFile(
        jobData.name + (jobData.options.clusterMode ? "" : ("_" + hexString)) +
            ".structure.json",
        true);
    if (!::fileOk(file.get())) {
      return ::fileError(file.get(), true);
    }

    VPackBuilder excludes;
    {  // { parameters: { shadowCollections: null } }
      ObjectBuilder object(&excludes);
      {
        ObjectBuilder subObject(&excludes, "parameters");
        subObject->add("shadowCollections", VPackSlice::nullSlice());
      }
    }

    VPackBuilder collectionWithExcludedParametersBuilder =
        VPackCollection::merge(jobData.collectionInfo, excludes.slice(), true,
                               true);

    std::string const collectionInfo =
        collectionWithExcludedParametersBuilder.slice().toJson();

    file->write(collectionInfo.c_str(), collectionInfo.size());
    if (file->status().fail()) {
      // close file and bail out
      result = file->status();
    }
  }

  if (result.ok()) {
    bool dumpData = jobData.options.dumpData;

    if (dumpData && jobData.maskings != nullptr) {
      dumpData = jobData.maskings->shouldDumpData(jobData.name);
    }

    if (dumpData) {
      // save the actual data
      auto file = jobData.directory.writableFile(
          jobData.name + "_" + hexString + ".data.json", true);
      if (!::fileOk(file.get())) {
        return ::fileError(file.get(), true);
      }

      if (jobData.options.clusterMode) {
        result = ::handleCollectionCluster(client, jobData, *file);
      } else {
        result = ::handleCollection(client, jobData, *file);
      }
    }
  }

  return result;
}

/// @brief handle the result of a single job
void handleJobResult(std::unique_ptr<arangodb::DumpFeature::JobData>&& jobData,
                     arangodb::Result const& result) {
  if (result.fail()) {
    jobData->feature.reportError(result);
  }
}

}  // namespace

namespace arangodb {

DumpFeature::JobData::JobData(ManagedDirectory& dir, DumpFeature& feat,
                              Options const& opts, maskings::Maskings* maskings,
                              Stats& stat, VPackSlice const& info, uint64_t const batch,
                              std::string const& c, std::string const& n,
                              std::string const& t)
    : directory{dir},
      feature{feat},
      options{opts},
      maskings{maskings},
      stats{stat},
      collectionInfo{info},
      batchId{batch},
      cid{c},
      name{n},
      type{t} {}

DumpFeature::DumpFeature(application_features::ApplicationServer& server,
                         int& exitCode)
    : ApplicationFeature(server, DumpFeature::featureName()),
      _clientManager{Logger::DUMP},
      _clientTaskQueue{::processJob, ::handleJobResult},
      _exitCode{exitCode} {
  requiresElevatedPrivileges(false);
  setOptional(false);
  startsAfter("BasicsPhase");

  using arangodb::basics::FileUtils::buildFilename;
  using arangodb::basics::FileUtils::currentDirectory;
  _options.outputPath = buildFilename(currentDirectory().result(), "dump");
};

std::string DumpFeature::featureName() { return ::FeatureName; }

void DumpFeature::collectOptions(
    std::shared_ptr<options::ProgramOptions> options) {
  using arangodb::options::BooleanParameter;
  using arangodb::options::StringParameter;
  using arangodb::options::UInt32Parameter;
  using arangodb::options::UInt64Parameter;
  using arangodb::options::VectorParameter;

  options->addOption(
      "--collection",
      "restrict to collection name (can be specified multiple times)",
      new VectorParameter<StringParameter>(&_options.collections));

  options->addOption("--initial-batch-size",
                     "initial size for individual data batches (in bytes)",
                     new UInt64Parameter(&_options.initialChunkSize));

  options->addOption("--batch-size",
                     "maximum size for individual data batches (in bytes)",
                     new UInt64Parameter(&_options.maxChunkSize));

  options->addOption("--threads",
                     "maximum number of collections to process in parallel",
                     new UInt32Parameter(&_options.threadCount));

  options->addOption("--dump-data", "dump collection data",
                     new BooleanParameter(&_options.dumpData));

  options->addOption(
      "--force", "continue dumping even in the face of some server-side errors",
      new BooleanParameter(&_options.force));

  options->addOption(
      "--ignore-distribute-shards-like-errors",
      "continue dump even if sharding prototype collection is "
      "not backed up along",
      new BooleanParameter(&_options.ignoreDistributeShardsLikeErrors));

  options->addOption("--include-system-collections",
                     "include system collections",
                     new BooleanParameter(&_options.includeSystemCollections));

  options->addOption("--output-directory", "output directory",
                     new StringParameter(&_options.outputPath));

  options->addOption("--overwrite", "overwrite data in output directory",
                     new BooleanParameter(&_options.overwrite));

  options->addOption("--progress", "show progress",
                     new BooleanParameter(&_options.progress));

  options->addOption("--tick-start", "only include data after this tick",
                     new UInt64Parameter(&_options.tickStart));

  options->addOption("--tick-end", "last tick to be included in data dump",
                     new UInt64Parameter(&_options.tickEnd));

  options->addOption("--maskings", "file with maskings definition",
                     new StringParameter(&_options.maskingsFile));
}

void DumpFeature::validateOptions(
    std::shared_ptr<options::ProgramOptions> options) {
  auto const& positionals = options->processingResult()._positionals;
  size_t n = positionals.size();

  if (1 == n) {
    _options.outputPath = positionals[0];
  } else if (1 < n) {
    LOG_TOPIC(FATAL, arangodb::Logger::FIXME)
        << "expecting at most one directory, got " +
               arangodb::basics::StringUtils::join(positionals, ", ");
    FATAL_ERROR_EXIT();
  }

  // clamp chunk values to allowed ranges
  _options.initialChunkSize = boost::algorithm::clamp(
      _options.initialChunkSize, ::MinChunkSize, ::MaxChunkSize);
  _options.maxChunkSize = boost::algorithm::clamp(
      _options.maxChunkSize, _options.initialChunkSize, ::MaxChunkSize);

  if (_options.tickStart < _options.tickEnd) {
    LOG_TOPIC(FATAL, arangodb::Logger::FIXME)
        << "invalid values for --tick-start or --tick-end";
    FATAL_ERROR_EXIT();
  }

  // trim trailing slash from path because it may cause problems on ...
  // Windows
  if (!_options.outputPath.empty() &&
      _options.outputPath.back() == TRI_DIR_SEPARATOR_CHAR) {
    TRI_ASSERT(_options.outputPath.size() > 0);
    _options.outputPath.pop_back();
  }

  uint32_t clamped = boost::algorithm::clamp(
      _options.threadCount, 1,
      4 * static_cast<uint32_t>(TRI_numberProcessors()));
  if (_options.threadCount != clamped) {
    LOG_TOPIC(WARN, Logger::FIXME) << "capping --threads value to " << clamped;
    _options.threadCount = clamped;
  }
}

// dump data from server
Result DumpFeature::runDump(httpclient::SimpleHttpClient& client,
                            std::string const& dbName) {
  Result result;
  uint64_t batchId;
  std::tie(result, batchId) = ::startBatch(client, "");
  if (result.fail()) {
    return result;
  }
  TRI_DEFER(::endBatch(client, "", batchId));

  // flush the wal and so we know we are getting everything
  flushWal(client);

  // fetch the collection inventory
  std::string const url =
      "/_api/replication/inventory?includeSystem=" +
      std::string(_options.includeSystemCollections ? "true" : "false") +
      "&batchId=" + basics::StringUtils::itoa(batchId);
  std::unique_ptr<httpclient::SimpleHttpResult> response(
      client.request(rest::RequestType::GET, url, nullptr, 0));
  auto check = ::checkHttpResponse(client, response);
  if (check.fail()) {
    LOG_TOPIC(ERR, arangodb::Logger::DUMP) << "An error occurred while fetching inventory: " << check.errorMessage();
    return check;
  }

  // extract the vpack body inventory
  std::shared_ptr<VPackBuilder> parsedBody;
  try {
    parsedBody = response->getBodyVelocyPack();
  } catch (...) {
    return ::ErrorMalformedJsonResponse;
  }
  VPackSlice const body = parsedBody->slice();
  if (!body.isObject()) {
    return ::ErrorMalformedJsonResponse;
  }

  // get the collections list
  VPackSlice const collections = body.get("collections");
  if (!collections.isArray()) {
    return ::ErrorMalformedJsonResponse;
  }

  // get the view list
  VPackSlice views = body.get("views");
  if (!views.isArray()) {
    views = VPackSlice::emptyArraySlice();
  }

  // Step 1. Store view definition files
  Result res = storeDumpJson(body, dbName);
  if (res.fail()) {
    return res;
  }

  // Step 2. Store view definition files
  res = storeViews(views);
  if (res.fail()) {
    return res;
  }

  // create a lookup table for collections
  std::map<std::string, bool> restrictList;
  for (size_t i = 0; i < _options.collections.size(); ++i) {
    restrictList.insert(
        std::pair<std::string, bool>(_options.collections[i], true));
  }

  // Step 3. iterate over collections, queue dump jobs
  for (VPackSlice const& collection : VPackArrayIterator(collections)) {
    // extract parameters about the individual collection
    if (!collection.isObject()) {
      return ::ErrorMalformedJsonResponse;
    }
    VPackSlice const parameters = collection.get("parameters");
    if (!parameters.isObject()) {
      return ::ErrorMalformedJsonResponse;
    }

    // extract basic info about the collection
    uint64_t const cid = basics::VelocyPackHelper::extractIdValue(parameters);
    std::string const name = arangodb::basics::VelocyPackHelper::getStringValue(
      parameters, StaticStrings::DataSourceName, ""
    );
    bool const deleted = arangodb::basics::VelocyPackHelper::getBooleanValue(
      parameters, StaticStrings::DataSourceDeleted.c_str(), false
    );
    int type = arangodb::basics::VelocyPackHelper::getNumericValue<int>(
      parameters, StaticStrings::DataSourceType.c_str(), 2
    );
    std::string const collectionType(type == 2 ? "document" : "edge");

    // basic filtering
    if (cid == 0 || name == "") {
      return ::ErrorMalformedJsonResponse;
    }

    if (deleted) {
      continue;
    }

    if (name[0] == '_' && !_options.includeSystemCollections) {
      continue;
    }

    // filter by specified names
    if (!restrictList.empty() &&
        restrictList.find(name) == restrictList.end()) {
      // collection name not in list
      continue;
    }

    // queue job to actually dump collection
    auto jobData = std::make_unique<JobData>(
        *_directory, *this, _options, _maskings.get(), _stats, collection, batchId,
        std::to_string(cid), name, collectionType);
    _clientTaskQueue.queueJob(std::move(jobData));
  }

  // wait for all jobs to finish, then check for errors
  _clientTaskQueue.waitForIdle();
  {
    MUTEX_LOCKER(lock, _workerErrorLock);
    if (!_workerErrors.empty()) {
      return _workerErrors.front();
    }
  }

  return {TRI_ERROR_NO_ERROR};
}

// dump data from cluster via a coordinator
Result DumpFeature::runClusterDump(httpclient::SimpleHttpClient& client,
                                   std::string const& dbname) {
  // get the cluster inventory
  std::string const url =
      "/_api/replication/clusterInventory?includeSystem=" +
      std::string(_options.includeSystemCollections ? "true" : "false");
  std::unique_ptr<httpclient::SimpleHttpResult> response(
      client.request(rest::RequestType::GET, url, nullptr, 0));
  auto check = ::checkHttpResponse(client, response);
  if (check.fail()) {
    LOG_TOPIC(ERR, arangodb::Logger::DUMP) << "An error occurred while fetching inventory: " << check.errorMessage();
    return check;
  }

  // parse the inventory vpack body
  std::shared_ptr<VPackBuilder> parsedBody;
  try {
    parsedBody = response->getBodyVelocyPack();
  } catch (...) {
    return ::ErrorMalformedJsonResponse;
  }
  VPackSlice const body = parsedBody->slice();
  if (!body.isObject()) {
    return ::ErrorMalformedJsonResponse;
  }

  // parse collections array
  VPackSlice const collections = body.get("collections");
  if (!collections.isArray()) {
    return ::ErrorMalformedJsonResponse;
  }

  // get the view list
  VPackSlice views = body.get("views");
  if (!views.isArray()) {
    views = VPackSlice::emptyArraySlice();
  }

  // Step 1. Store view definition files
  Result res = storeDumpJson(body, dbname);
  if (res.fail()) {
    return res;
  }

  // Step 2. Store view definition files
  res = storeViews(views);
  if (res.fail()) {
    return res;
  }

  // create a lookup table for collections
  std::map<std::string, bool> restrictList;
  for (size_t i = 0; i < _options.collections.size(); ++i) {
    restrictList.insert(
        std::pair<std::string, bool>(_options.collections[i], true));
  }

  // Step 3. iterate over collections
  for (auto const& collection : VPackArrayIterator(collections)) {
    // extract parameters about the individual collection
    if (!collection.isObject()) {
      return ::ErrorMalformedJsonResponse;
    }
    VPackSlice const parameters = collection.get("parameters");

    if (!parameters.isObject()) {
      return ::ErrorMalformedJsonResponse;
    }

    // extract basic info about the collection
    uint64_t const cid = basics::VelocyPackHelper::extractIdValue(parameters);
    std::string const name =
        basics::VelocyPackHelper::getStringValue(parameters, "name", "");
    bool const deleted =
        basics::VelocyPackHelper::getBooleanValue(parameters, "deleted", false);

    // simple filtering
    if (cid == 0 || name == "") {
      return ::ErrorMalformedJsonResponse;
    }
    if (deleted) {
      continue;
    }
    if (name[0] == '_' && !_options.includeSystemCollections) {
      continue;
    }

    // filter by specified names
    if (!restrictList.empty() &&
        restrictList.find(name) == restrictList.end()) {
      // collection name not in list
      continue;
    }

    if (isIgnoredHiddenEnterpriseCollection(_options, name)) {
      continue;
    }

    // verify distributeShardsLike info
    if (!_options.ignoreDistributeShardsLikeErrors) {
      std::string prototypeCollection =
          basics::VelocyPackHelper::getStringValue(parameters,
                                                   "distributeShardsLike", "");

      if (!prototypeCollection.empty() && !restrictList.empty()) {
        if (std::find(_options.collections.begin(), _options.collections.end(),
                      prototypeCollection) == _options.collections.end()) {
          return {
              TRI_ERROR_INTERNAL,
              std::string("Collection ") + name +
                  "'s shard distribution is based on that of collection " +
                  prototypeCollection +
                  ", which is not dumped along. You may dump the collection "
                  "regardless of the missing prototype collection by using "
                  "the "
                  "--ignore-distribute-shards-like-errors parameter."};
        }
      }
    }

    // queue job to actually dump collection
    auto jobData = std::make_unique<JobData>(
        *_directory, *this, _options, _maskings.get(), _stats, collection, 0 /* batchId */,
        std::to_string(cid), name, "" /* collectionType */);
    _clientTaskQueue.queueJob(std::move(jobData));
  }

  // wait for all jobs to finish, then check for errors
  _clientTaskQueue.waitForIdle();
  {
    MUTEX_LOCKER(lock, _workerErrorLock);
    if (!_workerErrors.empty()) {
      return _workerErrors.front();
    }
  }

  return {TRI_ERROR_NO_ERROR};
}

Result DumpFeature::storeDumpJson(VPackSlice const& body,
                                  std::string const& dbName) const {

  // read the server's max tick value
  std::string const tickString =
  basics::VelocyPackHelper::getStringValue(body, "tick", "");
  if (tickString == "") {
    return ::ErrorMalformedJsonResponse;
  }
  LOG_TOPIC(INFO, Logger::DUMP)
  << "Last tick provided by server is: " << tickString;

  try {
    VPackBuilder meta;
    meta.openObject();
    meta.add("database", VPackValue(dbName));
    meta.add("lastTickAtDumpStart", VPackValue(tickString));
    meta.close();

    // save last tick in file
    auto file = _directory->writableFile("dump.json", true);
    if (!::fileOk(file.get())) {
      return ::fileError(file.get(), true);
    }

    std::string const metaString = meta.slice().toJson();
    file->write(metaString.c_str(), metaString.size());
    if (file->status().fail()) {
      return file->status();
    }
  } catch (basics::Exception const& ex) {
    return {ex.code(), ex.what()};
  } catch (std::exception const& ex) {
    return {TRI_ERROR_INTERNAL, ex.what()};
  } catch (...) {
    return {TRI_ERROR_OUT_OF_MEMORY, "out of memory"};
  }
  return {};
}

Result DumpFeature::storeViews(VPackSlice const& views) const {
  for (VPackSlice view : VPackArrayIterator(views)) {
    auto nameSlice = view.get(StaticStrings::DataSourceName);
    if (!nameSlice.isString() || nameSlice.getStringLength() == 0) {
      continue; // ignore
    }

    try {
      std::string fname = nameSlice.copyString();
      fname.append(".view.json");
      // save last tick in file
      auto file = _directory->writableFile(fname, true);
      if (!::fileOk(file.get())) {
        return ::fileError(file.get(), true);
      }

      std::string const viewString = view.toJson();
      file->write(viewString.c_str(), viewString.size());
      if (file->status().fail()) {
        return file->status();
      }
    } catch (basics::Exception const& ex) {
      return {ex.code(), ex.what()};
    } catch (std::exception const& ex) {
      return {TRI_ERROR_INTERNAL, ex.what()};
    } catch (...) {
      return {TRI_ERROR_OUT_OF_MEMORY, "out of memory"};
    }
  }
  return {};
}

void DumpFeature::reportError(Result const& error) {
  try {
    MUTEX_LOCKER(lock, _workerErrorLock);
    _workerErrors.emplace(error);
    _clientTaskQueue.clearQueue();
  } catch (...) {
  }
}

void DumpFeature::start() {
  if (!_options.maskingsFile.empty()) {
    maskings::MaskingsResult m = maskings::Maskings::fromFile(_options.maskingsFile);

    if (m.status != maskings::MaskingsResult::VALID) {
      LOG_TOPIC(FATAL, Logger::CONFIG) << m.message;
      FATAL_ERROR_EXIT();
    }

    _maskings = std::move(m.maskings);
  }

  _exitCode = EXIT_SUCCESS;

  // generate a fake client id that we sent to the server
  ::clientId = RandomGenerator::interval(static_cast<uint64_t>(0x0000FFFFFFFFFFFFULL));

  double const start = TRI_microtime();

  // set up the output directory, not much else
  _directory =
      std::make_unique<ManagedDirectory>(_options.outputPath, !_options.overwrite, true);
  if (_directory->status().fail()) {
    switch (_directory->status().errorNumber()) {
      case TRI_ERROR_FILE_EXISTS:
        LOG_TOPIC(FATAL, Logger::FIXME) << "cannot write to output directory '"
                                        << _options.outputPath << "'";

        break;
      case TRI_ERROR_CANNOT_OVERWRITE_FILE:
        LOG_TOPIC(FATAL, Logger::FIXME)
            << "output directory '" << _options.outputPath
            << "' already exists. use \"--overwrite true\" to "
               "overwrite data in it";
        break;
      default:
        LOG_TOPIC(ERR, Logger::FIXME) << _directory->status().errorMessage();
        break;
    }
    FATAL_ERROR_EXIT();
  }

  // get database name to operate on
  auto client =
      application_features::ApplicationServer::getFeature<ClientFeature>(
          "Client");
  auto dbName = client->databaseName();

  // get a client to use in main thread
  auto httpClient = _clientManager.getConnectedClient(_options.force, true, true);

  // check if we are in cluster or single-server mode
  Result result{TRI_ERROR_NO_ERROR};
  std::tie(result, _options.clusterMode) =
      _clientManager.getArangoIsCluster(*httpClient);
  if (result.fail()) {
    LOG_TOPIC(FATAL, arangodb::Logger::RESTORE) << "Error: could not detect ArangoDB instance type: " << result.errorMessage();
    FATAL_ERROR_EXIT();
  }

  // special cluster-mode parameter checks
  if (_options.clusterMode) {
    if (_options.tickStart != 0 || _options.tickEnd != 0) {
      LOG_TOPIC(ERR, Logger::FIXME)
          << "Error: cannot use tick-start or tick-end on a cluster";
      FATAL_ERROR_EXIT();
    }
  }

  // set up threads and workers
  _clientTaskQueue.spawnWorkers(_clientManager, _options.threadCount);

  if (_options.progress) {
    LOG_TOPIC(INFO, Logger::DUMP)
        << "Connected to ArangoDB '" << client->endpoint() << "', database: '"
        << dbName << "', username: '" << client->username() << "'";

    LOG_TOPIC(INFO, Logger::DUMP)
        << "Writing dump to output directory '" << _directory->path() << "' with " << _options.threadCount << " thread(s)";
  }

  Result res;
  try {
    if (!_options.clusterMode) {
      res = runDump(*httpClient, dbName);
    } else {
      res = runClusterDump(*httpClient, dbName);
    }
  } catch (basics::Exception const& ex) {
    LOG_TOPIC(ERR, Logger::FIXME) << "caught exception: " << ex.what();
    res = {ex.code(), ex.what()};
  } catch (std::exception const& ex) {
    LOG_TOPIC(ERR, Logger::FIXME) << "caught exception: " << ex.what();
    res = {TRI_ERROR_INTERNAL, ex.what()};
  } catch (...) {
    LOG_TOPIC(ERR, Logger::FIXME) << "caught unknown exception";
    res = {TRI_ERROR_INTERNAL};
  }

  if (res.fail()) {
    LOG_TOPIC(ERR, Logger::FIXME) << "An error occurred: " + res.errorMessage();
    _exitCode = EXIT_FAILURE;
  }

  if (_options.progress) {
    double totalTime = TRI_microtime() - start;

    if (_options.dumpData) {
      LOG_TOPIC(INFO, Logger::DUMP)
          << "Processed " << _stats.totalCollections.load()
          << " collection(s) in " << Logger::FIXED(totalTime, 6) << " s,"
          << " wrote " << _stats.totalWritten.load()
          << " byte(s) into datafiles, sent " << _stats.totalBatches.load()
          << " batch(es)";
    } else {
      LOG_TOPIC(INFO, Logger::DUMP)
          << "Processed " << _stats.totalCollections.load()
          << " collection(s) in " << Logger::FIXED(totalTime, 6) << " s";
    }
  }
}

}  // namespace arangodb