1
0
Fork 0
arangodb/arangosh/Import/ImportFeature.cpp

348 lines
12 KiB
C++

////////////////////////////////////////////////////////////////////////////////
/// DISCLAIMER
///
/// Copyright 2016 ArangoDB GmbH, Cologne, Germany
///
/// Licensed under the Apache License, Version 2.0 (the "License");
/// you may not use this file except in compliance with the License.
/// You may obtain a copy of the License at
///
/// http://www.apache.org/licenses/LICENSE-2.0
///
/// Unless required by applicable law or agreed to in writing, software
/// distributed under the License is distributed on an "AS IS" BASIS,
/// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
/// See the License for the specific language governing permissions and
/// limitations under the License.
///
/// Copyright holder is ArangoDB GmbH, Cologne, Germany
///
/// @author Jan Steemann
////////////////////////////////////////////////////////////////////////////////
#include "ImportFeature.h"
#include "ApplicationFeatures/ClientFeature.h"
#include "Basics/StringUtils.h"
#include "Basics/FileUtils.h"
#include "Import/ImportHelper.h"
#include "Logger/Logger.h"
#include "ProgramOptions/ProgramOptions.h"
#include "SimpleHttpClient/GeneralClientConnection.h"
#include "SimpleHttpClient/SimpleHttpClient.h"
using namespace arangodb;
using namespace arangodb::basics;
using namespace arangodb::httpclient;
using namespace arangodb::options;
ImportFeature::ImportFeature(application_features::ApplicationServer* server,
int* result)
: ApplicationFeature(server, "Import"),
_filename(""),
_useBackslash(false),
_chunkSize(1024 * 1024 * 16),
_collectionName(""),
_fromCollectionPrefix(""),
_toCollectionPrefix(""),
_createCollection(false),
_createCollectionType("document"),
_typeImport("json"),
_overwrite(false),
_quote("\""),
_separator(","),
_progress(true),
_onDuplicateAction("error"),
_result(result) {
requiresElevatedPrivileges(false);
setOptional(false);
startsAfter("Client");
startsAfter("Config");
startsAfter("Logger");
}
void ImportFeature::collectOptions(
std::shared_ptr<options::ProgramOptions> options) {
LOG_TOPIC(TRACE, Logger::STARTUP) << name() << "::collectOptions";
options->addOption("--file", "file name (\"-\" for STDIN)",
new StringParameter(&_filename));
options->addOption(
"--backslash-escape",
"use backslash as the escape character for quotes, used for csv",
new BooleanParameter(&_useBackslash));
options->addOption("--batch-size",
"size for individual data batches (in bytes)",
new UInt64Parameter(&_chunkSize));
options->addOption("--collection", "collection name",
new StringParameter(&_collectionName));
options->addOption("--from-collection-prefix", "_from collection name prefix (will be prepended to all values in '_from')",
new StringParameter(&_fromCollectionPrefix));
options->addOption("--to-collection-prefix", "_to collection name prefix (will be prepended to all values in '_to')",
new StringParameter(&_toCollectionPrefix));
options->addOption("--create-collection",
"create collection if it does not yet exist",
new BooleanParameter(&_createCollection));
std::unordered_set<std::string> types = {"document", "edge"};
std::vector<std::string> typesVector(types.begin(), types.end());
std::string typesJoined = StringUtils::join(typesVector, " or ");
options->addOption(
"--create-collection-type",
"type of collection if collection is created (" + typesJoined + ")",
new DiscreteValuesParameter<StringParameter>(&_createCollectionType,
types));
std::unordered_set<std::string> imports = {"csv", "tsv", "json"};
std::vector<std::string> importsVector(imports.begin(), imports.end());
std::string importsJoined = StringUtils::join(importsVector, ", ");
options->addOption(
"--type", "type of file (" + importsJoined + ")",
new DiscreteValuesParameter<StringParameter>(&_typeImport, imports));
options->addOption(
"--overwrite",
"overwrite collection if it exist (WARNING: this will remove any data "
"from the collection)",
new BooleanParameter(&_overwrite, false));
options->addOption("--quote", "quote character(s), used for csv",
new StringParameter(&_quote));
options->addOption("--separator", "field separator, used for csv",
new StringParameter(&_separator));
options->addOption("--progress", "show progress",
new BooleanParameter(&_progress));
std::unordered_set<std::string> actions = {"error", "update", "replace",
"ignore"};
std::vector<std::string> actionsVector(actions.begin(), actions.end());
std::string actionsJoined = StringUtils::join(actionsVector, ", ");
options->addOption(
"--on-duplicate",
"action to perform when a unique key constraint "
"violation occurs. Possible values: " +
actionsJoined,
new DiscreteValuesParameter<StringParameter>(&_typeImport, actions));
}
void ImportFeature::validateOptions(
std::shared_ptr<options::ProgramOptions> options) {
LOG_TOPIC(TRACE, Logger::STARTUP) << name() << "::validateOptions";
auto const& positionals = options->processingResult()._positionals;
size_t n = positionals.size();
if (1 == n) {
// only take positional file name attribute into account if user
// did not specify the --file option as well
if (!options->processingResult().touched("--file")) {
_filename = positionals[0];
}
} else if (1 < n) {
LOG(FATAL) << "expecting at most one filename, got " +
StringUtils::join(positionals, ", ");
FATAL_ERROR_EXIT();
}
}
void ImportFeature::start() {
LOG_TOPIC(TRACE, Logger::STARTUP) << name() << "::start";
ClientFeature* client =
dynamic_cast<ClientFeature*>(server()->feature("Client"));
int ret = EXIT_SUCCESS;
*_result = ret;
std::unique_ptr<SimpleHttpClient> httpClient;
try {
httpClient = client->createHttpClient();
} catch (...) {
LOG(FATAL) << "cannot create server connection, giving up!";
FATAL_ERROR_EXIT();
}
std::string dbName = client->databaseName();
httpClient->setLocationRewriter((void*)client, &rewriteLocation);
httpClient->setUserNamePassword("/", client->username(), client->password());
// must stay here in order to establish the connection
httpClient->getServerVersion();
if (!httpClient->isConnected()) {
LOG(ERR) << "Could not connect to endpoint '" << client->endpoint()
<< "', database: '" << client->databaseName() << "', username: '"
<< client->username() << "'";
LOG(FATAL) << httpClient->getErrorMessage() << "'";
FATAL_ERROR_EXIT();
}
// successfully connected
std::cout << "Connected to ArangoDB '"
<< httpClient->getEndpointSpecification() << "', version "
<< httpClient->getServerVersion() << ", database: '"
<< client->databaseName() << "', username: '" << client->username()
<< "'" << std::endl;
std::cout << "----------------------------------------" << std::endl;
std::cout << "database: " << client->databaseName() << std::endl;
std::cout << "collection: " << _collectionName << std::endl;
if (!_fromCollectionPrefix.empty()) {
std::cout << "from collection prefix: " << _fromCollectionPrefix << std::endl;
}
if (!_toCollectionPrefix.empty()) {
std::cout << "to collection prefix: " << _toCollectionPrefix << std::endl;
}
std::cout << "create: " << (_createCollection ? "yes" : "no")
<< std::endl;
std::cout << "source filename: " << _filename << std::endl;
std::cout << "file type: " << _typeImport << std::endl;
if (_typeImport == "csv") {
std::cout << "quote: " << _quote << std::endl;
std::cout << "separator: " << _separator << std::endl;
}
std::cout << "connect timeout: " << client->connectionTimeout() << std::endl;
std::cout << "request timeout: " << client->requestTimeout() << std::endl;
std::cout << "----------------------------------------" << std::endl;
arangodb::import::ImportHelper ih(httpClient.get(), _chunkSize);
// create colletion
if (_createCollection) {
ih.setCreateCollection(true);
}
if (_createCollectionType == "document" || _createCollectionType == "edge") {
ih.setCreateCollectionType(_createCollectionType);
}
ih.setOverwrite(_overwrite);
ih.useBackslash(_useBackslash);
// quote
if (_quote.length() <= 1) {
ih.setQuote(_quote);
} else {
LOG(FATAL) << "Wrong length of quote character.";
FATAL_ERROR_EXIT();
}
// separator
if (_separator.length() == 1) {
ih.setSeparator(_separator);
} else {
LOG(FATAL) << "_separator must be exactly one character.";
FATAL_ERROR_EXIT();
}
// collection name
if (_collectionName == "") {
LOG(FATAL) << "Collection name is missing.";
FATAL_ERROR_EXIT();
}
// filename
if (_filename == "") {
LOG(FATAL) << "File name is missing.";
FATAL_ERROR_EXIT();
}
if (_filename != "-" && !FileUtils::isRegularFile(_filename)) {
if (!FileUtils::exists(_filename)) {
LOG(FATAL) << "Cannot open file '" << _filename << "'. File not found.";
} else if (FileUtils::isDirectory(_filename)) {
LOG(FATAL) << "Specified file '" << _filename
<< "' is a directory. Please use a regular file.";
} else {
LOG(FATAL) << "Cannot open '" << _filename << "'. Invalid file type.";
}
FATAL_ERROR_EXIT();
}
// progress
if (_progress) {
ih.setProgress(true);
}
if (_onDuplicateAction != "error" && _onDuplicateAction != "update" &&
_onDuplicateAction != "replace" && _onDuplicateAction != "ignore") {
LOG(FATAL)
<< "Invalid value for '--on-duplicate'. Possible values: 'error', "
"'update', 'replace', 'ignore'.";
FATAL_ERROR_EXIT();
}
ih.setOnDuplicateAction(_onDuplicateAction);
try {
bool ok = false;
// set prefixes
ih.setFrom(_fromCollectionPrefix);
ih.setTo(_toCollectionPrefix);
// import type
if (_typeImport == "csv") {
std::cout << "Starting CSV import..." << std::endl;
ok = ih.importDelimited(_collectionName, _filename,
arangodb::import::ImportHelper::CSV);
}
else if (_typeImport == "tsv") {
std::cout << "Starting TSV import..." << std::endl;
ih.setQuote("");
ih.setSeparator("\\t");
ok = ih.importDelimited(_collectionName, _filename,
arangodb::import::ImportHelper::TSV);
}
else if (_typeImport == "json") {
std::cout << "Starting JSON import..." << std::endl;
ok = ih.importJson(_collectionName, _filename);
}
else {
LOG(FATAL) << "Wrong type '" << _typeImport << "'.";
FATAL_ERROR_EXIT();
}
std::cout << std::endl;
// give information about import
if (ok) {
std::cout << "created: " << ih.getNumberCreated() << std::endl;
std::cout << "warnings/errors: " << ih.getNumberErrors() << std::endl;
std::cout << "updated/replaced: " << ih.getNumberUpdated() << std::endl;
std::cout << "ignored: " << ih.getNumberIgnored() << std::endl;
if (_typeImport == "csv" || _typeImport == "tsv") {
std::cout << "lines read: " << ih.getReadLines() << std::endl;
}
} else {
LOG(ERR) << "error message: " << ih.getErrorMessage();
}
} catch (std::exception const& ex) {
LOG(ERR) << "Caught exception " << ex.what() << " during import";
} catch (...) {
LOG(ERR) << "Got an unknown exception during import";
}
*_result = ret;
}