1
0
Fork 0
arangodb/arangosh/Export/ExportFeature.cpp

337 lines
11 KiB
C++

////////////////////////////////////////////////////////////////////////////////
/// DISCLAIMER
///
/// Copyright 2016 ArangoDB GmbH, Cologne, Germany
///
/// Licensed under the Apache License, Version 2.0 (the "License");
/// you may not use this file except in compliance with the License.
/// You may obtain a copy of the License at
///
/// http://www.apache.org/licenses/LICENSE-2.0
///
/// Unless required by applicable law or agreed to in writing, software
/// distributed under the License is distributed on an "AS IS" BASIS,
/// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
/// See the License for the specific language governing permissions and
/// limitations under the License.
///
/// Copyright holder is ArangoDB GmbH, Cologne, Germany
///
/// @author Manuel Baesler
////////////////////////////////////////////////////////////////////////////////
#include "ExportFeature.h"
#include "ApplicationFeatures/ApplicationServer.h"
#include "Basics/FileUtils.h"
#include "Basics/StringUtils.h"
#include "Logger/Logger.h"
#include "ProgramOptions/ProgramOptions.h"
#include "Shell/ClientFeature.h"
#include "SimpleHttpClient/GeneralClientConnection.h"
#include "SimpleHttpClient/SimpleHttpClient.h"
#include "SimpleHttpClient/SimpleHttpResult.h"
using namespace arangodb;
using namespace arangodb::basics;
using namespace arangodb::httpclient;
using namespace arangodb::options;
ExportFeature::ExportFeature(application_features::ApplicationServer* server,
int* result)
: ApplicationFeature(server, "Export"),
_collections(),
_graphName(),
_typeExport("json"),
_outputDirectory(),
_overwrite(false),
_progress(true),
_firstLine(true),
_result(result) {
requiresElevatedPrivileges(false);
setOptional(false);
startsAfter("Client");
startsAfter("Config");
startsAfter("Logger");
_outputDirectory =
FileUtils::buildFilename(FileUtils::currentDirectory(), "export");
}
void ExportFeature::collectOptions(
std::shared_ptr<options::ProgramOptions> options) {
options->addOption(
"--collection",
"restrict to collection name (can be specified multiple times)",
new VectorParameter<StringParameter>(&_collections));
options->addOption("--graph-name", "name of a graph to export",
new StringParameter(&_graphName));
options->addOption("--output-directory", "output directory",
new StringParameter(&_outputDirectory));
options->addOption("--overwrite", "overwrite data in output directory",
new BooleanParameter(&_overwrite));
options->addOption("--progress", "show progress",
new BooleanParameter(&_progress));
std::unordered_set<std::string> exportsWithUpperCase = {"csv", "json", "jsonl", "xgmml",
"CSV", "JSON", "JSONL", "XGMML"};
std::unordered_set<std::string> exports = {"csv", "json", "jsonl", "xgmml"};
std::vector<std::string> exportsVector(exports.begin(), exports.end());
std::string exportsJoined = StringUtils::join(exportsVector, ", ");
options->addOption(
"--type", "type of export (" + exportsJoined + ")",
new DiscreteValuesParameter<StringParameter>(&_typeExport, exportsWithUpperCase));
}
void ExportFeature::validateOptions(
std::shared_ptr<options::ProgramOptions> options) {
auto const& positionals = options->processingResult()._positionals;
size_t n = positionals.size();
if (1 == n) {
_outputDirectory = positionals[0];
} else if (1 < n) {
LOG(FATAL) << "expecting at most one directory, got " +
StringUtils::join(positionals, ", ");
FATAL_ERROR_EXIT();
}
// trim trailing slash from path because it may cause problems on ...
// Windows
if (!_outputDirectory.empty() &&
_outputDirectory.back() == TRI_DIR_SEPARATOR_CHAR) {
TRI_ASSERT(_outputDirectory.size() > 0);
_outputDirectory.pop_back();
}
if (_graphName.empty() && _collections.empty()) {
LOG(FATAL) << "expecting at least one collection ore one graph name";
FATAL_ERROR_EXIT();
}
std::transform(_typeExport.begin(), _typeExport.end(), _typeExport.begin(), ::tolower);
}
void ExportFeature::prepare() {
bool isDirectory = false;
bool isEmptyDirectory = false;
if (!_outputDirectory.empty()) {
isDirectory = TRI_IsDirectory(_outputDirectory.c_str());
if (isDirectory) {
std::vector<std::string> files(TRI_FullTreeDirectory(_outputDirectory.c_str()));
// we don't care if the target directory is empty
isEmptyDirectory = (files.size() <= 1); // TODO: TRI_FullTreeDirectory always returns at least one element (""), even if directory is empty?
}
}
if (_outputDirectory.empty() ||
(TRI_ExistsFile(_outputDirectory.c_str()) && !isDirectory)) {
LOG(FATAL) << "cannot write to output directory '" << _outputDirectory
<< "'";
FATAL_ERROR_EXIT();
}
if (isDirectory && !isEmptyDirectory && !_overwrite) {
LOG(FATAL) << "output directory '" << _outputDirectory
<< "' already exists. use \"--overwrite true\" to "
"overwrite data in it";
FATAL_ERROR_EXIT();
}
if (!isDirectory) {
long systemError;
std::string errorMessage;
int res = TRI_CreateDirectory(_outputDirectory.c_str(), systemError,
errorMessage);
if (res != TRI_ERROR_NO_ERROR) {
LOG(ERR) << "unable to create output directory '" << _outputDirectory
<< "': " << errorMessage;
FATAL_ERROR_EXIT();
}
}
}
void ExportFeature::start() {
ClientFeature* client = application_features::ApplicationServer::getFeature<ClientFeature>("Client");
int ret = EXIT_SUCCESS;
*_result = ret;
std::unique_ptr<SimpleHttpClient> httpClient;
try {
httpClient = client->createHttpClient();
} catch (...) {
LOG(FATAL) << "cannot create server connection, giving up!";
FATAL_ERROR_EXIT();
}
httpClient->setLocationRewriter(static_cast<void*>(client), &rewriteLocation);
httpClient->setUserNamePassword("/", client->username(), client->password());
// must stay here in order to establish the connection
httpClient->getServerVersion();
if (!httpClient->isConnected()) {
LOG(ERR) << "Could not connect to endpoint '" << client->endpoint()
<< "', database: '" << client->databaseName() << "', username: '"
<< client->username() << "'";
LOG(FATAL) << httpClient->getErrorMessage() << "'";
FATAL_ERROR_EXIT();
}
// successfully connected
std::cout << "Connected to ArangoDB '"
<< httpClient->getEndpointSpecification() << "', version "
<< httpClient->getServerVersion() << ", database: '"
<< client->databaseName() << "', username: '" << client->username()
<< "'" << std::endl;
if (_typeExport == "xgmml") {
graphExport(httpClient.get());
}
if (_typeExport == "json" || _typeExport == "jsonl" || _typeExport == "csv") {
if (_collections.size()) {
collectionExport(httpClient.get());
} else if (_graphName.size()) {
graphExport(httpClient.get());
}
}
*_result = ret;
}
void ExportFeature::collectionExport(SimpleHttpClient* httpClient) {
std::cout << "collection export" << std::endl;
std::string errorMsg;
for (auto const& collection : _collections) {
if (_progress) {
std::cout << "# Exporting collection '" << collection << "'..." << std::endl;
}
std::string fileName =
_outputDirectory + TRI_DIR_SEPARATOR_STR + collection + "." + _typeExport;
// remove an existing file first
if (TRI_ExistsFile(fileName.c_str())) {
TRI_UnlinkFile(fileName.c_str());
}
int fd = TRI_CREATE(fileName.c_str(), O_CREAT | O_EXCL | O_RDWR | TRI_O_CLOEXEC,
S_IRUSR | S_IWUSR);
if (fd < 0) {
errorMsg = "cannot write to file '" + fileName + "'";
THROW_ARANGO_EXCEPTION_MESSAGE(TRI_ERROR_CANNOT_WRITE_FILE, errorMsg);
}
TRI_DEFER(TRI_CLOSE(fd));
std::string const url = "/_api/export?collection="+StringUtils::urlEncode(collection);
_firstLine = true;
if (_typeExport == "json") {
std::string openingBracket = "[\n";
writeToFile(fd, openingBracket, fileName);
}
std::shared_ptr<VPackBuilder> parsedBody = httpCall(httpClient, url, rest::RequestType::POST);
VPackSlice body = parsedBody->slice();
writeCollectionBatch(fd, VPackArrayIterator(body.get("result")), fileName);
while (body.hasKey("id")) {
std::string const url = "/_api/export/"+body.get("id").copyString();
parsedBody = httpCall(httpClient, url, rest::RequestType::PUT);
body = parsedBody->slice();
writeCollectionBatch(fd, VPackArrayIterator(body.get("result")), fileName);
}
if (_typeExport == "json") {
std::string closingBracket = "]";
writeToFile(fd, closingBracket , fileName);
}
}
}
void ExportFeature::writeCollectionBatch(int fd, VPackArrayIterator it, std::string const& fileName) {
std::string line;
for (auto const& doc : it) {
line.clear();
if (_firstLine && _typeExport == "json") {
_firstLine = false;
} else if(!_firstLine && _typeExport == "json") {
line.push_back(',');
}
line += doc.toJson();
line.push_back('\n');
writeToFile(fd, line, fileName);
}
}
void ExportFeature::writeToFile(int fd, std::string& line, std::string const& fileName) {
if (!TRI_WritePointer(fd, line.c_str(), line.size())) {
std::string errorMsg = "cannot write to file '" + fileName + "'";
THROW_ARANGO_EXCEPTION_MESSAGE(TRI_ERROR_CANNOT_WRITE_FILE, errorMsg);
}
}
std::shared_ptr<VPackBuilder> ExportFeature::httpCall(SimpleHttpClient* httpClient, std::string const& url, rest::RequestType requestType) {
std::string errorMsg;
std::unique_ptr<SimpleHttpResult> response(
httpClient->request(requestType, url, "", 0));
if (response == nullptr || !response->isComplete()) {
errorMsg =
"got invalid response from server: " + httpClient->getErrorMessage();
THROW_ARANGO_EXCEPTION_MESSAGE(TRI_ERROR_INTERNAL, errorMsg);
}
if (response->wasHttpError()) {
if (response->getHttpReturnCode() == 404) {
LOG(FATAL) << "Collection not found.";
} else {
errorMsg = "got invalid response from server: HTTP " +
StringUtils::itoa(response->getHttpReturnCode()) + ": " +
response->getHttpReturnMessage();
THROW_ARANGO_EXCEPTION_MESSAGE(TRI_ERROR_INTERNAL, errorMsg);
}
}
std::shared_ptr<VPackBuilder> parsedBody;
try {
parsedBody = response->getBodyVelocyPack();
} catch (...) {
errorMsg = "got malformed JSON response from server";
THROW_ARANGO_EXCEPTION_MESSAGE(TRI_ERROR_INTERNAL, errorMsg);
}
VPackSlice body = parsedBody->slice();
if (!body.isObject()) {
errorMsg = "got malformed JSON response from server";
THROW_ARANGO_EXCEPTION_MESSAGE(TRI_ERROR_INTERNAL, errorMsg);
}
return parsedBody;
}
void ExportFeature::graphExport(SimpleHttpClient* httpClient) {
if (_progress) {
std::cout << "graph export" << std::endl;
}
}