From bf187f4b50958d07f16f54943f15c8d3c6fb6470 Mon Sep 17 00:00:00 2001 From: Jan Date: Wed, 9 Oct 2019 15:01:51 +0200 Subject: [PATCH] Bug fix 3.5/issue 10193 (#10194) * fixed issue #10193: Arangoexport does not handle line feeds when exporting as csvi * escape \r too --- CHANGELOG | 3 + arangosh/Export/ExportFeature.cpp | 137 ++++++++++-------- arangosh/Export/ExportFeature.h | 10 +- .../modules/@arangodb/testsuites/export.js | 31 +++- lib/Basics/files.cpp | 2 +- .../js/server/export/export-setup-cluster.js | 1 + tests/js/server/export/export-setup.js | 1 + 7 files changed, 115 insertions(+), 70 deletions(-) diff --git a/CHANGELOG b/CHANGELOG index f23d27ab81..617c76ef7c 100644 --- a/CHANGELOG +++ b/CHANGELOG @@ -1,6 +1,9 @@ v3.5.2 (XXXX-XX-XX) ------------------- +* Fixed issue #10193: Arangoexport does not handle line feeds when exporting as + csv. + * Removed debug log messages "found comm task ..." that could be logged on server shutdown. diff --git a/arangosh/Export/ExportFeature.cpp b/arangosh/Export/ExportFeature.cpp index 5b638d59b6..94b5c3d35d 100644 --- a/arangosh/Export/ExportFeature.cpp +++ b/arangosh/Export/ExportFeature.cpp @@ -34,8 +34,12 @@ #include "SimpleHttpClient/SimpleHttpClient.h" #include "SimpleHttpClient/SimpleHttpResult.h" -#include #include +#include +#include +#include +#include +#include #include #include #include @@ -63,8 +67,6 @@ ExportFeature::ExportFeature(application_features::ApplicationServer& server, in _graphName(), _xgmmlLabelAttribute("label"), _typeExport("json"), - _csvFieldOptions(), - _csvFields(), _xgmmlLabelOnly(false), _outputDirectory(), _overwrite(false), @@ -111,7 +113,7 @@ void ExportFeature::collectOptions(std::shared_ptr opti options->addOption("--progress", "show progress", new BooleanParameter(&_progress)); options->addOption("--fields", - "comma separated list of fileds to export into a csv file", + "comma separated list of fields to export into a csv file", new StringParameter(&_csvFieldOptions)); std::unordered_set exports = {"csv", "json", "jsonl", "xgmml", @@ -178,7 +180,7 @@ void ExportFeature::validateOptions(std::shared_ptr opt FATAL_ERROR_EXIT(); } - boost::split(_csvFields, _csvFieldOptions, boost::is_any_of(",")); + _csvFields = StringUtils::split(_csvFieldOptions, ','); } } @@ -341,10 +343,10 @@ void ExportFeature::collectionExport(SimpleHttpClient* httpClient) { if (_typeExport == "json") { std::string closingBracket = "\n]"; - writeToFile(*fd, closingBracket, fileName); + writeToFile(*fd, closingBracket); } else if (_typeExport == "xml") { std::string xmlFooter = ""; - writeToFile(*fd, xmlFooter, fileName); + writeToFile(*fd, xmlFooter); } } } @@ -394,10 +396,10 @@ void ExportFeature::queryExport(SimpleHttpClient* httpClient) { if (_typeExport == "json") { std::string closingBracket = "\n]"; - writeToFile(*fd, closingBracket, fileName); + writeToFile(*fd, closingBracket); } else if (_typeExport == "xml") { std::string xmlFooter = ""; - writeToFile(*fd, xmlFooter, fileName); + writeToFile(*fd, xmlFooter); } } @@ -406,7 +408,7 @@ void ExportFeature::writeFirstLine(ManagedDirectory::File & fd, std::string cons _firstLine = true; if (_typeExport == "json") { std::string openingBracket = "["; - writeToFile(fd, openingBracket, fileName); + writeToFile(fd, openingBracket); } else if (_typeExport == "xml") { std::string xmlHeader = @@ -414,10 +416,10 @@ void ExportFeature::writeFirstLine(ManagedDirectory::File & fd, std::string cons "\n"); - writeToFile(fd, xmlHeader, fileName); + writeToFile(fd, xmlHeader); } else if (_typeExport == "csv") { - std::string firstLine = ""; + std::string firstLine; bool isFirstValue = true; for (auto const& str : _csvFields) { if (isFirstValue) { @@ -428,22 +430,28 @@ void ExportFeature::writeFirstLine(ManagedDirectory::File & fd, std::string cons } } firstLine += "\n"; - writeToFile(fd, firstLine, fileName); + writeToFile(fd, firstLine); } } - void ExportFeature::writeBatch(ManagedDirectory::File & fd, VPackArrayIterator it, std::string const& fileName) { +void ExportFeature::writeBatch(ManagedDirectory::File & fd, VPackArrayIterator it, std::string const& fileName) { std::string line; line.reserve(1024); if (_typeExport == "jsonl") { + VPackStringSink sink(&line); + VPackDumper dumper(&sink); + for (auto const& doc : it) { line.clear(); - line += doc.toJson(); + dumper.dump(doc); line.push_back('\n'); - writeToFile(fd, line, fileName); + writeToFile(fd, line); } } else if (_typeExport == "json") { + VPackStringSink sink(&line); + VPackDumper dumper(&sink); + for (auto const& doc : it) { line.clear(); if (!_firstLine) { @@ -452,8 +460,8 @@ void ExportFeature::writeFirstLine(ManagedDirectory::File & fd, std::string cons line.append("\n ", 3); _firstLine = false; } - line += doc.toJson(); - writeToFile(fd, line, fileName); + dumper.dump(doc); + writeToFile(fd, line); } } else if (_typeExport == "csv") { for (auto const& doc : it) { @@ -461,39 +469,50 @@ void ExportFeature::writeFirstLine(ManagedDirectory::File & fd, std::string cons bool isFirstValue = true; for (auto const& key : _csvFields) { - std::string value = ""; - if (isFirstValue) { isFirstValue = false; } else { - line.append(","); + line.push_back(','); } - if (doc.hasKey(key)) { - VPackSlice val = doc.get(key); - + VPackSlice val = doc.get(key); + if (!val.isNone()) { + std::string value; + bool escape = false; if (val.isArray() || val.isObject()) { value = val.toJson(); + escape = true; } else { if (val.isString()) { value = val.copyString(); + escape = true; } else { value = val.toString(); } } - value = std::regex_replace(value, std::regex("\""), "\"\""); + if (escape) { + value = std::regex_replace(value, std::regex("\""), "\"\""); - if (value.find(",") != std::string::npos || - value.find("\"\"") != std::string::npos) { - value = "\"" + value; - value.append("\""); + if (value.find(',') != std::string::npos || + value.find('\"') != std::string::npos || + value.find('\r') != std::string::npos || + value.find('\n') != std::string::npos) { + // escape value and put it in quotes + line.push_back('\"'); + line.append(value); + line.push_back('\"'); + + continue; + } } + + // write unescaped + line.append(value); } - line.append(value); } - line.append("\n"); - writeToFile(fd, line, fileName); + line.push_back('\n'); + writeToFile(fd, line); } } else if (_typeExport == "xml") { for (auto const& doc : it) { @@ -501,18 +520,18 @@ void ExportFeature::writeFirstLine(ManagedDirectory::File & fd, std::string cons line.append("\n"); - writeToFile(fd, line, fileName); + writeToFile(fd, line); for (auto const& att : VPackObjectIterator(doc)) { - xgmmlWriteOneAtt(fd, fileName, att.value, att.key.copyString(), 2); + xgmmlWriteOneAtt(fd, att.value, att.key.copyString(), 2); } line.clear(); line.append("\n"); - writeToFile(fd, line, fileName); + writeToFile(fd, line); } } } -void ExportFeature::writeToFile(ManagedDirectory::File & fd, std::string const& line, std::string const& fileName) { +void ExportFeature::writeToFile(ManagedDirectory::File & fd, std::string const& line) { fd.write(line.c_str(), line.size()); } @@ -613,14 +632,14 @@ void ExportFeature::graphExport(SimpleHttpClient* httpClient) { std::string xmlHeader = R"( )"; - writeToFile(*fd, xmlHeader, fileName); + writeToFile(*fd, xmlHeader); for (auto const& collection : _collections) { if (_progress) { @@ -656,7 +675,7 @@ directed="1"> } } std::string closingGraphTag = "\n"; - writeToFile(*fd, closingGraphTag, fileName); + writeToFile(*fd, closingGraphTag); if (_skippedDeepNested) { std::cout << "skipped " << _skippedDeepNested @@ -677,21 +696,21 @@ void ExportFeature::writeGraphBatch(ManagedDirectory::File & fd, VPackArrayItera "\" source=\"" + encode_char_entities(doc.get("_from").copyString()) + "\" target=\"" + encode_char_entities(doc.get("_to").copyString()) + "\""; - writeToFile(fd, xmlTag, fileName); + writeToFile(fd, xmlTag); if (!_xgmmlLabelOnly) { xmlTag = ">\n"; - writeToFile(fd, xmlTag, fileName); + writeToFile(fd, xmlTag); for (auto const& it : VPackObjectIterator(doc)) { - xgmmlWriteOneAtt(fd, fileName, it.value, it.key.copyString()); + xgmmlWriteOneAtt(fd, it.value, it.key.copyString()); } xmlTag = "\n"; - writeToFile(fd, xmlTag, fileName); + writeToFile(fd, xmlTag); } else { xmlTag = " />\n"; - writeToFile(fd, xmlTag, fileName); + writeToFile(fd, xmlTag); } } else { @@ -702,27 +721,27 @@ void ExportFeature::writeGraphBatch(ManagedDirectory::File & fd, VPackArrayItera ? doc.get(_xgmmlLabelAttribute).copyString() : "Default-Label") + "\" id=\"" + encode_char_entities(doc.get("_id").copyString()) + "\""; - writeToFile(fd, xmlTag, fileName); + writeToFile(fd, xmlTag); if (!_xgmmlLabelOnly) { xmlTag = ">\n"; - writeToFile(fd, xmlTag, fileName); + writeToFile(fd, xmlTag); for (auto const& it : VPackObjectIterator(doc)) { - xgmmlWriteOneAtt(fd, fileName, it.value, it.key.copyString()); + xgmmlWriteOneAtt(fd, it.value, it.key.copyString()); } xmlTag = "\n"; - writeToFile(fd, xmlTag, fileName); + writeToFile(fd, xmlTag); } else { xmlTag = " />\n"; - writeToFile(fd, xmlTag, fileName); + writeToFile(fd, xmlTag); } } } } -void ExportFeature::xgmmlWriteOneAtt(ManagedDirectory::File & fd, std::string const& fileName, +void ExportFeature::xgmmlWriteOneAtt(ManagedDirectory::File & fd, VPackSlice const& slice, std::string const& name, int deep) { std::string value, type, xmlTag; @@ -761,38 +780,38 @@ void ExportFeature::xgmmlWriteOneAtt(ManagedDirectory::File & fd, std::string co xmlTag = " \n"; - writeToFile(fd, xmlTag, fileName); + writeToFile(fd, xmlTag); return; } if (!type.empty()) { xmlTag = " \n"; - writeToFile(fd, xmlTag, fileName); + writeToFile(fd, xmlTag); } else if (slice.isArray()) { xmlTag = " \n"; - writeToFile(fd, xmlTag, fileName); + writeToFile(fd, xmlTag); for (VPackSlice val : VPackArrayIterator(slice)) { - xgmmlWriteOneAtt(fd, fileName, val, name, deep + 1); + xgmmlWriteOneAtt(fd, val, name, deep + 1); } xmlTag = " \n"; - writeToFile(fd, xmlTag, fileName); + writeToFile(fd, xmlTag); } else if (slice.isObject()) { xmlTag = " \n"; - writeToFile(fd, xmlTag, fileName); + writeToFile(fd, xmlTag); for (auto const& it : VPackObjectIterator(slice)) { - xgmmlWriteOneAtt(fd, fileName, it.value, it.key.copyString(), deep + 1); + xgmmlWriteOneAtt(fd, it.value, it.key.copyString(), deep + 1); } xmlTag = " \n"; - writeToFile(fd, xmlTag, fileName); + writeToFile(fd, xmlTag); } } diff --git a/arangosh/Export/ExportFeature.h b/arangosh/Export/ExportFeature.h index ab2b8492d5..f2bfaff433 100644 --- a/arangosh/Export/ExportFeature.h +++ b/arangosh/Export/ExportFeature.h @@ -53,14 +53,14 @@ class ExportFeature final : public application_features::ApplicationFeature, private: void collectionExport(httpclient::SimpleHttpClient* httpClient); void queryExport(httpclient::SimpleHttpClient* httpClient); - void writeFirstLine(ManagedDirectory::File & fd, std::string const& fileName, std::string const& collection); - void writeBatch(ManagedDirectory::File & fd, VPackArrayIterator it, std::string const& fileName); + void writeFirstLine(ManagedDirectory::File& fd, std::string const& fileName, std::string const& collection); + void writeBatch(ManagedDirectory::File& fd, VPackArrayIterator it, std::string const& fileName); void graphExport(httpclient::SimpleHttpClient* httpClient); - void writeGraphBatch(ManagedDirectory::File &fd, VPackArrayIterator it, std::string const& fileName); - void xgmmlWriteOneAtt(ManagedDirectory::File & fd, std::string const& fileName, VPackSlice const& slice, + void writeGraphBatch(ManagedDirectory::File& fd, VPackArrayIterator it, std::string const& fileName); + void xgmmlWriteOneAtt(ManagedDirectory::File& fd, VPackSlice const& slice, std::string const& name, int deep = 0); - void writeToFile(ManagedDirectory::File & fd, std::string const& string, std::string const& fileName); + void writeToFile(ManagedDirectory::File& fd, std::string const& string); std::shared_ptr httpCall(httpclient::SimpleHttpClient* httpClient, std::string const& url, arangodb::rest::RequestType, std::string postBody = ""); diff --git a/js/client/modules/@arangodb/testsuites/export.js b/js/client/modules/@arangodb/testsuites/export.js index 6033fdbf1c..ea8aafe6fb 100644 --- a/js/client/modules/@arangodb/testsuites/export.js +++ b/js/client/modules/@arangodb/testsuites/export.js @@ -28,8 +28,7 @@ const functionsDocumentation = { 'export': 'export formats tests' }; -const optionsDocumentation = [ -]; +const optionsDocumentation = []; const fs = require('fs'); const pu = require('@arangodb/process-utils'); @@ -100,7 +99,7 @@ function exportTest (options) { 'overwrite': true, 'output-directory': tmpPath }; - const results = {failed: 0}; + let results = {failed: 0}; function shutdown () { print(CYAN + 'Shutting down...' + RESET); @@ -333,7 +332,7 @@ function exportTest (options) { results.exportQueryGz = pu.executeAndWait(pu.ARANGOEXPORT_BIN, toArgv(args), options, 'arangosh', tmpPath, false, options.coreCheck); results.exportQueryGz.failed = results.exportQuery.status ? 0 : 1; try { - fs.readGzip(fs.join(tmpPath, 'query.jsonl')).split('\n') + fs.readGzip(fs.join(tmpPath, 'query.jsonl.gz')).split('\n') .filter(line => line.trim() !== '') .forEach(line => JSON.parse(line)); results.parseQueryResultGz = { @@ -341,7 +340,6 @@ function exportTest (options) { status: true }; } catch (e) { - print(e); results.failed += 1; results.parseQueryResultGz = { failed: 1, @@ -350,6 +348,29 @@ function exportTest (options) { }; } args['compress-output'] = 'false'; + + print(CYAN + Date() + ': Export data (csv)' + RESET); + args['type'] = 'csv'; + args['query'] = 'FOR doc IN UnitTestsExport RETURN doc'; + args['fields'] = '_key,value1,value2,value3,value4'; + results.exportCsv = pu.executeAndWait(pu.ARANGOEXPORT_BIN, toArgv(args), options, 'arangosh', tmpPath, false, options.coreCheck); + results.exportCsv.failed = results.exportJsonl.status ? 0 : 1; + try { + fs.read(fs.join(tmpPath, 'query.csv')); + + results.parseCsv = { + failed: 0, + status: true + }; + } catch (e) { + results.failed += 1; + results.parseCsv = { + failed: 1, + status: false, + message: e + }; + } + delete args['fields']; return shutdown(); } diff --git a/lib/Basics/files.cpp b/lib/Basics/files.cpp index 5f7b87bd25..fb0c96266a 100644 --- a/lib/Basics/files.cpp +++ b/lib/Basics/files.cpp @@ -1109,7 +1109,7 @@ char* TRI_SlurpGzipFile(char const* filename, size_t* length) { TRI_set_errno(TRI_ERROR_NO_ERROR); gzFile gzFd(gzopen(filename,"rb")); auto fdGuard = arangodb::scopeGuard([&gzFd](){ if (nullptr != gzFd) gzclose(gzFd); }); - char * retPtr = nullptr; + char* retPtr = nullptr; if (nullptr != gzFd) { TRI_string_buffer_t result; diff --git a/tests/js/server/export/export-setup-cluster.js b/tests/js/server/export/export-setup-cluster.js index d1e379fc07..42a8e7a958 100644 --- a/tests/js/server/export/export-setup-cluster.js +++ b/tests/js/server/export/export-setup-cluster.js @@ -44,6 +44,7 @@ for (let i = 0; i < 100; ++i) { col.save({ _key: "export" + i, value1: i, value2: "this is export", value3: "export" + i, value4: "%<>\"'" }); } + col.save({ _key: "special", value1: "abc \"def\" ghi", value2: [1, 2], value3: { foo: "bar" }, value4: "abc\r\ncd" }); } return { diff --git a/tests/js/server/export/export-setup.js b/tests/js/server/export/export-setup.js index d1e379fc07..42a8e7a958 100644 --- a/tests/js/server/export/export-setup.js +++ b/tests/js/server/export/export-setup.js @@ -44,6 +44,7 @@ for (let i = 0; i < 100; ++i) { col.save({ _key: "export" + i, value1: i, value2: "this is export", value3: "export" + i, value4: "%<>\"'" }); } + col.save({ _key: "special", value1: "abc \"def\" ghi", value2: [1, 2], value3: { foo: "bar" }, value4: "abc\r\ncd" }); } return {