diff --git a/CHANGELOG b/CHANGELOG index 9349ba40d9..2010f59d64 100644 --- a/CHANGELOG +++ b/CHANGELOG @@ -1,6 +1,8 @@ devel ----- +* arangoimp now has a --remove-attribute option + * added V8 context lifetime control options `--javascript.v8-contexts-max-invocations` and `--javascript.v8-contexts-max-age` diff --git a/Documentation/Books/Manual/Administration/Arangoimp.md b/Documentation/Books/Manual/Administration/Arangoimp.md index 3047c51e65..29b92c3e56 100644 --- a/Documentation/Books/Manual/Administration/Arangoimp.md +++ b/Documentation/Books/Manual/Administration/Arangoimp.md @@ -272,6 +272,24 @@ Other common cases are to rename columns in the input file to *_from* and *_to*: The *translate* option can be specified multiple types. The source attribute name and the target attribute must be separated with a *=*. + +### Ignoring Attributes + + +For the CSV and TSV input formats, certain attribute names can be ignored on imports. +In an ArangoDB cluster there are cases where this can come in handy, +when your documents already contain a `_key` attribute +and your collection has a sharding attribute other than `_key`: In the cluster this +configuration is not supported, because ArangoDB needs to guarantee the uniqueness of the `_key` +attribute in *all* shards of the collection. + + > arangoimp --file "data.csv" --type csv --remove-attribute "_key" + +The same thing would apply if your data contains an *_id* attribute: + + > arangoimp --file "data.csv" --type csv --remove-attribute "_id" + + ### Importing into an Edge Collection arangoimp can also be used to import data into an existing edge collection. diff --git a/arangosh/Import/ImportFeature.cpp b/arangosh/Import/ImportFeature.cpp index 5b4eb506c5..a1f751d023 100644 --- a/arangosh/Import/ImportFeature.cpp +++ b/arangosh/Import/ImportFeature.cpp @@ -118,6 +118,11 @@ void ImportFeature::collectOptions( "translate an attribute name (use as --translate " "\"from=to\", for csv and tsv only)", new VectorParameter(&_translations)); + + options->addOption("--remove-attribute", + "remove an attribute before inserting an attribute" + " into a collection (for csv and tsv only)", + new VectorParameter(&_removeAttributes)); std::unordered_set types = {"document", "edge"}; std::vector typesVector(types.begin(), types.end()); @@ -222,6 +227,14 @@ void ImportFeature::validateOptions( FATAL_ERROR_EXIT(); } } + for (std::string& str : _removeAttributes) { + StringUtils::trimInPlace(str); + if (str.empty()) { + LOG_TOPIC(FATAL, arangodb::Logger::FIXME) + << "cannot remove an empty attribute"; + FATAL_ERROR_EXIT(); + } + } } void ImportFeature::start() { @@ -347,6 +360,7 @@ void ImportFeature::start() { } ih.setTranslations(translations); + ih.setRemoveAttributes(_removeAttributes); // quote if (_quote.length() <= 1) { diff --git a/arangosh/Import/ImportFeature.h b/arangosh/Import/ImportFeature.h index 5d600e3b97..e9b1633acc 100644 --- a/arangosh/Import/ImportFeature.h +++ b/arangosh/Import/ImportFeature.h @@ -57,6 +57,7 @@ class ImportFeature final : public application_features::ApplicationFeature, std::string _createCollectionType; std::string _typeImport; std::vector _translations; + std::vector _removeAttributes; bool _overwrite; std::string _quote; std::string _separator; diff --git a/arangosh/Import/ImportHelper.cpp b/arangosh/Import/ImportHelper.cpp index 7f6275f7c5..c30a9b1d31 100644 --- a/arangosh/Import/ImportHelper.cpp +++ b/arangosh/Import/ImportHelper.cpp @@ -162,6 +162,8 @@ ImportHelper::ImportHelper(ClientFeature const* client, _collectionName(), _lineBuffer(TRI_UNKNOWN_MEM_ZONE), _outputBuffer(TRI_UNKNOWN_MEM_ZONE), + _firstLine(""), + _columnNames(), _hasError(false) { for (uint32_t i = 0; i < threadCount; i++) { auto http = client->createHttpClient(endpoint, params); @@ -508,29 +510,36 @@ void ImportHelper::ProcessCsvAdd(TRI_csv_parser_t* parser, char const* field, size_t fieldLength, size_t row, size_t column, bool escaped) { auto importHelper = static_cast(parser->_dataAdd); - - if (importHelper->getRowsRead() < importHelper->getRowsToSkip()) { - return; - } - importHelper->addField(field, fieldLength, row, column, escaped); } void ImportHelper::addField(char const* field, size_t fieldLength, size_t row, size_t column, bool escaped) { + if (_rowsRead < _rowsToSkip) { + return; + } + // we read the first line if we get here + if (row == _rowsToSkip) { + std::string name = std::string(field, fieldLength); + if (fieldLength > 0) { // translate field + auto it = _translations.find(name); + if (it != _translations.end()) { + field = (*it).second.c_str(); + fieldLength = (*it).second.size(); + } + } + _columnNames.push_back(std::move(name)); + } + // skip removable attributes + if (!_removeAttributes.empty() && + _removeAttributes.find(_columnNames[column]) != _removeAttributes.end()) { + return; + } + if (column > 0) { _lineBuffer.appendChar(','); } - - if (row == _rowsToSkip && fieldLength > 0) { - // translate field - auto it = _translations.find(std::string(field, fieldLength)); - if (it != _translations.end()) { - field = (*it).second.c_str(); - fieldLength = (*it).second.size(); - } - } - + if (_keyColumn == -1 && row == _rowsToSkip && fieldLength == 4 && memcmp(field, "_key", 4) == 0) { _keyColumn = column; diff --git a/arangosh/Import/ImportHelper.h b/arangosh/Import/ImportHelper.h index 84e9dd77d8..87f67bd5ce 100644 --- a/arangosh/Import/ImportHelper.h +++ b/arangosh/Import/ImportHelper.h @@ -153,6 +153,12 @@ class ImportHelper { std::unordered_map const& translations) { _translations = translations; } + + void setRemoveAttributes(std::vector const& attr) { + for (std::string const& str : attr) { + _removeAttributes.insert(str); + } + } ////////////////////////////////////////////////////////////////////////////// /// @brief whether or not to overwrite existing data in the collection @@ -289,8 +295,10 @@ class ImportHelper { arangodb::basics::StringBuffer _lineBuffer; arangodb::basics::StringBuffer _outputBuffer; std::string _firstLine; + std::vector _columnNames; std::unordered_map _translations; + std::unordered_set _removeAttributes; bool _hasError; std::vector _errorMessages; diff --git a/js/client/modules/@arangodb/process-utils.js b/js/client/modules/@arangodb/process-utils.js index df15eb0b11..28174842b5 100644 --- a/js/client/modules/@arangodb/process-utils.js +++ b/js/client/modules/@arangodb/process-utils.js @@ -523,6 +523,9 @@ function runArangoImp (options, instanceInfo, what) { if (what.convert !== undefined) { args['convert'] = what.convert ? 'true' : 'false'; } + if (what.removeAttribute !== undefined) { + args['remove-attribute'] = what.removeAttribute; + } return executeAndWait(ARANGOIMP_BIN, toArgv(args), options, 'arangoimp', instanceInfo.rootDir); } diff --git a/js/client/modules/@arangodb/testsuites/importing.js b/js/client/modules/@arangodb/testsuites/importing.js index a71f80fe6b..02ce041dfd 100644 --- a/js/client/modules/@arangodb/testsuites/importing.js +++ b/js/client/modules/@arangodb/testsuites/importing.js @@ -159,6 +159,13 @@ const impTodos = [{ type: 'json', create: 'false', onDuplicate: 'replace' +}, { + id: 'removeAttribute', + data: tu.makePathUnix('js/common/test-data/import/import-1.csv'), + coll: 'UnitTestsImportRemoveAttribute', + type: 'csv', + create: 'true', + removeAttribute: 'a' }]; function importing (options) { diff --git a/js/server/tests/import/import-setup.js b/js/server/tests/import/import-setup.js index 306bb66142..eafcd7e152 100644 --- a/js/server/tests/import/import-setup.js +++ b/js/server/tests/import/import-setup.js @@ -50,6 +50,7 @@ db._drop("UnitTestsImportEdge"); db._drop("UnitTestsImportIgnore"); db._drop("UnitTestsImportUniqueConstraints"); + db._drop("UnitTestsImportRemoveAttribute"); db._create("UnitTestsImportJson1"); db._create("UnitTestsImportJson2"); diff --git a/js/server/tests/import/import.js b/js/server/tests/import/import.js index b80002fed8..ce459da3e4 100644 --- a/js/server/tests/import/import.js +++ b/js/server/tests/import/import.js @@ -405,7 +405,24 @@ function importTestSuite () { var actual = getQueryResults("FOR i IN UnitTestsImportUniqueConstraints SORT i._key RETURN i", true); assertEqual(expected, actual); - } + }, + +//////////////////////////////////////////////////////////////////////////////// +/// @brief test csv import removing attribute +//////////////////////////////////////////////////////////////////////////////// + + testCsvImportRemoveAttribute : function () { + var expected = [ + { "b": 1, "c": "1.3", "e": -5, "id": 1 }, + { "b": "", "c": 3.1, "d": -2.5, "e": "ddd \" ' ffd", "id": 2 }, + { "b": "test", "c" : -99999999, "d": true, "e": -888.4434, "id": 5 }, + { "b": 20.5, "c": -42, "d": " null ", "e": false, "id": 6 }, + { "b": 1.05e-2, "c": true, "d": false, "id": 7 } + ]; + + var actual = getQueryResults("FOR i IN UnitTestsImportRemoveAttribute SORT i.id RETURN i"); + assertEqual(expected, actual); + }, }; }