Remove attribute option for arangoimp (#3141)

* Adding a remove attribute option to arangoimp (only for csv) * Adding documentation * Fixed log statement * Update CHANGELOG
2017-08-30 10:45:02 +02:00 · 2017-08-30 10:45:02 +02:00 · 885ea3dd5f
parent 00650e6a3f
commit 885ea3dd5f
10 changed files with 96 additions and 16 deletions
--- a/2
+++ b/2
@ -1,6 +1,8 @@
 devel
 -----

+* arangoimp now has a --remove-attribute option
+
 * added V8 context lifetime control options
  `--javascript.v8-contexts-max-invocations` and `--javascript.v8-contexts-max-age`

--- a/Documentation/Books/Manual/Administration/Arangoimp.md
+++ b/Documentation/Books/Manual/Administration/Arangoimp.md
@ -272,6 +272,24 @@ Other common cases are to rename columns in the input file to *_from* and *_to*:
 The *translate* option can be specified multiple types. The source attribute name
 and the target attribute must be separated with a *=*.

+
+### Ignoring Attributes 
+
+
+For the CSV and TSV input formats, certain attribute names can be ignored on imports.
+In an ArangoDB cluster there are cases where this can come in handy,
+when your documents already contain a `_key` attribute
+and your collection has a sharding attribute other than `_key`: In the cluster this
+configuration is not supported, because ArangoDB needs to guarantee the uniqueness of the `_key` 
+attribute in *all* shards of the collection.
+
+    > arangoimp --file "data.csv" --type csv --remove-attribute "_key"
+
+The same thing would apply if your data contains an *_id* attribute:
+
+    > arangoimp --file "data.csv" --type csv --remove-attribute "_id"
+
+
 ### Importing into an Edge Collection

 arangoimp can also be used to import data into an existing edge collection.
--- a/arangosh/Import/ImportFeature.cpp
+++ b/arangosh/Import/ImportFeature.cpp
@ -118,6 +118,11 @@ void ImportFeature::collectOptions(
                     "translate an attribute name (use as --translate "
                     "\"from=to\", for csv and tsv only)",
                     new VectorParameter<StringParameter>(&_translations));
+  
+  options->addOption("--remove-attribute",
+                     "remove an attribute before inserting an attribute"
+                     " into a collection (for csv and tsv only)",
+                     new VectorParameter<StringParameter>(&_removeAttributes));

  std::unordered_set<std::string> types = {"document", "edge"};
  std::vector<std::string> typesVector(types.begin(), types.end());
@ -222,6 +227,14 @@ void ImportFeature::validateOptions(
      FATAL_ERROR_EXIT();
    }
  }
+  for (std::string& str : _removeAttributes) {
+    StringUtils::trimInPlace(str);
+    if (str.empty()) {
+      LOG_TOPIC(FATAL, arangodb::Logger::FIXME)
+        << "cannot remove an empty attribute";
+      FATAL_ERROR_EXIT();
+    }
+  }
 }

 void ImportFeature::start() {
@ -347,6 +360,7 @@ void ImportFeature::start() {
  }

  ih.setTranslations(translations);
+  ih.setRemoveAttributes(_removeAttributes);

  // quote
  if (_quote.length() <= 1) {
--- a/arangosh/Import/ImportFeature.h
+++ b/arangosh/Import/ImportFeature.h
@ -57,6 +57,7 @@ class ImportFeature final : public application_features::ApplicationFeature,
  std::string _createCollectionType;
  std::string _typeImport;
  std::vector<std::string> _translations;
+  std::vector<std::string> _removeAttributes;
  bool _overwrite;
  std::string _quote;
  std::string _separator;
--- a/arangosh/Import/ImportHelper.cpp
+++ b/arangosh/Import/ImportHelper.cpp
@ -162,6 +162,8 @@ ImportHelper::ImportHelper(ClientFeature const* client,
      _collectionName(),
      _lineBuffer(TRI_UNKNOWN_MEM_ZONE),
      _outputBuffer(TRI_UNKNOWN_MEM_ZONE),
+      _firstLine(""),
+      _columnNames(),
      _hasError(false) {
  for (uint32_t i = 0; i < threadCount; i++) {
    auto http = client->createHttpClient(endpoint, params);
@ -508,29 +510,36 @@ void ImportHelper::ProcessCsvAdd(TRI_csv_parser_t* parser, char const* field,
                                 size_t fieldLength, size_t row, size_t column,
                                 bool escaped) {
  auto importHelper = static_cast<ImportHelper*>(parser->_dataAdd);
-
-  if (importHelper->getRowsRead() < importHelper->getRowsToSkip()) {
-    return;
-  }
-
  importHelper->addField(field, fieldLength, row, column, escaped);
 }

 void ImportHelper::addField(char const* field, size_t fieldLength, size_t row,
                            size_t column, bool escaped) {
+  if (_rowsRead < _rowsToSkip) {
+    return;
+  }
+  // we read the first line if we get here
+  if (row == _rowsToSkip) {
+    std::string name = std::string(field, fieldLength);
+    if (fieldLength > 0) { // translate field
+      auto it = _translations.find(name);
+      if (it != _translations.end()) {
+        field = (*it).second.c_str();
+        fieldLength = (*it).second.size();
+      }
+    }
+    _columnNames.push_back(std::move(name));
+  }
+  // skip removable attributes
+  if (!_removeAttributes.empty() &&
+      _removeAttributes.find(_columnNames[column]) != _removeAttributes.end()) {
+    return;
+  }
+  
  if (column > 0) {
    _lineBuffer.appendChar(',');
  }
-
-  if (row == _rowsToSkip && fieldLength > 0) {
-    // translate field
-    auto it = _translations.find(std::string(field, fieldLength));
-    if (it != _translations.end()) {
-      field = (*it).second.c_str();
-      fieldLength = (*it).second.size();
-    }
-  }
-
+  
  if (_keyColumn == -1 && row == _rowsToSkip && fieldLength == 4 &&
      memcmp(field, "_key", 4) == 0) {
    _keyColumn = column;
--- a/arangosh/Import/ImportHelper.h
+++ b/arangosh/Import/ImportHelper.h
@ -153,6 +153,12 @@ class ImportHelper {
      std::unordered_map<std::string, std::string> const& translations) {
    _translations = translations;
  }
+  
+  void setRemoveAttributes(std::vector<std::string> const& attr) {
+    for (std::string const& str : attr) {
+      _removeAttributes.insert(str);
+    }
+  }

  //////////////////////////////////////////////////////////////////////////////
  /// @brief whether or not to overwrite existing data in the collection
@ -289,8 +295,10 @@ class ImportHelper {
  arangodb::basics::StringBuffer _lineBuffer;
  arangodb::basics::StringBuffer _outputBuffer;
  std::string _firstLine;
+  std::vector<std::string> _columnNames;

  std::unordered_map<std::string, std::string> _translations;
+  std::unordered_set<std::string> _removeAttributes;

  bool _hasError;
  std::vector<std::string> _errorMessages;
--- a/js/client/modules/@arangodb/process-utils.js
+++ b/js/client/modules/@arangodb/process-utils.js
@ -523,6 +523,9 @@ function runArangoImp (options, instanceInfo, what) {
  if (what.convert !== undefined) {
    args['convert'] = what.convert ? 'true' : 'false';
  }
+  if (what.removeAttribute !== undefined) {
+    args['remove-attribute'] = what.removeAttribute;
+  }

  return executeAndWait(ARANGOIMP_BIN, toArgv(args), options, 'arangoimp', instanceInfo.rootDir);
 }
--- a/js/client/modules/@arangodb/testsuites/importing.js
+++ b/js/client/modules/@arangodb/testsuites/importing.js
@ -159,6 +159,13 @@ const impTodos = [{
  type: 'json',
  create: 'false',
  onDuplicate: 'replace'
+}, {
+  id: 'removeAttribute',
+  data: tu.makePathUnix('js/common/test-data/import/import-1.csv'),
+  coll: 'UnitTestsImportRemoveAttribute',
+  type: 'csv',
+  create: 'true',
+  removeAttribute: 'a'
 }];

 function importing (options) {
--- a/js/server/tests/import/import-setup.js
+++ b/js/server/tests/import/import-setup.js
@ -50,6 +50,7 @@
  db._drop("UnitTestsImportEdge");
  db._drop("UnitTestsImportIgnore");
  db._drop("UnitTestsImportUniqueConstraints");
+  db._drop("UnitTestsImportRemoveAttribute");

  db._create("UnitTestsImportJson1");
  db._create("UnitTestsImportJson2");
--- a/js/server/tests/import/import.js
+++ b/js/server/tests/import/import.js
@ -405,7 +405,24 @@ function importTestSuite () {

      var actual = getQueryResults("FOR i IN UnitTestsImportUniqueConstraints SORT i._key RETURN i", true);
      assertEqual(expected, actual);
-    }
+    },
+
+////////////////////////////////////////////////////////////////////////////////
+/// @brief test csv import removing attribute
+////////////////////////////////////////////////////////////////////////////////
+        
+    testCsvImportRemoveAttribute : function () {
+      var expected = [ 
+        { "b": 1, "c": "1.3", "e": -5, "id": 1 }, 
+        { "b": "", "c": 3.1, "d": -2.5, "e": "ddd \" ' ffd", "id": 2 }, 
+        { "b": "test", "c" : -99999999, "d": true, "e": -888.4434, "id": 5 },
+        { "b": 20.5, "c": -42, "d": " null ", "e": false, "id": 6 },
+        { "b": 1.05e-2, "c": true, "d": false, "id": 7 }
+      ];
+
+      var actual = getQueryResults("FOR i IN UnitTestsImportRemoveAttribute SORT i.id RETURN i");
+      assertEqual(expected, actual);
+    },

  };
 }