Remove attribute option for arangoimp (#3141)

* Adding a remove attribute option to arangoimp (only for csv) * Adding documentation * Fixed log statement * Update CHANGELOG
2017-08-30 10:45:02 +02:00 · 2017-08-30 10:45:02 +02:00 · 885ea3dd5f
parent 00650e6a3f
commit 885ea3dd5f
10 changed files with 96 additions and 16 deletions
--- a/2
+++ b/2
@ -1,6 +1,8 @@
 devel
 -----
 * arangoimp now has a --remove-attribute option
 * added V8 context lifetime control options
  `--javascript.v8-contexts-max-invocations` and `--javascript.v8-contexts-max-age`
--- a/Documentation/Books/Manual/Administration/Arangoimp.md
+++ b/Documentation/Books/Manual/Administration/Arangoimp.md
@ -272,6 +272,24 @@ Other common cases are to rename columns in the input file to *_from* and *_to*:
 The *translate* option can be specified multiple types. The source attribute name
 and the target attribute must be separated with a *=*.
 ### Ignoring Attributes 
 For the CSV and TSV input formats, certain attribute names can be ignored on imports.
 In an ArangoDB cluster there are cases where this can come in handy,
 when your documents already contain a `_key` attribute
 and your collection has a sharding attribute other than `_key`: In the cluster this
 configuration is not supported, because ArangoDB needs to guarantee the uniqueness of the `_key` 
 attribute in *all* shards of the collection.
    > arangoimp --file "data.csv" --type csv --remove-attribute "_key"
 The same thing would apply if your data contains an *_id* attribute:
    > arangoimp --file "data.csv" --type csv --remove-attribute "_id"
 ### Importing into an Edge Collection
 arangoimp can also be used to import data into an existing edge collection.
--- a/arangosh/Import/ImportFeature.cpp
+++ b/arangosh/Import/ImportFeature.cpp
@ -118,6 +118,11 @@ void ImportFeature::collectOptions(
                     "translate an attribute name (use as --translate "
                     "\"from=to\", for csv and tsv only)",
                     new VectorParameter<StringParameter>(&_translations));
  options->addOption("--remove-attribute",
                     "remove an attribute before inserting an attribute"
                     " into a collection (for csv and tsv only)",
                     new VectorParameter<StringParameter>(&_removeAttributes));
  std::unordered_set<std::string> types = {"document", "edge"};
  std::vector<std::string> typesVector(types.begin(), types.end());
@ -222,6 +227,14 @@ void ImportFeature::validateOptions(
      FATAL_ERROR_EXIT();
    }
  }
  for (std::string& str : _removeAttributes) {
    StringUtils::trimInPlace(str);
    if (str.empty()) {
      LOG_TOPIC(FATAL, arangodb::Logger::FIXME)
        << "cannot remove an empty attribute";
      FATAL_ERROR_EXIT();
    }
  }
 }
 void ImportFeature::start() {
@ -347,6 +360,7 @@ void ImportFeature::start() {
  }
  ih.setTranslations(translations);
  ih.setRemoveAttributes(_removeAttributes);
  // quote
  if (_quote.length() <= 1) {
--- a/arangosh/Import/ImportFeature.h
+++ b/arangosh/Import/ImportFeature.h
@ -57,6 +57,7 @@ class ImportFeature final : public application_features::ApplicationFeature,
  std::string _createCollectionType;
  std::string _typeImport;
  std::vector<std::string> _translations;
  std::vector<std::string> _removeAttributes;
  bool _overwrite;
  std::string _quote;
  std::string _separator;
--- a/arangosh/Import/ImportHelper.cpp
+++ b/arangosh/Import/ImportHelper.cpp
@ -162,6 +162,8 @@ ImportHelper::ImportHelper(ClientFeature const* client,
      _collectionName(),
      _lineBuffer(TRI_UNKNOWN_MEM_ZONE),
      _outputBuffer(TRI_UNKNOWN_MEM_ZONE),
      _firstLine(""),
      _columnNames(),
      _hasError(false) {
  for (uint32_t i = 0; i < threadCount; i++) {
    auto http = client->createHttpClient(endpoint, params);
@ -508,29 +510,36 @@ void ImportHelper::ProcessCsvAdd(TRI_csv_parser_t* parser, char const* field,
                                 size_t fieldLength, size_t row, size_t column,
                                 bool escaped) {
  auto importHelper = static_cast<ImportHelper*>(parser->_dataAdd);
  if (importHelper->getRowsRead() < importHelper->getRowsToSkip()) {
    return;
  }
  importHelper->addField(field, fieldLength, row, column, escaped);
 }
 void ImportHelper::addField(char const* field, size_t fieldLength, size_t row,
                            size_t column, bool escaped) {
  if (_rowsRead < _rowsToSkip) {
    return;
  }
  // we read the first line if we get here
  if (row == _rowsToSkip) {
    std::string name = std::string(field, fieldLength);
    if (fieldLength > 0) { // translate field
      auto it = _translations.find(name);
      if (it != _translations.end()) {
        field = (*it).second.c_str();
        fieldLength = (*it).second.size();
      }
    }
    _columnNames.push_back(std::move(name));
  }
  // skip removable attributes
  if (!_removeAttributes.empty() &&
      _removeAttributes.find(_columnNames[column]) != _removeAttributes.end()) {
    return;
  }
  if (column > 0) {
    _lineBuffer.appendChar(',');
  }
-
+  
  if (row == _rowsToSkip && fieldLength > 0) {
    // translate field
    auto it = _translations.find(std::string(field, fieldLength));
    if (it != _translations.end()) {
      field = (*it).second.c_str();
      fieldLength = (*it).second.size();
    }
  }
  if (_keyColumn == -1 && row == _rowsToSkip && fieldLength == 4 &&
      memcmp(field, "_key", 4) == 0) {
    _keyColumn = column;
--- a/arangosh/Import/ImportHelper.h
+++ b/arangosh/Import/ImportHelper.h
@ -153,6 +153,12 @@ class ImportHelper {
      std::unordered_map<std::string, std::string> const& translations) {
    _translations = translations;
  }
  void setRemoveAttributes(std::vector<std::string> const& attr) {
    for (std::string const& str : attr) {
      _removeAttributes.insert(str);
    }
  }
  //////////////////////////////////////////////////////////////////////////////
  /// @brief whether or not to overwrite existing data in the collection
@ -289,8 +295,10 @@ class ImportHelper {
  arangodb::basics::StringBuffer _lineBuffer;
  arangodb::basics::StringBuffer _outputBuffer;
  std::string _firstLine;
  std::vector<std::string> _columnNames;
  std::unordered_map<std::string, std::string> _translations;
  std::unordered_set<std::string> _removeAttributes;
  bool _hasError;
  std::vector<std::string> _errorMessages;
--- a/js/client/modules/@arangodb/process-utils.js
+++ b/js/client/modules/@arangodb/process-utils.js
@ -523,6 +523,9 @@ function runArangoImp (options, instanceInfo, what) {
  if (what.convert !== undefined) {
    args['convert'] = what.convert ? 'true' : 'false';
  }
  if (what.removeAttribute !== undefined) {
    args['remove-attribute'] = what.removeAttribute;
  }
  return executeAndWait(ARANGOIMP_BIN, toArgv(args), options, 'arangoimp', instanceInfo.rootDir);
 }
--- a/js/client/modules/@arangodb/testsuites/importing.js
+++ b/js/client/modules/@arangodb/testsuites/importing.js
@ -159,6 +159,13 @@ const impTodos = [{
  type: 'json',
  create: 'false',
  onDuplicate: 'replace'
 }, {
  id: 'removeAttribute',
  data: tu.makePathUnix('js/common/test-data/import/import-1.csv'),
  coll: 'UnitTestsImportRemoveAttribute',
  type: 'csv',
  create: 'true',
  removeAttribute: 'a'
 }];
 function importing (options) {
--- a/js/server/tests/import/import-setup.js
+++ b/js/server/tests/import/import-setup.js
@ -50,6 +50,7 @@
  db._drop("UnitTestsImportEdge");
  db._drop("UnitTestsImportIgnore");
  db._drop("UnitTestsImportUniqueConstraints");
  db._drop("UnitTestsImportRemoveAttribute");
  db._create("UnitTestsImportJson1");
  db._create("UnitTestsImportJson2");
--- a/js/server/tests/import/import.js
+++ b/js/server/tests/import/import.js
@ -405,7 +405,24 @@ function importTestSuite () {
      var actual = getQueryResults("FOR i IN UnitTestsImportUniqueConstraints SORT i._key RETURN i", true);
      assertEqual(expected, actual);
-    }
+    },
 ////////////////////////////////////////////////////////////////////////////////
 /// @brief test csv import removing attribute
 ////////////////////////////////////////////////////////////////////////////////
    testCsvImportRemoveAttribute : function () {
      var expected = [ 
        { "b": 1, "c": "1.3", "e": -5, "id": 1 }, 
        { "b": "", "c": 3.1, "d": -2.5, "e": "ddd \" ' ffd", "id": 2 }, 
        { "b": "test", "c" : -99999999, "d": true, "e": -888.4434, "id": 5 },
        { "b": 20.5, "c": -42, "d": " null ", "e": false, "id": 6 },
        { "b": 1.05e-2, "c": true, "d": false, "id": 7 }
      ];
      var actual = getQueryResults("FOR i IN UnitTestsImportRemoveAttribute SORT i.id RETURN i");
      assertEqual(expected, actual);
    },
  };
 }