1
0
Fork 0

Remove attribute option for arangoimp (#3141)

* Adding a remove attribute option to arangoimp (only for csv)

* Adding documentation

* Fixed log statement

* Update CHANGELOG
This commit is contained in:
Simon Grätzer 2017-08-30 10:45:02 +02:00 committed by Frank Celler
parent 00650e6a3f
commit 885ea3dd5f
10 changed files with 96 additions and 16 deletions

View File

@ -1,6 +1,8 @@
devel
-----
* arangoimp now has a --remove-attribute option
* added V8 context lifetime control options
`--javascript.v8-contexts-max-invocations` and `--javascript.v8-contexts-max-age`

View File

@ -272,6 +272,24 @@ Other common cases are to rename columns in the input file to *_from* and *_to*:
The *translate* option can be specified multiple types. The source attribute name
and the target attribute must be separated with a *=*.
### Ignoring Attributes
For the CSV and TSV input formats, certain attribute names can be ignored on imports.
In an ArangoDB cluster there are cases where this can come in handy,
when your documents already contain a `_key` attribute
and your collection has a sharding attribute other than `_key`: In the cluster this
configuration is not supported, because ArangoDB needs to guarantee the uniqueness of the `_key`
attribute in *all* shards of the collection.
> arangoimp --file "data.csv" --type csv --remove-attribute "_key"
The same thing would apply if your data contains an *_id* attribute:
> arangoimp --file "data.csv" --type csv --remove-attribute "_id"
### Importing into an Edge Collection
arangoimp can also be used to import data into an existing edge collection.

View File

@ -118,6 +118,11 @@ void ImportFeature::collectOptions(
"translate an attribute name (use as --translate "
"\"from=to\", for csv and tsv only)",
new VectorParameter<StringParameter>(&_translations));
options->addOption("--remove-attribute",
"remove an attribute before inserting an attribute"
" into a collection (for csv and tsv only)",
new VectorParameter<StringParameter>(&_removeAttributes));
std::unordered_set<std::string> types = {"document", "edge"};
std::vector<std::string> typesVector(types.begin(), types.end());
@ -222,6 +227,14 @@ void ImportFeature::validateOptions(
FATAL_ERROR_EXIT();
}
}
for (std::string& str : _removeAttributes) {
StringUtils::trimInPlace(str);
if (str.empty()) {
LOG_TOPIC(FATAL, arangodb::Logger::FIXME)
<< "cannot remove an empty attribute";
FATAL_ERROR_EXIT();
}
}
}
void ImportFeature::start() {
@ -347,6 +360,7 @@ void ImportFeature::start() {
}
ih.setTranslations(translations);
ih.setRemoveAttributes(_removeAttributes);
// quote
if (_quote.length() <= 1) {

View File

@ -57,6 +57,7 @@ class ImportFeature final : public application_features::ApplicationFeature,
std::string _createCollectionType;
std::string _typeImport;
std::vector<std::string> _translations;
std::vector<std::string> _removeAttributes;
bool _overwrite;
std::string _quote;
std::string _separator;

View File

@ -162,6 +162,8 @@ ImportHelper::ImportHelper(ClientFeature const* client,
_collectionName(),
_lineBuffer(TRI_UNKNOWN_MEM_ZONE),
_outputBuffer(TRI_UNKNOWN_MEM_ZONE),
_firstLine(""),
_columnNames(),
_hasError(false) {
for (uint32_t i = 0; i < threadCount; i++) {
auto http = client->createHttpClient(endpoint, params);
@ -508,29 +510,36 @@ void ImportHelper::ProcessCsvAdd(TRI_csv_parser_t* parser, char const* field,
size_t fieldLength, size_t row, size_t column,
bool escaped) {
auto importHelper = static_cast<ImportHelper*>(parser->_dataAdd);
if (importHelper->getRowsRead() < importHelper->getRowsToSkip()) {
return;
}
importHelper->addField(field, fieldLength, row, column, escaped);
}
void ImportHelper::addField(char const* field, size_t fieldLength, size_t row,
size_t column, bool escaped) {
if (_rowsRead < _rowsToSkip) {
return;
}
// we read the first line if we get here
if (row == _rowsToSkip) {
std::string name = std::string(field, fieldLength);
if (fieldLength > 0) { // translate field
auto it = _translations.find(name);
if (it != _translations.end()) {
field = (*it).second.c_str();
fieldLength = (*it).second.size();
}
}
_columnNames.push_back(std::move(name));
}
// skip removable attributes
if (!_removeAttributes.empty() &&
_removeAttributes.find(_columnNames[column]) != _removeAttributes.end()) {
return;
}
if (column > 0) {
_lineBuffer.appendChar(',');
}
if (row == _rowsToSkip && fieldLength > 0) {
// translate field
auto it = _translations.find(std::string(field, fieldLength));
if (it != _translations.end()) {
field = (*it).second.c_str();
fieldLength = (*it).second.size();
}
}
if (_keyColumn == -1 && row == _rowsToSkip && fieldLength == 4 &&
memcmp(field, "_key", 4) == 0) {
_keyColumn = column;

View File

@ -153,6 +153,12 @@ class ImportHelper {
std::unordered_map<std::string, std::string> const& translations) {
_translations = translations;
}
void setRemoveAttributes(std::vector<std::string> const& attr) {
for (std::string const& str : attr) {
_removeAttributes.insert(str);
}
}
//////////////////////////////////////////////////////////////////////////////
/// @brief whether or not to overwrite existing data in the collection
@ -289,8 +295,10 @@ class ImportHelper {
arangodb::basics::StringBuffer _lineBuffer;
arangodb::basics::StringBuffer _outputBuffer;
std::string _firstLine;
std::vector<std::string> _columnNames;
std::unordered_map<std::string, std::string> _translations;
std::unordered_set<std::string> _removeAttributes;
bool _hasError;
std::vector<std::string> _errorMessages;

View File

@ -523,6 +523,9 @@ function runArangoImp (options, instanceInfo, what) {
if (what.convert !== undefined) {
args['convert'] = what.convert ? 'true' : 'false';
}
if (what.removeAttribute !== undefined) {
args['remove-attribute'] = what.removeAttribute;
}
return executeAndWait(ARANGOIMP_BIN, toArgv(args), options, 'arangoimp', instanceInfo.rootDir);
}

View File

@ -159,6 +159,13 @@ const impTodos = [{
type: 'json',
create: 'false',
onDuplicate: 'replace'
}, {
id: 'removeAttribute',
data: tu.makePathUnix('js/common/test-data/import/import-1.csv'),
coll: 'UnitTestsImportRemoveAttribute',
type: 'csv',
create: 'true',
removeAttribute: 'a'
}];
function importing (options) {

View File

@ -50,6 +50,7 @@
db._drop("UnitTestsImportEdge");
db._drop("UnitTestsImportIgnore");
db._drop("UnitTestsImportUniqueConstraints");
db._drop("UnitTestsImportRemoveAttribute");
db._create("UnitTestsImportJson1");
db._create("UnitTestsImportJson2");

View File

@ -405,7 +405,24 @@ function importTestSuite () {
var actual = getQueryResults("FOR i IN UnitTestsImportUniqueConstraints SORT i._key RETURN i", true);
assertEqual(expected, actual);
}
},
////////////////////////////////////////////////////////////////////////////////
/// @brief test csv import removing attribute
////////////////////////////////////////////////////////////////////////////////
testCsvImportRemoveAttribute : function () {
var expected = [
{ "b": 1, "c": "1.3", "e": -5, "id": 1 },
{ "b": "", "c": 3.1, "d": -2.5, "e": "ddd \" ' ffd", "id": 2 },
{ "b": "test", "c" : -99999999, "d": true, "e": -888.4434, "id": 5 },
{ "b": 20.5, "c": -42, "d": " null ", "e": false, "id": 6 },
{ "b": 1.05e-2, "c": true, "d": false, "id": 7 }
];
var actual = getQueryResults("FOR i IN UnitTestsImportRemoveAttribute SORT i.id RETURN i");
assertEqual(expected, actual);
},
};
}