diff --git a/Documentation/Books/Users/Arangoimp/README.mdpp b/Documentation/Books/Users/Arangoimp/README.mdpp index f52faf2104..69105b0914 100644 --- a/Documentation/Books/Users/Arangoimp/README.mdpp +++ b/Documentation/Books/Users/Arangoimp/README.mdpp @@ -210,6 +210,10 @@ multine password!" "Bartholomew ""Bart"" Simpson","Milhouse" ``` +Extra whitespace at the end of each line will be ignored. Whitespace at the +start of lines or between field values will not be ignored, so please make sure +that there is no extra whitespace in front of values or between them. + !SUBSECTION Importing TSV Data diff --git a/UnitTests/Makefile.unittests b/UnitTests/Makefile.unittests index 765a05085c..b725a50c61 100755 --- a/UnitTests/Makefile.unittests +++ b/UnitTests/Makefile.unittests @@ -731,9 +731,9 @@ unittests-import: @echo $(VALGRIND) @builddir@/bin/arangosh $(CLIENT_OPT) --server.username "$(USERNAME)" --server.password "$(PASSWORD)" --server.endpoint unix://$(VOCDIR)/arango.sock --javascript.unit-tests @top_srcdir@/js/server/tests/import-setup.js || test "x$(FORCE)" == "x1" - for i in 1 2 3 4; do $(VALGRIND) @builddir@/bin/arangoimp --server.username "$(USERNAME)" --server.password "$(PASSWORD)" --server.endpoint unix://$(VOCDIR)/arango.sock --file UnitTests/import-$$i.json --collection UnitTestsImportJson$$i --type json || test "x$(FORCE)" == "x1"; done + for i in 1 2 3 4 5; do $(VALGRIND) @builddir@/bin/arangoimp --server.username "$(USERNAME)" --server.password "$(PASSWORD)" --server.endpoint unix://$(VOCDIR)/arango.sock --file UnitTests/import-$$i.json --collection UnitTestsImportJson$$i --type json || test "x$(FORCE)" == "x1"; done for i in 1 2 3; do $(VALGRIND) @builddir@/bin/arangoimp --server.username "$(USERNAME)" --server.password "$(PASSWORD)" --server.endpoint unix://$(VOCDIR)/arango.sock --file UnitTests/import-$$i.csv --collection UnitTestsImportCsv$$i --create-collection true --type csv || test "x$(FORCE)" == "x1"; done - for i in 4; do $(VALGRIND) @builddir@/bin/arangoimp --server.username "$(USERNAME)" --server.password "$(PASSWORD)" --server.endpoint unix://$(VOCDIR)/arango.sock --file UnitTests/import-$$i.csv --collection UnitTestsImportCsv$$i --create-collection true --type csv --backslash-escape true --separator ";" || test "x$(FORCE)" == "x1"; done + for i in 4 5; do $(VALGRIND) @builddir@/bin/arangoimp --server.username "$(USERNAME)" --server.password "$(PASSWORD)" --server.endpoint unix://$(VOCDIR)/arango.sock --file UnitTests/import-$$i.csv --collection UnitTestsImportCsv$$i --create-collection true --type csv --backslash-escape true --separator ";" || test "x$(FORCE)" == "x1"; done for i in 1 2; do $(VALGRIND) @builddir@/bin/arangoimp --server.username "$(USERNAME)" --server.password "$(PASSWORD)" --server.endpoint unix://$(VOCDIR)/arango.sock --file UnitTests/import-$$i.tsv --collection UnitTestsImportTsv$$i --create-collection true --type tsv || test "x$(FORCE)" == "x1"; done $(VALGRIND) @builddir@/bin/arangoimp --server.username "$(USERNAME)" --server.password "$(PASSWORD)" --server.endpoint unix://$(VOCDIR)/arango.sock --file UnitTests/import-edges.json --collection UnitTestsImportEdge --create-collection false --type json || test "x$(FORCE)" == "x1" $(VALGRIND) @builddir@/bin/arangosh $(CLIENT_OPT) --server.username "$(USERNAME)" --server.password "$(PASSWORD)" --server.endpoint unix://$(VOCDIR)/arango.sock --javascript.unit-tests @top_srcdir@/js/server/tests/import.js || test "x$(FORCE)" == "x1" diff --git a/UnitTests/import-5.csv b/UnitTests/import-5.csv new file mode 100644 index 0000000000..f243a1e398 --- /dev/null +++ b/UnitTests/import-5.csv @@ -0,0 +1,6 @@ +"name";"password" +"foo ";"wow +this is a multiline password!" +"bar";"foo" + + diff --git a/UnitTests/import-5.json b/UnitTests/import-5.json new file mode 100644 index 0000000000..f7cefed751 --- /dev/null +++ b/UnitTests/import-5.json @@ -0,0 +1,5 @@ + { "id": 0, "value": "something", "active": true } + +{ "id": 1, "value": "foobar" } + + { "id" : 2, "data": 99.5, "true": "a\t\r\nb c " } diff --git a/arangod/RestHandler/RestImportHandler.cpp b/arangod/RestHandler/RestImportHandler.cpp index 3b47052a89..b50306490f 100644 --- a/arangod/RestHandler/RestImportHandler.cpp +++ b/arangod/RestHandler/RestImportHandler.cpp @@ -181,7 +181,7 @@ int RestImportHandler::handleSingleDocument (RestImportTransaction& trx, char const* from = extractJsonStringValue(json, TRI_VOC_ATTRIBUTE_FROM); char const* to = extractJsonStringValue(json, TRI_VOC_ATTRIBUTE_TO); - if (from == 0 || to == 0) { + if (from == nullptr || to == nullptr) { errorMsg = positionise(i) + "missing '_from' or '_to' attribute"; return TRI_ERROR_ARANGO_INVALID_EDGE_ATTRIBUTE; @@ -206,10 +206,10 @@ int RestImportHandler::handleSingleDocument (RestImportTransaction& trx, res = (res1 != TRI_ERROR_NO_ERROR ? res1 : res2); } - if (edge._fromKey != 0) { + if (edge._fromKey != nullptr) { TRI_Free(TRI_CORE_MEM_ZONE, edge._fromKey); } - if (edge._toKey != 0) { + if (edge._toKey != nullptr) { TRI_Free(TRI_CORE_MEM_ZONE, edge._toKey); } } @@ -617,7 +617,6 @@ bool RestImportHandler::createFromJson (string const& type) { return false; } - // find and load collection given by name or identifier RestImportTransaction trx(_vocbase, collection); @@ -646,35 +645,54 @@ bool RestImportHandler::createFromJson (string const& type) { // each line is a separate JSON document char const* ptr = _request->body(); char const* end = ptr + _request->bodySize(); - string line; size_t i = 0; + string errorMsg; while (ptr < end) { // read line until done i++; + + TRI_ASSERT(ptr != nullptr); + + // trim whitespace at start of line + while (ptr < end && + (*ptr == ' ' || *ptr == '\t' || *ptr == '\r')) { + ++ptr; + } + + if (ptr == end || *ptr == '\0') { + break; + } + + // now find end of line char const* pos = strchr(ptr, '\n'); - if (pos == 0) { - line.assign(ptr, (size_t) (end - ptr)); - ptr = end; - } - else { - line.assign(ptr, (size_t) (pos - ptr)); - ptr = pos + 1; - } + TRI_json_t* json = nullptr; - StringUtils::trimInPlace(line, "\r\n\t "); - if (line.length() == 0) { + if (pos == ptr) { + // line starting with \n, i.e. empty line + ptr = pos + 1; ++result._numEmpty; continue; } + else if (pos != nullptr) { + // non-empty line + *(const_cast(pos)) = '\0'; + TRI_ASSERT(ptr != nullptr); + json = parseJsonLine(ptr); + ptr = pos + 1; + } + else { + // last-line, non-empty + TRI_ASSERT(pos == nullptr); + TRI_ASSERT(ptr != nullptr); + json = parseJsonLine(ptr); + ptr = end; + } - TRI_json_t* json = parseJsonLine(line); - - string errorMsg; res = handleSingleDocument(trx, json, errorMsg, isEdgeCollection, waitForSync, i); - if (json != 0) { + if (json != nullptr) { TRI_FreeJson(TRI_UNKNOWN_MEM_ZONE, json); } @@ -696,10 +714,10 @@ bool RestImportHandler::createFromJson (string const& type) { else { // the entire request body is one JSON document - TRI_json_t* documents = TRI_Json2String(TRI_UNKNOWN_MEM_ZONE, _request->body(), 0); + TRI_json_t* documents = TRI_Json2String(TRI_UNKNOWN_MEM_ZONE, _request->body(), nullptr); if (! TRI_IsListJson(documents)) { - if (documents != 0) { + if (documents != nullptr) { TRI_FreeJson(TRI_UNKNOWN_MEM_ZONE, documents); } @@ -709,12 +727,12 @@ bool RestImportHandler::createFromJson (string const& type) { return false; } + string errorMsg; size_t const n = documents->_value._objects._length; for (size_t i = 0; i < n; ++i) { - TRI_json_t const* json = (TRI_json_t const*) TRI_AtVector(&documents->_value._objects, i); + TRI_json_t const* json = static_cast(TRI_AtVector(&documents->_value._objects, i)); - string errorMsg; res = handleSingleDocument(trx, json, errorMsg, isEdgeCollection, waitForSync, i + 1); if (res == TRI_ERROR_NO_ERROR) { @@ -1034,26 +1052,35 @@ bool RestImportHandler::createFromKeyValueList () { lineNumber = StringUtils::int64(lineNumValue); } - size_t start = 0; - string body(_request->body(), _request->bodySize()); - size_t next = body.find('\n', start); + char const* current = _request->body(); + char const* bodyEnd = current + _request->bodySize(); - if (next == string::npos) { + // process header + char const* next = strchr(current, '\n'); + + if (next == nullptr) { generateError(HttpResponse::BAD, TRI_ERROR_HTTP_BAD_PARAMETER, "no JSON list found in second line"); return false; } - - string line = body.substr(start, next); - StringUtils::trimInPlace(line, "\r\n\t "); - - // get first line - TRI_json_t* keys = 0; - - if (line != "") { - keys = parseJsonLine(line); + + char const* lineStart = current; + char const* lineEnd = next; + + // trim line + while (lineStart < bodyEnd && + (*lineStart == ' ' || *lineStart == '\t' || *lineStart == '\r' || *lineStart == '\n')) { + ++lineStart; } + + while (lineEnd > lineStart && + (*(lineEnd - 1) == ' ' || *(lineEnd - 1) == '\t' || *(lineEnd - 1) == '\r' || *(lineEnd - 1) == '\n')) { + --lineEnd; + } + + *(const_cast(lineEnd)) = '\0'; + TRI_json_t* keys = parseJsonLine(lineStart, lineEnd); if (! checkKeys(keys)) { LOG_WARNING("no JSON string list in first line found"); @@ -1061,13 +1088,13 @@ bool RestImportHandler::createFromKeyValueList () { TRI_ERROR_HTTP_BAD_PARAMETER, "no JSON string list in first line found"); - if (keys != 0) { + if (keys != nullptr) { TRI_FreeJson(TRI_UNKNOWN_MEM_ZONE, keys); } return false; } - start = next + 1; + current = next + 1; // find and load collection given by name or identifier @@ -1097,35 +1124,51 @@ bool RestImportHandler::createFromKeyValueList () { size_t i = (size_t) lineNumber; - while (next != string::npos && start < body.length()) { + while (current != nullptr && current < bodyEnd) { i++; - next = body.find('\n', start); + next = static_cast(memchr(current, '\n', bodyEnd - current)); - if (next == string::npos) { - line = body.substr(start); + char const* lineStart = current; + char const* lineEnd = next; + + if (next == nullptr) { + // reached the end + lineEnd = bodyEnd; + current = nullptr; } else { - line = body.substr(start, next - start); - start = next + 1; + // got more to read + current = next + 1; + *(const_cast(lineEnd)) = '\0'; } - StringUtils::trimInPlace(line, "\r\n\t "); - if (line.length() == 0) { + // trim line + while (lineStart < bodyEnd && + (*lineStart == ' ' || *lineStart == '\t' || *lineStart == '\r' || *lineStart == '\n')) { + ++lineStart; + } + + while (lineEnd > lineStart && + (*(lineEnd - 1) == ' ' || *(lineEnd - 1) == '\t' || *(lineEnd - 1) == '\r' || *(lineEnd - 1) == '\n')) { + --lineEnd; + } + + if (lineStart == lineEnd) { ++result._numEmpty; continue; } - TRI_json_t* values = parseJsonLine(line); + TRI_json_t* values = parseJsonLine(lineStart, lineEnd); - if (values != 0) { + if (values != nullptr) { // build the json object from the list string errorMsg; - TRI_json_t* json = createJsonObject(keys, values, errorMsg, line, i); + TRI_json_t* json = createJsonObject(keys, values, errorMsg, i); TRI_FreeJson(TRI_UNKNOWN_MEM_ZONE, values); - if (json != 0) { + if (json != nullptr) { res = handleSingleDocument(trx, json, errorMsg, isEdgeCollection, waitForSync, i); TRI_FreeJson(TRI_UNKNOWN_MEM_ZONE, json); } @@ -1214,10 +1257,19 @@ void RestImportHandler::generateDocumentsCreated (RestImportResult const& result //////////////////////////////////////////////////////////////////////////////// TRI_json_t* RestImportHandler::parseJsonLine (string const& line) { - char* errmsg = 0; - TRI_json_t* json = TRI_Json2String(TRI_UNKNOWN_MEM_ZONE, line.c_str(), &errmsg); + return parseJsonLine(line.c_str(), line.c_str() + line.size()); +} - if (errmsg != 0) { +//////////////////////////////////////////////////////////////////////////////// +/// @brief parse a single document line +//////////////////////////////////////////////////////////////////////////////// + +TRI_json_t* RestImportHandler::parseJsonLine (char const* start, + char const* end) { + char* errmsg = nullptr; + TRI_json_t* json = TRI_Json2String(TRI_UNKNOWN_MEM_ZONE, start, &errmsg); + + if (errmsg != nullptr) { // must free this error message, otherwise we'll have a memleak TRI_FreeString(TRI_CORE_MEM_ZONE, errmsg); } @@ -1231,26 +1283,25 @@ TRI_json_t* RestImportHandler::parseJsonLine (string const& line) { TRI_json_t* RestImportHandler::createJsonObject (TRI_json_t const* keys, TRI_json_t const* values, string& errorMsg, - string const& line, size_t lineNumber) { if (values->_type != TRI_JSON_LIST) { errorMsg = positionise(lineNumber) + "no valid JSON list data"; - return 0; + return nullptr; } size_t const n = keys->_value._objects._length; if (n != values->_value._objects._length) { errorMsg = positionise(lineNumber) + "wrong number of JSON values"; - return 0; + return nullptr; } TRI_json_t* result = TRI_CreateArray2Json(TRI_UNKNOWN_MEM_ZONE, n); - if (result == 0) { + if (result == nullptr) { LOG_ERROR("out of memory"); - return 0; + return nullptr; } for (size_t i = 0; i < n; ++i) { @@ -1270,7 +1321,7 @@ TRI_json_t* RestImportHandler::createJsonObject (TRI_json_t const* keys, /// @brief validate keys //////////////////////////////////////////////////////////////////////////////// -bool RestImportHandler::checkKeys (TRI_json_t const* keys) { +bool RestImportHandler::checkKeys (TRI_json_t const* keys) const { if (! TRI_IsListJson(keys)) { return false; } @@ -1282,7 +1333,7 @@ bool RestImportHandler::checkKeys (TRI_json_t const* keys) { } for (size_t i = 0; i < n; ++i) { - TRI_json_t* key = (TRI_json_t*) TRI_AtVector(&keys->_value._objects, i); + TRI_json_t const* key = static_cast(TRI_AtVector(&keys->_value._objects, i)); if (! JsonHelper::isString(key)) { return false; diff --git a/arangod/RestHandler/RestImportHandler.h b/arangod/RestHandler/RestImportHandler.h index 37eb736113..cea480f9c5 100644 --- a/arangod/RestHandler/RestImportHandler.h +++ b/arangod/RestHandler/RestImportHandler.h @@ -169,6 +169,13 @@ namespace triagens { TRI_json_t* parseJsonLine (std::string const&); +//////////////////////////////////////////////////////////////////////////////// +/// @brief parses a string +//////////////////////////////////////////////////////////////////////////////// + + TRI_json_t* parseJsonLine (char const*, + char const*); + //////////////////////////////////////////////////////////////////////////////// /// @brief builds a TRI_json_t object from a key and value list //////////////////////////////////////////////////////////////////////////////// @@ -176,14 +183,13 @@ namespace triagens { TRI_json_t* createJsonObject (TRI_json_t const*, TRI_json_t const*, std::string&, - std::string const&, size_t); //////////////////////////////////////////////////////////////////////////////// /// @brief checks the keys, returns true if all values in the list are strings. //////////////////////////////////////////////////////////////////////////////// - bool checkKeys (TRI_json_t const*); + bool checkKeys (TRI_json_t const*) const; }; } diff --git a/js/server/tests/import-setup.js b/js/server/tests/import-setup.js index eedf354c48..971e38d59e 100644 --- a/js/server/tests/import-setup.js +++ b/js/server/tests/import-setup.js @@ -32,10 +32,12 @@ db._drop("UnitTestsImportJson2"); db._drop("UnitTestsImportJson3"); db._drop("UnitTestsImportJson4"); + db._drop("UnitTestsImportJson5"); db._drop("UnitTestsImportCsv1"); db._drop("UnitTestsImportCsv2"); db._drop("UnitTestsImportCsv3"); db._drop("UnitTestsImportCsv4"); + db._drop("UnitTestsImportCsv5"); db._drop("UnitTestsImportTsv1"); db._drop("UnitTestsImportTsv2"); db._drop("UnitTestsImportVertex"); @@ -45,6 +47,7 @@ db._create("UnitTestsImportJson2"); db._create("UnitTestsImportJson3"); db._create("UnitTestsImportJson4"); + db._create("UnitTestsImportJson5"); db._create("UnitTestsImportTsv1"); db._create("UnitTestsImportTsv2"); db._create("UnitTestsImportVertex"); diff --git a/js/server/tests/import-teardown.js b/js/server/tests/import-teardown.js index 7bdf6792ce..6e44009a28 100644 --- a/js/server/tests/import-teardown.js +++ b/js/server/tests/import-teardown.js @@ -32,10 +32,12 @@ db._drop("UnitTestsImportJson2"); db._drop("UnitTestsImportJson3"); db._drop("UnitTestsImportJson4"); + db._drop("UnitTestsImportJson5"); db._drop("UnitTestsImportCsv1"); db._drop("UnitTestsImportCsv2"); db._drop("UnitTestsImportCsv3"); db._drop("UnitTestsImportCsv4"); + db._drop("UnitTestsImportCsv5"); db._drop("UnitTestsImportTsv1"); db._drop("UnitTestsImportTsv2"); db._drop("UnitTestsImportVertex"); diff --git a/js/server/tests/import.js b/js/server/tests/import.js index 90dd31493b..da6f29a2a3 100644 --- a/js/server/tests/import.js +++ b/js/server/tests/import.js @@ -135,7 +135,13 @@ function importTestSuite () { //////////////////////////////////////////////////////////////////////////////// testJsonImport3 : function () { - var expected = [ { "id": 1, "one": 1, "three": 3, "two": 2 }, { "a": 1234, "b": "the quick fox", "id": 2, "jumped": "over the fox", "null": null }, { "id": 3, "not": "important", "spacing": "is" }, { " c ": "h\"'ihi", "a": true, "b": false, "d": "", "id": 4 }, { "id": 5 } ]; + var expected = [ + { "id": 1, "one": 1, "three": 3, "two": 2 }, + { "a": 1234, "b": "the quick fox", "id": 2, "jumped": "over the fox", "null": null }, + { "id": 3, "not": "important", "spacing": "is" }, + { " c ": "h\"'ihi", "a": true, "b": false, "d": "", "id": 4 }, + { "id": 5 } + ]; var actual = getQueryResults("FOR i IN UnitTestsImportJson3 SORT i.id RETURN i"); assertEqual(expected, actual); }, @@ -153,6 +159,21 @@ function importTestSuite () { assertEqual(expected, actual); }, +//////////////////////////////////////////////////////////////////////////////// +/// @brief test json import +//////////////////////////////////////////////////////////////////////////////// + + testJsonImport5 : function () { + var expected = [ + { "active": true, "id": 0, "value": "something" }, + { "id": 1, "value": "foobar" }, + { "id": 2, "data": 99.5, "true": "a\t\r\nb c " } + ]; + + var actual = getQueryResults("FOR i IN UnitTestsImportJson5 SORT i.id RETURN i"); + assertEqual(expected, actual); + }, + //////////////////////////////////////////////////////////////////////////////// /// @brief test csv import //////////////////////////////////////////////////////////////////////////////// @@ -208,6 +229,20 @@ function importTestSuite () { assertEqual(expected, actual); }, +//////////////////////////////////////////////////////////////////////////////// +/// @brief test csv import +//////////////////////////////////////////////////////////////////////////////// + + testCsvImport5 : function () { + var expected = [ + { name: "bar", password: "foo" }, + { name: "foo\t", password: "wow \t \r\nthis is \t\ra multiline password!" } + ]; + + var actual = getQueryResults("FOR i IN UnitTestsImportCsv5 SORT i.name RETURN i"); + assertEqual(expected, actual); + }, + //////////////////////////////////////////////////////////////////////////////// /// @brief test tsv import ////////////////////////////////////////////////////////////////////////////////