1
0
Fork 0

less string copying, less regex slowdown when importing CSV files

This commit is contained in:
Jan Steemann 2014-09-30 23:20:12 +02:00
parent b9bd496681
commit ce9499eac8
9 changed files with 215 additions and 109 deletions

View File

@ -1,6 +1,8 @@
v2.3.0 (XXXX-XX-XX)
-------------------
* improved performance of CSV import in arangoimp
* fixed issue #1027: Stack traces are off-by-one
* fixed issue #1026: Modules loaded in different files within the same app
@ -105,6 +107,10 @@ v2.3.0 (XXXX-XX-XX)
v2.2.4 (2014-XX-XX)
-------------------
* fixed issue #1030: arangoimp 2.2.3 crashing, not logging on large Windows CSV file
* fixed issue #1025: Traversal not as expected in undirected graph
* fixed issue #1020
This requires re-introducing the startup option `--database.force-sync-properties`.

View File

@ -86,7 +86,7 @@ struct CCsvSetup {
me->column = 0;
}
static void ProcessCsvAdd (TRI_csv_parser_t* parser, char const* field, size_t row, size_t column, bool escaped) {
static void ProcessCsvAdd (TRI_csv_parser_t* parser, char const* field, size_t, size_t row, size_t column, bool escaped) {
CCsvSetup* me = reinterpret_cast<CCsvSetup*> (parser->_dataAdd);
if (me->column++ > 0) {
@ -96,7 +96,7 @@ struct CCsvSetup {
me->out << (escaped ? "ESC" : "") << field << (escaped ? "ESC" : "");
}
static void ProcessCsvEnd (TRI_csv_parser_t* parser, char const* field, size_t row, size_t column, bool escaped) {
static void ProcessCsvEnd (TRI_csv_parser_t* parser, char const* field, size_t, size_t row, size_t column, bool escaped) {
CCsvSetup* me = reinterpret_cast<CCsvSetup*> (parser->_dataAdd);
if (me->column++ > 0) {

View File

@ -5,3 +5,5 @@
3,"this","is"
4,let's,see,what,happens
5,9999999999999999999999999999999999,test,-99999999,true,-888.4434
6,10e4,20.5,-42, null ,false
7,-1.05e2,1.05e-2,true,false,null

Can't render this file because it has a wrong number of fields in line 5.

View File

@ -46,6 +46,92 @@ using namespace triagens::httpclient;
using namespace triagens::rest;
using namespace std;
////////////////////////////////////////////////////////////////////////////////
/// @brief helper function to determine if a field value is an integer
/// this function is here to avoid usage of regexes, which are too slow
////////////////////////////////////////////////////////////////////////////////
static bool IsInteger (char const* field,
size_t fieldLength) {
char const* end = field + fieldLength;
if (*field == '+' || *field == '-') {
++field;
}
while (field < end) {
if (*field < '0' || *field > '9') {
return false;
}
++field;
}
return true;
}
////////////////////////////////////////////////////////////////////////////////
/// @brief helper function to determine if a field value maybe is a decimal
/// value. this function peeks into the first few bytes of the value only
/// this function is here to avoid usage of regexes, which are too slow
////////////////////////////////////////////////////////////////////////////////
static bool IsDecimal (char const* field,
size_t fieldLength) {
char const* ptr = field;
char const* end = ptr + fieldLength;
if (*ptr == '+' || *ptr == '-') {
++ptr;
}
bool nextMustBeNumber = false;
while (ptr < end) {
if (*ptr == '.') {
if (nextMustBeNumber) {
return false;
}
// expect a number after the .
nextMustBeNumber = true;
}
else if (*ptr == 'e' || *ptr == 'E') {
if (nextMustBeNumber) {
return false;
}
// expect a number after the exponent
nextMustBeNumber = true;
++ptr;
if (ptr >= end) {
return false;
}
// skip over optional + or -
if (*ptr == '+' || *ptr == '-') {
++ptr;
}
// do not advance ptr anymore
continue;
}
else if (*ptr >= '0' && *ptr <= '9') {
// found a number
nextMustBeNumber = false;
}
else {
// something else
return false;
}
++ptr;
}
if (nextMustBeNumber) {
return false;
}
return true;
}
namespace triagens {
namespace v8client {
@ -53,7 +139,7 @@ namespace triagens {
/// initialise step value for progress reports
////////////////////////////////////////////////////////////////////////////////
const double ImportHelper::ProgressStep = 2.0;
const double ImportHelper::ProgressStep = 3.0;
////////////////////////////////////////////////////////////////////////////////
/// constructor and destructor
@ -73,14 +159,10 @@ namespace triagens {
_lineBuffer(TRI_UNKNOWN_MEM_ZONE),
_outputBuffer(TRI_UNKNOWN_MEM_ZONE) {
regcomp(&_doubleRegex, "^[-+]?([0-9]+\\.?[0-9]*|\\.[0-9]+)([eE][-+]?[0-8]+)?$", REG_EXTENDED);
regcomp(&_intRegex, "^[-+]?([0-9]+)$", REG_EXTENDED);
_hasError = false;
}
ImportHelper::~ImportHelper () {
regfree(&_doubleRegex);
regfree(&_intRegex);
}
////////////////////////////////////////////////////////////////////////////////
@ -365,11 +447,7 @@ namespace triagens {
////////////////////////////////////////////////////////////////////////////////
void ImportHelper::ProcessCsvBegin (TRI_csv_parser_t* parser, size_t row) {
ImportHelper* ih = reinterpret_cast<ImportHelper*> (parser->_dataAdd);
if (ih) {
ih->beginLine(row);
}
static_cast<ImportHelper*>(parser->_dataAdd)->beginLine(row);
}
void ImportHelper::beginLine (size_t row) {
@ -391,15 +469,20 @@ namespace triagens {
/// @brief adds a new CSV field
////////////////////////////////////////////////////////////////////////////////
void ImportHelper::ProcessCsvAdd (TRI_csv_parser_t* parser, char const* field, size_t row, size_t column, bool escaped) {
ImportHelper* ih = reinterpret_cast<ImportHelper*> (parser->_dataAdd);
if (ih) {
ih->addField(field, row, column, escaped);
}
void ImportHelper::ProcessCsvAdd (TRI_csv_parser_t* parser,
char const* field,
size_t fieldLength,
size_t row,
size_t column,
bool escaped) {
static_cast<ImportHelper*>(parser->_dataAdd)->addField(field, fieldLength, row, column, escaped);
}
void ImportHelper::addField (char const* field, size_t row, size_t column, bool escaped) {
void ImportHelper::addField (char const* field,
size_t fieldLength,
size_t row,
size_t column,
bool escaped) {
if (column > 0) {
_lineBuffer.appendChar(',');
}
@ -407,63 +490,74 @@ namespace triagens {
if (row == 0 || escaped) {
// head line or escaped value
_lineBuffer.appendChar('"');
_lineBuffer.appendText(StringUtils::escapeUnicode(field));
_lineBuffer.appendJsonEncoded(field);
_lineBuffer.appendChar('"');
return;
}
if (*field == '\0') {
// do nothing
_lineBuffer.appendText("null", strlen("null"));
return;
}
// check for literals null, false and true
if (fieldLength == 4 &&
(memcmp(field, "true", 4) == 0 ||
memcmp(field, "null", 4) == 0)) {
_lineBuffer.appendText(field, fieldLength);
return;
}
else if (fieldLength == 5 && memcmp(field, "false", 5) == 0) {
_lineBuffer.appendText(field, fieldLength);
return;
}
if (IsInteger(field, fieldLength)) {
// integer value
// conversion might fail with out-of-range error
try {
if (fieldLength > 8) {
// long integer numbers might be problematic. check if we get out of range
std::stoll(std::string(field, fieldLength)); // this will fail if the number cannot be converted
}
int64_t num = StringUtils::int64(field, fieldLength);
_lineBuffer.appendInteger(num);
}
catch (...) {
// conversion failed
_lineBuffer.appendChar('"');
_lineBuffer.appendJsonEncoded(field);
_lineBuffer.appendChar('"');
}
}
else if (IsDecimal(field, fieldLength)) {
// double value
// conversion might fail with out-of-range error
try {
double num = StringUtils::doubleDecimal(field, fieldLength);
bool failed = (num != num || num == HUGE_VAL || num == -HUGE_VAL);
if (! failed) {
_lineBuffer.appendDecimal(num);
return;
}
// NaN, +inf, -inf
// fall-through to appending the number as a string
}
catch (...) {
// conversion failed
// fall-through to appending the number as a string
}
_lineBuffer.appendChar('"');
_lineBuffer.appendText(field, fieldLength);
_lineBuffer.appendChar('"');
}
else {
string s(field);
if (s.empty()) {
// do nothing
_lineBuffer.appendText("null", strlen("null"));
}
else if ("true" == s || "false" == s || "null" == s) {
_lineBuffer.appendText(s);
}
else {
if (regexec(&_intRegex, s.c_str(), 0, 0, 0) == 0) {
// integer value
// conversion might fail with out-of-range error
try {
std::stoll(s); // this will fail if the number cannot be converted
int64_t num = StringUtils::int64(s);
_lineBuffer.appendInteger(num);
}
catch (...) {
// conversion failed
_lineBuffer.appendChar('"');
_lineBuffer.appendText(StringUtils::escapeUnicode(s));
_lineBuffer.appendChar('"');
}
}
else if (regexec(&_doubleRegex, s.c_str(), 0, 0, 0) == 0) {
// double value
// conversion might fail with out-of-range error
try {
double num = StringUtils::doubleDecimal(s);
bool failed = (num != num || num == HUGE_VAL || num == -HUGE_VAL);
if (! failed) {
_lineBuffer.appendDecimal(num);
}
else {
// NaN, +inf, -inf
_lineBuffer.appendChar('"');
_lineBuffer.appendText(StringUtils::escapeUnicode(s));
_lineBuffer.appendChar('"');
}
}
catch (...) {
// conversion failed
_lineBuffer.appendChar('"');
_lineBuffer.appendText(StringUtils::escapeUnicode(s));
_lineBuffer.appendChar('"');
}
}
else {
_lineBuffer.appendChar('"');
_lineBuffer.appendText(StringUtils::escapeUnicode(s));
_lineBuffer.appendChar('"');
}
}
_lineBuffer.appendChar('"');
_lineBuffer.appendJsonEncoded(field);
_lineBuffer.appendChar('"');
}
}
@ -471,23 +565,32 @@ namespace triagens {
/// @brief ends a CSV line
////////////////////////////////////////////////////////////////////////////////
void ImportHelper::ProcessCsvEnd (TRI_csv_parser_t* parser, char const* field, size_t row, size_t column, bool escaped) {
ImportHelper* ih = reinterpret_cast<ImportHelper*> (parser->_dataAdd);
void ImportHelper::ProcessCsvEnd (TRI_csv_parser_t* parser,
char const* field,
size_t fieldLength,
size_t row,
size_t column,
bool escaped) {
ImportHelper* ih = static_cast<ImportHelper*>(parser->_dataAdd);
if (ih) {
ih->addLastField(field, row, column, escaped);
ih->addLastField(field, fieldLength, row, column, escaped);
ih->incRowsRead();
}
}
void ImportHelper::addLastField (char const* field, size_t row, size_t column, bool escaped) {
if (column == 0 && StringUtils::trim(field) == "") {
void ImportHelper::addLastField (char const* field,
size_t fieldLength,
size_t row,
size_t column,
bool escaped) {
if (column == 0 && *field == '\0') {
// ignore empty line
_lineBuffer.clear();
_lineBuffer.reset();
return;
}
addField(field, row, column, escaped);
addField(field, fieldLength, row, column, escaped);
_lineBuffer.appendChar(']');
@ -552,22 +655,22 @@ namespace triagens {
}
void ImportHelper::handleResult (SimpleHttpResult* result) {
if (result == 0) {
if (result == nullptr) {
return;
}
TRI_json_t* json = TRI_JsonString(TRI_UNKNOWN_MEM_ZONE,
result->getBody().c_str());
if (json != 0) {
if (json != nullptr) {
// error details
TRI_json_t const* details = TRI_LookupArrayJson(json, "details");
if (TRI_IsListJson(details)) {
const size_t n = details->_value._objects._length;
size_t const n = details->_value._objects._length;
for (size_t i = 0; i < n; ++i) {
TRI_json_t const* detail = (TRI_json_t const*) TRI_AtVector(&details->_value._objects, i);
TRI_json_t const* detail = static_cast<TRI_json_t const*>(TRI_AtVector(&details->_value._objects, i));
if (TRI_IsStringJson(detail)) {
LOG_WARNING("%s", detail->_value._string.data);

View File

@ -33,10 +33,6 @@
#include "Basics/Common.h"
#include <regex.h>
#include <v8.h>
#include "Basics/csv.h"
#include "Basics/StringBuffer.h"
@ -203,16 +199,16 @@ namespace triagens {
}
private:
static void ProcessCsvBegin (TRI_csv_parser_t* , size_t );
static void ProcessCsvAdd (TRI_csv_parser_t* parser, char const* field, size_t row, size_t column, bool escaped);
static void ProcessCsvEnd (TRI_csv_parser_t* parser, char const* field, size_t row, size_t column, bool escaped);
static void ProcessCsvBegin (TRI_csv_parser_t*, size_t);
static void ProcessCsvAdd (TRI_csv_parser_t*, char const*, size_t, size_t, size_t, bool);
static void ProcessCsvEnd (TRI_csv_parser_t*, char const*, size_t, size_t, size_t, bool);
void reportProgress (int64_t, int64_t, double&);
std::string getCollectionUrlPart ();
void beginLine (size_t row);
void addField (char const* field, size_t row, size_t column, bool escaped);
void addLastField (char const* field, size_t row, size_t column, bool escaped);
void addField (char const*, size_t, size_t row, size_t column, bool escaped);
void addLastField (char const*, size_t, size_t row, size_t column, bool escaped);
void sendCsvBuffer ();
void sendJsonBuffer (char const* str, size_t len, bool isArray);
@ -242,9 +238,6 @@ namespace triagens {
triagens::basics::StringBuffer _outputBuffer;
std::string _firstLine;
regex_t _doubleRegex;
regex_t _intRegex;
bool _hasError;
std::string _errorMessage;

View File

@ -161,7 +161,9 @@ function importTestSuite () {
var expected = [
{ "a": "1", "b": 1, "c": "1.3", "e": -5, "id": 1 },
{ "b": "", "c": 3.1, "d": -2.5, "e": "ddd \" ' ffd", "id": 2 },
{ "a": "9999999999999999999999999999999999", "b": "test", "c" : -99999999, "d": true, "e": -888.4434, "id": 5 }
{ "a": "9999999999999999999999999999999999", "b": "test", "c" : -99999999, "d": true, "e": -888.4434, "id": 5 },
{ "a": 10e4, "b": 20.5, "c": -42, "d": " null ", "e": false, "id": 6 },
{ "a": -1.05e2, "b": 1.05e-2, "c": true, "d": false, "id": 7 }
];
var actual = getQueryResults("FOR i IN UnitTestsImportCsv1 SORT i.id RETURN i");

View File

@ -40,8 +40,8 @@
void TRI_InitCsvParser (TRI_csv_parser_t* parser,
TRI_memory_zone_t* zone,
void (*begin) (TRI_csv_parser_t*, size_t),
void (*add) (TRI_csv_parser_t*, char const*, size_t, size_t, bool),
void (*end) (TRI_csv_parser_t*, char const*, size_t, size_t, bool)) {
void (*add) (TRI_csv_parser_t*, char const*, size_t, size_t, size_t, bool),
void (*end) (TRI_csv_parser_t*, char const*, size_t, size_t, size_t, bool)) {
size_t length;
parser->_state = TRI_CSV_PARSER_BOL;
@ -316,7 +316,7 @@ int TRI_ParseCsvString2 (TRI_csv_parser_t* parser, char const* line, size_t leng
if (*ptr == parser->_separator) {
*qtr = '\0';
parser->add(parser, parser->_start, parser->_row, parser->_column, false);
parser->add(parser, parser->_start, qtr - parser->_start, parser->_row, parser->_column, false);
ptr++;
parser->_column++;
@ -328,7 +328,7 @@ int TRI_ParseCsvString2 (TRI_csv_parser_t* parser, char const* line, size_t leng
char c = *ptr;
*qtr = '\0';
parser->end(parser, parser->_start, parser->_row, parser->_column, false);
parser->end(parser, parser->_start, qtr - parser->_start, parser->_row, parser->_column, false);
parser->_row++;
if (c == '\r') {
parser->_state = TRI_CSV_PARSER_BOL2;
@ -389,7 +389,7 @@ int TRI_ParseCsvString2 (TRI_csv_parser_t* parser, char const* line, size_t leng
if (*ptr == parser->_separator) {
*qtr = '\0';
parser->add(parser, parser->_start, parser->_row, parser->_column, true);
parser->add(parser, parser->_start, qtr - parser->_start, parser->_row, parser->_column, true);
ptr++;
parser->_column++;
@ -400,7 +400,7 @@ int TRI_ParseCsvString2 (TRI_csv_parser_t* parser, char const* line, size_t leng
char c = *ptr;
*qtr = '\0';
parser->end(parser, parser->_start, parser->_row, parser->_column, true);
parser->end(parser, parser->_start, qtr - parser->_start, parser->_row, parser->_column, true);
parser->_row++;
if (c == '\r') {

View File

@ -79,8 +79,8 @@ typedef struct TRI_csv_parser_s {
void* _dataEnd;
void (*begin) (struct TRI_csv_parser_s*, size_t row);
void (*add) (struct TRI_csv_parser_s*, char const*, size_t row, size_t column, bool escaped);
void (*end) (struct TRI_csv_parser_s*, char const*, size_t row, size_t column, bool escaped);
void (*add) (struct TRI_csv_parser_s*, char const*, size_t, size_t row, size_t column, bool escaped);
void (*end) (struct TRI_csv_parser_s*, char const*, size_t, size_t row, size_t column, bool escaped);
size_t _nResize;
size_t _nMemmove;
@ -99,8 +99,8 @@ TRI_csv_parser_t;
void TRI_InitCsvParser (TRI_csv_parser_t*,
TRI_memory_zone_t*,
void (*) (TRI_csv_parser_t*, size_t),
void (*) (TRI_csv_parser_t*, char const*, size_t, size_t, bool),
void (*) (TRI_csv_parser_t*, char const*, size_t, size_t, bool));
void (*) (TRI_csv_parser_t*, char const*, size_t, size_t, size_t, bool),
void (*) (TRI_csv_parser_t*, char const*, size_t, size_t, size_t, bool));
////////////////////////////////////////////////////////////////////////////////
/// @brief destroys a CSV parser

View File

@ -66,7 +66,7 @@ static void ProcessCsvBegin (TRI_csv_parser_t* parser, size_t row) {
/// @brief adds a new CSV field
////////////////////////////////////////////////////////////////////////////////
static void ProcessCsvAdd (TRI_csv_parser_t* parser, const char* field, size_t row, size_t column, bool escaped) {
static void ProcessCsvAdd (TRI_csv_parser_t* parser, const char* field, size_t, size_t row, size_t column, bool escaped) {
v8::Handle<v8::Array>* array = reinterpret_cast<v8::Handle<v8::Array>*>(parser->_dataBegin);
(*array)->Set((uint32_t) column, v8::String::New(field));
@ -76,7 +76,7 @@ static void ProcessCsvAdd (TRI_csv_parser_t* parser, const char* field, size_t r
/// @brief ends a CSV line
////////////////////////////////////////////////////////////////////////////////
static void ProcessCsvEnd (TRI_csv_parser_t* parser, const char* field, size_t row, size_t column, bool escaped) {
static void ProcessCsvEnd (TRI_csv_parser_t* parser, const char* field, size_t, size_t row, size_t column, bool escaped) {
v8::Handle<v8::Array>* array = reinterpret_cast<v8::Handle<v8::Array>*>(parser->_dataBegin);
(*array)->Set((uint32_t) column, v8::String::New(field));