mirror of https://gitee.com/bigwinds/arangodb
407 lines
12 KiB
C++
407 lines
12 KiB
C++
////////////////////////////////////////////////////////////////////////////////
|
|
/// DISCLAIMER
|
|
///
|
|
/// Copyright 2014-2016 ArangoDB GmbH, Cologne, Germany
|
|
/// Copyright 2004-2014 triAGENS GmbH, Cologne, Germany
|
|
///
|
|
/// Licensed under the Apache License, Version 2.0 (the "License");
|
|
/// you may not use this file except in compliance with the License.
|
|
/// You may obtain a copy of the License at
|
|
///
|
|
/// http://www.apache.org/licenses/LICENSE-2.0
|
|
///
|
|
/// Unless required by applicable law or agreed to in writing, software
|
|
/// distributed under the License is distributed on an "AS IS" BASIS,
|
|
/// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
|
/// See the License for the specific language governing permissions and
|
|
/// limitations under the License.
|
|
///
|
|
/// Copyright holder is ArangoDB GmbH, Cologne, Germany
|
|
///
|
|
/// @author Dr. Frank Celler
|
|
////////////////////////////////////////////////////////////////////////////////
|
|
|
|
#include "csv.h"
|
|
#include <cstring>
|
|
|
|
////////////////////////////////////////////////////////////////////////////////
|
|
/// @brief inits a CSV parser
|
|
////////////////////////////////////////////////////////////////////////////////
|
|
|
|
void TRI_InitCsvParser(
|
|
TRI_csv_parser_t* parser, void (*begin)(TRI_csv_parser_t*, size_t),
|
|
void (*add)(TRI_csv_parser_t*, char const*, size_t, size_t, size_t, bool),
|
|
void (*end)(TRI_csv_parser_t*, char const*, size_t, size_t, size_t, bool), void* vData) {
|
|
size_t length;
|
|
|
|
parser->_state = TRI_CSV_PARSER_BOL;
|
|
parser->_data = vData;
|
|
TRI_SetQuoteCsvParser(parser, '"', true);
|
|
TRI_SetSeparatorCsvParser(parser, ';');
|
|
TRI_UseBackslashCsvParser(parser, false);
|
|
|
|
length = 1024;
|
|
|
|
parser->_row = 0;
|
|
parser->_column = 0;
|
|
|
|
parser->_begin = static_cast<char*>(TRI_Allocate(length));
|
|
|
|
if (parser->_begin == nullptr) {
|
|
length = 0;
|
|
}
|
|
|
|
parser->_start = parser->_begin;
|
|
parser->_written = parser->_begin;
|
|
parser->_current = parser->_begin;
|
|
parser->_stop = parser->_begin;
|
|
parser->_end = parser->_begin + length;
|
|
|
|
parser->_dataBegin = nullptr;
|
|
parser->_dataAdd = nullptr;
|
|
parser->_dataEnd = nullptr;
|
|
|
|
parser->begin = begin;
|
|
parser->add = add;
|
|
parser->end = end;
|
|
|
|
parser->_nResize = 0;
|
|
parser->_nMemmove = 0;
|
|
parser->_nMemcpy = 0;
|
|
}
|
|
|
|
////////////////////////////////////////////////////////////////////////////////
|
|
/// @brief destroys a CSV parser
|
|
////////////////////////////////////////////////////////////////////////////////
|
|
|
|
void TRI_DestroyCsvParser(TRI_csv_parser_t* parser) {
|
|
if (parser->_begin != nullptr) {
|
|
TRI_Free(parser->_begin);
|
|
}
|
|
}
|
|
|
|
////////////////////////////////////////////////////////////////////////////////
|
|
/// @brief set the separator
|
|
///
|
|
/// note that the separator string must be valid until the parser is destroyed
|
|
////////////////////////////////////////////////////////////////////////////////
|
|
|
|
void TRI_SetSeparatorCsvParser(TRI_csv_parser_t* parser, char separator) {
|
|
parser->_separator = separator;
|
|
}
|
|
|
|
////////////////////////////////////////////////////////////////////////////////
|
|
/// @brief set the quote character
|
|
////////////////////////////////////////////////////////////////////////////////
|
|
|
|
void TRI_SetQuoteCsvParser(TRI_csv_parser_t* parser, char quote, bool useQuote) {
|
|
parser->_quote = quote;
|
|
parser->_useQuote = useQuote;
|
|
}
|
|
|
|
////////////////////////////////////////////////////////////////////////////////
|
|
/// @brief whether or not a backslash is used to escape quotes
|
|
////////////////////////////////////////////////////////////////////////////////
|
|
|
|
void TRI_UseBackslashCsvParser(TRI_csv_parser_t* parser, bool value) {
|
|
parser->_useBackslash = value;
|
|
}
|
|
|
|
////////////////////////////////////////////////////////////////////////////////
|
|
/// @brief parses a CSV line
|
|
////////////////////////////////////////////////////////////////////////////////
|
|
|
|
int TRI_ParseCsvString(TRI_csv_parser_t* parser, char const* line, size_t length) {
|
|
char* ptr;
|
|
char* qtr;
|
|
|
|
// append line to buffer
|
|
if (line != nullptr) {
|
|
TRI_ASSERT(parser->_begin <= parser->_start);
|
|
TRI_ASSERT(parser->_start <= parser->_written);
|
|
TRI_ASSERT(parser->_written <= parser->_current);
|
|
TRI_ASSERT(parser->_current <= parser->_stop);
|
|
TRI_ASSERT(parser->_stop <= parser->_end);
|
|
|
|
// there is enough room between STOP and END
|
|
if (parser->_stop + length <= parser->_end) {
|
|
memcpy(parser->_stop, line, length);
|
|
|
|
parser->_stop += length;
|
|
parser->_nMemcpy++;
|
|
} else {
|
|
size_t l1 = parser->_start - parser->_begin;
|
|
size_t l2 = parser->_end - parser->_stop;
|
|
size_t l3;
|
|
|
|
// not enough room, but enough room between BEGIN and START plus STOP and
|
|
// END
|
|
if (length <= l1 + l2) {
|
|
l3 = parser->_stop - parser->_start;
|
|
|
|
if (0 < l3) {
|
|
memmove(parser->_begin, parser->_start, l3);
|
|
}
|
|
|
|
memcpy(parser->_begin + l3, line, length);
|
|
|
|
parser->_start = parser->_begin;
|
|
parser->_written = parser->_written - l1;
|
|
parser->_current = parser->_current - l1;
|
|
parser->_stop = parser->_begin + l3 + length;
|
|
parser->_nMemmove++;
|
|
}
|
|
|
|
// really not enough room
|
|
else {
|
|
size_t l4, l5;
|
|
|
|
l2 = parser->_stop - parser->_start;
|
|
l3 = parser->_end - parser->_begin + length;
|
|
l4 = parser->_written - parser->_start;
|
|
l5 = parser->_current - parser->_start;
|
|
|
|
ptr = static_cast<char*>(TRI_Allocate(l3));
|
|
|
|
if (ptr == nullptr) {
|
|
return TRI_ERROR_OUT_OF_MEMORY;
|
|
}
|
|
|
|
memcpy(ptr, parser->_start, l2);
|
|
memcpy(ptr + l2, line, length);
|
|
TRI_Free(parser->_begin);
|
|
|
|
parser->_begin = ptr;
|
|
parser->_start = ptr;
|
|
parser->_written = ptr + l4;
|
|
parser->_current = ptr + l5;
|
|
parser->_stop = ptr + l2 + length;
|
|
parser->_end = ptr + l3;
|
|
parser->_nResize++;
|
|
}
|
|
}
|
|
|
|
// start parsing or continue
|
|
ptr = parser->_current;
|
|
qtr = parser->_written;
|
|
|
|
while (true) {
|
|
switch (parser->_state) {
|
|
case TRI_CSV_PARSER_BOL:
|
|
if (ptr == parser->_stop) {
|
|
parser->_written = ptr;
|
|
parser->_current = ptr;
|
|
return false;
|
|
}
|
|
|
|
parser->begin(parser, parser->_row);
|
|
|
|
parser->_column = 0;
|
|
parser->_state = TRI_CSV_PARSER_BOF;
|
|
|
|
break;
|
|
|
|
case TRI_CSV_PARSER_BOL2:
|
|
if (ptr == parser->_stop) {
|
|
parser->_written = ptr;
|
|
parser->_current = ptr;
|
|
return false;
|
|
}
|
|
|
|
if (*ptr == '\n') {
|
|
ptr++;
|
|
}
|
|
parser->_state = TRI_CSV_PARSER_BOL;
|
|
|
|
break;
|
|
|
|
case TRI_CSV_PARSER_BOF:
|
|
if (ptr == parser->_stop) {
|
|
parser->_written = ptr;
|
|
parser->_current = ptr;
|
|
return TRI_ERROR_CORRUPTED_CSV;
|
|
}
|
|
|
|
else if (parser->_useQuote && *ptr == parser->_quote) {
|
|
if (ptr + 1 == parser->_stop) {
|
|
parser->_written = qtr;
|
|
parser->_current = ptr;
|
|
return TRI_ERROR_CORRUPTED_CSV;
|
|
}
|
|
|
|
parser->_state = TRI_CSV_PARSER_WITHIN_QUOTED_FIELD;
|
|
parser->_start = ++ptr;
|
|
|
|
qtr = parser->_written = ptr;
|
|
} else {
|
|
parser->_state = TRI_CSV_PARSER_WITHIN_FIELD;
|
|
parser->_start = ptr;
|
|
|
|
qtr = parser->_written = ptr;
|
|
}
|
|
|
|
break;
|
|
|
|
case TRI_CSV_PARSER_CORRUPTED:
|
|
while (ptr < parser->_stop && *ptr != parser->_separator && *ptr != '\n') {
|
|
ptr++;
|
|
}
|
|
|
|
// found separator or eol
|
|
if (ptr < parser->_stop) {
|
|
// found separator
|
|
if (*ptr == parser->_separator) {
|
|
ptr++;
|
|
|
|
parser->_state = TRI_CSV_PARSER_BOF;
|
|
}
|
|
|
|
// found eol
|
|
else {
|
|
ptr++;
|
|
|
|
parser->_row++;
|
|
parser->_state = TRI_CSV_PARSER_BOL;
|
|
}
|
|
}
|
|
|
|
// need more input
|
|
else {
|
|
parser->_written = qtr;
|
|
parser->_current = ptr;
|
|
return TRI_ERROR_NO_ERROR;
|
|
}
|
|
|
|
break;
|
|
|
|
case TRI_CSV_PARSER_WITHIN_FIELD:
|
|
while (ptr < parser->_stop && *ptr != parser->_separator &&
|
|
*ptr != '\r' && *ptr != '\n') {
|
|
*qtr++ = *ptr++;
|
|
}
|
|
|
|
// found separator or eol
|
|
if (ptr < parser->_stop) {
|
|
// found separator
|
|
if (*ptr == parser->_separator) {
|
|
*qtr = '\0';
|
|
|
|
parser->add(parser, parser->_start, qtr - parser->_start,
|
|
parser->_row, parser->_column, false);
|
|
|
|
ptr++;
|
|
parser->_column++;
|
|
parser->_state = TRI_CSV_PARSER_BOF;
|
|
}
|
|
|
|
// found eol
|
|
else {
|
|
char c = *ptr;
|
|
*qtr = '\0';
|
|
|
|
parser->end(parser, parser->_start, qtr - parser->_start,
|
|
parser->_row, parser->_column, false);
|
|
parser->_row++;
|
|
if (c == '\r') {
|
|
parser->_state = TRI_CSV_PARSER_BOL2;
|
|
} else {
|
|
parser->_state = TRI_CSV_PARSER_BOL;
|
|
}
|
|
|
|
ptr++;
|
|
}
|
|
}
|
|
|
|
// need more input
|
|
else {
|
|
parser->_written = qtr;
|
|
parser->_current = ptr;
|
|
return TRI_ERROR_NO_ERROR;
|
|
}
|
|
|
|
break;
|
|
|
|
case TRI_CSV_PARSER_WITHIN_QUOTED_FIELD:
|
|
TRI_ASSERT(parser->_useQuote);
|
|
|
|
while (ptr < parser->_stop && *ptr != parser->_quote &&
|
|
(!parser->_useBackslash || *ptr != '\\')) {
|
|
*qtr++ = *ptr++;
|
|
}
|
|
|
|
// found quote or a backslash, need at least another quote, a
|
|
// separator, or an eol
|
|
if (ptr + 1 < parser->_stop) {
|
|
bool foundBackslash = (parser->_useBackslash && *ptr == '\\');
|
|
|
|
++ptr;
|
|
|
|
if (foundBackslash) {
|
|
if (*ptr == parser->_quote || *ptr == '\\') {
|
|
// backslash-escaped quote or literal backslash
|
|
*qtr++ = *ptr;
|
|
ptr++;
|
|
break;
|
|
}
|
|
} else if (*ptr == parser->_quote) {
|
|
// a real quote
|
|
*qtr++ = parser->_quote;
|
|
ptr++;
|
|
break;
|
|
}
|
|
|
|
// ignore spaces
|
|
while ((*ptr == ' ' || *ptr == '\t') && (ptr + 1) < parser->_stop) {
|
|
++ptr;
|
|
}
|
|
|
|
// found separator
|
|
if (*ptr == parser->_separator) {
|
|
*qtr = '\0';
|
|
|
|
parser->add(parser, parser->_start, qtr - parser->_start,
|
|
parser->_row, parser->_column, true);
|
|
|
|
ptr++;
|
|
parser->_column++;
|
|
parser->_state = TRI_CSV_PARSER_BOF;
|
|
}
|
|
|
|
else if (*ptr == '\r' || *ptr == '\n') {
|
|
char c = *ptr;
|
|
*qtr = '\0';
|
|
|
|
parser->end(parser, parser->_start, qtr - parser->_start,
|
|
parser->_row, parser->_column, true);
|
|
parser->_row++;
|
|
|
|
if (c == '\r') {
|
|
parser->_state = TRI_CSV_PARSER_BOL2;
|
|
} else {
|
|
parser->_state = TRI_CSV_PARSER_BOL;
|
|
}
|
|
|
|
ptr++;
|
|
}
|
|
|
|
// ups
|
|
else {
|
|
parser->_state = TRI_CSV_PARSER_CORRUPTED;
|
|
}
|
|
}
|
|
|
|
// need more input
|
|
else {
|
|
parser->_written = qtr;
|
|
parser->_current = ptr;
|
|
return true;
|
|
}
|
|
|
|
break;
|
|
}
|
|
}
|
|
}
|
|
|
|
return TRI_ERROR_CORRUPTED_CSV;
|
|
}
|