arangodb/arangosh/V8Client/ImportHelper.cpp

////////////////////////////////////////////////////////////////////////////////
/// @brief import helper
///
/// @file
///
/// DISCLAIMER
///
/// Copyright by triAGENS GmbH - All rights reserved.
///
/// The Programs (which include both the software and documentation)
/// contain proprietary information of triAGENS GmbH; they are
/// provided under a license agreement containing restrictions on use and
/// disclosure and are also protected by copyright, patent and other
/// intellectual and industrial property laws. Reverse engineering,
/// disassembly or decompilation of the Programs, except to the extent
/// required to obtain interoperability with other independently created
/// software or as specified by law, is prohibited.
///
/// The Programs are not intended for use in any nuclear, aviation, mass
/// transit, medical, or other inherently dangerous applications. It shall
/// be the licensee's responsibility to take all appropriate fail-safe,
/// backup, redundancy, and other measures to ensure the safe use of such
/// applications if the Programs are used for such purposes, and triAGENS
/// GmbH disclaims liability for any damages caused by such use of
/// the Programs.
///
/// This software is the confidential and proprietary information of
/// triAGENS GmbH. You shall not disclose such confidential and
/// proprietary information and shall use it only in accordance with the
/// terms of the license agreement you entered into with triAGENS GmbH.
///
/// Copyright holder is triAGENS GmbH, Cologne, Germany
///
/// @author Dr. Frank Celler
/// @author Achim Brandt
/// @author Copyright 2008-2011, triagens GmbH, Cologne, Germany
////////////////////////////////////////////////////////////////////////////////

#include "ImportHelper.h"

#include <sstream>

#include "Basics/StringUtils.h"
#include "BasicsC/json.h"
#include "BasicsC/strings.h"
#include "Rest/HttpRequest.h"
#include "SimpleHttpClient/SimpleHttpClient.h"
#include "SimpleHttpClient/SimpleHttpResult.h"

using namespace triagens::basics;
using namespace triagens::httpclient;
using namespace triagens::rest;
using namespace std;

namespace triagens {
  namespace v8client {

    ////////////////////////////////////////////////////////////////////////////////
    /// constructor and destructor
    ////////////////////////////////////////////////////////////////////////////////

    ImportHelper::ImportHelper (httpclient::SimpleHttpClient* _client, size_t maxUploadSize)
    : _client(_client),
      _maxUploadSize(maxUploadSize),
      _lineBuffer(TRI_UNKNOWN_MEM_ZONE),
      _outputBuffer(TRI_UNKNOWN_MEM_ZONE) {
      _quote = "\"";
      _separator = ",";
      _eol = "\\n";
      _createCollection = false;
      regcomp(&_doubleRegex, "^[-+]?([0-9]+\\.?[0-9]*|\\.[0-9]+)([eE][-+]?[0-8]+)?$", REG_EXTENDED);
      regcomp(&_intRegex, "^[-+]?([0-9]+)$", REG_EXTENDED);
      _hasError = false;
    }

    ImportHelper::~ImportHelper () {
      regfree(&_doubleRegex);
      regfree(&_intRegex);
    }

    ////////////////////////////////////////////////////////////////////////////////
    /// public functions
    ////////////////////////////////////////////////////////////////////////////////

    ////////////////////////////////////////////////////////////////////////////////
    /// @brief imports a delmiited file
    ////////////////////////////////////////////////////////////////////////////////

    bool ImportHelper::importDelimited (const string& collectionName,
                                        const string& fileName,
                                        const DelimitedImportType typeImport) {
      _collectionName = collectionName;
      _firstLine = "";
      _numberLines = 0;
      _numberOk = 0;
      _numberError = 0;
      _outputBuffer.clear();
      _lineBuffer.clear();
      _errorMessage = "";
      _hasError = false;

      // read and convert
      int fd;

      if (fileName == "-") {
        fd = STDIN_FILENO;
      }
      else {
        fd = open(fileName.c_str(), O_RDONLY);
      }

      if (fd < 0) {
        _errorMessage = TRI_LAST_ERROR_STR;
        return false;
      }

      size_t separatorLength;
      char* separator = TRI_UnescapeUtf8StringZ(TRI_UNKNOWN_MEM_ZONE, _separator.c_str(), _separator.size(), &separatorLength);
      if (separator == NULL) {
        _errorMessage = "out of memory";
        return false;
      }

      size_t eolLength;
      char* eol = TRI_UnescapeUtf8StringZ(TRI_UNKNOWN_MEM_ZONE, _eol.c_str(), _eol.size(), &eolLength);
      if (eol == NULL) {
        _errorMessage = "out of memory";
        return false;
      }

      TRI_csv_parser_t parser;

      TRI_InitCsvParser(&parser,
                        TRI_UNKNOWN_MEM_ZONE,
                        ProcessCsvBegin,
                        ProcessCsvAdd,
                        ProcessCsvEnd);

      TRI_SetSeparatorCsvParser(&parser, separator, separatorLength);
      TRI_SetEolCsvParser(&parser, eol, eolLength);

      // in csv, we'll use the quote char if set
      // in tsv, we do not use the quote char
      if (typeImport == ImportHelper::CSV && _quote.size() > 0) {
        TRI_SetQuoteCsvParser(&parser, _quote[0], true);
      }
      else {
        TRI_SetQuoteCsvParser(&parser, '\0', false);
      }
      parser._dataAdd = this;

      char buffer[16384];

      while (! _hasError) {
        v8::HandleScope scope;

        ssize_t n = read(fd, buffer, sizeof(buffer));

        if (n < 0) {
          TRI_Free(TRI_UNKNOWN_MEM_ZONE, separator);
          TRI_Free(TRI_UNKNOWN_MEM_ZONE, eol);
          TRI_DestroyCsvParser(&parser);
          _errorMessage = TRI_LAST_ERROR_STR;
          return false;
        }
        else if (n == 0) {
          break;
        }

        TRI_ParseCsvString2(&parser, buffer, n);
      }

      if (_outputBuffer.length() > 0) {
        sendCsvBuffer();
      }

      TRI_DestroyCsvParser(&parser);
      TRI_Free(TRI_UNKNOWN_MEM_ZONE, separator);
      TRI_Free(TRI_UNKNOWN_MEM_ZONE, eol);

      if (fileName != "-") {
        close(fd);
      }

      _outputBuffer.clear();
      return !_hasError;
    }

    bool ImportHelper::importJson (const string& collectionName, const string& fileName) {
      _collectionName = collectionName;
      _firstLine = "";
      _numberLines = 0;
      _numberOk = 0;
      _numberError = 0;
      _outputBuffer.clear();
      _errorMessage = "";
      _hasError = false;

      // read and convert
      int fd;

      if (fileName == "-") {
        fd = STDIN_FILENO;
      }
      else {
        fd = open(fileName.c_str(), O_RDONLY);
      }

      if (fd < 0) {
        _errorMessage = TRI_LAST_ERROR_STR;
        return false;
      }

      char buffer[16384];
      bool isArray = false;

      while (! _hasError) {
        ssize_t n = read(fd, buffer, sizeof(buffer));

        if (n < 0) {
          _errorMessage = TRI_LAST_ERROR_STR;
          return false;
        }
        else if (n == 0) {
          break;
        }

        if (_outputBuffer.length() == 0) {
          // detect the import file format (single lines with individual JSON objects
          // or a JSON array with all documents)
          const string firstChar = StringUtils::lTrim(string(buffer, n), "\r\n\t\f\b ").substr(0, 1);
          isArray = (firstChar == "[");
        }

        _outputBuffer.appendText(buffer, n);

        if (_outputBuffer.length() > _maxUploadSize) {
          if (isArray) {
            _errorMessage = "import file is too big.";
            return false;
          }

          // send all data before last '\n'
          const char* first = _outputBuffer.c_str();
          char* pos = (char*) memrchr(first, '\n', _outputBuffer.length());

          if (pos != 0) {
            size_t len = pos - first + 1;
            sendJsonBuffer(first, len, isArray);
            _outputBuffer.erase_front(len);
          }

        }
      }

      if (_outputBuffer.length() > 0) {
        sendJsonBuffer(_outputBuffer.c_str(), _outputBuffer.length(), isArray);
      }

      _numberLines = _numberError + _numberOk;

      if (fileName != "-") {
        close(fd);
      }

      _outputBuffer.clear();
      return ! _hasError;
    }


    ////////////////////////////////////////////////////////////////////////////////
    /// private functions
    ////////////////////////////////////////////////////////////////////////////////

    ////////////////////////////////////////////////////////////////////////////////
    /// @brief return the collection-related URL part
    ////////////////////////////////////////////////////////////////////////////////

    string ImportHelper::getCollectionUrlPart () {
      string part("collection=" + StringUtils::urlEncode(_collectionName));

      if (_createCollection) {
        part += "&createCollection=yes";
      }

      return part;
    }

    ////////////////////////////////////////////////////////////////////////////////
    /// @brief start a new csv line
    ////////////////////////////////////////////////////////////////////////////////

    void ImportHelper::ProcessCsvBegin (TRI_csv_parser_t* parser, size_t row) {
      ImportHelper* ih = reinterpret_cast<ImportHelper*> (parser->_dataAdd);

      if (ih) ih->beginLine(row);
    }

    void ImportHelper::beginLine(size_t row) {
      if (_lineBuffer.length() > 0) {
        // error
        ++_numberError;
        _lineBuffer.clear();
      }

      ++_numberLines;

      if (row > 0) {
        _lineBuffer.appendChar('\n');
      }
      _lineBuffer.appendChar('[');
    }

    ////////////////////////////////////////////////////////////////////////////////
    /// @brief adds a new CSV field
    ////////////////////////////////////////////////////////////////////////////////

    void ImportHelper::ProcessCsvAdd (TRI_csv_parser_t* parser, char const* field, size_t row, size_t column, bool escaped) {
      ImportHelper* ih = reinterpret_cast<ImportHelper*> (parser->_dataAdd);

      if (ih) ih->addField(field, row, column, escaped);
    }

    void ImportHelper::addField (char const* field, size_t row, size_t column, bool escaped) {
      if (column > 0) {
        _lineBuffer.appendChar(',');
      }

      if (row == 0) {
        // head line
        _lineBuffer.appendChar('"');
        _lineBuffer.appendText(StringUtils::escapeUnicode(field));
        _lineBuffer.appendChar('"');
      }
      else {
        if (escaped) {
          _lineBuffer.appendChar('"');
          _lineBuffer.appendText(StringUtils::escapeUnicode(field));
          _lineBuffer.appendChar('"');
        }
        else {
          string s(field);
          if (s.length() == 0) {
            // do nothing
            _lineBuffer.appendText("null");
          }
          else if ("true" == s || "false" == s) {
            _lineBuffer.appendText(field);
          }
          else {
            if (regexec(&_intRegex, s.c_str(), 0, 0, 0) == 0) {
              int64_t num = StringUtils::int64(s);
              _lineBuffer.appendInteger(num);
            }
            else if (regexec(&_doubleRegex, s.c_str(), 0, 0, 0) == 0) {
              double num = StringUtils::doubleDecimal(s);
              _lineBuffer.appendDecimal(num);
            }
            else {
              _lineBuffer.appendChar('"');
              _lineBuffer.appendText(StringUtils::escapeUnicode(field));
              _lineBuffer.appendChar('"');
            }
          }
        }
      }
    }

    ////////////////////////////////////////////////////////////////////////////////
    /// @brief ends a CSV line
    ////////////////////////////////////////////////////////////////////////////////

    void ImportHelper::ProcessCsvEnd (TRI_csv_parser_t* parser, char const* field, size_t row, size_t column, bool escaped) {
      ImportHelper* ih = reinterpret_cast<ImportHelper*> (parser->_dataAdd);

      if (ih) ih->addLastField(field, row, column, escaped);
    }

    void ImportHelper::addLastField (char const* field, size_t row, size_t column, bool escaped) {
      if (column == 0 && StringUtils::trim(field) == "") {
        // ignore empty line
        _lineBuffer.clear();
        return;
      }

      addField(field, row, column, escaped);

      _lineBuffer.appendChar(']');

      if (row == 0) {
        // save the first line
        _firstLine = _lineBuffer.c_str();
      }
      else if (row > 0 && _firstLine == "") {
        // error
        ++_numberError;
        _lineBuffer.clear();
        return;
      }

      // read a complete line

      if (_lineBuffer.length() > 0) {
        _outputBuffer.appendText(_lineBuffer);
        _lineBuffer.clear();
      }
      else {
        ++_numberError;
      }

      if (_outputBuffer.length() > _maxUploadSize) {
        sendCsvBuffer();
        _outputBuffer.appendText(_firstLine);
      }

    }


    void ImportHelper::sendCsvBuffer () {
      if (_hasError) {
        return;
      }

      map<string, string> headerFields;
      SimpleHttpResult* result = _client->request(HttpRequest::HTTP_REQUEST_POST, "/_api/import?" + getCollectionUrlPart(), _outputBuffer.c_str(), _outputBuffer.length(), headerFields);

      handleResult(result);

      _outputBuffer.clear();
    }

    void ImportHelper::sendJsonBuffer (char const* str, size_t len, bool isArray) {
      if (_hasError) {
        return;
      }

      map<string, string> headerFields;
      SimpleHttpResult* result;
      if (isArray) {
        result = _client->request(HttpRequest::HTTP_REQUEST_POST, "/_api/import?type=array&" + getCollectionUrlPart(), str, len, headerFields);
      }
      else {
        result = _client->request(HttpRequest::HTTP_REQUEST_POST, "/_api/import?type=documents&" + getCollectionUrlPart(), str, len, headerFields);
      }

      handleResult(result);
    }

    void ImportHelper::handleResult (SimpleHttpResult* result) {
      if (!result) {
        return;
      }

      stringstream& r = result->getBody();

      TRI_json_t* json = TRI_JsonString(TRI_UNKNOWN_MEM_ZONE, r.str().c_str());

      if (json) {
        // get the "error" flag. This returns a pointer, not a copy
        TRI_json_t* error = TRI_LookupArrayJson(json, "error");

        if (error) {
          if (error->_type == TRI_JSON_BOOLEAN && error->_value._boolean) {
            _hasError = true;

            // get the error message. This returns a pointer, not a copy
            TRI_json_t* errorMessage = TRI_LookupArrayJson(json, "errorMessage");
            if (errorMessage) {
              if (errorMessage->_type == TRI_JSON_STRING) {
                _errorMessage = string(errorMessage->_value._string.data, errorMessage->_value._string.length);
              }
            }
          }
        }

        TRI_json_t* importResult;

        // look up the "created" flag. This returns a pointer, not a copy
        importResult= TRI_LookupArrayJson(json, "created");
        if (importResult) {
          if (importResult->_type == TRI_JSON_NUMBER) {
            _numberOk += (size_t) importResult->_value._number;
          }
        }

        // look up the "errors" flag. This returns a pointer, not a copy
        importResult= TRI_LookupArrayJson(json, "errors");
        if (importResult) {
          if (importResult->_type == TRI_JSON_NUMBER) {
            _numberError += (size_t) importResult->_value._number;
          }
        }

        // this will free the json struct will a sub-elements
        TRI_FreeJson(TRI_UNKNOWN_MEM_ZONE, json);
      }

      delete result;
    }

  }
}