arangodb/arangod/Indexes/HashIndex.cpp

////////////////////////////////////////////////////////////////////////////////
/// @brief hash index
///
/// @file
///
/// DISCLAIMER
///
/// Copyright 2014 ArangoDB GmbH, Cologne, Germany
/// Copyright 2004-2014 triAGENS GmbH, Cologne, Germany
///
/// Licensed under the Apache License, Version 2.0 (the "License");
/// you may not use this file except in compliance with the License.
/// You may obtain a copy of the License at
///
///     http://www.apache.org/licenses/LICENSE-2.0
///
/// Unless required by applicable law or agreed to in writing, software
/// distributed under the License is distributed on an "AS IS" BASIS,
/// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
/// See the License for the specific language governing permissions and
/// limitations under the License.
///
/// Copyright holder is ArangoDB GmbH, Cologne, Germany
///
/// @author Dr. Frank Celler
/// @author Copyright 2014, ArangoDB GmbH, Cologne, Germany
/// @author Copyright 2011-2013, triAGENS GmbH, Cologne, Germany
////////////////////////////////////////////////////////////////////////////////

#include "HashIndex.h"
#include "VocBase/transaction.h"
#include "VocBase/VocShaper.h"

using namespace triagens::arango;

// -----------------------------------------------------------------------------
// --SECTION--                                                 private functions
// -----------------------------------------------------------------------------

////////////////////////////////////////////////////////////////////////////////
/// @brief Frees an index element
////////////////////////////////////////////////////////////////////////////////

static void FreeElement(TRI_index_element_t* element) {
  TRI_index_element_t::free(element);
}

////////////////////////////////////////////////////////////////////////////////
/// @brief determines if two elements are equal
////////////////////////////////////////////////////////////////////////////////

static bool IsEqualElementElement (TRI_index_element_t const* left,
                                   TRI_index_element_t const* right) {
  return left->document() == right->document();
}

////////////////////////////////////////////////////////////////////////////////
/// @brief given a key generates a hash integer
////////////////////////////////////////////////////////////////////////////////

static uint64_t HashKey (TRI_index_search_value_t const* key) {
  uint64_t hash = 0x0123456789abcdef;

  for (size_t j = 0;  j < key->_length;  ++j) {
    // ignore the sid for hashing
    hash = fasthash64(key->_values[j]._data.data, key->_values[j]._data.length, hash);
  }

  return hash;
}

////////////////////////////////////////////////////////////////////////////////
/// @brief determines if a key corresponds to an element
////////////////////////////////////////////////////////////////////////////////

static bool IsEqualKeyElement (TRI_index_search_value_t const* left,
                               TRI_index_element_t const* right) {
  TRI_ASSERT_EXPENSIVE(right->document() != nullptr);

  for (size_t j = 0;  j < left->_length; ++j) {
    TRI_shaped_json_t* leftJson = &left->_values[j];
    TRI_shaped_sub_t* rightSub = &right->subObjects()[j];

    if (leftJson->_sid != rightSub->_sid) {
      return false;
    }

    auto length = leftJson->_data.length;

    char const* rightData;
    size_t rightLength;
    TRI_InspectShapedSub(rightSub, right->document(), rightData, rightLength);

    if (length != rightLength) {
      return false;
    }

    if (length > 0 && memcmp(leftJson->_data.data, rightData, length) != 0) {
      return false;
    }
  }

  return true;
}

////////////////////////////////////////////////////////////////////////////////
/// @brief fills the index search from hash index element
////////////////////////////////////////////////////////////////////////////////

static int FillIndexSearchValueByHashIndexElement (HashIndex const* hashIndex,
                                                   TRI_index_search_value_t* key,
                                                   TRI_index_element_t const* element) {
  key->_values = static_cast<TRI_shaped_json_t*>(TRI_Allocate(TRI_UNKNOWN_MEM_ZONE, hashIndex->keyEntrySize(), false));

  if (key->_values == nullptr) {
    return TRI_ERROR_OUT_OF_MEMORY;
  }

  char const* ptr = element->document()->getShapedJsonPtr();  // ONLY IN INDEX
  size_t const n = hashIndex->paths().size();

  for (size_t i = 0;  i < n;  ++i) {
    auto sid = element->subObjects()[i]._sid;
    key->_values[i]._sid = sid;

    TRI_InspectShapedSub(&element->subObjects()[i], ptr, key->_values[i]);
  }
  key->_length = n;

  return TRI_ERROR_NO_ERROR;
}

// -----------------------------------------------------------------------------
// --SECTION--                                      class HashIndex::UniqueArray
// -----------------------------------------------------------------------------

////////////////////////////////////////////////////////////////////////////////
/// @brief create the unique array
////////////////////////////////////////////////////////////////////////////////

HashIndex::UniqueArray::UniqueArray (TRI_HashArray_t* hashArray,
                                     HashElementFunc* hashElement)
  : _hashArray(hashArray),
    _hashElement(hashElement) {

  TRI_ASSERT(_hashArray != nullptr);
  TRI_ASSERT(_hashElement != nullptr);
}

////////////////////////////////////////////////////////////////////////////////
/// @brief destroy the unique array
////////////////////////////////////////////////////////////////////////////////

HashIndex::UniqueArray::~UniqueArray () {
  if (_hashArray != nullptr) {
    _hashArray->invokeOnAllElements(FreeElement);
  }

  delete _hashArray;
  delete _hashElement;
}

// -----------------------------------------------------------------------------
// --SECTION--                                       class HashIndex::MultiArray
// -----------------------------------------------------------------------------

////////////////////////////////////////////////////////////////////////////////
/// @brief create the multi array
////////////////////////////////////////////////////////////////////////////////

HashIndex::MultiArray::MultiArray (TRI_HashArrayMulti_t* hashArray,
                                   HashElementFunc* hashElement,
                                   IsEqualElementElementByKey* isEqualElElByKey)
  : _hashArray(hashArray),
    _hashElement(hashElement),
    _isEqualElElByKey(isEqualElElByKey) {

  TRI_ASSERT(_hashArray != nullptr);
  TRI_ASSERT(_hashElement != nullptr);
  TRI_ASSERT(_isEqualElElByKey != nullptr);
}

////////////////////////////////////////////////////////////////////////////////
/// @brief destroy the multi array
////////////////////////////////////////////////////////////////////////////////

HashIndex::MultiArray::~MultiArray () {
  if (_hashArray != nullptr) {
    _hashArray->invokeOnAllElements(FreeElement);
  }

  delete _hashArray;
  delete _hashElement;
  delete _isEqualElElByKey;
}

// -----------------------------------------------------------------------------
// --SECTION--                                                   class HashIndex
// -----------------------------------------------------------------------------

// -----------------------------------------------------------------------------
// --SECTION--                                      constructors and destructors
// -----------------------------------------------------------------------------

////////////////////////////////////////////////////////////////////////////////
/// @brief create the index
////////////////////////////////////////////////////////////////////////////////

HashIndex::HashIndex (TRI_idx_iid_t iid,
                      TRI_document_collection_t* collection,
                      std::vector<std::vector<triagens::basics::AttributeName>> const& fields,
                      bool unique,
                      bool sparse)
  : PathBasedIndex(iid, collection, fields, unique, sparse),
    _uniqueArray(nullptr) {

  uint32_t indexBuckets = 1;

  if (collection != nullptr) {
    // document is a nullptr in the coordinator case
    indexBuckets = collection->_info._indexBuckets;
  }

  std::unique_ptr<HashElementFunc> func(new HashElementFunc(_paths.size()));

  if (unique) {
    std::unique_ptr<TRI_HashArray_t> array(new TRI_HashArray_t(HashKey,
                                                               *(func.get()),
                                                               IsEqualKeyElement,
                                                               IsEqualElementElement,
                                                               indexBuckets,
                                                               [] () -> std::string { return "unique hash-array"; }));

    _uniqueArray = new HashIndex::UniqueArray(array.get(), func.get());
    array.release();
  }
  else {
    _multiArray = nullptr;

    std::unique_ptr<IsEqualElementElementByKey> compare(new IsEqualElementElementByKey(_paths.size()));

    std::unique_ptr<TRI_HashArrayMulti_t> array(new TRI_HashArrayMulti_t(HashKey,
                                                                         *(func.get()),
                                                                         IsEqualKeyElement,
                                                                         IsEqualElementElement,
                                                                         *(compare.get()),
                                                                         indexBuckets,
                                                                         64,
                                                                         [] () -> std::string { return "multi hash-array"; }));

    _multiArray = new HashIndex::MultiArray(array.get(), func.get(), compare.get());

    compare.release();
    array.release();
  }

  func.release();
}

////////////////////////////////////////////////////////////////////////////////
/// @brief destroys the index
////////////////////////////////////////////////////////////////////////////////

HashIndex::~HashIndex () {
  if (_unique) {
    delete _uniqueArray;
  }
  else {
    delete _multiArray;
  }
}

// -----------------------------------------------------------------------------
// --SECTION--                                                    public methods
// -----------------------------------------------------------------------------

////////////////////////////////////////////////////////////////////////////////
/// @brief returns a selectivity estimate for the index
////////////////////////////////////////////////////////////////////////////////

double HashIndex::selectivityEstimate () const {
  if (_unique) {
    return 1.0;
  }

  double estimate = _multiArray->_hashArray->selectivity();
  TRI_ASSERT(estimate >= 0.0 && estimate <= 1.00001); // floating-point tolerance
  return estimate;
}

////////////////////////////////////////////////////////////////////////////////
/// @brief returns the index memory usage
////////////////////////////////////////////////////////////////////////////////

size_t HashIndex::memory () const {
  if (_unique) {
    return static_cast<size_t>(keyEntrySize() * _uniqueArray->_hashArray->size() +
                               _uniqueArray->_hashArray->memoryUsage());
  }

  return static_cast<size_t>(keyEntrySize() * _multiArray->_hashArray->size() +
                             _multiArray->_hashArray->memoryUsage());
}

////////////////////////////////////////////////////////////////////////////////
/// @brief return a JSON representation of the index
////////////////////////////////////////////////////////////////////////////////

triagens::basics::Json HashIndex::toJson (TRI_memory_zone_t* zone,
                                          bool withFigures) const {
  auto json = Index::toJson(zone, withFigures);

  json("unique", triagens::basics::Json(zone, _unique))
      ("sparse", triagens::basics::Json(zone, _sparse));

  return json;
}

////////////////////////////////////////////////////////////////////////////////
/// @brief return a JSON representation of the index figures
////////////////////////////////////////////////////////////////////////////////

triagens::basics::Json HashIndex::toJsonFigures (TRI_memory_zone_t* zone) const {
  triagens::basics::Json json(zone, triagens::basics::Json::Object);
  json("memory", triagens::basics::Json(static_cast<double>(memory())));

  if (_unique) {
    _uniqueArray->_hashArray->appendToJson(zone, json);
  }
  else {
    _multiArray->_hashArray->appendToJson(zone, json);
  }
  return json;
}

int HashIndex::insert (TRI_doc_mptr_t const* doc,
                       bool isRollback) {
  if (_unique) {
    return insertUnique(doc, isRollback);
  }
  return insertMulti(doc, isRollback);
}

////////////////////////////////////////////////////////////////////////////////
/// @brief removes an entry from the hash array part of the hash index
////////////////////////////////////////////////////////////////////////////////

int HashIndex::remove (TRI_doc_mptr_t const* doc,
                       bool isRollback) {

  if (_unique) {
    return removeUnique(doc, isRollback);
  }
  return removeMulti(doc, isRollback);
}

////////////////////////////////////////////////////////////////////////////////
/// @brief provides a size hint for the hash index
////////////////////////////////////////////////////////////////////////////////

int HashIndex::sizeHint (size_t size) {
  if (_sparse) {
    // for sparse indexes, we assume that we will have less index entries
    // than if the index would be fully populated
    size /= 5;
  }

  if (_unique) {
    return _uniqueArray->_hashArray->resize(size);
  }
  else {
    return _multiArray->_hashArray->resize(size);
  }
}

////////////////////////////////////////////////////////////////////////////////
/// @brief locates entries in the hash index given shaped json objects
////////////////////////////////////////////////////////////////////////////////

int HashIndex::lookup (TRI_index_search_value_t* searchValue,
                       std::vector<TRI_doc_mptr_copy_t>& documents) const {

  if (_unique) {
    TRI_index_element_t* found = _uniqueArray->_hashArray->findByKey(searchValue);

    if (found != nullptr) {
      // unique hash index: maximum number is 1
      documents.emplace_back(*(found->document()));
    }
    return TRI_ERROR_NO_ERROR;
  }

  std::vector<TRI_index_element_t*>* results = nullptr;
  try {
    results = _multiArray->_hashArray->lookupByKey(searchValue);
  }
  catch (...) {
    return TRI_ERROR_OUT_OF_MEMORY;
  }
  if (results != nullptr) {
    try {
      for (size_t i = 0; i < results->size(); i++) {
        documents.emplace_back(*((*results)[i]->document()));
      }
      delete results;
    }
    catch (...) {
      delete results;
      return TRI_ERROR_OUT_OF_MEMORY;
    }
  }
  return TRI_ERROR_NO_ERROR;
}

////////////////////////////////////////////////////////////////////////////////
/// @brief locates entries in the hash index given shaped json objects
////////////////////////////////////////////////////////////////////////////////

int HashIndex::lookup (TRI_index_search_value_t* searchValue,
                       std::vector<TRI_doc_mptr_copy_t>& documents,
                       TRI_index_element_t*& next,
                       size_t batchSize) const {

  if (_unique) {
    next = nullptr;
    TRI_index_element_t* found = _uniqueArray->_hashArray->findByKey(searchValue);

    if (found != nullptr) {
      // unique hash index: maximum number is 1
      documents.emplace_back(*(found->document()));
    }
    return TRI_ERROR_NO_ERROR;
  }

  std::vector<TRI_index_element_t*>* results = nullptr;

  if (next == nullptr) {
    try {
      results = _multiArray->_hashArray->lookupByKey(searchValue, batchSize);
    }
    catch (...) {
      return TRI_ERROR_OUT_OF_MEMORY;
    }
  }
  else {
    try {
      results = _multiArray->_hashArray->lookupByKeyContinue(next, batchSize);
    }
    catch (...) {
      return TRI_ERROR_OUT_OF_MEMORY;
    }
  }

  if (results != nullptr) {
    if (results->size() > 0) {
      next = results->back();  // for continuation the next time
      try {
        for (size_t i = 0; i < results->size(); i++) {
          documents.emplace_back(*((*results)[i]->document()));
        }
      }
      catch (...) {
        delete results;
        return TRI_ERROR_OUT_OF_MEMORY;
      }
    }
    else {
      next = nullptr;
    }
    delete results;
  }
  else {
    next = nullptr;
  }
  return TRI_ERROR_NO_ERROR;
}

// -----------------------------------------------------------------------------
// --SECTION--                                                   private methods
// -----------------------------------------------------------------------------

int HashIndex::insertUnique (TRI_doc_mptr_t const* doc,
                             bool isRollback) {

  auto allocate = [this] () -> TRI_index_element_t* {
    return TRI_index_element_t::allocate(keyEntrySize(), false);
  };

  std::vector<TRI_index_element_t*> elements;
  int res = fillElement(allocate, elements, doc);

  auto work = [this] (TRI_index_element_t* element, bool isRollback) -> int {
    TRI_IF_FAILURE("InsertHashIndex") {
      return TRI_ERROR_DEBUG;
    }

    TRI_index_search_value_t key;
    int res = FillIndexSearchValueByHashIndexElement(this, &key, element);

    if (res != TRI_ERROR_NO_ERROR) {
      // out of memory
      return res;
    }

    res = _uniqueArray->_hashArray->insert(&key, element, isRollback);

    if (key._values != nullptr) {
      TRI_Free(TRI_UNKNOWN_MEM_ZONE, key._values);
    }

    return res;
  };

  size_t count = elements.size();
  for (size_t i = 0; i < count; ++i) {
    auto hashElement = elements[i];
    res = work(hashElement, isRollback);
    if (res != TRI_ERROR_NO_ERROR) {
      for (size_t j = i; j < count; ++j) {
        // Free all elements that are not yet in the index
        FreeElement(elements[j]);
      }
      // Allready indexed elements will be removed by the rollback
      return res;
    }
  }
  return res;
}

int HashIndex::insertMulti (TRI_doc_mptr_t const* doc,
                            bool isRollback) {

  auto allocate = [this] () -> TRI_index_element_t* {
    return TRI_index_element_t::allocate(keyEntrySize(), false);
  };

  std::vector<TRI_index_element_t*> elements;
  int res = fillElement(allocate, elements, doc);

  auto work = [this] (TRI_index_element_t* element, bool isRollback) -> int {
    TRI_IF_FAILURE("InsertHashIndex") {
      return TRI_ERROR_DEBUG;
    }

    TRI_index_element_t* found = _multiArray->_hashArray->insert(element, false, true);

    if (found != nullptr) {   // bad, can only happen if we are in a rollback
      if (isRollback) {       // in which case we silently ignore it
        return TRI_ERROR_NO_ERROR;
      }
      // This is TRI_RESULT_ELEMENT_EXISTS, but this should not happen:
      return TRI_ERROR_INTERNAL;
    }

    return TRI_ERROR_NO_ERROR;
  };

  size_t const count = elements.size();
  for (size_t i = 0; i < count; ++i) {
    auto hashElement = elements[i];
    res = work(hashElement, isRollback);
    if (res != TRI_ERROR_NO_ERROR) {
      for (size_t j = i; j < count; ++j) {
        // Free all elements that are not yet in the index
        FreeElement(elements[j]);
      }
      for (size_t j = 0; j < i; ++j) {
        // Remove all allready indexed elements and free them
        removeMultiElement(elements[j], isRollback);
      }
      return res;
    }
  }
  return res;
}

int HashIndex::removeUniqueElement (TRI_index_element_t* element, bool isRollback) {
  TRI_IF_FAILURE("RemoveHashIndex") {
    return TRI_ERROR_DEBUG;
  }
  TRI_index_element_t* old = _uniqueArray->_hashArray->remove(element);

  // this might happen when rolling back
  if (old == nullptr) {
    if (isRollback) {
      return TRI_ERROR_NO_ERROR;
    }
    else {
      return TRI_ERROR_INTERNAL;
    }
  }

  FreeElement(old);

  return TRI_ERROR_NO_ERROR;
}

int HashIndex::removeUnique (TRI_doc_mptr_t const* doc, bool isRollback) {
  auto allocate = [this] () -> TRI_index_element_t* {
    return TRI_index_element_t::allocate(keyEntrySize(), false);
  };
  std::vector<TRI_index_element_t*> elements;
  int res = fillElement(allocate, elements, doc);

  if (res != TRI_ERROR_NO_ERROR) {
    for (auto& hashElement : elements) {
      FreeElement(hashElement);
    }
    return res;
  }

  for (auto& hashElement : elements) {
    res = removeUniqueElement(hashElement, isRollback);
    FreeElement(hashElement);
  }
  return res;
}

int HashIndex::removeMultiElement (TRI_index_element_t* element, bool isRollback) {
    TRI_IF_FAILURE("RemoveHashIndex") {
      return TRI_ERROR_DEBUG;
    }

    TRI_index_element_t* old = _multiArray->_hashArray->remove(element);

    if (old == nullptr) {
      // not found
      if (isRollback) {   // ignore in this case, because it can happen
        return TRI_ERROR_NO_ERROR;
      }
      else {
        return TRI_ERROR_INTERNAL;
      }
    }
    FreeElement(old);
    return TRI_ERROR_NO_ERROR;
}

int HashIndex::removeMulti (TRI_doc_mptr_t const* doc, bool isRollback) {

  auto allocate = [this] () -> TRI_index_element_t* {
    return TRI_index_element_t::allocate(keyEntrySize(), false);
  };
  std::vector<TRI_index_element_t*> elements;
  int res = fillElement(allocate, elements, doc);

  for (auto& hashElement : elements) {
    res = removeMultiElement(hashElement, isRollback);
    FreeElement(hashElement);
  }

  return res;
}

// -----------------------------------------------------------------------------
// --SECTION--                                                       END-OF-FILE
// -----------------------------------------------------------------------------

// Local Variables:
// mode: outline-minor
// outline-regexp: "/// @brief\\|/// {@inheritDoc}\\|/// @page\\|// --SECTION--\\|/// @\\}"
// End: