//////////////////////////////////////////////////////////////////////////////// /// @brief hash index /// /// @file /// /// DISCLAIMER /// /// Copyright 2014 ArangoDB GmbH, Cologne, Germany /// Copyright 2004-2014 triAGENS GmbH, Cologne, Germany /// /// Licensed under the Apache License, Version 2.0 (the "License"); /// you may not use this file except in compliance with the License. /// You may obtain a copy of the License at /// /// http://www.apache.org/licenses/LICENSE-2.0 /// /// Unless required by applicable law or agreed to in writing, software /// distributed under the License is distributed on an "AS IS" BASIS, /// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. /// See the License for the specific language governing permissions and /// limitations under the License. /// /// Copyright holder is ArangoDB GmbH, Cologne, Germany /// /// @author Dr. Frank Celler /// @author Copyright 2014, ArangoDB GmbH, Cologne, Germany /// @author Copyright 2011-2013, triAGENS GmbH, Cologne, Germany //////////////////////////////////////////////////////////////////////////////// #include "HashIndex.h" #include "VocBase/transaction.h" #include "VocBase/VocShaper.h" using namespace triagens::arango; // ----------------------------------------------------------------------------- // --SECTION-- private functions // ----------------------------------------------------------------------------- //////////////////////////////////////////////////////////////////////////////// /// @brief determines if two elements are equal //////////////////////////////////////////////////////////////////////////////// static bool isEqualElementElement (TRI_index_element_t const* left, TRI_index_element_t const* right) { return left->document() == right->document(); } //////////////////////////////////////////////////////////////////////////////// /// @brief given a key generates a hash integer //////////////////////////////////////////////////////////////////////////////// static uint64_t hashKey (TRI_index_search_value_t const* key) { uint64_t hash = 0x0123456789abcdef; for (size_t j = 0; j < key->_length; ++j) { // ignore the sid for hashing hash = fasthash64(key->_values[j]._data.data, key->_values[j]._data.length, hash); } return hash; } //////////////////////////////////////////////////////////////////////////////// /// @brief determines if a key corresponds to an element //////////////////////////////////////////////////////////////////////////////// static bool isEqualKeyElement (TRI_index_search_value_t const* left, TRI_index_element_t const* right) { TRI_ASSERT_EXPENSIVE(right->document() != nullptr); for (size_t j = 0; j < left->_length; ++j) { TRI_shaped_json_t* leftJson = &left->_values[j]; TRI_shaped_sub_t* rightSub = &right->subObjects()[j]; if (leftJson->_sid != rightSub->_sid) { return false; } auto length = leftJson->_data.length; char const* rightData; size_t rightLength; TRI_InspectShapedSub(rightSub, right->document(), rightData, rightLength); if (length != rightLength) { return false; } if (length > 0 && memcmp(leftJson->_data.data, rightData, length) != 0) { return false; } } return true; } //////////////////////////////////////////////////////////////////////////////// /// @brief fills the index search from hash index element //////////////////////////////////////////////////////////////////////////////// static int FillIndexSearchValueByHashIndexElement (HashIndex const* hashIndex, TRI_index_search_value_t* key, TRI_index_element_t const* element) { key->_values = static_cast(TRI_Allocate(TRI_UNKNOWN_MEM_ZONE, hashIndex->keyEntrySize(), false)); if (key->_values == nullptr) { return TRI_ERROR_OUT_OF_MEMORY; } char const* ptr = element->document()->getShapedJsonPtr(); // ONLY IN INDEX size_t const n = hashIndex->paths().size(); for (size_t i = 0; i < n; ++i) { auto sid = element->subObjects()[i]._sid; key->_values[i]._sid = sid; TRI_InspectShapedSub(&element->subObjects()[i], ptr, key->_values[i]); } return TRI_ERROR_NO_ERROR; } //////////////////////////////////////////////////////////////////////////////// /// @brief helper for hashing /// /// This function takes a document master pointer and creates a corresponding /// hash index element. The index element contains the document master pointer /// and a lists of offsets and sizes describing the parts of the documents to be /// hashed and the shape identifier of each part. //////////////////////////////////////////////////////////////////////////////// /* static int HashIndexHelper (HashIndex const* hashIndex, TRI_hash_index_element_t* hashElement, TRI_doc_mptr_t const* document) { TRI_shaped_json_t shapedJson; // the object behind document auto shaper = hashIndex->collection()->getShaper(); // ONLY IN INDEX, PROTECTED by RUNTIME bool const sparse = hashIndex->sparse(); // ............................................................................. // Assign the document to the TRI_hash_index_element_t structure - so that it // can later be retreived. // ............................................................................. TRI_EXTRACT_SHAPED_JSON_MARKER(shapedJson, document->getDataPtr()); // ONLY IN INDEX, PROTECTED by RUNTIME hashElement->_document = const_cast(document); char const* ptr = document->getShapedJsonPtr(); // ONLY IN INDEX // ............................................................................. // Extract the attribute values // ............................................................................. int res = TRI_ERROR_NO_ERROR; auto const& paths = hashIndex->paths(); size_t const n = paths.size(); for (size_t j = 0; j < n; ++j) { TRI_shape_pid_t path = paths[j]; // determine if document has that particular shape TRI_shape_access_t const* acc = shaper->findAccessor(shapedJson._sid, path); // field not part of the object if (acc == nullptr || acc->_resultSid == TRI_SHAPE_ILLEGAL) { hashElement->_subObjects[j]._sid = BasicShapes::TRI_SHAPE_SID_NULL; res = TRI_ERROR_ARANGO_INDEX_DOCUMENT_ATTRIBUTE_MISSING; if (sparse) { // no need to continue return res; } } } */ //////////////////////////////////////////////////////////////////////////////// /// @brief locates a key within the hash array part /// it is the callers responsibility to destroy the result //////////////////////////////////////////////////////////////////////////////// static TRI_vector_pointer_t HashIndex_find (TRI_hash_array_t const* hashArray, TRI_index_search_value_t* key) { TRI_vector_pointer_t results; TRI_InitVectorPointer(&results, TRI_UNKNOWN_MEM_ZONE); // ............................................................................. // A find request means that a set of values for the "key" was sent. We need // to locate the hash array entry by key. // ............................................................................. TRI_index_element_t* result = hashArray->findByKey(key); if (result != nullptr) { // unique hash index: maximum number is 1 TRI_PushBackVectorPointer(&results, result->document()); } return results; } //////////////////////////////////////////////////////////////////////////////// /// @brief locates a key within the hash array part //////////////////////////////////////////////////////////////////////////////// static int HashIndex_find (TRI_hash_array_t const* hashArray, TRI_index_search_value_t* key, std::vector& result) { // ............................................................................. // A find request means that a set of values for the "key" was sent. We need // to locate the hash array entry by key. // ............................................................................. TRI_index_element_t* found = hashArray->findByKey(key); if (found != nullptr) { // unique hash index: maximum number is 1 result.emplace_back(*(found->document())); } return TRI_ERROR_NO_ERROR; } // ----------------------------------------------------------------------------- // --SECTION-- class Index // ----------------------------------------------------------------------------- // ----------------------------------------------------------------------------- // --SECTION-- constructors and destructors // ----------------------------------------------------------------------------- HashIndex::HashIndex (TRI_idx_iid_t iid, TRI_document_collection_t* collection, std::vector> const& fields, bool unique, bool sparse) : Index(iid, collection, fields), _paths(fillPidPaths()), _unique(unique), _sparse(sparse) { TRI_ASSERT(! fields.empty()); TRI_ASSERT(iid != 0); if (unique) { _hashArray = nullptr; try { _hashArray = new TRI_hash_array_t(_paths.size()); } catch (...) { THROW_ARANGO_EXCEPTION(TRI_ERROR_OUT_OF_MEMORY); } } else { uint32_t indexBuckets = 1; if (collection != nullptr) { // document is a nullptr in the coordinator case indexBuckets = collection->_info._indexBuckets; } _multi._hashArray = nullptr; _multi._isEqualElElByKey = nullptr; _multi._hashElement = nullptr; try { _multi._hashElement = new HashElementFunc(_paths.size()); _multi._isEqualElElByKey = new IsEqualElementElementByKey(_paths.size()); _multi._hashArray = new TRI_HashArrayMulti_t(hashKey, *_multi._hashElement, isEqualKeyElement, isEqualElementElement, *_multi._isEqualElElByKey, indexBuckets); } catch (...) { delete _multi._hashElement; _multi._hashElement = nullptr; delete _multi._isEqualElElByKey; _multi._isEqualElElByKey = nullptr; _multi._hashArray = nullptr; THROW_ARANGO_EXCEPTION(TRI_ERROR_OUT_OF_MEMORY); } } } HashIndex::~HashIndex () { if (_unique) { delete _hashArray; _hashArray = nullptr; } else { delete _multi._hashElement; delete _multi._isEqualElElByKey; delete _multi._hashArray; // FIXME: should we free the pointers in there? } } // ----------------------------------------------------------------------------- // --SECTION-- public methods // ----------------------------------------------------------------------------- //////////////////////////////////////////////////////////////////////////////// /// @brief returns a selectivity estimate for the index //////////////////////////////////////////////////////////////////////////////// double HashIndex::selectivityEstimate () const { if (_unique) { return 1.0; } double estimate = _multi._hashArray->selectivity(); TRI_ASSERT(estimate >= 0.0 && estimate <= 1.00001); // floating-point tolerance return estimate; } size_t HashIndex::memory () const { if (_unique) { return static_cast(keyEntrySize() * _hashArray->size() + _hashArray->memoryUsage()); } return static_cast(keyEntrySize() * _multi._hashArray->size() + _multi._hashArray->memoryUsage()); } //////////////////////////////////////////////////////////////////////////////// /// @brief return a JSON representation of the index //////////////////////////////////////////////////////////////////////////////// triagens::basics::Json HashIndex::toJson (TRI_memory_zone_t* zone) const { auto json = Index::toJson(zone); json("unique", triagens::basics::Json(zone, _unique)) ("sparse", triagens::basics::Json(zone, _sparse)); return json; } int HashIndex::insert (TRI_doc_mptr_t const* doc, bool isRollback) { if (_unique) { return insertUnique(doc, isRollback); } return insertMulti(doc, isRollback); } //////////////////////////////////////////////////////////////////////////////// /// @brief removes an entry from the hash array part of the hash index //////////////////////////////////////////////////////////////////////////////// int HashIndex::remove (TRI_doc_mptr_t const* doc, bool isRollback) { if (_unique) { return removeUnique(doc, isRollback); } return removeMulti(doc, isRollback); } //////////////////////////////////////////////////////////////////////////////// /// @brief provides a size hint for the hash index //////////////////////////////////////////////////////////////////////////////// int HashIndex::sizeHint (size_t size) { if (_sparse) { // for sparse indexes, we assume that we will have less index entries // than if the index would be fully populated size /= 5; } if (_unique) { return _hashArray->resize(this, size); } else { return _multi._hashArray->resize(size); } } //////////////////////////////////////////////////////////////////////////////// /// @brief locates entries in the hash index given shaped json objects /// it is the callers responsibility to destroy the result //////////////////////////////////////////////////////////////////////////////// // FIXME: use std::vector here as well TRI_vector_pointer_t HashIndex::lookup (TRI_index_search_value_t* searchValue) const { if (_unique) { return HashIndex_find(_hashArray, searchValue); } std::vector* results = _multi._hashArray->lookupByKey(searchValue); TRI_vector_pointer_t resultsvec; int res = TRI_InitVectorPointer(&resultsvec, TRI_UNKNOWN_MEM_ZONE, results->size()); if (res == TRI_ERROR_NO_ERROR) { for (size_t i = 0; i < results->size(); i++) { TRI_PushBackVectorPointer(&resultsvec, (*results)[i]->document()); } } delete results; return resultsvec; } //////////////////////////////////////////////////////////////////////////////// /// @brief locates entries in the hash index given shaped json objects //////////////////////////////////////////////////////////////////////////////// int HashIndex::lookup (TRI_index_search_value_t* searchValue, std::vector& documents) const { if (_unique) { return HashIndex_find(_hashArray, searchValue, documents); } std::vector* results = nullptr; try { results = _multi._hashArray->lookupByKey(searchValue); } catch (...) { return TRI_ERROR_OUT_OF_MEMORY; } if (results != nullptr) { try { for (size_t i = 0; i < results->size(); i++) { documents.emplace_back(*((*results)[i]->document())); } delete results; } catch (...) { delete results; return TRI_ERROR_OUT_OF_MEMORY; } } return TRI_ERROR_NO_ERROR; } //////////////////////////////////////////////////////////////////////////////// /// @brief locates entries in the hash index given shaped json objects //////////////////////////////////////////////////////////////////////////////// int HashIndex::lookup (TRI_index_search_value_t* searchValue, std::vector& documents, TRI_index_element_t*& next, size_t batchSize) const { if (_unique) { next = nullptr; return HashIndex_find(_hashArray, searchValue, documents); } std::vector* results = nullptr; if (next == nullptr) { try { results = _multi._hashArray->lookupByKey(searchValue, batchSize); } catch (...) { return TRI_ERROR_OUT_OF_MEMORY; } } else { try { results = _multi._hashArray->lookupByKeyContinue(next, batchSize); } catch (...) { return TRI_ERROR_OUT_OF_MEMORY; } } if (results != nullptr) { if (results->size() > 0) { next = results->back(); // for continuation the next time try { for (size_t i = 0; i < results->size(); i++) { documents.emplace_back(*((*results)[i]->document())); } } catch (...) { delete results; return TRI_ERROR_OUT_OF_MEMORY; } } else { next = nullptr; } delete results; } else { next = nullptr; } return TRI_ERROR_NO_ERROR; } // ----------------------------------------------------------------------------- // --SECTION-- private methods // ----------------------------------------------------------------------------- int HashIndex::insertUnique (TRI_doc_mptr_t const* doc, bool isRollback) { auto allocate = [this] () -> TRI_index_element_t* { return TRI_index_element_t::allocate(keyEntrySize(), false); }; std::vector elements; int res = fillElement(allocate, elements, doc, paths(), sparse()); auto work = [this] (TRI_index_element_t const* element, bool isRollback) -> int { TRI_IF_FAILURE("InsertHashIndex") { return TRI_ERROR_DEBUG; } TRI_index_search_value_t key; int res = FillIndexSearchValueByHashIndexElement(this, &key, element); if (res != TRI_ERROR_NO_ERROR) { // out of memory return res; } res = _hashArray->insert(this, &key, element, isRollback); if (key._values != nullptr) { TRI_Free(TRI_UNKNOWN_MEM_ZONE, key._values); } return res; }; size_t count = elements.size(); for (size_t i = 0; i < count; ++i) { auto hashElement = elements[i]; res = work(hashElement, isRollback); if (res != TRI_ERROR_NO_ERROR) { for (size_t j = i; j < count; ++j) { // Free all elements that are not yet in the index TRI_index_element_t::free(elements[j]); } for (size_t j = 0; j < i; ++j) { // Remove all allready indexed elements and free them removeUniqueElement(elements[j], isRollback); } return res; } } return res; } int HashIndex::insertMulti (TRI_doc_mptr_t const* doc, bool isRollback) { auto allocate = [this] () -> TRI_index_element_t* { return TRI_index_element_t::allocate(keyEntrySize(), false); }; std::vector elements; int res = fillElement(allocate, elements, doc, paths(), sparse()); auto work = [this] (TRI_index_element_t* element, bool isRollback) -> int { TRI_IF_FAILURE("InsertHashIndex") { return TRI_ERROR_DEBUG; } TRI_index_element_t* found = _multi._hashArray->insert(element, false, true); if (found != nullptr) { // bad, can only happen if we are in a rollback if (isRollback) { // in which case we silently ignore it return TRI_ERROR_NO_ERROR; } // This is TRI_RESULT_ELEMENT_EXISTS, but this should not happen: return TRI_ERROR_INTERNAL; } return TRI_ERROR_NO_ERROR; }; size_t count = elements.size(); for (size_t i = 0; i < count; ++i) { auto hashElement = elements[i]; res = work(hashElement, isRollback); if (res != TRI_ERROR_NO_ERROR) { for (size_t j = i; j < count; ++j) { // Free all elements that are not yet in the index TRI_index_element_t::free(elements[j]); } for (size_t j = 0; j < i; ++j) { // Remove all allready indexed elements and free them removeMultiElement(elements[j], isRollback); } return res; } } return res; } int HashIndex::removeUniqueElement(TRI_index_element_t* element, bool isRollback) { TRI_IF_FAILURE("RemoveHashIndex") { return TRI_ERROR_DEBUG; } int res = _hashArray->remove (this, element); // this might happen when rolling back if (res == TRI_RESULT_ELEMENT_NOT_FOUND) { if (isRollback) { return TRI_ERROR_NO_ERROR; } else { return TRI_ERROR_INTERNAL; } } return res; } int HashIndex::removeUnique (TRI_doc_mptr_t const* doc, bool isRollback) { auto allocate = [this] () -> TRI_index_element_t* { return TRI_index_element_t::allocate(keyEntrySize(), false); }; std::vector elements; int res = fillElement(allocate, elements, doc, paths(), sparse()); if (res != TRI_ERROR_NO_ERROR) { for (auto& hashElement : elements) { TRI_index_element_t::free(hashElement); } return res; } for (auto& hashElement : elements) { res = removeUniqueElement(hashElement, isRollback); TRI_index_element_t::free(hashElement); } return res; } int HashIndex::removeMultiElement(TRI_index_element_t* element, bool isRollback) { TRI_IF_FAILURE("RemoveHashIndex") { return TRI_ERROR_DEBUG; } TRI_index_element_t* old = _multi._hashArray->remove(element); if (old == nullptr) { // not found if (isRollback) { // ignore in this case, because it can happen return TRI_ERROR_NO_ERROR; } else { return TRI_ERROR_INTERNAL; } } return TRI_ERROR_NO_ERROR; } int HashIndex::removeMulti (TRI_doc_mptr_t const* doc, bool isRollback) { auto allocate = [this] () -> TRI_index_element_t* { return TRI_index_element_t::allocate(keyEntrySize(), false); }; std::vector elements; int res = fillElement(allocate, elements, doc, paths(), sparse()); for (auto& hashElement : elements) { res = removeMultiElement(hashElement, isRollback); TRI_index_element_t::free(hashElement); } return res; } // ----------------------------------------------------------------------------- // --SECTION-- END-OF-FILE // ----------------------------------------------------------------------------- // Local Variables: // mode: outline-minor // outline-regexp: "/// @brief\\|/// {@inheritDoc}\\|/// @page\\|// --SECTION--\\|/// @\\}" // End: