//////////////////////////////////////////////////////////////////////////////// /// DISCLAIMER /// /// Copyright 2014-2016 ArangoDB GmbH, Cologne, Germany /// Copyright 2004-2014 triAGENS GmbH, Cologne, Germany /// /// Licensed under the Apache License, Version 2.0 (the "License"); /// you may not use this file except in compliance with the License. /// You may obtain a copy of the License at /// /// http://www.apache.org/licenses/LICENSE-2.0 /// /// Unless required by applicable law or agreed to in writing, software /// distributed under the License is distributed on an "AS IS" BASIS, /// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. /// See the License for the specific language governing permissions and /// limitations under the License. /// /// Copyright holder is ArangoDB GmbH, Cologne, Germany /// /// @author Jan Steemann /// @author Daniel H. Larkin /// @author Simon Grätzer //////////////////////////////////////////////////////////////////////////////// #include "RocksDBVPackIndex.h" #include "Aql/AstNode.h" #include "Aql/SortCondition.h" #include "Basics/StaticStrings.h" #include "Basics/VelocyPackHelper.h" #include "RocksDBEngine/RocksDBCollection.h" #include "RocksDBEngine/RocksDBColumnFamily.h" #include "RocksDBEngine/RocksDBCommon.h" #include "RocksDBEngine/RocksDBComparator.h" #include "RocksDBEngine/RocksDBCounterManager.h" #include "RocksDBEngine/RocksDBKeyBounds.h" #include "RocksDBEngine/RocksDBMethods.h" #include "RocksDBEngine/RocksDBPrimaryIndex.h" #include "RocksDBEngine/RocksDBToken.h" #include "RocksDBEngine/RocksDBTransactionState.h" #include "Transaction/Helpers.h" #include "Transaction/Methods.h" #include "VocBase/LogicalCollection.h" #include #include #include #include #include #include #include using namespace arangodb; /// @brief the _key attribute, which, when used in an index, will implictly make /// it unique static std::vector const KeyAttribute{ arangodb::basics::AttributeName("_key", false)}; // ............................................................................. // recall for all of the following comparison functions: // // left < right return -1 // left > right return 1 // left == right return 0 // // furthermore: // // the following order is currently defined for placing an order on documents // undef < null < boolean < number < strings < lists < hash arrays // note: undefined will be treated as NULL pointer not NULL JSON OBJECT // within each type class we have the following order // boolean: false < true // number: natural order // strings: lexicographical // lists: lexicographically and within each slot according to these rules. // ........................................................................... RocksDBVPackIndexIterator::RocksDBVPackIndexIterator( LogicalCollection* collection, transaction::Methods* trx, ManagedDocumentResult* mmdr, arangodb::RocksDBVPackIndex const* index, bool reverse, VPackSlice const& left, VPackSlice const& right) : IndexIterator(collection, trx, mmdr, index), _index(index), _cmp(index->comparator()), _reverse(reverse), _bounds(index->_unique ? RocksDBKeyBounds::UniqueIndexRange( index->objectId(), left, right) : RocksDBKeyBounds::IndexRange(index->objectId(), left, right)) { RocksDBMethods* mthds = rocksutils::toRocksMethods(trx); rocksdb::ReadOptions options = mthds->readOptions(); if (!reverse) { // we need to have a pointer to a slice for the upper bound // so we need to assign the slice to an instance variable here _upperBound = _bounds.end(); options.iterate_upper_bound = &_upperBound; } _iterator = mthds->NewIterator(options, index->columnFamily()); if (reverse) { _iterator->SeekForPrev(_bounds.end()); } else { _iterator->Seek(_bounds.start()); } } /// @brief Reset the cursor void RocksDBVPackIndexIterator::reset() { TRI_ASSERT(_trx->state()->isRunning()); if (_reverse) { _iterator->SeekForPrev(_bounds.end()); } else { _iterator->Seek(_bounds.start()); } } bool RocksDBVPackIndexIterator::outOfRange() const { TRI_ASSERT(_trx->state()->isRunning()); if (_reverse) { return (_cmp->Compare(_iterator->key(), _bounds.start()) < 0); } else { return (_cmp->Compare(_iterator->key(), _bounds.end()) > 0); } } bool RocksDBVPackIndexIterator::next(TokenCallback const& cb, size_t limit) { TRI_ASSERT(_trx->state()->isRunning()); if (limit == 0 || !_iterator->Valid() || outOfRange()) { // No limit no data, or we are actually done. The last call should have // returned false TRI_ASSERT(limit > 0); // Someone called with limit == 0. Api broken return false; } while (limit > 0) { TRI_ASSERT(_index->objectId() == RocksDBKey::objectId(_iterator->key())); TRI_voc_rid_t revisionId = _index->_unique ? RocksDBValue::revisionId(_iterator->value()) : RocksDBKey::revisionId(_iterator->key()); cb(RocksDBToken(revisionId)); --limit; if (_reverse) { _iterator->Prev(); } else { _iterator->Next(); } if (!_iterator->Valid() || outOfRange()) { return false; } } return true; } uint64_t RocksDBVPackIndex::HashForKey(const rocksdb::Slice& key) { // NOTE: This function needs to use the same hashing on the // indexed VPack as the initial inserter does VPackSlice tmp = RocksDBKey::indexedVPack(key); return tmp.normalizedHash(); } /// @brief create the index RocksDBVPackIndex::RocksDBVPackIndex(TRI_idx_iid_t iid, arangodb::LogicalCollection* collection, arangodb::velocypack::Slice const& info) : RocksDBIndex(iid, collection, info, RocksDBColumnFamily::index()), _useExpansion(false), _allowPartialIndex(true), _estimator(nullptr) { _cf = _unique ? RocksDBColumnFamily::uniqueIndex() : RocksDBColumnFamily::index(); if (!_unique && !ServerState::instance()->isCoordinator()) { // We activate the estimator for all non unique-indexes. // And only on DBServers _estimator = std::make_unique>( RocksDBIndex::ESTIMATOR_SIZE); TRI_ASSERT(_estimator != nullptr); } TRI_ASSERT(!_fields.empty()); TRI_ASSERT(iid != 0); fillPaths(_paths, _expanding); for (auto const& it : _fields) { if (TRI_AttributeNamesHaveExpansion(it)) { _useExpansion = true; break; } } } /// @brief destroy the index RocksDBVPackIndex::~RocksDBVPackIndex() {} double RocksDBVPackIndex::selectivityEstimate( arangodb::StringRef const*) const { if (_unique) { return 1.0; // only valid if unique } return _estimator->computeEstimate(); } size_t RocksDBVPackIndex::memory() const { rocksdb::TransactionDB* db = rocksutils::globalRocksDB(); RocksDBKeyBounds bounds = _unique ? RocksDBKeyBounds::UniqueIndex(_objectId) : RocksDBKeyBounds::IndexEntries(_objectId); rocksdb::Range r(bounds.start(), bounds.end()); uint64_t out; db->GetApproximateSizes(&r, 1, &out, true); return (size_t)out; } /// @brief return a VelocyPack representation of the index void RocksDBVPackIndex::toVelocyPack(VPackBuilder& builder, bool withFigures, bool forPersistence) const { TRI_ASSERT(builder.isOpenArray() || builder.isEmpty()); builder.openObject(); RocksDBIndex::toVelocyPack(builder, withFigures, forPersistence); builder.add("unique", VPackValue(_unique)); builder.add("sparse", VPackValue(_sparse)); builder.close(); } /// @brief whether or not the index is implicitly unique /// this can be the case if the index is not declared as unique, but contains /// a /// unique attribute such as _key bool RocksDBVPackIndex::implicitlyUnique() const { if (_unique) { // a unique index is always unique return true; } if (_useExpansion) { // when an expansion such as a[*] is used, the index may not be unique, // even // if it contains attributes that are guaranteed to be unique return false; } for (auto const& it : _fields) { // if _key is contained in the index fields definition, then the index is // implicitly unique if (it == KeyAttribute) { return true; } } // _key not contained return false; } /// @brief helper function to insert a document into any index type /// Should result in an elements vector filled with the new index entries /// uses the _unique field to determine the kind of key structure int RocksDBVPackIndex::fillElement(VPackBuilder& leased, TRI_voc_rid_t revisionId, VPackSlice const& doc, std::vector& elements, std::vector& hashes) { if (doc.isNone()) { LOG_TOPIC(ERR, arangodb::Logger::FIXME) << "encountered invalid marker with slice of type None"; return TRI_ERROR_INTERNAL; } TRI_IF_FAILURE("FillElementIllegalSlice") { return TRI_ERROR_INTERNAL; } TRI_ASSERT(leased.isEmpty()); if (!_useExpansion) { // fast path for inserts... no array elements used leased.openArray(); size_t const n = _paths.size(); for (size_t i = 0; i < n; ++i) { TRI_ASSERT(!_paths[i].empty()); VPackSlice slice = doc.get(_paths[i]); if (slice.isNone() || slice.isNull()) { // attribute not found if (_sparse) { // if sparse we do not have to index, this is indicated by result // being shorter than n return TRI_ERROR_NO_ERROR; } // null, note that this will be copied later! leased.add(VPackSlice::nullSlice()); } else { leased.add(slice); } } leased.close(); TRI_IF_FAILURE("FillElementOOM") { return TRI_ERROR_OUT_OF_MEMORY; } TRI_IF_FAILURE("FillElementOOM2") { THROW_ARANGO_EXCEPTION(TRI_ERROR_OUT_OF_MEMORY); } if (_unique) { // Unique VPack index values are stored as follows: // - Key: 7 + 8-byte object ID of index + VPack array with index // value(s) // + separator (NUL) byte // - Value: primary key elements.emplace_back( RocksDBKey::UniqueIndexValue(_objectId, leased.slice())); } else { // Non-unique VPack index values are stored as follows: // - Key: 6 + 8-byte object ID of index + VPack array with index // value(s) // + separator (NUL) byte + primary key // - Value: empty elements.emplace_back( RocksDBKey::IndexValue(_objectId, leased.slice(), revisionId)); hashes.push_back(leased.slice().normalizedHash()); } } else { // other path for handling array elements, too std::vector sliceStack; buildIndexValues(leased, revisionId, doc, 0, elements, sliceStack, hashes); } return TRI_ERROR_NO_ERROR; } void RocksDBVPackIndex::addIndexValue(VPackBuilder& leased, TRI_voc_rid_t revisionId, VPackSlice const& document, std::vector& elements, std::vector& sliceStack, std::vector& hashes) { leased.clear(); leased.openArray(true); // unindexed for (VPackSlice const& s : sliceStack) { leased.add(s); } leased.close(); if (_unique) { // Unique VPack index values are stored as follows: // - Key: 7 + 8-byte object ID of index + VPack array with index value(s) // - Value: primary key elements.emplace_back( RocksDBKey::UniqueIndexValue(_objectId, leased.slice())); } else { // Non-unique VPack index values are stored as follows: // - Key: 6 + 8-byte object ID of index + VPack array with index value(s) // + primary key // - Value: empty elements.emplace_back( RocksDBKey::IndexValue(_objectId, leased.slice(), revisionId)); hashes.push_back(leased.slice().normalizedHash()); } } /// @brief helper function to create a set of index combinations to insert void RocksDBVPackIndex::buildIndexValues(VPackBuilder& leased, TRI_voc_rid_t revisionId, VPackSlice const doc, size_t level, std::vector& elements, std::vector& sliceStack, std::vector& hashes) { // Invariant: level == sliceStack.size() // Stop the recursion: if (level == _paths.size()) { addIndexValue(leased, revisionId, doc, elements, sliceStack, hashes); return; } if (_expanding[level] == -1) { // the trivial, non-expanding case VPackSlice slice = doc.get(_paths[level]); if (slice.isNone() || slice.isNull()) { if (_sparse) { return; } sliceStack.emplace_back(arangodb::basics::VelocyPackHelper::NullValue()); } else { sliceStack.emplace_back(slice); } buildIndexValues(leased, revisionId, doc, level + 1, elements, sliceStack, hashes); sliceStack.pop_back(); return; } // Finally, the complex case, where we have to expand one entry. // Note again that at most one step in the attribute path can be // an array step. Furthermore, if _allowPartialIndex is true and // anything goes wrong with this attribute path, we have to bottom out // with None values to be able to use the index for a prefix match. // Trivial case to bottom out with Illegal types. VPackSlice illegalSlice = arangodb::basics::VelocyPackHelper::IllegalValue(); auto finishWithNones = [&]() -> void { if (!_allowPartialIndex || level == 0) { return; } for (size_t i = level; i < _paths.size(); i++) { sliceStack.emplace_back(illegalSlice); } addIndexValue(leased, revisionId, doc, elements, sliceStack, hashes); for (size_t i = level; i < _paths.size(); i++) { sliceStack.pop_back(); } }; size_t const n = _paths[level].size(); // We have 0 <= _expanding[level] < n. VPackSlice current(doc); for (size_t i = 0; i <= static_cast(_expanding[level]); i++) { if (!current.isObject()) { finishWithNones(); return; } current = current.get(_paths[level][i]); if (current.isNone()) { finishWithNones(); return; } } // Now the expansion: if (!current.isArray() || current.length() == 0) { finishWithNones(); return; } std::unordered_set seen(2, arangodb::basics::VelocyPackHelper::VPackHash(), arangodb::basics::VelocyPackHelper::VPackEqual()); auto moveOn = [&](VPackSlice something) -> void { auto it = seen.find(something); if (it == seen.end()) { seen.insert(something); sliceStack.emplace_back(something); buildIndexValues(leased, revisionId, doc, level + 1, elements, sliceStack, hashes); sliceStack.pop_back(); } }; for (auto const& member : VPackArrayIterator(current)) { VPackSlice current2(member); bool doneNull = false; for (size_t i = _expanding[level] + 1; i < n; i++) { if (!current2.isObject()) { if (!_sparse) { moveOn(arangodb::basics::VelocyPackHelper::NullValue()); } doneNull = true; break; } current2 = current2.get(_paths[level][i]); if (current2.isNone()) { if (!_sparse) { moveOn(arangodb::basics::VelocyPackHelper::NullValue()); } doneNull = true; break; } } if (!doneNull) { moveOn(current2); } // Finally, if, because of sparsity, we have not inserted anything by now, // we need to play the above trick with None because of the above // mentioned // reasons: if (seen.empty()) { finishWithNones(); } } } /// @brief helper function to transform AttributeNames into strings. void RocksDBVPackIndex::fillPaths(std::vector>& paths, std::vector& expanding) { paths.clear(); expanding.clear(); for (std::vector const& list : _fields) { paths.emplace_back(); std::vector& interior(paths.back()); int expands = -1; int count = 0; for (auto const& att : list) { interior.emplace_back(att.name); if (att.shouldExpand) { expands = count; } ++count; } expanding.emplace_back(expands); } } /// @brief inserts a document into the index int RocksDBVPackIndex::insert(transaction::Methods* trx, TRI_voc_rid_t revisionId, VPackSlice const& doc, bool isRollback) { std::vector elements; std::vector hashes; int res; try { transaction::BuilderLeaser leased(trx); res = fillElement(*(leased.get()), revisionId, doc, elements, hashes); } catch (...) { res = TRI_ERROR_OUT_OF_MEMORY; } if (res != TRI_ERROR_NO_ERROR) { return res; } // now we are going to construct the value to insert into rocksdb // unique indexes have a different key structure RocksDBValue value = _unique ? RocksDBValue::UniqueIndexValue(revisionId) : RocksDBValue::IndexValue(); RocksDBMethods* mthds = rocksutils::toRocksMethods(trx); size_t const count = elements.size(); for (size_t i = 0; i < count; ++i) { RocksDBKey& key = elements[i]; if (_unique) { RocksDBValue existing = RocksDBValue::Empty(RocksDBEntryType::UniqueIndexValue); if (mthds->Exists(_cf, key)) { res = TRI_ERROR_ARANGO_UNIQUE_CONSTRAINT_VIOLATED; } } if (res == TRI_ERROR_NO_ERROR) { arangodb::Result r = mthds->Put(_cf, key, value.string(), rocksutils::index); if (!r.ok()) { // auto status = // rocksutils::convertStatus(s, rocksutils::StatusHint::index); res = r.errorNumber(); } } if (res != TRI_ERROR_NO_ERROR) { for (size_t j = 0; j < i; ++j) { mthds->Delete(_cf, elements[j]); } if (res == TRI_ERROR_ARANGO_UNIQUE_CONSTRAINT_VIOLATED && !_unique) { // We ignore unique_constraint violated if we are not unique res = TRI_ERROR_NO_ERROR; // TODO: remove this? seems dangerous... } break; } } for (auto& it : hashes) { // The estimator is only useful if we are in a non-unique indexes TRI_ASSERT(!_unique); _estimator->insert(it); } return res; } int RocksDBVPackIndex::insertRaw(RocksDBMethods* batch, TRI_voc_rid_t revisionId, VPackSlice const& doc) { std::vector elements; std::vector hashes; int res; try { VPackBuilder leased; res = fillElement(leased, revisionId, doc, elements, hashes); } catch (...) { return TRI_ERROR_OUT_OF_MEMORY; } if (res != TRI_ERROR_NO_ERROR) { return res; } // now we are going to construct the value to insert into rocksdb // unique indexes have a different key structure RocksDBValue value = _unique ? RocksDBValue::UniqueIndexValue(revisionId) : RocksDBValue::IndexValue(); for (RocksDBKey const& key : elements) { if (_unique) { rocksdb::ReadOptions readOpts; if (batch->Exists(_cf, key)) { res = TRI_ERROR_ARANGO_UNIQUE_CONSTRAINT_VIOLATED; } } if (res == TRI_ERROR_NO_ERROR) { batch->Put(_cf, key, value.string(), rocksutils::index); } } for (auto& it : hashes) { // The estimator is only useful if we are in a non-unique indexes TRI_ASSERT(!_unique); _estimator->insert(it); } return res; } /// @brief removes a document from the index int RocksDBVPackIndex::remove(transaction::Methods* trx, TRI_voc_rid_t revisionId, VPackSlice const& doc, bool isRollback) { std::vector elements; std::vector hashes; int res; try { transaction::BuilderLeaser leased(trx); res = fillElement(*(leased.get()), revisionId, doc, elements, hashes); } catch (...) { res = TRI_ERROR_OUT_OF_MEMORY; } if (res != TRI_ERROR_NO_ERROR) { return res; } RocksDBMethods* mthds = rocksutils::toRocksMethods(trx); size_t const count = elements.size(); for (size_t i = 0; i < count; ++i) { arangodb::Result r = mthds->Delete(_cf, elements[i]); if (!r.ok()) { res = r.errorNumber(); } } for (auto& it : hashes) { // The estimator is only useful if we are in a non-unique indexes TRI_ASSERT(!_unique); _estimator->remove(it); } return res; } int RocksDBVPackIndex::removeRaw(RocksDBMethods* writeBatch, TRI_voc_rid_t revisionId, VPackSlice const& doc) { std::vector elements; std::vector hashes; int res; try { VPackBuilder leased; res = fillElement(leased, revisionId, doc, elements, hashes); } catch (...) { res = TRI_ERROR_OUT_OF_MEMORY; } if (res != TRI_ERROR_NO_ERROR) { return res; } size_t const count = elements.size(); for (size_t i = 0; i < count; ++i) { writeBatch->Delete(_cf, elements[i]); } for (auto& it : hashes) { // The estimator is only useful if we are in a non-unique indexes TRI_ASSERT(!_unique); _estimator->remove(it); } return TRI_ERROR_NO_ERROR; } /// @brief called when the index is dropped int RocksDBVPackIndex::drop() { // First drop the cache all indexes can work without it. RocksDBIndex::drop(); if (_unique) { return rocksutils::removeLargeRange( rocksutils::globalRocksDB(), RocksDBKeyBounds::UniqueIndex(_objectId)) .errorNumber(); } else { return rocksutils::removeLargeRange( rocksutils::globalRocksDB(), RocksDBKeyBounds::IndexEntries(_objectId)) .errorNumber(); } } /// @brief attempts to locate an entry in the index /// Warning: who ever calls this function is responsible for destroying /// the RocksDBVPackIndexIterator* results RocksDBVPackIndexIterator* RocksDBVPackIndex::lookup( transaction::Methods* trx, ManagedDocumentResult* mmdr, VPackSlice const searchValues, bool reverse) const { TRI_ASSERT(searchValues.isArray()); TRI_ASSERT(searchValues.length() <= _fields.size()); VPackBuilder leftSearch; VPackBuilder rightSearch; VPackSlice lastNonEq; leftSearch.openArray(); for (auto const& it : VPackArrayIterator(searchValues)) { TRI_ASSERT(it.isObject()); VPackSlice eq = it.get(StaticStrings::IndexEq); if (eq.isNone()) { lastNonEq = it; break; } leftSearch.add(eq); } VPackSlice leftBorder; VPackSlice rightBorder; if (lastNonEq.isNone()) { // We only have equality! rightSearch = leftSearch; leftSearch.add(VPackSlice::minKeySlice()); leftSearch.close(); rightSearch.add(VPackSlice::maxKeySlice()); rightSearch.close(); leftBorder = leftSearch.slice(); rightBorder = rightSearch.slice(); } else { // Copy rightSearch = leftSearch for right border rightSearch = leftSearch; // Define Lower-Bound VPackSlice lastLeft = lastNonEq.get(StaticStrings::IndexGe); if (!lastLeft.isNone()) { TRI_ASSERT(!lastNonEq.hasKey(StaticStrings::IndexGt)); leftSearch.add(lastLeft); leftSearch.add(VPackSlice::minKeySlice()); leftSearch.close(); VPackSlice search = leftSearch.slice(); leftBorder = search; } else { lastLeft = lastNonEq.get(StaticStrings::IndexGt); if (!lastLeft.isNone()) { leftSearch.add(lastLeft); leftSearch.add(VPackSlice::maxKeySlice()); leftSearch.close(); VPackSlice search = leftSearch.slice(); leftBorder = search; } else { // No lower bound set default to (null <= x) leftSearch.add(VPackSlice::minKeySlice()); leftSearch.close(); VPackSlice search = leftSearch.slice(); leftBorder = search; } } // Define upper-bound VPackSlice lastRight = lastNonEq.get(StaticStrings::IndexLe); if (!lastRight.isNone()) { TRI_ASSERT(!lastNonEq.hasKey(StaticStrings::IndexLt)); rightSearch.add(lastRight); rightSearch.add(VPackSlice::maxKeySlice()); rightSearch.close(); VPackSlice search = rightSearch.slice(); rightBorder = search; } else { lastRight = lastNonEq.get(StaticStrings::IndexLt); if (!lastRight.isNone()) { rightSearch.add(lastRight); rightSearch.add(VPackSlice::minKeySlice()); rightSearch.close(); VPackSlice search = rightSearch.slice(); rightBorder = search; } else { // No upper bound set default to (x <= INFINITY) rightSearch.add(VPackSlice::maxKeySlice()); rightSearch.close(); VPackSlice search = rightSearch.slice(); rightBorder = search; } } } // Secured by trx. The shared_ptr index stays valid in // _collection at least as long as trx is running. // Same for the iterator auto physical = static_cast(_collection->getPhysical()); return new RocksDBVPackIndexIterator(_collection, trx, mmdr, this, reverse, leftBorder, rightBorder); } bool RocksDBVPackIndex::accessFitsIndex( arangodb::aql::AstNode const* access, arangodb::aql::AstNode const* other, arangodb::aql::AstNode const* op, arangodb::aql::Variable const* reference, std::unordered_map>& found, std::unordered_set& nonNullAttributes, bool isExecution) const { if (!canUseConditionPart(access, other, op, reference, nonNullAttributes, isExecution)) { return false; } arangodb::aql::AstNode const* what = access; std::pair> attributeData; if (op->type != arangodb::aql::NODE_TYPE_OPERATOR_BINARY_IN) { if (!what->isAttributeAccessForVariable(attributeData) || attributeData.first != reference) { // this access is not referencing this collection return false; } if (arangodb::basics::TRI_AttributeNamesHaveExpansion( attributeData.second)) { // doc.value[*] == 'value' return false; } if (isAttributeExpanded(attributeData.second)) { // doc.value == 'value' (with an array index) return false; } } else { // ok, we do have an IN here... check if it's something like 'value' IN // doc.value[*] TRI_ASSERT(op->type == arangodb::aql::NODE_TYPE_OPERATOR_BINARY_IN); bool canUse = false; if (what->isAttributeAccessForVariable(attributeData) && attributeData.first == reference && !arangodb::basics::TRI_AttributeNamesHaveExpansion( attributeData.second) && attributeMatches(attributeData.second)) { // doc.value IN 'value' // can use this index canUse = true; } else { // check for 'value' IN doc.value AND 'value' IN doc.value[*] what = other; if (what->isAttributeAccessForVariable(attributeData) && attributeData.first == reference && isAttributeExpanded(attributeData.second) && attributeMatches(attributeData.second)) { canUse = true; } } if (!canUse) { return false; } } std::vector const& fieldNames = attributeData.second; for (size_t i = 0; i < _fields.size(); ++i) { if (_fields[i].size() != fieldNames.size()) { // attribute path length differs continue; } if (this->isAttributeExpanded(i) && op->type != arangodb::aql::NODE_TYPE_OPERATOR_BINARY_IN) { // If this attribute is correct or not, it could only serve for IN continue; } bool match = arangodb::basics::AttributeName::isIdentical(_fields[i], fieldNames, true); if (match) { // mark ith attribute as being covered auto it = found.find(i); if (it == found.end()) { found.emplace(i, std::vector{op}); } else { (*it).second.emplace_back(op); } TRI_IF_FAILURE("PersistentIndex::accessFitsIndex") { THROW_ARANGO_EXCEPTION(TRI_ERROR_DEBUG); } TRI_IF_FAILURE("SkiplistIndex::accessFitsIndex") { THROW_ARANGO_EXCEPTION(TRI_ERROR_DEBUG); } TRI_IF_FAILURE("HashIndex::accessFitsIndex") { THROW_ARANGO_EXCEPTION(TRI_ERROR_DEBUG); } return true; } } return false; } void RocksDBVPackIndex::matchAttributes( arangodb::aql::AstNode const* node, arangodb::aql::Variable const* reference, std::unordered_map>& found, size_t& values, std::unordered_set& nonNullAttributes, bool isExecution) const { for (size_t i = 0; i < node->numMembers(); ++i) { auto op = node->getMember(i); switch (op->type) { case arangodb::aql::NODE_TYPE_OPERATOR_BINARY_EQ: case arangodb::aql::NODE_TYPE_OPERATOR_BINARY_LT: case arangodb::aql::NODE_TYPE_OPERATOR_BINARY_LE: case arangodb::aql::NODE_TYPE_OPERATOR_BINARY_GT: case arangodb::aql::NODE_TYPE_OPERATOR_BINARY_GE: TRI_ASSERT(op->numMembers() == 2); accessFitsIndex(op->getMember(0), op->getMember(1), op, reference, found, nonNullAttributes, isExecution); accessFitsIndex(op->getMember(1), op->getMember(0), op, reference, found, nonNullAttributes, isExecution); break; case arangodb::aql::NODE_TYPE_OPERATOR_BINARY_IN: if (accessFitsIndex(op->getMember(0), op->getMember(1), op, reference, found, nonNullAttributes, isExecution)) { auto m = op->getMember(1); if (m->isArray() && m->numMembers() > 1) { // attr IN [ a, b, c ] => this will produce multiple items, so // count them! values += m->numMembers() - 1; } } break; default: break; } } } bool RocksDBVPackIndex::supportsFilterCondition( arangodb::aql::AstNode const* node, arangodb::aql::Variable const* reference, size_t itemsInIndex, size_t& estimatedItems, double& estimatedCost) const { // mmfiles failure point compat if (this->type() == Index::TRI_IDX_TYPE_HASH_INDEX) { TRI_IF_FAILURE("SimpleAttributeMatcher::accessFitsIndex") { THROW_ARANGO_EXCEPTION(TRI_ERROR_DEBUG); } } std::unordered_map> found; std::unordered_set nonNullAttributes; size_t values = 0; matchAttributes(node, reference, found, values, nonNullAttributes, false); bool lastContainsEquality = true; size_t attributesCovered = 0; size_t attributesCoveredByEquality = 0; double equalityReductionFactor = 20.0; estimatedCost = static_cast(itemsInIndex); for (size_t i = 0; i < _fields.size(); ++i) { auto it = found.find(i); if (it == found.end()) { // index attribute not covered by condition break; } // check if the current condition contains an equality condition auto const& nodes = (*it).second; bool containsEquality = false; for (size_t j = 0; j < nodes.size(); ++j) { if (nodes[j]->type == arangodb::aql::NODE_TYPE_OPERATOR_BINARY_EQ || nodes[j]->type == arangodb::aql::NODE_TYPE_OPERATOR_BINARY_IN) { containsEquality = true; break; } } if (!lastContainsEquality) { // unsupported condition. must abort break; } ++attributesCovered; if (containsEquality) { ++attributesCoveredByEquality; estimatedCost /= equalityReductionFactor; // decrease the effect of the equality reduction factor equalityReductionFactor *= 0.25; if (equalityReductionFactor < 2.0) { // equalityReductionFactor shouldn't get too low equalityReductionFactor = 2.0; } } else { // quick estimate for the potential reductions caused by the conditions if (nodes.size() >= 2) { // at least two (non-equality) conditions. probably a range with lower // and upper bound defined estimatedCost /= 7.5; } else { // one (non-equality). this is either a lower or a higher bound estimatedCost /= 2.0; } } lastContainsEquality = containsEquality; } if (values == 0) { values = 1; } if (attributesCoveredByEquality == _fields.size() && unique()) { // index is unique and condition covers all attributes by equality if (estimatedItems >= values) { // reduce costs due to uniqueness estimatedItems = values; estimatedCost = static_cast(estimatedItems); } else { // cost is already low... now slightly prioritize the unique index estimatedCost *= 0.995; } return true; } if (attributesCovered > 0 && (!_sparse || attributesCovered == _fields.size())) { // if the condition contains at least one index attribute and is not // sparse, // or the index is sparse and all attributes are covered by the condition, // then it can be used (note: additional checks for condition parts in // sparse indexes are contained in Index::canUseConditionPart) estimatedItems = static_cast((std::max)( static_cast(estimatedCost * values), static_cast(1))); estimatedCost *= static_cast(values); return true; } // no condition estimatedItems = itemsInIndex; estimatedCost = static_cast(estimatedItems); return false; } bool RocksDBVPackIndex::supportsSortCondition( arangodb::aql::SortCondition const* sortCondition, arangodb::aql::Variable const* reference, size_t itemsInIndex, double& estimatedCost, size_t& coveredAttributes) const { TRI_ASSERT(sortCondition != nullptr); if (!_sparse) { // only non-sparse indexes can be used for sorting if (!_useExpansion && sortCondition->isUnidirectional() && sortCondition->isOnlyAttributeAccess()) { coveredAttributes = sortCondition->coveredAttributes(reference, _fields); if (coveredAttributes >= sortCondition->numAttributes()) { // sort is fully covered by index. no additional sort costs! // forward iteration does not have high costs estimatedCost = itemsInIndex * 0.001; if (sortCondition->isDescending()) { // reverse iteration has higher costs than forward iteration estimatedCost *= 4; } return true; } else if (coveredAttributes > 0) { estimatedCost = (itemsInIndex / coveredAttributes) * std::log2(static_cast(itemsInIndex)); if (sortCondition->isAscending()) { // reverse iteration is more expensive estimatedCost *= 4; } return true; } } } coveredAttributes = 0; // by default no sort conditions are supported if (itemsInIndex > 0) { estimatedCost = itemsInIndex * std::log2(static_cast(itemsInIndex)); // slightly penalize this type of index against other indexes which // are in memory estimatedCost *= 1.05; } else { estimatedCost = 0.0; } return false; } IndexIterator* RocksDBVPackIndex::iteratorForCondition( transaction::Methods* trx, ManagedDocumentResult* mmdr, arangodb::aql::AstNode const* node, arangodb::aql::Variable const* reference, bool reverse) { VPackBuilder searchValues; searchValues.openArray(); bool needNormalize = false; if (node == nullptr) { // We only use this index for sort. Empty searchValue VPackArrayBuilder guard(&searchValues); TRI_IF_FAILURE("PersistentIndex::noSortIterator") { THROW_ARANGO_EXCEPTION(TRI_ERROR_DEBUG); } TRI_IF_FAILURE("SkiplistIndex::noSortIterator") { THROW_ARANGO_EXCEPTION(TRI_ERROR_DEBUG); } TRI_IF_FAILURE("HashIndex::noSortIterator") { THROW_ARANGO_EXCEPTION(TRI_ERROR_DEBUG); } } else { // Create the search Values for the lookup VPackArrayBuilder guard(&searchValues); std::unordered_map> found; std::unordered_set nonNullAttributes; size_t unused = 0; matchAttributes(node, reference, found, unused, nonNullAttributes, true); // found contains all attributes that are relevant for this node. // It might be less than fields(). // // Handle the first attributes. They can only be == or IN and only // one node per attribute auto getValueAccess = [&](arangodb::aql::AstNode const* comp, arangodb::aql::AstNode const*& access, arangodb::aql::AstNode const*& value) -> bool { access = comp->getMember(0); value = comp->getMember(1); std::pair> paramPair; if (!(access->isAttributeAccessForVariable(paramPair) && paramPair.first == reference)) { access = comp->getMember(1); value = comp->getMember(0); if (!(access->isAttributeAccessForVariable(paramPair) && paramPair.first == reference)) { // Both side do not have a correct AttributeAccess, this should not // happen and indicates // an error in the optimizer TRI_ASSERT(false); } return true; } return false; }; size_t usedFields = 0; for (; usedFields < _fields.size(); ++usedFields) { auto it = found.find(usedFields); if (it == found.end()) { // We are either done // or this is a range. // Continue with more complicated loop break; } auto comp = it->second[0]; TRI_ASSERT(comp->numMembers() == 2); arangodb::aql::AstNode const* access = nullptr; arangodb::aql::AstNode const* value = nullptr; getValueAccess(comp, access, value); // We found an access for this field if (comp->type == arangodb::aql::NODE_TYPE_OPERATOR_BINARY_EQ) { searchValues.openObject(); searchValues.add(VPackValue(StaticStrings::IndexEq)); TRI_IF_FAILURE("PersistentIndex::permutationEQ") { THROW_ARANGO_EXCEPTION(TRI_ERROR_DEBUG); } TRI_IF_FAILURE("SkiplistIndex::permutationEQ") { THROW_ARANGO_EXCEPTION(TRI_ERROR_DEBUG); } TRI_IF_FAILURE("HashIndex::permutationEQ") { THROW_ARANGO_EXCEPTION(TRI_ERROR_DEBUG); } } else if (comp->type == arangodb::aql::NODE_TYPE_OPERATOR_BINARY_IN) { if (isAttributeExpanded(usedFields)) { searchValues.openObject(); searchValues.add(VPackValue(StaticStrings::IndexEq)); TRI_IF_FAILURE("PersistentIndex::permutationArrayIN") { THROW_ARANGO_EXCEPTION(TRI_ERROR_DEBUG); } TRI_IF_FAILURE("SkiplistIndex::permutationArrayIN") { THROW_ARANGO_EXCEPTION(TRI_ERROR_DEBUG); } TRI_IF_FAILURE("HashIndex::permutationArrayIN") { THROW_ARANGO_EXCEPTION(TRI_ERROR_DEBUG); } } else { needNormalize = true; searchValues.openObject(); searchValues.add(VPackValue(StaticStrings::IndexIn)); } } else { // This is a one-sided range break; } // We have to add the value always, the key was added before value->toVelocyPackValue(searchValues); searchValues.close(); } // Now handle the next element, which might be a range if (usedFields < _fields.size()) { auto it = found.find(usedFields); if (it != found.end()) { auto rangeConditions = it->second; TRI_ASSERT(rangeConditions.size() <= 2); VPackObjectBuilder searchElement(&searchValues); for (auto& comp : rangeConditions) { TRI_ASSERT(comp->numMembers() == 2); arangodb::aql::AstNode const* access = nullptr; arangodb::aql::AstNode const* value = nullptr; bool isReverseOrder = getValueAccess(comp, access, value); // Add the key switch (comp->type) { case arangodb::aql::NODE_TYPE_OPERATOR_BINARY_LT: if (isReverseOrder) { searchValues.add(VPackValue(StaticStrings::IndexGt)); } else { searchValues.add(VPackValue(StaticStrings::IndexLt)); } break; case arangodb::aql::NODE_TYPE_OPERATOR_BINARY_LE: if (isReverseOrder) { searchValues.add(VPackValue(StaticStrings::IndexGe)); } else { searchValues.add(VPackValue(StaticStrings::IndexLe)); } break; case arangodb::aql::NODE_TYPE_OPERATOR_BINARY_GT: if (isReverseOrder) { searchValues.add(VPackValue(StaticStrings::IndexLt)); } else { searchValues.add(VPackValue(StaticStrings::IndexGt)); } break; case arangodb::aql::NODE_TYPE_OPERATOR_BINARY_GE: if (isReverseOrder) { searchValues.add(VPackValue(StaticStrings::IndexLe)); } else { searchValues.add(VPackValue(StaticStrings::IndexGe)); } break; default: // unsupported right now. Should have been rejected by // supportsFilterCondition TRI_ASSERT(false); return new EmptyIndexIterator(_collection, trx, mmdr, this); } value->toVelocyPackValue(searchValues); } } } } searchValues.close(); TRI_IF_FAILURE("PersistentIndex::noIterator") { THROW_ARANGO_EXCEPTION(TRI_ERROR_DEBUG); } TRI_IF_FAILURE("SkiplistIndex::noIterator") { THROW_ARANGO_EXCEPTION(TRI_ERROR_DEBUG); } TRI_IF_FAILURE("HashIndex::noIterator") { THROW_ARANGO_EXCEPTION(TRI_ERROR_DEBUG); } if (needNormalize) { VPackBuilder expandedSearchValues; expandInSearchValues(searchValues.slice(), expandedSearchValues); VPackSlice expandedSlice = expandedSearchValues.slice(); std::vector iterators; try { for (auto const& val : VPackArrayIterator(expandedSlice)) { auto iterator = lookup(trx, mmdr, val, reverse); try { iterators.push_back(iterator); } catch (...) { // avoid leak delete iterator; throw; } } if (reverse) { std::reverse(iterators.begin(), iterators.end()); } } catch (...) { for (auto& it : iterators) { delete it; } throw; } return new MultiIndexIterator(_collection, trx, mmdr, this, iterators); } VPackSlice searchSlice = searchValues.slice(); TRI_ASSERT(searchSlice.length() == 1); searchSlice = searchSlice.at(0); return lookup(trx, mmdr, searchSlice, reverse); } /// @brief specializes the condition for use with the index arangodb::aql::AstNode* RocksDBVPackIndex::specializeCondition( arangodb::aql::AstNode* node, arangodb::aql::Variable const* reference) const { // mmfiles failure compat if (this->type() == Index::TRI_IDX_TYPE_HASH_INDEX) { TRI_IF_FAILURE("SimpleAttributeMatcher::specializeAllChildrenEQ") { THROW_ARANGO_EXCEPTION(TRI_ERROR_DEBUG); } TRI_IF_FAILURE("SimpleAttributeMatcher::specializeAllChildrenIN") { THROW_ARANGO_EXCEPTION(TRI_ERROR_DEBUG); } } std::unordered_map> found; std::unordered_set nonNullAttributes; size_t values = 0; matchAttributes(node, reference, found, values, nonNullAttributes, false); std::vector children; bool lastContainsEquality = true; for (size_t i = 0; i < _fields.size(); ++i) { auto it = found.find(i); if (it == found.end()) { // index attribute not covered by condition break; } // check if the current condition contains an equality condition auto& nodes = (*it).second; bool containsEquality = false; for (size_t j = 0; j < nodes.size(); ++j) { if (nodes[j]->type == arangodb::aql::NODE_TYPE_OPERATOR_BINARY_EQ || nodes[j]->type == arangodb::aql::NODE_TYPE_OPERATOR_BINARY_IN) { containsEquality = true; break; } } if (!lastContainsEquality) { // unsupported condition. must abort break; } std::sort(nodes.begin(), nodes.end(), [](arangodb::aql::AstNode const* lhs, arangodb::aql::AstNode const* rhs) -> bool { return sortWeight(lhs) < sortWeight(rhs); }); lastContainsEquality = containsEquality; std::unordered_set operatorsFound; for (auto& it : nodes) { // do not let duplicate or related operators pass if (isDuplicateOperator(it, operatorsFound)) { continue; } operatorsFound.emplace(static_cast(it->type)); children.emplace_back(it); } } while (node->numMembers() > 0) { node->removeMemberUnchecked(0); } for (auto& it : children) { node->addMember(it); } return node; } bool RocksDBVPackIndex::isDuplicateOperator( arangodb::aql::AstNode const* node, std::unordered_set const& operatorsFound) const { auto type = node->type; if (operatorsFound.find(static_cast(type)) != operatorsFound.end()) { // duplicate operator return true; } if (operatorsFound.find( static_cast(arangodb::aql::NODE_TYPE_OPERATOR_BINARY_EQ)) != operatorsFound.end() || operatorsFound.find( static_cast(arangodb::aql::NODE_TYPE_OPERATOR_BINARY_IN)) != operatorsFound.end()) { return true; } bool duplicate = false; switch (type) { case arangodb::aql::NODE_TYPE_OPERATOR_BINARY_LT: duplicate = operatorsFound.find(static_cast( arangodb::aql::NODE_TYPE_OPERATOR_BINARY_LE)) != operatorsFound.end(); break; case arangodb::aql::NODE_TYPE_OPERATOR_BINARY_LE: duplicate = operatorsFound.find(static_cast( arangodb::aql::NODE_TYPE_OPERATOR_BINARY_LT)) != operatorsFound.end(); break; case arangodb::aql::NODE_TYPE_OPERATOR_BINARY_GT: duplicate = operatorsFound.find(static_cast( arangodb::aql::NODE_TYPE_OPERATOR_BINARY_GE)) != operatorsFound.end(); break; case arangodb::aql::NODE_TYPE_OPERATOR_BINARY_GE: duplicate = operatorsFound.find(static_cast( arangodb::aql::NODE_TYPE_OPERATOR_BINARY_GT)) != operatorsFound.end(); break; case arangodb::aql::NODE_TYPE_OPERATOR_BINARY_EQ: duplicate = operatorsFound.find(static_cast( arangodb::aql::NODE_TYPE_OPERATOR_BINARY_IN)) != operatorsFound.end(); break; case arangodb::aql::NODE_TYPE_OPERATOR_BINARY_IN: duplicate = operatorsFound.find(static_cast( arangodb::aql::NODE_TYPE_OPERATOR_BINARY_EQ)) != operatorsFound.end(); break; default: { // ignore } } return duplicate; } int RocksDBVPackIndex::cleanup() { rocksdb::TransactionDB* db = rocksutils::globalRocksDB(); rocksdb::CompactRangeOptions opts; RocksDBKeyBounds bounds = _unique ? RocksDBKeyBounds::UniqueIndex(_objectId) : RocksDBKeyBounds::IndexEntries(_objectId); rocksdb::Slice b = bounds.start(), e = bounds.end(); LOG_TOPIC(DEBUG, Logger::FIXME) << "compacting index range " << bounds; db->CompactRange(opts, &b, &e); return TRI_ERROR_NO_ERROR; } void RocksDBVPackIndex::serializeEstimate(std::string& output) const { TRI_ASSERT(!ServerState::instance()->isCoordinator()); if (!_unique) { TRI_ASSERT(_estimator != nullptr); _estimator->serialize(output); } } bool RocksDBVPackIndex::deserializeEstimate(RocksDBCounterManager* mgr) { TRI_ASSERT(!ServerState::instance()->isCoordinator()); if (_unique) { return true; } // We simply drop the current estimator and steal the one from recovery // We are than save for resizing issues in our _estimator format // and will use the old size. TRI_ASSERT(mgr != nullptr); auto tmp = mgr->stealIndexEstimator(_objectId); if (tmp == nullptr) { // We expected to receive a stored index estimate, however we got none. // We use the freshly created estimator but have to recompute it. return false; } _estimator.swap(tmp); TRI_ASSERT(_estimator != nullptr); return true; } void RocksDBVPackIndex::recalculateEstimates() { if (unique()) { return; } TRI_ASSERT(_estimator != nullptr); _estimator->clear(); auto bounds = RocksDBKeyBounds::IndexEntries(_objectId); rocksutils::iterateBounds(bounds, [&](rocksdb::Iterator* it) { uint64_t hash = RocksDBVPackIndex::HashForKey(it->key()); _estimator->insert(hash); }, arangodb::RocksDBColumnFamily::index()); } Result RocksDBVPackIndex::postprocessRemove(transaction::Methods* trx, rocksdb::Slice const& key, rocksdb::Slice const& value) { if (!unique()) { uint64_t hash = RocksDBVPackIndex::HashForKey(key); _estimator->remove(hash); } return {TRI_ERROR_NO_ERROR}; }