//////////////////////////////////////////////////////////////////////////////// /// DISCLAIMER /// /// Copyright 2017 ArangoDB GmbH, Cologne, Germany /// /// Licensed under the Apache License, Version 2.0 (the "License"); /// you may not use this file except in compliance with the License. /// You may obtain a copy of the License at /// /// http://www.apache.org/licenses/LICENSE-2.0 /// /// Unless required by applicable law or agreed to in writing, software /// distributed under the License is distributed on an "AS IS" BASIS, /// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. /// See the License for the specific language governing permissions and /// limitations under the License. /// /// Copyright holder is ArangoDB GmbH, Cologne, Germany /// /// @author Simon Grätzer //////////////////////////////////////////////////////////////////////////////// #include "RocksDBFulltextIndex.h" #include "Basics/StaticStrings.h" #include "Basics/StringRef.h" #include "Basics/Utf8Helper.h" #include "Basics/VelocyPackHelper.h" #include "Basics/tri-strings.h" #include "Logger/Logger.h" #include "RocksDBEngine/RocksDBCollection.h" #include "RocksDBEngine/RocksDBCommon.h" #include "RocksDBEngine/RocksDBPrimaryIndex.h" #include "RocksDBEngine/RocksDBToken.h" #include "RocksDBEngine/RocksDBTransactionState.h" #include "RocksDBEngine/RocksDBTypes.h" #include "StorageEngine/DocumentIdentifierToken.h" #include #include #include #include #include using namespace arangodb; TRI_voc_rid_t RocksDBFulltextIndex::fromDocumentIdentifierToken( DocumentIdentifierToken const& token) { auto tkn = static_cast(&token); return tkn->revisionId(); } DocumentIdentifierToken RocksDBFulltextIndex::toDocumentIdentifierToken( TRI_voc_rid_t revisionId) { return RocksDBToken{revisionId}; } RocksDBFulltextIndex::RocksDBFulltextIndex( TRI_idx_iid_t iid, arangodb::LogicalCollection* collection, VPackSlice const& info) : RocksDBIndex(iid, collection, info), _minWordLength(TRI_FULLTEXT_MIN_WORD_LENGTH_DEFAULT) { TRI_ASSERT(iid != 0); VPackSlice const value = info.get("minLength"); if (value.isNumber()) { _minWordLength = value.getNumericValue(); if (_minWordLength <= 0) { // The min length cannot be negative. _minWordLength = 1; } } else if (!value.isNone()) { // minLength defined but no number THROW_ARANGO_EXCEPTION_MESSAGE(TRI_ERROR_BAD_PARAMETER, " must be a number"); } _unique = false; _sparse = true; if (_fields.size() != 1) { // We need exactly 1 attribute THROW_ARANGO_EXCEPTION_MESSAGE( TRI_ERROR_INTERNAL, "fulltext index definition should have exactly one attribute"); } auto& attribute = _fields[0]; _attr.reserve(attribute.size()); for (auto& a : attribute) { _attr.emplace_back(a.name); } } RocksDBFulltextIndex::~RocksDBFulltextIndex() {} size_t RocksDBFulltextIndex::memory() const { rocksdb::TransactionDB* db = rocksutils::globalRocksDB(); RocksDBKeyBounds bounds = RocksDBKeyBounds::FulltextIndexPrefix(_objectId, StringRef()); rocksdb::Range r(bounds.start(), bounds.end()); uint64_t out; db->GetApproximateSizes(&r, 1, &out, true); return (size_t)out; } /// @brief return a VelocyPack representation of the index void RocksDBFulltextIndex::toVelocyPack(VPackBuilder& builder, bool withFigures, bool forPersistence) const { builder.openObject(); RocksDBIndex::toVelocyPack(builder, withFigures, forPersistence); builder.add("unique", VPackValue(false)); builder.add("sparse", VPackValue(true)); builder.add("minLength", VPackValue(_minWordLength)); builder.close(); } /// @brief Test if this index matches the definition bool RocksDBFulltextIndex::matchesDefinition(VPackSlice const& info) const { TRI_ASSERT(info.isObject()); #ifdef ARANGODB_ENABLE_MAINTAINER_MODE VPackSlice typeSlice = info.get("type"); TRI_ASSERT(typeSlice.isString()); StringRef typeStr(typeSlice); TRI_ASSERT(typeStr == oldtypeName()); #endif auto value = info.get("id"); if (!value.isNone()) { // We already have an id. if (!value.isString()) { // Invalid ID return false; } // Short circuit. If id is correct the index is identical. StringRef idRef(value); return idRef == std::to_string(_iid); } value = info.get("minLength"); if (value.isNumber()) { int cmp = value.getNumericValue(); if (cmp <= 0) { if (_minWordLength != 1) { return false; } } else { if (_minWordLength != cmp) { return false; } } } else if (!value.isNone()) { // Illegal minLength return false; } value = info.get("fields"); if (!value.isArray()) { return false; } size_t const n = static_cast(value.length()); if (n != _fields.size()) { return false; } if (_unique != arangodb::basics::VelocyPackHelper::getBooleanValue( info, "unique", false)) { return false; } if (_sparse != arangodb::basics::VelocyPackHelper::getBooleanValue( info, "sparse", true)) { return false; } // This check takes ordering of attributes into account. std::vector translate; for (size_t i = 0; i < n; ++i) { translate.clear(); VPackSlice f = value.at(i); if (!f.isString()) { // Invalid field definition! return false; } arangodb::StringRef in(f); TRI_ParseAttributeString(in, translate, true); if (!arangodb::basics::AttributeName::isIdentical(_fields[i], translate, false)) { return false; } } return true; } int RocksDBFulltextIndex::insert(transaction::Methods* trx, TRI_voc_rid_t revisionId, VPackSlice const& doc, bool isRollback) { std::vector words = wordlist(doc); if (words.empty()) { return TRI_ERROR_NO_ERROR; } RocksDBTransactionState* state = rocksutils::toRocksTransactionState(trx); rocksdb::Transaction* rtrx = state->rocksTransaction(); // now we are going to construct the value to insert into rocksdb // unique indexes have a different key structure StringRef docKey(doc.get(StaticStrings::KeyString)); RocksDBValue value = RocksDBValue::IndexValue(); int res = TRI_ERROR_NO_ERROR; size_t const count = words.size(); size_t i = 0; for (; i < count; ++i) { std::string const& word = words[i]; RocksDBKey key = RocksDBKey::FulltextIndexValue(_objectId, StringRef(word), docKey); rocksdb::Status s = rtrx->Put(key.string(), value.string()); if (!s.ok()) { auto status = rocksutils::convertStatus(s, rocksutils::StatusHint::index); res = status.errorNumber(); break; } } if (res != TRI_ERROR_NO_ERROR) { for (size_t j = 0; j < i; ++j) { std::string const& word = words[j]; RocksDBKey key = RocksDBKey::FulltextIndexValue(_objectId, StringRef(word), docKey); rtrx->Delete(key.string()); } } return res; } int RocksDBFulltextIndex::insertRaw(rocksdb::WriteBatchWithIndex* batch, TRI_voc_rid_t, arangodb::velocypack::Slice const& doc) { std::vector words = wordlist(doc); if (words.empty()) { return TRI_ERROR_NO_ERROR; } // now we are going to construct the value to insert into rocksdb // unique indexes have a different key structure StringRef docKey(doc.get(StaticStrings::KeyString)); RocksDBValue value = RocksDBValue::IndexValue(); size_t const count = words.size(); for (size_t i = 0; i < count; ++i) { std::string const& word = words[i]; RocksDBKey key = RocksDBKey::FulltextIndexValue(_objectId, StringRef(word), docKey); batch->Put(key.string(), value.string()); } return TRI_ERROR_NO_ERROR; } int RocksDBFulltextIndex::remove(transaction::Methods* trx, TRI_voc_rid_t revisionId, VPackSlice const& doc, bool isRollback) { std::vector words = wordlist(doc); if (words.empty()) { // TODO: distinguish the cases "empty wordlist" and "out of memory" // LOG_TOPIC(WARN, arangodb::Logger::FIXME) << "could not build wordlist"; return TRI_ERROR_OUT_OF_MEMORY; } RocksDBTransactionState* state = rocksutils::toRocksTransactionState(trx); rocksdb::Transaction* rtrx = state->rocksTransaction(); // now we are going to construct the value to insert into rocksdb // unique indexes have a different key structure StringRef docKey(doc.get(StaticStrings::KeyString)); int res = TRI_ERROR_NO_ERROR; size_t const count = words.size(); for (size_t i = 0; i < count; ++i) { std::string const& word = words[i]; RocksDBKey key = RocksDBKey::FulltextIndexValue(_objectId, StringRef(word), docKey); rocksdb::Status s = rtrx->Delete(key.string()); if (!s.ok()) { auto status = rocksutils::convertStatus(s, rocksutils::StatusHint::index); res = status.errorNumber(); } } return res; } int RocksDBFulltextIndex::removeRaw(rocksdb::WriteBatchWithIndex* batch, TRI_voc_rid_t, arangodb::velocypack::Slice const& doc) { std::vector words = wordlist(doc); // now we are going to construct the value to insert into rocksdb // unique indexes have a different key structure StringRef docKey(doc.get(StaticStrings::KeyString)); size_t const count = words.size(); for (size_t i = 0; i < count; ++i) { std::string const& word = words[i]; RocksDBKey key = RocksDBKey::FulltextIndexValue(_objectId, StringRef(word), docKey); batch->Delete(key.string()); } return TRI_ERROR_NO_ERROR; } int RocksDBFulltextIndex::cleanup() { rocksdb::TransactionDB* db = rocksutils::globalRocksDB(); rocksdb::CompactRangeOptions opts; RocksDBKeyBounds bounds = RocksDBKeyBounds::FulltextIndexPrefix(_objectId, StringRef()); rocksdb::Slice b = bounds.start(), e = bounds.end(); db->CompactRange(opts, &b, &e); return TRI_ERROR_NO_ERROR; } /// @brief walk over the attribute. Also Extract sub-attributes and elements in /// list. static void ExtractWords(std::vector& words, VPackSlice const value, size_t minWordLength, int level) { if (value.isString()) { // extract the string value for the indexed attribute std::string text = value.copyString(); // parse the document text arangodb::basics::Utf8Helper::DefaultUtf8Helper.getWords( words, text, minWordLength, TRI_FULLTEXT_MAX_WORD_LENGTH, true); // We don't care for the result. If the result is false, words stays // unchanged and is not indexed } else if (value.isArray() && level == 0) { for (auto const& v : VPackArrayIterator(value)) { ExtractWords(words, v, minWordLength, level + 1); } } else if (value.isObject() && level == 0) { for (auto const& v : VPackObjectIterator(value)) { ExtractWords(words, v.value, minWordLength, level + 1); } } } /// @brief callback function called by the fulltext index to determine the /// words to index for a specific document std::vector RocksDBFulltextIndex::wordlist(VPackSlice const& doc) { std::vector words; try { VPackSlice const value = doc.get(_attr); if (!value.isString() && !value.isArray() && !value.isObject()) { // Invalid Input return words; } ExtractWords(words, value, _minWordLength, 0); } catch (...) { // Backwards compatibility // The pre-vpack impl. did just ignore all errors and returned nulltpr return words; } return words; } Result RocksDBFulltextIndex::parseQueryString(std::string const& qstr, FulltextQuery& query) { if (qstr.empty()) { return Result(TRI_ERROR_BAD_PARAMETER); } const char* ptr = qstr.data(); int i = 0; while (*ptr) { char c = *ptr; // ignore whitespace if (c == ' ' || c == '\t' || c == '\r' || c == '\n' || c == '\f' || c == '\b' || c == ',') { ++ptr; continue; } // defaults FulltextQueryToken::Operation operation = FulltextQueryToken::AND; FulltextQueryToken::MatchType matchType = FulltextQueryToken::COMPLETE; // word begin // get operation if (c == '+') { operation = FulltextQueryToken::AND; ++ptr; } else if (c == '|') { operation = FulltextQueryToken::OR; ++ptr; } else if (c == '-') { operation = FulltextQueryToken::EXCLUDE; ++ptr; } // find a word with ':' at the end, i.e. prefix: or complete: // set ptr to the end of the word char const* split = nullptr; char const* start = ptr; while (*ptr) { c = *ptr; if (c == ' ' || c == '\t' || c == '\r' || c == '\n' || c == '\f' || c == '\b' || c == ',') { // end of word break; } else if (split == nullptr && c == ':') { split = ptr + 1; } ++ptr; } char const* end = ptr; if ((end - start == 0) || (split != nullptr && split - start == 0) || (split != nullptr && end - split == 0)) { // invalid string return Result(TRI_ERROR_BAD_PARAMETER); } // get command if (split != nullptr) { if (TRI_CaseEqualString(start, "prefix:", strlen("prefix:"))) { matchType = FulltextQueryToken::PREFIX; } else if (TRI_CaseEqualString(start, "complete:", strlen("complete:"))) { matchType = FulltextQueryToken::COMPLETE; } start = split; } // normalize a word for a fulltext search query this will create a copy of // the word char const* word = start; size_t wordLength = (size_t)(end - start); TRI_ASSERT(end >= start); size_t outLength; char* normalized = TRI_normalize_utf8_to_NFC(TRI_UNKNOWN_MEM_ZONE, word, wordLength, &outLength); if (normalized == nullptr) { return Result(TRI_ERROR_OUT_OF_MEMORY); } // lower case string int32_t outLength2; char* lowered = TRI_tolower_utf8(TRI_UNKNOWN_MEM_ZONE, normalized, (int32_t)outLength, &outLength2); TRI_Free(TRI_UNKNOWN_MEM_ZONE, normalized); if (lowered == nullptr) { return Result(TRI_ERROR_OUT_OF_MEMORY); } // emplace_back below may throw TRI_DEFER(TRI_Free(TRI_UNKNOWN_MEM_ZONE, lowered)); // calculate the proper prefix char* prefixEnd = TRI_PrefixUtf8String(lowered, TRI_FULLTEXT_MAX_WORD_LENGTH); ptrdiff_t prefixLength = prefixEnd - lowered; query.emplace_back(std::string(lowered, (size_t)prefixLength), matchType, operation); ++i; if (i >= TRI_FULLTEXT_SEARCH_MAX_WORDS) { break; } } if (!query.empty()) { query[0].operation = FulltextQueryToken::OR; } return Result(i == 0 ? TRI_ERROR_BAD_PARAMETER : TRI_ERROR_NO_ERROR); } Result RocksDBFulltextIndex::executeQuery(transaction::Methods* trx, FulltextQuery const& query, size_t maxResults, VPackBuilder& builder) { std::set resultSet; for (FulltextQueryToken const& token : query) { applyQueryToken(trx, token, resultSet); } auto physical = static_cast(_collection->getPhysical()); auto idx = physical->primaryIndex(); ManagedDocumentResult mmdr; if (maxResults == 0) { // 0 appearantly means "all results" maxResults = SIZE_MAX; } builder.openArray(); // get the first N results std::set::iterator it = resultSet.cbegin(); while (maxResults > 0 && it != resultSet.cend()) { RocksDBToken token = idx->lookupKey(trx, StringRef(*it)); if (token.revisionId()) { if (physical->readDocument(trx, token, mmdr)) { mmdr.addToBuilder(builder, true); maxResults--; } } ++it; } builder.close(); return Result(); } static RocksDBKeyBounds MakeBounds(uint64_t oid, FulltextQueryToken const& token) { if (token.matchType == FulltextQueryToken::COMPLETE) { return RocksDBKeyBounds::FulltextIndexComplete(oid, StringRef(token.value)); } else if (token.matchType == FulltextQueryToken::PREFIX) { return RocksDBKeyBounds::FulltextIndexPrefix(oid, StringRef(token.value)); } THROW_ARANGO_EXCEPTION(TRI_ERROR_NOT_IMPLEMENTED); } Result RocksDBFulltextIndex::applyQueryToken(transaction::Methods* trx, FulltextQueryToken const& token, std::set& resultSet) { RocksDBTransactionState* state = rocksutils::toRocksTransactionState(trx); rocksdb::Transaction* rtrx = state->rocksTransaction(); auto const& options = state->readOptions(); TRI_ASSERT(options.snapshot != nullptr); // why can't I have an assignment operator when I want one RocksDBKeyBounds bounds = MakeBounds(_objectId, token); std::unique_ptr iter(rtrx->GetIterator(options)); iter->Seek(bounds.start()); std::set intersect; // TODO: set options.iterate_upper_bound and remove compare? // apply left to right logic, merging all current results with ALL previous while (iter->Valid() && _cmp->Compare(iter->key(), bounds.end()) < 0) { rocksdb::Status s = iter->status(); if (!s.ok()) { return rocksutils::convertStatus(s); } StringRef key = RocksDBKey::primaryKey(iter->key()); if (token.operation == FulltextQueryToken::AND) { intersect.insert(key.toString()); } else if (token.operation == FulltextQueryToken::OR) { resultSet.insert(key.toString()); } else if (token.operation == FulltextQueryToken::EXCLUDE) { resultSet.erase(key.toString()); } iter->Next(); } if (token.operation == FulltextQueryToken::AND) { if (resultSet.empty() || intersect.empty()) { resultSet.clear(); } else { std::set output; std::set_intersection(resultSet.begin(), resultSet.end(), intersect.begin(), intersect.end(), std::inserter(output, output.begin())); resultSet = std::move(output); } } return Result(); }