//////////////////////////////////////////////////////////////////////////////// /// DISCLAIMER /// /// Copyright 2017 ArangoDB GmbH, Cologne, Germany /// /// Licensed under the Apache License, Version 2.0 (the "License"); /// you may not use this file except in compliance with the License. /// You may obtain a copy of the License at /// /// http://www.apache.org/licenses/LICENSE-2.0 /// /// Unless required by applicable law or agreed to in writing, software /// distributed under the License is distributed on an "AS IS" BASIS, /// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. /// See the License for the specific language governing permissions and /// limitations under the License. /// /// Copyright holder is ArangoDB GmbH, Cologne, Germany /// /// @author Simon Grätzer //////////////////////////////////////////////////////////////////////////////// #include "RocksDBFulltextIndex.h" #include "Aql/Ast.h" #include "Aql/AstNode.h" #include "Basics/StaticStrings.h" #include "Basics/Utf8Helper.h" #include "Basics/VelocyPackHelper.h" #include "Basics/tri-strings.h" #include "Logger/Logger.h" #include "RocksDBEngine/RocksDBCollection.h" #include "RocksDBEngine/RocksDBCommon.h" #include "RocksDBEngine/RocksDBMethods.h" #include "RocksDBEngine/RocksDBTransactionState.h" #include "RocksDBEngine/RocksDBTypes.h" #include #include #include #include #include #include #include #include using namespace arangodb; namespace arangodb { /// El Cheapo index iterator class RocksDBFulltextIndexIterator final : public IndexIterator { public: RocksDBFulltextIndexIterator(LogicalCollection* collection, transaction::Methods* trx, std::set&& docs) : IndexIterator(collection, trx), _docs(std::move(docs)), _pos(_docs.begin()) {} char const* typeName() const override { return "fulltext-index-iterator"; } bool next(LocalDocumentIdCallback const& cb, size_t limit) override { TRI_ASSERT(limit > 0); while (_pos != _docs.end() && limit > 0) { cb(*_pos); ++_pos; limit--; } return _pos != _docs.end(); } void reset() override { _pos = _docs.begin(); } void skip(uint64_t count, uint64_t& skipped) override { while (_pos != _docs.end() && skipped < count) { ++_pos; skipped++; } } private: std::set const _docs; std::set::iterator _pos; }; } // namespace RocksDBFulltextIndex::RocksDBFulltextIndex(TRI_idx_iid_t iid, arangodb::LogicalCollection& collection, arangodb::velocypack::Slice const& info) : RocksDBIndex(iid, collection, info, RocksDBColumnFamily::fulltext(), false), _minWordLength(FulltextIndexLimits::minWordLengthDefault) { TRI_ASSERT(iid != 0); TRI_ASSERT(_cf == RocksDBColumnFamily::fulltext()); VPackSlice const value = info.get("minLength"); if (value.isNumber()) { _minWordLength = value.getNumericValue(); if (_minWordLength <= 0) { // The min length cannot be negative. _minWordLength = 1; } } else if (!value.isNone()) { // minLength defined but no number THROW_ARANGO_EXCEPTION_MESSAGE(TRI_ERROR_BAD_PARAMETER, " must be a number"); } _unique = false; _sparse = true; if (_fields.size() != 1) { // We need exactly 1 attribute THROW_ARANGO_EXCEPTION_MESSAGE( TRI_ERROR_INTERNAL, "fulltext index definition should have exactly one attribute"); } auto& attribute = _fields[0]; _attr.reserve(attribute.size()); for (auto& a : attribute) { _attr.emplace_back(a.name); } } /// @brief return a VelocyPack representation of the index void RocksDBFulltextIndex::toVelocyPack(VPackBuilder& builder, std::underlying_type::type flags) const { builder.openObject(); RocksDBIndex::toVelocyPack(builder, flags); builder.add(arangodb::StaticStrings::IndexUnique, arangodb::velocypack::Value(false)); builder.add(arangodb::StaticStrings::IndexSparse, arangodb::velocypack::Value(true)); builder.add("minLength", VPackValue(_minWordLength)); builder.close(); } /// @brief Test if this index matches the definition bool RocksDBFulltextIndex::matchesDefinition(VPackSlice const& info) const { TRI_ASSERT(info.isObject()); #ifdef ARANGODB_ENABLE_MAINTAINER_MODE auto typeSlice = info.get(arangodb::StaticStrings::IndexType); TRI_ASSERT(typeSlice.isString()); arangodb::velocypack::StringRef typeStr(typeSlice); TRI_ASSERT(typeStr == oldtypeName()); #endif auto value = info.get(arangodb::StaticStrings::IndexId); if (!value.isNone()) { // We already have an id. if (!value.isString()) { // Invalid ID return false; } // Short circuit. If id is correct the index is identical. arangodb::velocypack::StringRef idRef(value); return idRef == std::to_string(_iid); } value = info.get("minLength"); if (value.isNumber()) { int cmp = value.getNumericValue(); if (cmp <= 0) { if (_minWordLength != 1) { return false; } } else { if (_minWordLength != cmp) { return false; } } } else if (!value.isNone()) { // Illegal minLength return false; } value = info.get(arangodb::StaticStrings::IndexFields); if (!value.isArray()) { return false; } size_t const n = static_cast(value.length()); if (n != _fields.size()) { return false; } if (_unique != arangodb::basics::VelocyPackHelper::getBooleanValue( info, arangodb::StaticStrings::IndexUnique, false)) { return false; } if (_sparse != arangodb::basics::VelocyPackHelper::getBooleanValue( info, arangodb::StaticStrings::IndexSparse, true)) { return false; } // This check takes ordering of attributes into account. std::vector translate; for (size_t i = 0; i < n; ++i) { translate.clear(); VPackSlice f = value.at(i); if (!f.isString()) { // Invalid field definition! return false; } arangodb::velocypack::StringRef in(f); TRI_ParseAttributeString(in, translate, true); if (!arangodb::basics::AttributeName::isIdentical(_fields[i], translate, false)) { return false; } } return true; } Result RocksDBFulltextIndex::insert(transaction::Methods& trx, RocksDBMethods* mthd, LocalDocumentId const& documentId, velocypack::Slice const& doc, Index::OperationMode mode) { Result res; std::set words = wordlist(doc); if (words.empty()) { return res; } // now we are going to construct the value to insert into rocksdb // unique indexes have a different key structure RocksDBValue value = RocksDBValue::VPackIndexValue(); // size_t const count = words.size(); for (std::string const& word : words) { RocksDBKeyLeaser key(&trx); key->constructFulltextIndexValue(_objectId, arangodb::velocypack::StringRef(word), documentId); TRI_ASSERT(key->containsLocalDocumentId(documentId)); rocksdb::Status s = mthd->PutUntracked(_cf, key.ref(), value.string()); if (!s.ok()) { res.reset(rocksutils::convertStatus(s, rocksutils::index)); addErrorMsg(res); break; } } return res; } Result RocksDBFulltextIndex::remove(transaction::Methods& trx, RocksDBMethods* mthd, LocalDocumentId const& documentId, velocypack::Slice const& doc, Index::OperationMode mode) { Result res; std::set words = wordlist(doc); if (words.empty()) { return res; } // now we are going to construct the value to insert into rocksdb // unique indexes have a different key structure for (std::string const& word : words) { RocksDBKeyLeaser key(&trx); key->constructFulltextIndexValue(_objectId, arangodb::velocypack::StringRef(word), documentId); rocksdb::Status s = mthd->Delete(_cf, key.ref()); if (!s.ok()) { res.reset(rocksutils::convertStatus(s, rocksutils::index)); addErrorMsg(res); break; } } return res; } /// @brief walk over the attribute. Also Extract sub-attributes and elements in /// list. static void ExtractWords(std::set& words, VPackSlice const value, size_t minWordLength, int level) { if (value.isString()) { // extract the string value for the indexed attribute // parse the document text arangodb::basics::Utf8Helper::DefaultUtf8Helper.tokenize(words, value.stringRef(), minWordLength, FulltextIndexLimits::maxWordLength, true); // We don't care for the result. If the result is false, words stays // unchanged and is not indexed } else if (value.isArray() && level == 0) { for (VPackSlice v : VPackArrayIterator(value)) { ExtractWords(words, v, minWordLength, level + 1); } } else if (value.isObject() && level == 0) { for (VPackObjectIterator::ObjectPair v : VPackObjectIterator(value)) { ExtractWords(words, v.value, minWordLength, level + 1); } } } /// @brief callback function called by the fulltext index to determine the /// words to index for a specific document std::set RocksDBFulltextIndex::wordlist(VPackSlice const& doc) { std::set words; VPackSlice const value = doc.get(_attr); if (!value.isString() && !value.isArray() && !value.isObject()) { // Invalid Input return words; } ExtractWords(words, value, _minWordLength, 0); return words; } Result RocksDBFulltextIndex::parseQueryString(std::string const& qstr, FulltextQuery& query) { if (qstr.empty()) { return Result(TRI_ERROR_BAD_PARAMETER); } const char* ptr = qstr.data(); int i = 0; while (*ptr) { char c = *ptr; // ignore whitespace if (c == ' ' || c == '\t' || c == '\r' || c == '\n' || c == '\f' || c == '\b' || c == ',') { ++ptr; continue; } // defaults FulltextQueryToken::Operation operation = FulltextQueryToken::AND; FulltextQueryToken::MatchType matchType = FulltextQueryToken::COMPLETE; // word begin // get operation if (c == '+') { operation = FulltextQueryToken::AND; ++ptr; } else if (c == '|') { operation = FulltextQueryToken::OR; ++ptr; } else if (c == '-') { operation = FulltextQueryToken::EXCLUDE; ++ptr; } // find a word with ':' at the end, i.e. prefix: or complete: // set ptr to the end of the word char const* split = nullptr; char const* start = ptr; while (*ptr) { c = *ptr; if (c == ' ' || c == '\t' || c == '\r' || c == '\n' || c == '\f' || c == '\b' || c == ',') { // end of word break; } else if (split == nullptr && c == ':') { split = ptr + 1; } ++ptr; } char const* end = ptr; if ((end - start == 0) || (split != nullptr && split - start == 0) || (split != nullptr && end - split == 0)) { // invalid string return Result(TRI_ERROR_BAD_PARAMETER); } // get command if (split != nullptr) { if (TRI_CaseEqualString(start, "prefix:", strlen("prefix:"))) { matchType = FulltextQueryToken::PREFIX; } else if (TRI_CaseEqualString(start, "complete:", strlen("complete:"))) { matchType = FulltextQueryToken::COMPLETE; } start = split; } // normalize a word for a fulltext search query this will create a copy of // the word char const* word = start; size_t wordLength = (size_t)(end - start); TRI_ASSERT(end >= start); size_t outLength; char* normalized = TRI_normalize_utf8_to_NFC(word, wordLength, &outLength); if (normalized == nullptr) { return Result(TRI_ERROR_OUT_OF_MEMORY); } // lower case string int32_t outLength2; char* lowered = TRI_tolower_utf8(normalized, (int32_t)outLength, &outLength2); TRI_Free(normalized); if (lowered == nullptr) { return Result(TRI_ERROR_OUT_OF_MEMORY); } // emplace_back below may throw TRI_DEFER(TRI_Free(lowered)); // calculate the proper prefix char* prefixEnd = TRI_PrefixUtf8String(lowered, FulltextIndexLimits::maxWordLength); ptrdiff_t prefixLength = prefixEnd - lowered; query.emplace_back(std::string(lowered, (size_t)prefixLength), matchType, operation); ++i; if (i >= FulltextIndexLimits::maxSearchWords) { break; } } if (!query.empty()) { query[0].operation = FulltextQueryToken::OR; } return Result(i == 0 ? TRI_ERROR_BAD_PARAMETER : TRI_ERROR_NO_ERROR); } Result RocksDBFulltextIndex::executeQuery(transaction::Methods* trx, FulltextQuery const& query, std::set& resultSet) { for (size_t i = 0; i < query.size(); i++) { FulltextQueryToken const& token = query[i]; if (i > 0 && token.operation != FulltextQueryToken::OR && resultSet.empty()) { // skip tokens which won't do anything continue; } Result res = applyQueryToken(trx, token, resultSet); if (res.fail()) { return res; } } return TRI_ERROR_NO_ERROR; } static RocksDBKeyBounds MakeBounds(uint64_t oid, FulltextQueryToken const& token) { if (token.matchType == FulltextQueryToken::COMPLETE) { return RocksDBKeyBounds::FulltextIndexComplete(oid, arangodb::velocypack::StringRef(token.value)); } else if (token.matchType == FulltextQueryToken::PREFIX) { return RocksDBKeyBounds::FulltextIndexPrefix(oid, arangodb::velocypack::StringRef(token.value)); } THROW_ARANGO_EXCEPTION(TRI_ERROR_NOT_IMPLEMENTED); } Result RocksDBFulltextIndex::applyQueryToken(transaction::Methods* trx, FulltextQueryToken const& token, std::set& resultSet) { auto mthds = RocksDBTransactionState::toMethods(trx); // why can't I have an assignment operator when I want one RocksDBKeyBounds bounds = MakeBounds(_objectId, token); rocksdb::Slice end = bounds.end(); rocksdb::Comparator const* cmp = this->comparator(); rocksdb::ReadOptions ro = mthds->iteratorReadOptions(); ro.iterate_upper_bound = &end; std::unique_ptr iter = mthds->NewIterator(ro, _cf); // set is used to perform an intersection with the result set std::set intersect; // apply left to right logic, merging all current results with ALL previous for (iter->Seek(bounds.start()); iter->Valid() && cmp->Compare(iter->key(), end) < 0; iter->Next()) { TRI_ASSERT(_objectId == RocksDBKey::objectId(iter->key())); rocksdb::Status s = iter->status(); if (!s.ok()) { return rocksutils::convertStatus(s); } LocalDocumentId documentId = RocksDBKey::indexDocumentId(RocksDBEntryType::FulltextIndexValue, iter->key()); if (token.operation == FulltextQueryToken::AND) { intersect.insert(documentId); } else if (token.operation == FulltextQueryToken::OR) { resultSet.insert(documentId); } else if (token.operation == FulltextQueryToken::EXCLUDE) { resultSet.erase(documentId); } } if (token.operation == FulltextQueryToken::AND) { if (resultSet.empty() || intersect.empty()) { resultSet.clear(); } else { std::set output; std::set_intersection(resultSet.begin(), resultSet.end(), intersect.begin(), intersect.end(), std::inserter(output, output.begin())); resultSet = std::move(output); } } return Result(); } std::unique_ptr RocksDBFulltextIndex::iteratorForCondition( transaction::Methods* trx, aql::AstNode const* condNode, aql::Variable const* var, IndexIteratorOptions const& opts) { TRI_ASSERT(!isSorted() || opts.sorted); TRI_ASSERT(condNode != nullptr); TRI_ASSERT(condNode->numMembers() == 1); // should only be an FCALL aql::AstNode const* fcall = condNode->getMember(0); TRI_ASSERT(fcall->type == arangodb::aql::NODE_TYPE_FCALL); TRI_ASSERT(fcall->numMembers() == 1); aql::AstNode const* args = fcall->getMember(0); size_t numMembers = args->numMembers(); TRI_ASSERT(numMembers == 3 || numMembers == 4); aql::AstNode const* queryNode = args->getMember(2); if (queryNode->type != aql::NODE_TYPE_VALUE || queryNode->value.type != aql::VALUE_TYPE_STRING) { std::string message = basics::Exception::FillExceptionString(TRI_ERROR_QUERY_FUNCTION_ARGUMENT_TYPE_MISMATCH, "FULLTEXT"); THROW_ARANGO_EXCEPTION_MESSAGE(TRI_ERROR_QUERY_FUNCTION_ARGUMENT_TYPE_MISMATCH, message); } FulltextQuery parsedQuery; Result res = parseQueryString(queryNode->getString(), parsedQuery); if (res.fail()) { THROW_ARANGO_EXCEPTION(res); } std::set results; res = executeQuery(trx, parsedQuery, results); if (res.fail()) { THROW_ARANGO_EXCEPTION(res); } return std::make_unique(&_collection, trx, std::move(results)); }