//////////////////////////////////////////////////////////////////////////////// /// DISCLAIMER /// /// Copyright 2014-2016 ArangoDB GmbH, Cologne, Germany /// Copyright 2004-2014 triAGENS GmbH, Cologne, Germany /// /// Licensed under the Apache License, Version 2.0 (the "License"); /// you may not use this file except in compliance with the License. /// You may obtain a copy of the License at /// /// http://www.apache.org/licenses/LICENSE-2.0 /// /// Unless required by applicable law or agreed to in writing, software /// distributed under the License is distributed on an "AS IS" BASIS, /// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. /// See the License for the specific language governing permissions and /// limitations under the License. /// /// Copyright holder is ArangoDB GmbH, Cologne, Germany /// /// @author Dr. Frank Celler //////////////////////////////////////////////////////////////////////////////// #include "MMFilesFulltextIndex.h" #include "Basics/StringRef.h" #include "Basics/Utf8Helper.h" #include "Basics/VelocyPackHelper.h" #include "Logger/Logger.h" #include "MMFiles/fulltext-index.h" #include "MMFiles/MMFilesToken.h" #include "StorageEngine/DocumentIdentifierToken.h" #include "StorageEngine/TransactionState.h" #include #include using namespace arangodb; /// @brief walk over the attribute. Also Extract sub-attributes and elements in /// list. static void ExtractWords(std::vector& words, VPackSlice const value, size_t minWordLength, int level) { if (value.isString()) { // extract the string value for the indexed attribute std::string text = value.copyString(); // parse the document text arangodb::basics::Utf8Helper::DefaultUtf8Helper.getWords( words, text, minWordLength, TRI_FULLTEXT_MAX_WORD_LENGTH, true); // We don't care for the result. If the result is false, words stays // unchanged and is not indexed } else if (value.isArray() && level == 0) { for (auto const& v : VPackArrayIterator(value)) { ExtractWords(words, v, minWordLength, level + 1); } } else if (value.isObject() && level == 0) { for (auto const& v : VPackObjectIterator(value)) { ExtractWords(words, v.value, minWordLength, level + 1); } } } TRI_voc_rid_t MMFilesFulltextIndex::fromDocumentIdentifierToken( DocumentIdentifierToken const& token) { auto tkn = static_cast(&token); return tkn->revisionId(); } DocumentIdentifierToken MMFilesFulltextIndex::toDocumentIdentifierToken( TRI_voc_rid_t revisionId) { return MMFilesToken{revisionId}; } MMFilesFulltextIndex::MMFilesFulltextIndex(TRI_idx_iid_t iid, arangodb::LogicalCollection* collection, VPackSlice const& info) : Index(iid, collection, info), _fulltextIndex(nullptr), _minWordLength(TRI_FULLTEXT_MIN_WORD_LENGTH_DEFAULT) { TRI_ASSERT(iid != 0); VPackSlice const value = info.get("minLength"); if (value.isNumber()) { _minWordLength = value.getNumericValue(); if (_minWordLength <= 0) { // The min length cannot be negative. _minWordLength = 1; } } else if (!value.isNone()) { // minLength defined but no number THROW_ARANGO_EXCEPTION_MESSAGE(TRI_ERROR_BAD_PARAMETER, " must be a number"); } _unique = false; _sparse = true; if (_fields.size() != 1) { // We need exactly 1 attribute THROW_ARANGO_EXCEPTION_MESSAGE(TRI_ERROR_INTERNAL, "fulltext index definition should have exactly one attribute"); } auto& attribute = _fields[0]; _attr.reserve(attribute.size()); for (auto& a : attribute) { _attr.emplace_back(a.name); } _fulltextIndex = TRI_CreateFtsIndex(2048, 1, 1); if (_fulltextIndex == nullptr) { THROW_ARANGO_EXCEPTION(TRI_ERROR_OUT_OF_MEMORY); } } MMFilesFulltextIndex::~MMFilesFulltextIndex() { if (_fulltextIndex != nullptr) { LOG_TOPIC(TRACE, arangodb::Logger::FIXME) << "destroying fulltext index"; TRI_FreeFtsIndex(_fulltextIndex); } } size_t MMFilesFulltextIndex::memory() const { return TRI_MemoryMMFilesFulltextIndex(_fulltextIndex); } /// @brief return a VelocyPack representation of the index void MMFilesFulltextIndex::toVelocyPack(VPackBuilder& builder, bool withFigures) const { Index::toVelocyPack(builder, withFigures); builder.add("unique", VPackValue(false)); builder.add("sparse", VPackValue(true)); builder.add("minLength", VPackValue(_minWordLength)); } /// @brief Test if this index matches the definition bool MMFilesFulltextIndex::matchesDefinition(VPackSlice const& info) const { TRI_ASSERT(info.isObject()); #ifdef ARANGODB_ENABLE_MAINTAINER_MODE VPackSlice typeSlice = info.get("type"); TRI_ASSERT(typeSlice.isString()); StringRef typeStr(typeSlice); TRI_ASSERT(typeStr == typeName()); #endif auto value = info.get("id"); if (!value.isNone()) { // We already have an id. if(!value.isString()) { // Invalid ID return false; } // Short circuit. If id is correct the index is identical. StringRef idRef(value); return idRef == std::to_string(_iid); } value = info.get("minLength"); if (value.isNumber()) { int cmp = value.getNumericValue(); if (cmp <= 0) { if (_minWordLength != 1) { return false; } } else { if (_minWordLength != cmp) { return false; } } } else if (!value.isNone()) { // Illegal minLength return false; } value = info.get("fields"); if (!value.isArray()) { return false; } size_t const n = static_cast(value.length()); if (n != _fields.size()) { return false; } if (_unique != arangodb::basics::VelocyPackHelper::getBooleanValue( info, "unique", false)) { return false; } if (_sparse != arangodb::basics::VelocyPackHelper::getBooleanValue( info, "sparse", true)) { return false; } // This check takes ordering of attributes into account. std::vector translate; for (size_t i = 0; i < n; ++i) { translate.clear(); VPackSlice f = value.at(i); if (!f.isString()) { // Invalid field definition! return false; } arangodb::StringRef in(f); TRI_ParseAttributeString(in, translate, true); if (!arangodb::basics::AttributeName::isIdentical(_fields[i], translate, false)) { return false; } } return true; } int MMFilesFulltextIndex::insert(TransactionMethods*, TRI_voc_rid_t revisionId, VPackSlice const& doc, bool isRollback) { int res = TRI_ERROR_NO_ERROR; std::vector words = wordlist(doc); if (words.empty()) { // TODO: distinguish the cases "empty wordlist" and "out of memory" // LOG_TOPIC(WARN, arangodb::Logger::FIXME) << "could not build wordlist"; return res; } // TODO: use status codes if (!TRI_InsertWordsMMFilesFulltextIndex(_fulltextIndex, revisionId, words)) { LOG_TOPIC(ERR, arangodb::Logger::FIXME) << "adding document to fulltext index failed"; res = TRI_ERROR_INTERNAL; } return res; } int MMFilesFulltextIndex::remove(TransactionMethods*, TRI_voc_rid_t revisionId, VPackSlice const& doc, bool isRollback) { TRI_DeleteDocumentMMFilesFulltextIndex(_fulltextIndex, revisionId); return TRI_ERROR_NO_ERROR; } int MMFilesFulltextIndex::unload() { TRI_TruncateMMFilesFulltextIndex(_fulltextIndex); return TRI_ERROR_NO_ERROR; } int MMFilesFulltextIndex::cleanup() { LOG_TOPIC(TRACE, arangodb::Logger::FIXME) << "fulltext cleanup called"; int res = TRI_ERROR_NO_ERROR; // check whether we should do a cleanup at all if (!TRI_CompactMMFilesFulltextIndex(_fulltextIndex)) { res = TRI_ERROR_INTERNAL; } return res; } /// @brief callback function called by the fulltext index to determine the /// words to index for a specific document std::vector MMFilesFulltextIndex::wordlist(VPackSlice const& doc) { std::vector words; try { VPackSlice const value = doc.get(_attr); if (!value.isString() && !value.isArray() && !value.isObject()) { // Invalid Input return words; } ExtractWords(words, value, _minWordLength, 0); } catch (...) { // Backwards compatibility // The pre-vpack impl. did just ignore all errors and returned nulltpr return words; } return words; }