//////////////////////////////////////////////////////////////////////////////// /// DISCLAIMER /// /// Copyright 2014-2016 ArangoDB GmbH, Cologne, Germany /// Copyright 2004-2014 triAGENS GmbH, Cologne, Germany /// /// Licensed under the Apache License, Version 2.0 (the "License"); /// you may not use this file except in compliance with the License. /// You may obtain a copy of the License at /// /// http://www.apache.org/licenses/LICENSE-2.0 /// /// Unless required by applicable law or agreed to in writing, software /// distributed under the License is distributed on an "AS IS" BASIS, /// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. /// See the License for the specific language governing permissions and /// limitations under the License. /// /// Copyright holder is ArangoDB GmbH, Cologne, Germany /// /// @author Dr. Frank Celler //////////////////////////////////////////////////////////////////////////////// #include "FulltextIndex.h" #include "Logger/Logger.h" #include "Basics/Utf8Helper.h" #include "FulltextIndex/fulltext-index.h" #include "VocBase/document-collection.h" #include "VocBase/transaction.h" #include #include using namespace arangodb; //////////////////////////////////////////////////////////////////////////////// /// @brief walk over the attribute. Also Extract sub-attributes and elements in /// list. //////////////////////////////////////////////////////////////////////////////// static void ExtractWords(std::vector& words, VPackSlice const value, size_t minWordLength, int level) { if (value.isString()) { // extract the string value for the indexed attribute std::string text = value.copyString(); // parse the document text arangodb::basics::Utf8Helper::DefaultUtf8Helper.getWords( words, text, minWordLength, TRI_FULLTEXT_MAX_WORD_LENGTH, true); // We don't care for the result. If the result is false, words stays // unchanged and is not indexed } else if (value.isArray() && level == 0) { for (auto const& v : VPackArrayIterator(value)) { ExtractWords(words, v, minWordLength, level + 1); } } else if (value.isObject() && level == 0) { for (auto const& v : VPackObjectIterator(value)) { ExtractWords(words, v.value, minWordLength, level + 1); } } } FulltextIndex::FulltextIndex(TRI_idx_iid_t iid, TRI_document_collection_t* collection, std::string const& attribute, int minWordLength) : Index(iid, collection, std::vector>{ {{attribute, false}}}, false, true), _fulltextIndex(nullptr), _minWordLength(minWordLength > 0 ? minWordLength : 1) { TRI_ASSERT(iid != 0); _attr = arangodb::basics::StringUtils::split(attribute, "."); _fulltextIndex = TRI_CreateFtsIndex(2048, 1, 1); if (_fulltextIndex == nullptr) { THROW_ARANGO_EXCEPTION(TRI_ERROR_OUT_OF_MEMORY); } } FulltextIndex::~FulltextIndex() { if (_fulltextIndex != nullptr) { LOG(TRACE) << "destroying fulltext index"; TRI_FreeFtsIndex(_fulltextIndex); } } size_t FulltextIndex::memory() const { return TRI_MemoryFulltextIndex(_fulltextIndex); } //////////////////////////////////////////////////////////////////////////////// /// @brief return a VelocyPack representation of the index //////////////////////////////////////////////////////////////////////////////// void FulltextIndex::toVelocyPack(VPackBuilder& builder, bool withFigures) const { Index::toVelocyPack(builder, withFigures); builder.add("unique", VPackValue(false)); builder.add("sparse", VPackValue(true)); builder.add("minLength", VPackValue(_minWordLength)); } int FulltextIndex::insert(arangodb::Transaction*, TRI_doc_mptr_t const* doc, bool isRollback) { int res = TRI_ERROR_NO_ERROR; std::vector words = wordlist(doc); if (words.empty()) { // TODO: distinguish the cases "empty wordlist" and "out of memory" // LOG(WARN) << "could not build wordlist"; return res; } // TODO: use status codes if (!TRI_InsertWordsFulltextIndex( _fulltextIndex, (TRI_fulltext_doc_t)((uintptr_t)doc), words)) { LOG(ERR) << "adding document to fulltext index failed"; res = TRI_ERROR_INTERNAL; } return res; } int FulltextIndex::remove(arangodb::Transaction*, TRI_doc_mptr_t const* doc, bool) { TRI_DeleteDocumentFulltextIndex(_fulltextIndex, (TRI_fulltext_doc_t)((uintptr_t)doc)); return TRI_ERROR_NO_ERROR; } int FulltextIndex::cleanup() { LOG(TRACE) << "fulltext cleanup called"; int res = TRI_ERROR_NO_ERROR; // check whether we should do a cleanup at all if (!TRI_CompactFulltextIndex(_fulltextIndex)) { res = TRI_ERROR_INTERNAL; } return res; } //////////////////////////////////////////////////////////////////////////////// /// @brief callback function called by the fulltext index to determine the /// words to index for a specific document //////////////////////////////////////////////////////////////////////////////// std::vector FulltextIndex::wordlist( TRI_doc_mptr_t const* document) { std::vector words; try { VPackSlice const slice(document->vpack()); VPackSlice const value = slice.get(_attr); if (!value.isString() && !value.isArray() && !value.isObject()) { // Invalid Input return words; } ExtractWords(words, value, _minWordLength, 0); } catch (...) { // Backwards compatibility // The pre-vpack impl. did just ignore all errors and returned nulltpr return words; } return words; }