//////////////////////////////////////////////////////////////////////////////// /// @brief fulltext index /// /// @file /// /// DISCLAIMER /// /// Copyright 2014 ArangoDB GmbH, Cologne, Germany /// Copyright 2004-2014 triAGENS GmbH, Cologne, Germany /// /// Licensed under the Apache License, Version 2.0 (the "License"); /// you may not use this file except in compliance with the License. /// You may obtain a copy of the License at /// /// http://www.apache.org/licenses/LICENSE-2.0 /// /// Unless required by applicable law or agreed to in writing, software /// distributed under the License is distributed on an "AS IS" BASIS, /// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. /// See the License for the specific language governing permissions and /// limitations under the License. /// /// Copyright holder is ArangoDB GmbH, Cologne, Germany /// /// @author Dr. Frank Celler /// @author Copyright 2014, ArangoDB GmbH, Cologne, Germany /// @author Copyright 2011-2013, triAGENS GmbH, Cologne, Germany //////////////////////////////////////////////////////////////////////////////// #include "FulltextIndex.h" #include "Basics/logging.h" #include "Basics/Utf8Helper.h" #include "FulltextIndex/fulltext-index.h" #include "FulltextIndex/fulltext-wordlist.h" #include "VocBase/document-collection.h" #include "VocBase/transaction.h" #include "VocBase/voc-shaper.h" using namespace triagens::arango; // ----------------------------------------------------------------------------- // --SECTION-- private functions // ----------------------------------------------------------------------------- //////////////////////////////////////////////////////////////////////////////// /// @brief extraction context //////////////////////////////////////////////////////////////////////////////// struct TextExtractorContext { std::vector>* _positions; TRI_shaper_t* _shaper; }; //////////////////////////////////////////////////////////////////////////////// /// @brief walk over an array shape and extract the string values //////////////////////////////////////////////////////////////////////////////// static bool ArrayTextExtractor (TRI_shaper_t* shaper, TRI_shape_t const* shape, char const*, char const* shapedJson, uint64_t length, void* data) { char* text; size_t textLength; bool ok = TRI_StringValueShapedJson(shape, shapedJson, &text, &textLength); if (ok) { // add string value found try { static_cast(data)->_positions->emplace_back(text, textLength); } catch (...) { } } return true; } //////////////////////////////////////////////////////////////////////////////// /// @brief walk over a list shape and extract the string values //////////////////////////////////////////////////////////////////////////////// static bool ListTextExtractor (TRI_shaper_t* shaper, TRI_shape_t const* shape, char const* shapedJson, uint64_t length, void* data) { if (shape->_type == TRI_SHAPE_ARRAY) { // a sub-object TRI_IterateShapeDataArray(static_cast(data)->_shaper, shape, shapedJson, ArrayTextExtractor, data); } else if (shape->_type == TRI_SHAPE_SHORT_STRING || shape->_type == TRI_SHAPE_LONG_STRING) { char* text; size_t textLength; bool ok = TRI_StringValueShapedJson(shape, shapedJson, &text, &textLength); if (ok) { // add string value found try { static_cast(data)->_positions->emplace_back(text, textLength); } catch (...) { } } } return true; } // ----------------------------------------------------------------------------- // --SECTION-- class FulltextIndex // ----------------------------------------------------------------------------- // ----------------------------------------------------------------------------- // --SECTION-- constructors and destructors // ----------------------------------------------------------------------------- FulltextIndex::FulltextIndex (TRI_idx_iid_t iid, TRI_document_collection_t* collection, std::string const& attribute, int minWordLength) : Index(iid, collection, std::vector{ attribute }), _pid(0), _fulltextIndex(nullptr), _minWordLength(minWordLength > 0 ? minWordLength : 1) { TRI_ASSERT(iid != 0); // look up the attribute TRI_shaper_t* shaper = _collection->getShaper(); // ONLY IN INDEX, PROTECTED by RUNTIME _pid = shaper->findOrCreateAttributePathByName(shaper, attribute.c_str()); if (_pid == 0) { THROW_ARANGO_EXCEPTION(TRI_ERROR_OUT_OF_MEMORY); } _fulltextIndex = TRI_CreateFtsIndex(2048, 1, 1); if (_fulltextIndex == nullptr) { THROW_ARANGO_EXCEPTION(TRI_ERROR_OUT_OF_MEMORY); } } FulltextIndex::~FulltextIndex () { if (_fulltextIndex != nullptr) { LOG_TRACE("destroying fulltext index"); TRI_FreeFtsIndex(_fulltextIndex); } } // ----------------------------------------------------------------------------- // --SECTION-- public methods // ----------------------------------------------------------------------------- size_t FulltextIndex::memory () const { return TRI_MemoryFulltextIndex(_fulltextIndex); } triagens::basics::Json FulltextIndex::toJson (TRI_memory_zone_t* zone) const { auto json = Index::toJson(zone); // hard-coded json("unique", triagens::basics::Json(false)) ("sparse", triagens::basics::Json(true)); json("minLength", triagens::basics::Json(zone, static_cast(_minWordLength))); return json; } int FulltextIndex::insert (TRI_doc_mptr_t const* doc, bool isRollback) { int res = TRI_ERROR_NO_ERROR; TRI_fulltext_wordlist_t* words = wordlist(doc); if (words == nullptr) { // TODO: distinguish the cases "empty wordlist" and "out of memory" // LOG_WARNING("could not build wordlist"); return res; } if (words->_numWords > 0) { // TODO: use status codes if (! TRI_InsertWordsFulltextIndex(_fulltextIndex, (TRI_fulltext_doc_t) ((uintptr_t) doc), words)) { LOG_ERROR("adding document to fulltext index failed"); res = TRI_ERROR_INTERNAL; } } TRI_FreeWordlistFulltextIndex(words); return res; } int FulltextIndex::remove (TRI_doc_mptr_t const* doc, bool) { TRI_DeleteDocumentFulltextIndex(_fulltextIndex, (TRI_fulltext_doc_t) ((uintptr_t) doc)); return TRI_ERROR_NO_ERROR; } int FulltextIndex::cleanup () { LOG_TRACE("fulltext cleanup called"); int res = TRI_ERROR_NO_ERROR; // check whether we should do a cleanup at all if (! TRI_CompactFulltextIndex(_fulltextIndex)) { res = TRI_ERROR_INTERNAL; } return res; } // ----------------------------------------------------------------------------- // --SECTION-- private methods // ----------------------------------------------------------------------------- //////////////////////////////////////////////////////////////////////////////// /// @brief callback function called by the fulltext index to determine the /// words to index for a specific document //////////////////////////////////////////////////////////////////////////////// TRI_fulltext_wordlist_t* FulltextIndex::wordlist (TRI_doc_mptr_t const* document) { TRI_shaped_json_t shaped; TRI_shaped_json_t shapedJson; TRI_shape_t const* shape; // extract the shape TRI_shaper_t* shaper = _collection->getShaper(); TRI_EXTRACT_SHAPED_JSON_MARKER(shaped, document->getDataPtr()); // ONLY IN INDEX, PROTECTED by RUNTIME bool ok = TRI_ExtractShapedJsonVocShaper(shaper, &shaped, 0, _pid, &shapedJson, &shape); // ONLY IN INDEX, PROTECTED by RUNTIME if (! ok || shape == nullptr) { return nullptr; } TRI_vector_string_t* words; // extract the string value for the indexed attribute if (shape->_type == TRI_SHAPE_SHORT_STRING || shape->_type == TRI_SHAPE_LONG_STRING) { char* text; size_t textLength; ok = TRI_StringValueShapedJson(shape, shapedJson._data.data, &text, &textLength); if (! ok) { return nullptr; } // parse the document text words = TRI_get_words(text, textLength, (size_t) _minWordLength, (size_t) TRI_FULLTEXT_MAX_WORD_LENGTH, true); } else if (shape->_type == TRI_SHAPE_ARRAY) { std::vector> values; TextExtractorContext context{ &values, shaper }; TRI_IterateShapeDataArray(shaper, shape, shapedJson._data.data, ArrayTextExtractor, &context); words = nullptr; for (auto const& it : values) { if (! TRI_get_words(words, it.first, it.second, (size_t) _minWordLength, (size_t) TRI_FULLTEXT_MAX_WORD_LENGTH, true)) { if (words != nullptr) { TRI_FreeVectorString(TRI_UNKNOWN_MEM_ZONE, words); } return nullptr; } } } else if (shape->_type == TRI_SHAPE_LIST || shape->_type == TRI_SHAPE_HOMOGENEOUS_LIST || shape->_type == TRI_SHAPE_HOMOGENEOUS_SIZED_LIST) { std::vector> values; TextExtractorContext context{ &values, shaper }; TRI_IterateShapeDataList(shaper, shape, shapedJson._data.data, ListTextExtractor, &context); words = nullptr; for (auto const& it : values) { if (! TRI_get_words(words, it.first, it.second, (size_t) _minWordLength, (size_t) TRI_FULLTEXT_MAX_WORD_LENGTH, true)) { if (words != nullptr) { TRI_FreeVectorString(TRI_UNKNOWN_MEM_ZONE, words); } return nullptr; } } } else { words = nullptr; } if (words == nullptr) { return nullptr; } TRI_fulltext_wordlist_t* wordlist = TRI_CreateWordlistFulltextIndex(words->_buffer, words->_length); if (wordlist == nullptr) { TRI_FreeVectorString(TRI_UNKNOWN_MEM_ZONE, words); return nullptr; } // this really is a hack, but it works well: // make the word list vector think it's empty and free it // this does not free the word list, that we have already over the result words->_length = 0; words->_buffer = nullptr; TRI_FreeVectorString(TRI_UNKNOWN_MEM_ZONE, words); return wordlist; } // ----------------------------------------------------------------------------- // --SECTION-- END-OF-FILE // ----------------------------------------------------------------------------- // Local Variables: // mode: outline-minor // outline-regexp: "/// @brief\\|/// {@inheritDoc}\\|/// @page\\|// --SECTION--\\|/// @\\}" // End: