1
0
Fork 0
arangodb/arangod/Indexes/FulltextIndex.cpp

319 lines
10 KiB
C++

////////////////////////////////////////////////////////////////////////////////
/// DISCLAIMER
///
/// Copyright 2014-2016 ArangoDB GmbH, Cologne, Germany
/// Copyright 2004-2014 triAGENS GmbH, Cologne, Germany
///
/// Licensed under the Apache License, Version 2.0 (the "License");
/// you may not use this file except in compliance with the License.
/// You may obtain a copy of the License at
///
/// http://www.apache.org/licenses/LICENSE-2.0
///
/// Unless required by applicable law or agreed to in writing, software
/// distributed under the License is distributed on an "AS IS" BASIS,
/// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
/// See the License for the specific language governing permissions and
/// limitations under the License.
///
/// Copyright holder is ArangoDB GmbH, Cologne, Germany
///
/// @author Dr. Frank Celler
////////////////////////////////////////////////////////////////////////////////
#include "FulltextIndex.h"
#include "Basics/logging.h"
#include "Basics/Utf8Helper.h"
#include "FulltextIndex/fulltext-index.h"
#include "FulltextIndex/fulltext-wordlist.h"
#include "VocBase/document-collection.h"
#include "VocBase/transaction.h"
#include "VocBase/VocShaper.h"
using namespace triagens::arango;
////////////////////////////////////////////////////////////////////////////////
/// @brief extraction context
////////////////////////////////////////////////////////////////////////////////
struct TextExtractorContext {
std::vector<std::pair<char const*, size_t>>* _positions;
VocShaper* _shaper;
};
////////////////////////////////////////////////////////////////////////////////
/// @brief walk over an array shape and extract the string values
////////////////////////////////////////////////////////////////////////////////
static bool ArrayTextExtractor(VocShaper* shaper, TRI_shape_t const* shape,
char const*, char const* shapedJson,
uint64_t length, void* data) {
char* text;
size_t textLength;
bool ok = TRI_StringValueShapedJson(shape, shapedJson, &text, &textLength);
if (ok) {
// add string value found
try {
static_cast<TextExtractorContext*>(data)
->_positions->emplace_back(text, textLength);
} catch (...) {
}
}
return true;
}
////////////////////////////////////////////////////////////////////////////////
/// @brief walk over a list shape and extract the string values
////////////////////////////////////////////////////////////////////////////////
static bool ListTextExtractor(VocShaper* shaper, TRI_shape_t const* shape,
char const* shapedJson, uint64_t length,
void* data) {
if (shape->_type == TRI_SHAPE_ARRAY) {
// a sub-object
TRI_IterateShapeDataArray(static_cast<TextExtractorContext*>(data)->_shaper,
shape, shapedJson, ArrayTextExtractor, data);
} else if (shape->_type == TRI_SHAPE_SHORT_STRING ||
shape->_type == TRI_SHAPE_LONG_STRING) {
char* text;
size_t textLength;
bool ok = TRI_StringValueShapedJson(shape, shapedJson, &text, &textLength);
if (ok) {
// add string value found
try {
static_cast<TextExtractorContext*>(data)
->_positions->emplace_back(text, textLength);
} catch (...) {
}
}
}
return true;
}
FulltextIndex::FulltextIndex(TRI_idx_iid_t iid,
TRI_document_collection_t* collection,
std::string const& attribute, int minWordLength)
: Index(iid, collection,
std::vector<std::vector<triagens::basics::AttributeName>>{
{{attribute, false}}},
false, true),
_pid(0),
_fulltextIndex(nullptr),
_minWordLength(minWordLength > 0 ? minWordLength : 1) {
TRI_ASSERT(iid != 0);
// look up the attribute
auto shaper =
_collection->getShaper(); // ONLY IN INDEX, PROTECTED by RUNTIME
_pid = shaper->findOrCreateAttributePathByName(attribute.c_str());
if (_pid == 0) {
THROW_ARANGO_EXCEPTION(TRI_ERROR_OUT_OF_MEMORY);
}
_fulltextIndex = TRI_CreateFtsIndex(2048, 1, 1);
if (_fulltextIndex == nullptr) {
THROW_ARANGO_EXCEPTION(TRI_ERROR_OUT_OF_MEMORY);
}
}
FulltextIndex::~FulltextIndex() {
if (_fulltextIndex != nullptr) {
LOG_TRACE("destroying fulltext index");
TRI_FreeFtsIndex(_fulltextIndex);
}
}
size_t FulltextIndex::memory() const {
return TRI_MemoryFulltextIndex(_fulltextIndex);
}
////////////////////////////////////////////////////////////////////////////////
/// @brief return a JSON representation of the index
////////////////////////////////////////////////////////////////////////////////
triagens::basics::Json FulltextIndex::toJson(TRI_memory_zone_t* zone,
bool withFigures) const {
auto json = Index::toJson(zone, withFigures);
// hard-coded
json("unique", triagens::basics::Json(false))("sparse",
triagens::basics::Json(true));
json("minLength",
triagens::basics::Json(zone, static_cast<double>(_minWordLength)));
return json;
}
////////////////////////////////////////////////////////////////////////////////
/// @brief return a JSON representation of the index figures
////////////////////////////////////////////////////////////////////////////////
triagens::basics::Json FulltextIndex::toJsonFigures(
TRI_memory_zone_t* zone) const {
triagens::basics::Json json(triagens::basics::Json::Object);
json("memory", triagens::basics::Json(static_cast<double>(memory())));
return json;
}
int FulltextIndex::insert(triagens::arango::Transaction*,
TRI_doc_mptr_t const* doc, bool isRollback) {
int res = TRI_ERROR_NO_ERROR;
TRI_fulltext_wordlist_t* words = wordlist(doc);
if (words == nullptr) {
// TODO: distinguish the cases "empty wordlist" and "out of memory"
// LOG_WARNING("could not build wordlist");
return res;
}
if (words->_numWords > 0) {
// TODO: use status codes
if (!TRI_InsertWordsFulltextIndex(
_fulltextIndex, (TRI_fulltext_doc_t)((uintptr_t)doc), words)) {
LOG_ERROR("adding document to fulltext index failed");
res = TRI_ERROR_INTERNAL;
}
}
TRI_FreeWordlistFulltextIndex(words);
return res;
}
int FulltextIndex::remove(triagens::arango::Transaction*,
TRI_doc_mptr_t const* doc, bool) {
TRI_DeleteDocumentFulltextIndex(_fulltextIndex,
(TRI_fulltext_doc_t)((uintptr_t)doc));
return TRI_ERROR_NO_ERROR;
}
int FulltextIndex::cleanup() {
LOG_TRACE("fulltext cleanup called");
int res = TRI_ERROR_NO_ERROR;
// check whether we should do a cleanup at all
if (!TRI_CompactFulltextIndex(_fulltextIndex)) {
res = TRI_ERROR_INTERNAL;
}
return res;
}
////////////////////////////////////////////////////////////////////////////////
/// @brief callback function called by the fulltext index to determine the
/// words to index for a specific document
////////////////////////////////////////////////////////////////////////////////
TRI_fulltext_wordlist_t* FulltextIndex::wordlist(
TRI_doc_mptr_t const* document) {
TRI_shaped_json_t shaped;
TRI_shaped_json_t shapedJson;
TRI_shape_t const* shape;
// extract the shape
auto shaper = _collection->getShaper();
TRI_EXTRACT_SHAPED_JSON_MARKER(
shaped, document->getDataPtr()); // ONLY IN INDEX, PROTECTED by RUNTIME
bool ok =
shaper->extractShapedJson(&shaped, 0, _pid, &shapedJson,
&shape); // ONLY IN INDEX, PROTECTED by RUNTIME
if (!ok || shape == nullptr) {
return nullptr;
}
TRI_vector_string_t* words;
// extract the string value for the indexed attribute
if (shape->_type == TRI_SHAPE_SHORT_STRING ||
shape->_type == TRI_SHAPE_LONG_STRING) {
char* text;
size_t textLength;
ok = TRI_StringValueShapedJson(shape, shapedJson._data.data, &text,
&textLength);
if (!ok) {
return nullptr;
}
// parse the document text
words = TRI_get_words(text, textLength, (size_t)_minWordLength,
(size_t)TRI_FULLTEXT_MAX_WORD_LENGTH, true);
} else if (shape->_type == TRI_SHAPE_ARRAY) {
std::vector<std::pair<char const*, size_t>> values;
TextExtractorContext context{&values, shaper};
TRI_IterateShapeDataArray(shaper, shape, shapedJson._data.data,
ArrayTextExtractor, &context);
words = nullptr;
for (auto const& it : values) {
if (!TRI_get_words(words, it.first, it.second, (size_t)_minWordLength,
(size_t)TRI_FULLTEXT_MAX_WORD_LENGTH, true)) {
if (words != nullptr) {
TRI_FreeVectorString(TRI_UNKNOWN_MEM_ZONE, words);
}
return nullptr;
}
}
} else if (shape->_type == TRI_SHAPE_LIST ||
shape->_type == TRI_SHAPE_HOMOGENEOUS_LIST ||
shape->_type == TRI_SHAPE_HOMOGENEOUS_SIZED_LIST) {
std::vector<std::pair<char const*, size_t>> values;
TextExtractorContext context{&values, shaper};
TRI_IterateShapeDataList(shaper, shape, shapedJson._data.data,
ListTextExtractor, &context);
words = nullptr;
for (auto const& it : values) {
if (!TRI_get_words(words, it.first, it.second, (size_t)_minWordLength,
(size_t)TRI_FULLTEXT_MAX_WORD_LENGTH, true)) {
if (words != nullptr) {
TRI_FreeVectorString(TRI_UNKNOWN_MEM_ZONE, words);
}
return nullptr;
}
}
} else {
words = nullptr;
}
if (words == nullptr) {
return nullptr;
}
TRI_fulltext_wordlist_t* wordlist =
TRI_CreateWordlistFulltextIndex(words->_buffer, words->_length);
if (wordlist == nullptr) {
TRI_FreeVectorString(TRI_UNKNOWN_MEM_ZONE, words);
return nullptr;
}
// this really is a hack, but it works well:
// make the word list vector think it's empty and free it
// this does not free the word list, that we have already over the result
words->_length = 0;
words->_buffer = nullptr;
TRI_FreeVectorString(TRI_UNKNOWN_MEM_ZONE, words);
return wordlist;
}