From cbaa57a1bdf1e195d96f6723fc63ab4cc5eb9b22 Mon Sep 17 00:00:00 2001 From: Jan Steemann Date: Fri, 7 Dec 2012 18:00:01 +0100 Subject: [PATCH] cleanup of fulltext indexes, resizing etc. --- arangod/FulltextIndex/fulltext-index.c | 417 +++++++++++++++++++------ arangod/FulltextIndex/fulltext-index.h | 10 +- arangod/FulltextIndex/zstr.c | 3 +- arangod/V8Server/v8-vocbase.cpp | 16 +- arangod/VocBase/document-collection.c | 33 +- arangod/VocBase/document-collection.h | 4 +- arangod/VocBase/fulltext-query.h | 19 ++ arangod/VocBase/index.c | 81 +++-- arangod/VocBase/index.h | 4 +- js/server/tests/fulltext.js | 10 +- lib/Basics/Utf8Helper.cpp | 37 ++- 11 files changed, 480 insertions(+), 154 deletions(-) diff --git a/arangod/FulltextIndex/fulltext-index.c b/arangod/FulltextIndex/fulltext-index.c index b5cbe81e3e..b3bd34b3a9 100644 --- a/arangod/FulltextIndex/fulltext-index.c +++ b/arangod/FulltextIndex/fulltext-index.c @@ -75,19 +75,30 @@ extern ZCOD zcdh; /// @brief maximum number of Unicode characters for an indexed word //////////////////////////////////////////////////////////////////////////////// -#define MAX_WORD_LENGTH (40) +#define MAX_WORD_LENGTH (40) //////////////////////////////////////////////////////////////////////////////// /// @brief gap between two words in a temporary search buffer //////////////////////////////////////////////////////////////////////////////// -#define SPACING (10) +#define SPACING (10) //////////////////////////////////////////////////////////////////////////////// /// @brief maximum tolerable occupancy of the index (e.g. 60 %) //////////////////////////////////////////////////////////////////////////////// -#define HEALTH_THRESHOLD (75) +#define HEALTH_THRESHOLD (75) + +//////////////////////////////////////////////////////////////////////////////// +/// @brief index extra growth factor +/// if 1.0, the index will be resized to the values originally suggested. As +/// resizing is expensive, one might want to decrease the overall number of +/// resizings. This can be done by setting this number to a value bigger than +/// 1.0 +/// TODO: find a good default value for this +//////////////////////////////////////////////////////////////////////////////// + +#define EXTRA_GROWTH_FACTOR (1.0) //////////////////////////////////////////////////////////////////////////////// /// @brief the actual index struct used @@ -104,9 +115,10 @@ typedef struct { TUBER* _index1; TUBER* _index2; TUBER* _index3; + uint64_t _ix3KKey; - int64_t _maxDocuments; - int64_t _numDocuments; + uint64_t _maxDocuments; + uint64_t _numDocuments; FTS_texts_t* (*getTexts)(FTS_document_id_t, void*); void (*freeWordlist)(FTS_texts_t*); @@ -168,6 +180,20 @@ static uint64_t GetUnicode (uint8_t** ptr) { return 0; } +//////////////////////////////////////////////////////////////////////////////// +/// @brief translate zstr error code into TRI_error code +//////////////////////////////////////////////////////////////////////////////// + +static int TranslateZStrErrorCode (int zstrErrorCode) { + assert(zstrErrorCode != 0); + + if (zstrErrorCode == 2) { + return TRI_ERROR_ARANGO_INDEX_NEEDS_RESIZE; + } + + return TRI_ERROR_OUT_OF_MEMORY; +} + //////////////////////////////////////////////////////////////////////////////// /// @brief add a document to the index //////////////////////////////////////////////////////////////////////////////// @@ -195,6 +221,7 @@ int RealAddDocument (FTS_index_t* ftx, FTS_document_id_t docid, FTS_texts_t* raw uint64_t bkey = 0; uint64_t docb, dock; int res; + int res2; ix = (FTS_real_index*) ftx; @@ -202,6 +229,7 @@ int RealAddDocument (FTS_index_t* ftx, FTS_document_id_t docid, FTS_texts_t* raw handle = ix->_firstFree; if (handle == 0) { // no more document handles free + LOG_ERROR("fail on %d", __LINE__); return TRI_ERROR_ARANGO_INDEX_NEEDS_RESIZE; } @@ -227,7 +255,7 @@ int RealAddDocument (FTS_index_t* ftx, FTS_document_id_t docid, FTS_texts_t* raw // check for out of memory if (zstrwl == NULL || zstr2a == NULL || zstr2b == NULL || x3zstr == NULL || x3zstrb == NULL) { - LOG_INFO("oom triggered in %d", __LINE__); res = TRI_ERROR_OUT_OF_MEMORY; + res = TRI_ERROR_OUT_OF_MEMORY; goto oom; } @@ -243,7 +271,7 @@ int RealAddDocument (FTS_index_t* ftx, FTS_document_id_t docid, FTS_texts_t* raw unicode = GetUnicode(&utf); while (unicode != 0) { if (ZStrEnc(zstrwl, &zcutf, unicode) != 0) { - LOG_INFO("oom triggered in %d", __LINE__); res = TRI_ERROR_OUT_OF_MEMORY; + res = TRI_ERROR_OUT_OF_MEMORY; goto oom; } @@ -256,13 +284,13 @@ int RealAddDocument (FTS_index_t* ftx, FTS_document_id_t docid, FTS_texts_t* raw // terminate the word and insert into STEX if (ZStrEnc(zstrwl, &zcutf, 0) != 0) { - LOG_INFO("oom triggered in %d", __LINE__); res = TRI_ERROR_OUT_OF_MEMORY; + res = TRI_ERROR_OUT_OF_MEMORY; goto oom; } ZStrNormalize(zstrwl); if (ZStrSTAppend(stex, zstrwl) != 0) { - LOG_INFO("oom triggered in %d", __LINE__); res = TRI_ERROR_OUT_OF_MEMORY; + res = TRI_ERROR_OUT_OF_MEMORY; goto oom; } } @@ -279,7 +307,7 @@ int RealAddDocument (FTS_index_t* ftx, FTS_document_id_t docid, FTS_texts_t* raw for (wdx = 0; wdx < nowords; wdx++) { // get it out as a word if (ZStrInsert(zstrwl, wpt, 2) != 0) { - LOG_INFO("oom triggered in %d", __LINE__); res = TRI_ERROR_OUT_OF_MEMORY; + res = TRI_ERROR_OUT_OF_MEMORY; goto oom; } @@ -307,7 +335,7 @@ int RealAddDocument (FTS_index_t* ftx, FTS_document_id_t docid, FTS_texts_t* raw // get the Z-string for the index-2 entry before this letter i = ZStrTuberRead(ix->_index2, kkey[j], zstr2a); if (i == 1) { - res = TRI_ERROR_INTERNAL; // TODO: check with Richard + res = TRI_ERROR_INTERNAL; goto oom; } @@ -335,6 +363,7 @@ int RealAddDocument (FTS_index_t* ftx, FTS_document_id_t docid, FTS_texts_t* raw if (newlet != tran) { // if not there, create a new index-2 entry for it bkey = ZStrTuberIns(ix->_index2, kkey[j], tran); + // TODO: check bkey for INSFAIL kkey[j + 1] = ZStrTuberK(ix->_index2, kkey[j], tran, bkey); // update old index-2 entry to insert new letter ZStrCxClear(&zcdelt, &ctx2a); @@ -344,14 +373,14 @@ int RealAddDocument (FTS_index_t* ftx, FTS_document_id_t docid, FTS_texts_t* raw ZStrClear(zstr2b); x64 = ZStrBitsOut(zstr2a, 1); if (ZStrBitsIn(x64, 1, zstr2b) != 0) { - LOG_INFO("oom triggered in %d", __LINE__); res = TRI_ERROR_OUT_OF_MEMORY; + res = TRI_ERROR_OUT_OF_MEMORY; goto oom; } if (x64 == 1) { // copy over the B-key into index 3 docb = ZStrDec(zstr2a, &zcbky); if (ZStrEnc(zstr2b, &zcbky, docb) != 0) { - LOG_INFO("oom triggered in %d", __LINE__); res = TRI_ERROR_OUT_OF_MEMORY; + res = TRI_ERROR_OUT_OF_MEMORY; goto oom; } } @@ -364,27 +393,27 @@ int RealAddDocument (FTS_index_t* ftx, FTS_document_id_t docid, FTS_texts_t* raw break; } if (ZStrCxEnc(zstr2b, &zcdelt, &ctx2b, newlet) != 0) { - LOG_INFO("oom triggered in %d", __LINE__); res = TRI_ERROR_OUT_OF_MEMORY; + res = TRI_ERROR_OUT_OF_MEMORY; goto oom; } x64 = ZStrDec(zstr2a, &zcbky); if (ZStrEnc(zstr2b, &zcbky, x64) != 0) { - LOG_INFO("oom triggered in %d", __LINE__); res = TRI_ERROR_OUT_OF_MEMORY; + res = TRI_ERROR_OUT_OF_MEMORY; goto oom; } } if (ZStrCxEnc(zstr2b, &zcdelt, &ctx2b, tran) != 0) { - LOG_INFO("oom triggered in %d", __LINE__); res = TRI_ERROR_OUT_OF_MEMORY; + res = TRI_ERROR_OUT_OF_MEMORY; goto oom; } if (ZStrEnc(zstr2b, &zcbky, bkey) != 0) { - LOG_INFO("oom triggered in %d", __LINE__); res = TRI_ERROR_OUT_OF_MEMORY; + res = TRI_ERROR_OUT_OF_MEMORY; goto oom; } if (newlet == oldlet) { if (ZStrCxEnc(zstr2b, &zcdelt, &ctx2b, tran) != 0) { - LOG_INFO("oom triggered in %d", __LINE__); res = TRI_ERROR_OUT_OF_MEMORY; + res = TRI_ERROR_OUT_OF_MEMORY; goto oom; } } @@ -392,24 +421,25 @@ int RealAddDocument (FTS_index_t* ftx, FTS_document_id_t docid, FTS_texts_t* raw while (newlet != oldlet) { oldlet = newlet; if (ZStrCxEnc(zstr2b, &zcdelt, &ctx2b, newlet) != 0) { - LOG_INFO("oom triggered in %d", __LINE__); res = TRI_ERROR_OUT_OF_MEMORY; + res = TRI_ERROR_OUT_OF_MEMORY; goto oom; } x64 = ZStrDec(zstr2a, &zcbky); if (ZStrEnc(zstr2b, &zcbky, x64) != 0) { - LOG_INFO("oom triggered in %d", __LINE__); res = TRI_ERROR_OUT_OF_MEMORY; + res = TRI_ERROR_OUT_OF_MEMORY; goto oom; } newlet = ZStrCxDec(zstr2a, &zcdelt, &ctx2a); } if (ZStrCxEnc(zstr2b, &zcdelt, &ctx2b, newlet) != 0) { - LOG_INFO("oom triggered in %d", __LINE__); res = TRI_ERROR_OUT_OF_MEMORY; + res = TRI_ERROR_OUT_OF_MEMORY; goto oom; } } ZStrNormalize(zstr2b); - if (ZStrTuberUpdate(ix->_index2, kkey[j], zstr2b) != 0) { - LOG_INFO("oom triggered in %d", __LINE__); res = TRI_ERROR_OUT_OF_MEMORY; + res2 = ZStrTuberUpdate(ix->_index2, kkey[j], zstr2b); + if (res2 != 0) { + res = TranslateZStrErrorCode(res2); goto oom; } } @@ -424,7 +454,7 @@ int RealAddDocument (FTS_index_t* ftx, FTS_document_id_t docid, FTS_texts_t* raw // so read the zstr from index2 i = ZStrTuberRead(ix->_index2, kkey[j], zstr2a); if (i == 1) { - res = TRI_ERROR_INTERNAL; // TODO: check with Richard + res = TRI_ERROR_INTERNAL; goto oom; } // is there already an index-3 entry available? @@ -435,6 +465,7 @@ int RealAddDocument (FTS_index_t* ftx, FTS_document_id_t docid, FTS_texts_t* raw } else { docb = ZStrTuberIns(ix->_index3, kkey[j], 0); + // TODO: check docb // put it into index 2 ZStrCxClear(&zcdelt, &ctx2a); ZStrCxClear(&zcdelt, &ctx2b); @@ -443,12 +474,12 @@ int RealAddDocument (FTS_index_t* ftx, FTS_document_id_t docid, FTS_texts_t* raw ZStrClear(zstr2b); x64 = ZStrBitsOut(zstr2a, 1); if (ZStrBitsIn(1, 1, zstr2b) != 0) { - LOG_INFO("oom triggered in %d", __LINE__); res = TRI_ERROR_OUT_OF_MEMORY; + res = TRI_ERROR_OUT_OF_MEMORY; goto oom; } if (ZStrEnc(zstr2b, &zcbky, docb) != 0) { - LOG_INFO("oom triggered in %d", __LINE__); res = TRI_ERROR_OUT_OF_MEMORY; + res = TRI_ERROR_OUT_OF_MEMORY; goto oom; } @@ -461,18 +492,19 @@ int RealAddDocument (FTS_index_t* ftx, FTS_document_id_t docid, FTS_texts_t* raw } if (ZStrCxEnc(zstr2b, &zcdelt, &ctx2b, newlet) != 0) { - LOG_INFO("oom triggered in %d", __LINE__); res = TRI_ERROR_OUT_OF_MEMORY; + res = TRI_ERROR_OUT_OF_MEMORY; goto oom; } x64 = ZStrDec(zstr2a, &zcbky); if (ZStrEnc(zstr2b,&zcbky, x64) != 0) { - LOG_INFO("oom triggered in %d", __LINE__); res = TRI_ERROR_OUT_OF_MEMORY; + res = TRI_ERROR_OUT_OF_MEMORY; goto oom; } } ZStrNormalize(zstr2b); - if (ZStrTuberUpdate(ix->_index2, kkey[j], zstr2b) != 0) { - LOG_INFO("oom triggered in %d", __LINE__); res = TRI_ERROR_OUT_OF_MEMORY; + res2 = ZStrTuberUpdate(ix->_index2, kkey[j], zstr2b); + if (res2 != 0) { + res = TranslateZStrErrorCode(res2); goto oom; } } @@ -481,7 +513,7 @@ int RealAddDocument (FTS_index_t* ftx, FTS_document_id_t docid, FTS_texts_t* raw i = ZStrTuberRead(ix->_index3, dock, x3zstr); ZStrClear(x3zstrb); if (i == 1) { - res = TRI_ERROR_INTERNAL; // TODO: check with Richard + res = TRI_ERROR_INTERNAL; goto oom; } @@ -496,37 +528,38 @@ int RealAddDocument (FTS_index_t* ftx, FTS_document_id_t docid, FTS_texts_t* raw } if (ZStrCxEnc(x3zstrb, &zcdoc, &x3ctxb, newhan) != 0) { - LOG_INFO("oom triggered in %d", __LINE__); res = TRI_ERROR_OUT_OF_MEMORY; + res = TRI_ERROR_OUT_OF_MEMORY; goto oom; } } if (ZStrCxEnc(x3zstrb, &zcdoc, &x3ctxb, handle) != 0) { - LOG_INFO("oom triggered in %d", __LINE__); res = TRI_ERROR_OUT_OF_MEMORY; + res = TRI_ERROR_OUT_OF_MEMORY; goto oom; } if (newhan == oldhan) { if (ZStrCxEnc(x3zstrb, &zcdoc, &x3ctxb, handle) != 0) { - LOG_INFO("oom triggered in %d", __LINE__); res = TRI_ERROR_OUT_OF_MEMORY; + res = TRI_ERROR_OUT_OF_MEMORY; goto oom; } } else { if (ZStrCxEnc(x3zstrb, &zcdoc, &x3ctxb, newhan) != 0) { - LOG_INFO("oom triggered in %d", __LINE__); res = TRI_ERROR_OUT_OF_MEMORY; + res = TRI_ERROR_OUT_OF_MEMORY; goto oom; } while (newhan != oldhan) { oldhan = newhan; newhan = ZStrCxDec(x3zstr, &zcdoc, &x3ctx); if (ZStrCxEnc(x3zstrb, &zcdoc, &x3ctxb, newhan) != 0) { - LOG_INFO("oom triggered in %d", __LINE__); res = TRI_ERROR_OUT_OF_MEMORY; + res = TRI_ERROR_OUT_OF_MEMORY; goto oom; } } } ZStrNormalize(x3zstrb); - if (ZStrTuberUpdate(ix->_index3, dock, x3zstrb) != 0) { - LOG_INFO("oom triggered in %d", __LINE__); res = TRI_ERROR_OUT_OF_MEMORY; + res2 = ZStrTuberUpdate(ix->_index3, dock, x3zstrb); + if (res2 != 0) { + res = TranslateZStrErrorCode(res2); goto oom; } @@ -545,7 +578,7 @@ int RealAddDocument (FTS_index_t* ftx, FTS_document_id_t docid, FTS_texts_t* raw tran = ZStrXlate(&zcutf, ixlet[j2]); i = ZStrTuberRead(ix->_index1, kkey1[j2 + 1], zstr2a); if (i == 1) { - res = TRI_ERROR_INTERNAL; // TODO: check with Richard + res = TRI_ERROR_INTERNAL; goto oom; } // look to see if the letter is there @@ -565,6 +598,7 @@ int RealAddDocument (FTS_index_t* ftx, FTS_document_id_t docid, FTS_texts_t* raw if (newlet != tran) { // if not there, create a new index-1 entry for it bkey = ZStrTuberIns(ix->_index1, kkey1[j2 + 1], tran); + // TODO: check bkey kkey1[j2] = ZStrTuberK(ix->_index1, kkey1[j2 + 1], tran, bkey); // update old index-1 entry to insert new letter ZStrCxClear(&zcdelt, &ctx2a); @@ -580,26 +614,26 @@ int RealAddDocument (FTS_index_t* ftx, FTS_document_id_t docid, FTS_texts_t* raw break; } if (ZStrCxEnc(zstr2b, &zcdelt, &ctx2b, newlet) != 0) { - LOG_INFO("oom triggered in %d", __LINE__); res = TRI_ERROR_OUT_OF_MEMORY; + res = TRI_ERROR_OUT_OF_MEMORY; goto oom; } x64 = ZStrDec(zstr2a, &zcbky); if (ZStrEnc(zstr2b, &zcbky, x64) != 0) { - LOG_INFO("oom triggered in %d", __LINE__); res = TRI_ERROR_OUT_OF_MEMORY; + res = TRI_ERROR_OUT_OF_MEMORY; goto oom; } } if (ZStrCxEnc(zstr2b, &zcdelt, &ctx2b, tran) != 0) { - LOG_INFO("oom triggered in %d", __LINE__); res = TRI_ERROR_OUT_OF_MEMORY; + res = TRI_ERROR_OUT_OF_MEMORY; goto oom; } if (ZStrEnc(zstr2b, &zcbky, bkey) != 0) { - LOG_INFO("oom triggered in %d", __LINE__); res = TRI_ERROR_OUT_OF_MEMORY; + res = TRI_ERROR_OUT_OF_MEMORY; goto oom; } if (newlet == oldlet) { if (ZStrCxEnc(zstr2b, &zcdelt, &ctx2b, tran) != 0) { - LOG_INFO("oom triggered in %d", __LINE__); res = TRI_ERROR_OUT_OF_MEMORY; + res = TRI_ERROR_OUT_OF_MEMORY; goto oom; } } @@ -607,24 +641,25 @@ int RealAddDocument (FTS_index_t* ftx, FTS_document_id_t docid, FTS_texts_t* raw while (newlet != oldlet) { oldlet = newlet; if (ZStrCxEnc(zstr2b, &zcdelt, &ctx2b, newlet) != 0) { - LOG_INFO("oom triggered in %d", __LINE__); res = TRI_ERROR_OUT_OF_MEMORY; + res = TRI_ERROR_OUT_OF_MEMORY; goto oom; } x64 = ZStrDec(zstr2a, &zcbky); if (ZStrEnc(zstr2b, &zcbky, x64) != 0) { - LOG_INFO("oom triggered in %d", __LINE__); res = TRI_ERROR_OUT_OF_MEMORY; + res = TRI_ERROR_OUT_OF_MEMORY; goto oom; } newlet = ZStrCxDec(zstr2a, &zcdelt, &ctx2a); } if (ZStrCxEnc(zstr2b, &zcdelt, &ctx2b, newlet) != 0) { - LOG_INFO("oom triggered in %d", __LINE__); res = TRI_ERROR_OUT_OF_MEMORY; + res = TRI_ERROR_OUT_OF_MEMORY; goto oom; } } ZStrNormalize(zstr2b); - if (ZStrTuberUpdate(ix->_index1, kkey1[j2 + 1], zstr2b) != 0) { - LOG_INFO("oom triggered in %d", __LINE__); res = TRI_ERROR_OUT_OF_MEMORY; + res2 = ZStrTuberUpdate(ix->_index1, kkey1[j2 + 1], zstr2b); + if (res2 != 0) { + res = TranslateZStrErrorCode(res2); goto oom; } } @@ -685,11 +720,15 @@ static int RealDeleteDocument (FTS_index_t* ftx, FTS_document_id_t docid) { } if (i > ix->_lastSlot) { + LOG_ERROR("fail on %d", __LINE__); return TRI_ERROR_ARANGO_INDEX_NEEDS_RESIZE; } ix->_handlesFree[i] = 1; - ix->_numDocuments--; + if (ix->_numDocuments > 0) { + // should never underflow + ix->_numDocuments--; + } return TRI_ERROR_NO_ERROR; } @@ -946,11 +985,17 @@ static void Ix2Recurs (STEX* dochan, FTS_real_index* ix, uint64_t kk2) { /// @brief index recursion, prefix matching //////////////////////////////////////////////////////////////////////////////// -static void Ix1Recurs (STEX* dochan, FTS_real_index* ix, uint64_t kk1, uint64_t* wd) { +static int Ix1Recurs (STEX* dochan, + FTS_real_index* ix, + uint64_t kk1, + uint64_t* wd) { ZSTR* zstr; CTX ctx; uint64_t newlet; uint64_t kk2; + int res; + + res = TRI_ERROR_NO_ERROR; kk2 = FindKKey2(ix,wd); @@ -961,13 +1006,11 @@ static void Ix1Recurs (STEX* dochan, FTS_real_index* ix, uint64_t kk1, uint64_t* // index 1 entry for this prefix zstr = ZStrCons(10); if (zstr == NULL) { - // TODO: out of memory + return TRI_ERROR_OUT_OF_MEMORY; } if (ZStrTuberRead(ix->_index1, kk1, zstr) == 1) { - // TODO: make this return an error instead - printf("recursion failed to read kk1\n"); - exit(1); + return TRI_ERROR_INTERNAL; } ZStrCxClear(&zcdelt, &ctx); @@ -986,10 +1029,16 @@ static void Ix1Recurs (STEX* dochan, FTS_real_index* ix, uint64_t kk1, uint64_t* bkey = ZStrDec(zstr, &zcbky); newkk1 = ZStrTuberK(ix->_index1, kk1, newlet, bkey); *(wd - 1) = newlet; - Ix1Recurs(dochan, ix, newkk1, wd - 1); + + res = Ix1Recurs(dochan, ix, newkk1, wd - 1); + if (res != TRI_ERROR_NO_ERROR) { + return res; + } } ZStrDest(zstr); + + return res; } //////////////////////////////////////////////////////////////////////////////// @@ -1072,15 +1121,9 @@ int FTS_HealthIndex (FTS_index_t* ftx, uint64_t* stats) { ix = (FTS_real_index*) ftx; health = (ix->_numDocuments * 100) / ix->_maxDocuments; - stats[0] = (health * (ix->_numDocuments + 5)) / 50; - - if (stats[0] < 5) { - stats[0] = 5; - } if (ix->_options == FTS_INDEX_SUBSTRINGS) { ZStrTuberStats(ix->_index1, st); - // LOG_TRACE("index 1 health %d size %d", (int) st[0], (int) st[1]); stats[1] = st[1]; if (health < st[0]) { health = st[0]; @@ -1091,19 +1134,30 @@ int FTS_HealthIndex (FTS_index_t* ftx, uint64_t* stats) { } ZStrTuberStats(ix->_index2, st); - // LOG_TRACE("index 2 health %d size %d", (int) st[0], (int) st[1]); stats[2] = st[1]; if (health < st[0]) { health = st[0]; } ZStrTuberStats(ix->_index3, st); - // LOG_TRACE("index 3 health %d size %d", (int) st[0], (int) st[1]); stats[3] = st[1]; if (health < st[0]) { health = st[0]; } + stats[0] = (health * (ix->_numDocuments + 5)) / 50; + if (stats[0] < (ix->_numDocuments + 5)) { + stats[0] = (ix->_numDocuments + 5); + } + + if (EXTRA_GROWTH_FACTOR > 1.0) { + size_t i; + + for (i = 0; i < 4; ++i) { + stats[i] = (uint64_t) ((double) stats[i] * (double) EXTRA_GROWTH_FACTOR); + } + } + return (int) health; } @@ -1115,6 +1169,7 @@ int FTS_HealthIndex (FTS_index_t* ftx, uint64_t* stats) { //////////////////////////////////////////////////////////////////////////////// FTS_index_t* FTS_CloneIndex (FTS_index_t* ftx, + FTS_document_id_t excludeDocument, uint64_t sizes[4]) { FTS_real_index* old; FTS_index_t* clone; @@ -1126,15 +1181,24 @@ FTS_index_t* FTS_CloneIndex (FTS_index_t* ftx, if (clone != NULL) { // copy documents FTS_document_id_t i; + uint64_t count = 0; for (i = 1; i <= old->_lastSlot; i++) { + FTS_document_id_t found; int res; if (old->_handlesFree[i] == 1) { + // document is marked as deleted continue; } - res = FTS_AddDocument(clone, old->_handles[i]); + found = old->_handles[i]; + if (found == excludeDocument) { + // do not insert this document, because the caller will insert it later + continue; + } + + res = FTS_AddDocument(clone, found); if (res != TRI_ERROR_NO_ERROR && res != TRI_ERROR_ARANGO_INDEX_NEEDS_RESIZE) { // if resize fails, everything's ruined LOG_ERROR("resizing the fulltext index failed with %d, sizes were: %llu %llu %llu %llu", @@ -1147,7 +1211,11 @@ FTS_index_t* FTS_CloneIndex (FTS_index_t* ftx, FTS_FreeIndex(clone); return NULL; } + + ++count; } + + LOG_DEBUG("cloned %llu documents", (unsigned long long) count); } return clone; @@ -1170,11 +1238,11 @@ FTS_index_t* FTS_CreateIndex (void* context, FTS_real_index* ix; int i; - LOG_INFO("creating fulltext index with sizes %llu %llu %llu %llu", - (unsigned long long) sizes[0], - (unsigned long long) sizes[1], - (unsigned long long) sizes[2], - (unsigned long long) sizes[3]); + LOG_TRACE("creating fulltext index with sizes %llu %llu %llu %llu", + (unsigned long long) sizes[0], + (unsigned long long) sizes[1], + (unsigned long long) sizes[2], + (unsigned long long) sizes[3]); ix = TRI_Allocate(TRI_UNKNOWN_MEM_ZONE, sizeof(FTS_real_index), false); if (ix == NULL) { @@ -1194,10 +1262,11 @@ FTS_index_t* FTS_CreateIndex (void* context, return NULL; } - ix->_maxDocuments = (int64_t) sizes[0]; + ix->_maxDocuments = sizes[0]; ix->_numDocuments = 0; ix->_context = context; ix->_options = options; + ix->_ix3KKey = 0; // wordlists retrieval function ix->getTexts = getTexts; @@ -1335,11 +1404,11 @@ int FTS_AddDocument (FTS_index_t* ftx, FTS_document_id_t docid) { health = FTS_HealthIndex(ftx, sizes); if (health > HEALTH_THRESHOLD || res == TRI_ERROR_ARANGO_INDEX_NEEDS_RESIZE) { - LOG_INFO("Add document: health exceeds threshold. suggested sizes are: %llu %llu %llu %llu", - (unsigned long long) sizes[0], - (unsigned long long) sizes[1], - (unsigned long long) sizes[2], - (unsigned long long) sizes[3]); + LOG_TRACE("fulltext index health threshold exceeded. new suggested sizes are: %llu %llu %llu %llu", + (unsigned long long) sizes[0], + (unsigned long long) sizes[1], + (unsigned long long) sizes[2], + (unsigned long long) sizes[3]); res = TRI_ERROR_ARANGO_INDEX_NEEDS_RESIZE; } @@ -1392,13 +1461,110 @@ int FTS_UpdateDocument (FTS_index_t* ftx, FTS_document_id_t docid) { } //////////////////////////////////////////////////////////////////////////////// -/// @brief current not called. TODO: find out what its intention is +/// @brief read index3 and remove handles of unused documents. stop after docs +/// deletions. the scan & cleanup is incremental +/// the caller must have write-locked the index //////////////////////////////////////////////////////////////////////////////// -void FTS_BackgroundTask (FTS_index_t* ftx) { - /* obtain LOCKMAIN */ - /* remove deleted handles from index3 not done QQQ */ - /* release LOCKMAIN */ +int FTS_BackgroundTask (FTS_index_t * ftx, int docs) { + FTS_real_index * ix; + int dleft,i; + CTX cold, cnew; + int cd; + uint64_t newterm,oldhan,han; + ZSTR *zold, *znew; + ix = (FTS_real_index *)ftx; + dleft=docs; + cd=0; + + znew=ZStrCons(100); + if(znew==NULL) return 1; + zold=ZStrCons(100); + if(zold==NULL) + { + ZStrDest(znew); + return 1; + } + + while(dleft>0) + { + uint64_t numDeletions; + + assert(ix->_ix3KKey < (ix->_index3)->kmax); + + numDeletions = 0; + i=ZStrTuberRead(ix->_index3,ix->_ix3KKey,zold); + if(i==2) + { + cd=1; + break; + } + if(i==0) + { + ZStrCxClear(&zcdoc,&cold); + ZStrCxClear(&zcdoc,&cnew); + ZStrClear(znew); + oldhan=0; + newterm=0; + while(1) + { + han=ZStrCxDec(zold,&zcdoc,&cold); + if(han==oldhan) { + break; + } + oldhan=han; + dleft--; + if(ix->_handlesFree[han]==0) + { + i=ZStrCxEnc(znew,&zcdoc,&cnew,han); + if(i!=0) { + ix->_ix3KKey = 0; + ZStrDest(znew); + ZStrDest(zold); + return 1; + } + newterm=han; + } + else { + // something was deleted + ++numDeletions; + } + } + + if (numDeletions > 0) { + // update existing entry in tuber + // but only if there's something to update + + i=ZStrCxEnc(znew,&zcdoc,&cnew,newterm); + if(i!=0) { + ix->_ix3KKey = 0; + ZStrDest(znew); + ZStrDest(zold); + return 1; + } + ZStrNormalize(znew); + i=ZStrTuberUpdate(ix->_index3,ix->_ix3KKey,znew); + } + + if(i!=0) { + ix->_ix3KKey = 0; + ZStrDest(znew); + ZStrDest(zold); + return i; + } + } + ix->_ix3KKey++; + if(ix->_ix3KKey >= (ix->_index3)->kmax) + { + ix->_ix3KKey = 0; + cd=3; // finished iterating over all document handles + break; + } + } + + ZStrDest(znew); + ZStrDest(zold); + return cd; } //////////////////////////////////////////////////////////////////////////////// @@ -1423,34 +1589,53 @@ FTS_document_ids_t* FTS_FindDocuments (FTS_index_t* ftx, uint64_t word[2 * (MAX_WORD_LENGTH + SPACING)]; uint64_t ndocs = 0; - zstr2 = ZStrCons(10); /* from index-2 tuber */ + // initialise + dc = NULL; + TRI_set_errno(TRI_ERROR_NO_ERROR); + + zstr2 = ZStrCons(10); /* from index-2 tuber */ if (zstr2 == NULL) { - // TODO: out of memory + TRI_set_errno(TRI_ERROR_OUT_OF_MEMORY); + return NULL; } - zstr3 = ZStrCons(10); /* from index-3 tuber */ + zstr3 = ZStrCons(10); /* from index-3 tuber */ if (zstr3 == NULL) { - // TODO: out of memory + TRI_set_errno(TRI_ERROR_OUT_OF_MEMORY); + ZStrDest(zstr2); + return NULL; } zstra1 = ZStrCons(10); /* current list of documents */ if (zstra1 == NULL) { - // TODO: out of memory + TRI_set_errno(TRI_ERROR_OUT_OF_MEMORY); + ZStrDest(zstr3); + ZStrDest(zstr2); + return NULL; } zstra2 = ZStrCons(10); /* new list of documents */ if (zstra2 == NULL) { - // TODO: out of memory + TRI_set_errno(TRI_ERROR_OUT_OF_MEMORY); + ZStrDest(zstra1); + ZStrDest(zstr3); + ZStrDest(zstr2); + return NULL; } - zstr = ZStrCons(4); /* work zstr from stex */ + zstr = ZStrCons(4); /* work zstr from stex */ if (zstr == NULL) { - // TODO: out of memory + TRI_set_errno(TRI_ERROR_OUT_OF_MEMORY); + ZStrDest(zstra2); + ZStrDest(zstra1); + ZStrDest(zstr3); + ZStrDest(zstr2); + return NULL; } - + ix = (FTS_real_index*) ftx; -/* - for each term in the query */ + // for each term in the query for (queryterm = 0; queryterm < query->_len; queryterm++) { if (query->_localOptions[queryterm] == FTS_MATCH_SUBSTRING && ix->_options != FTS_INDEX_SUBSTRINGS) { @@ -1512,7 +1697,8 @@ FTS_document_ids_t* FTS_FindDocuments (FTS_index_t* ftx, } if (ix->_handlesFree[newhan] == 0) { if (ZStrCxEnc(zstra2, &zcdoc, &ctxa2, newhan) != 0) { - // TODO: out of memory + TRI_set_errno(TRI_ERROR_OUT_OF_MEMORY); + goto oom; } lasthan = newhan; ndocs++; @@ -1541,7 +1727,8 @@ FTS_document_ids_t* FTS_FindDocuments (FTS_index_t* ftx, if (newhan == nhand1) { if (ix->_handlesFree[newhan] == 0) { if (ZStrCxEnc(zstra2, &zcdoc, &ctxa2, newhan) != 0) { - // TODO: out of memory + TRI_set_errno(TRI_ERROR_OUT_OF_MEMORY); + goto oom; } lasthan = newhan; ndocs++; @@ -1561,9 +1748,12 @@ FTS_document_ids_t* FTS_FindDocuments (FTS_index_t* ftx, } } } + if (ZStrCxEnc(zstra2, &zcdoc, &ctxa2, lasthan) != 0) { - // TODO: out of memory + TRI_set_errno(TRI_ERROR_OUT_OF_MEMORY); + goto oom; } + ZStrNormalize(zstra2); ztemp = zstra1; zstra1 = zstra2; @@ -1579,7 +1769,8 @@ FTS_document_ids_t* FTS_FindDocuments (FTS_index_t* ftx, // make STEX to contain new list of handles dochan = ZStrSTCons(2); if (dochan == NULL) { - // TODO: out of memory + TRI_set_errno(TRI_ERROR_OUT_OF_MEMORY); + goto oom; } FillWordBuffer(&word[MAX_WORD_LENGTH + SPACING], query->_texts[queryterm]); @@ -1607,7 +1798,10 @@ FTS_document_ids_t* FTS_FindDocuments (FTS_index_t* ftx, break; } // call routine to recursively put handles to STEX - Ix1Recurs(dochan, ix, kkey, word + MAX_WORD_LENGTH + SPACING); + if (Ix1Recurs(dochan, ix, kkey, word + MAX_WORD_LENGTH + SPACING) != TRI_ERROR_NO_ERROR) { + TRI_set_errno(TRI_ERROR_OUT_OF_MEMORY); + goto oom; + } } ZStrSTSort(dochan); @@ -1625,13 +1819,17 @@ FTS_document_ids_t* FTS_FindDocuments (FTS_index_t* ftx, uint64_t newhan; if (ZStrInsert(zstr, docpt, 2) != 0) { - // TODO: out of memory + TRI_set_errno(TRI_ERROR_OUT_OF_MEMORY); + ZStrSTDest(dochan); + goto oom; } newhan = ZStrDec(zstr, &zcdh); docpt += ZStrExtLen(docpt, 2); if (ix->_handlesFree[newhan] == 0) { if (ZStrCxEnc(zstra2, &zcdoc, &ctxa2, newhan) != 0) { - // TODO: out of memory + TRI_set_errno(TRI_ERROR_OUT_OF_MEMORY); + ZStrSTDest(dochan); + goto oom; } lasthan = newhan; ndocs++; @@ -1652,7 +1850,9 @@ FTS_document_ids_t* FTS_FindDocuments (FTS_index_t* ftx, nhand1 = ZStrCxDec(zstra1, &zcdoc, &ctxa1); if (ZStrInsert(zstr, docpt, 2) != 0) { - // TODO: out of memory + TRI_set_errno(TRI_ERROR_OUT_OF_MEMORY); + ZStrSTDest(dochan); + goto oom; } newhan = ZStrDec(zstr, &zcdh); docpt += ZStrExtLen(docpt, 2); @@ -1667,8 +1867,11 @@ FTS_document_ids_t* FTS_FindDocuments (FTS_index_t* ftx, if (newhan == nhand1) { if (ix->_handlesFree[newhan] == 0) { if (ZStrCxEnc(zstra2, &zcdoc, &ctxa2, newhan) != 0) { - // TODO: out of memory + TRI_set_errno(TRI_ERROR_OUT_OF_MEMORY); + ZStrSTDest(dochan); + goto oom; } + lasthan = newhan; ndocs++; } @@ -1676,8 +1879,11 @@ FTS_document_ids_t* FTS_FindDocuments (FTS_index_t* ftx, break; } if (ZStrInsert(zstr, docpt, 2) != 0) { - // TODO: out of memory + TRI_set_errno(TRI_ERROR_OUT_OF_MEMORY); + ZStrSTDest(dochan); + goto oom; } + newhan = ZStrDec(zstr, &zcdh); docpt += ZStrExtLen(docpt, 2); odocs--; @@ -1693,7 +1899,9 @@ FTS_document_ids_t* FTS_FindDocuments (FTS_index_t* ftx, break; } if (ZStrInsert(zstr, docpt, 2) != 0) { - // TODO: out of memory + TRI_set_errno(TRI_ERROR_OUT_OF_MEMORY); + ZStrSTDest(dochan); + goto oom; } newhan = ZStrDec(zstr, &zcdh); docpt += ZStrExtLen(docpt, 2); @@ -1702,7 +1910,8 @@ FTS_document_ids_t* FTS_FindDocuments (FTS_index_t* ftx, } } if (ZStrCxEnc(zstra2, &zcdoc, &ctxa2, lasthan) != 0) { - // TODO: out of memory + TRI_set_errno(TRI_ERROR_OUT_OF_MEMORY); + goto oom; } ZStrNormalize(zstra2); ztemp = zstra1; @@ -1737,6 +1946,8 @@ FTS_document_ids_t* FTS_FindDocuments (FTS_index_t* ftx, } } } + +oom: ZStrDest(zstra1); ZStrDest(zstra2); diff --git a/arangod/FulltextIndex/fulltext-index.h b/arangod/FulltextIndex/fulltext-index.h index 2f334d05b6..c58ddcd68d 100644 --- a/arangod/FulltextIndex/fulltext-index.h +++ b/arangod/FulltextIndex/fulltext-index.h @@ -116,13 +116,14 @@ FTS_texts_t; /// @brief determine the health of the index //////////////////////////////////////////////////////////////////////////////// -int FTS_HealthIndex (FTS_index_t*, uint64_t sizes[4]); +int FTS_HealthIndex (FTS_index_t*, uint64_t[4]); //////////////////////////////////////////////////////////////////////////////// /// @brief clone an existing index //////////////////////////////////////////////////////////////////////////////// FTS_index_t* FTS_CloneIndex (FTS_index_t*, + FTS_document_id_t, uint64_t[4]); //////////////////////////////////////////////////////////////////////////////// @@ -159,6 +160,13 @@ int FTS_DeleteDocument (FTS_index_t*, FTS_document_id_t); int FTS_UpdateDocument (FTS_index_t*, FTS_document_id_t); +//////////////////////////////////////////////////////////////////////////////// +/// @brief read index3 and remove handles of unused documents. stop after docs +/// deletions. the bc struct can be used to do an incremental scan & cleanup +//////////////////////////////////////////////////////////////////////////////// + +int FTS_BackgroundTask (FTS_index_t* ftx, int docs); + //////////////////////////////////////////////////////////////////////////////// /// @brief perform a search in the index //////////////////////////////////////////////////////////////////////////////// diff --git a/arangod/FulltextIndex/zstr.c b/arangod/FulltextIndex/zstr.c index a8b34c227c..63aa356753 100644 --- a/arangod/FulltextIndex/zstr.c +++ b/arangod/FulltextIndex/zstr.c @@ -952,10 +952,11 @@ void ZStrTuberStats(TUBER * t, uint64_t * stats) d2=100-d2; if(d2>d1) d1=d2; d2=((t->wct*8)*(d1+1))/50; + /* printf("fuse %d freebit %d freekey %d kmax %d wct %d lenlen %d\n", (int)t->fuses, (int)t->freebit, (int)t->freekey, (int)t->kmax, (int)t->wct, (int)t->lenlen); - +*/ if(d2<72*t->fuses)d2=72*t->fuses; stats[0]=d1; stats[1]=d2; diff --git a/arangod/V8Server/v8-vocbase.cpp b/arangod/V8Server/v8-vocbase.cpp index 787946946c..21f27c6ac8 100755 --- a/arangod/V8Server/v8-vocbase.cpp +++ b/arangod/V8Server/v8-vocbase.cpp @@ -61,6 +61,7 @@ #include "VocBase/general-cursor.h" #include "VocBase/document-collection.h" #include "VocBase/edge-collection.h" +#include "VocBase/fulltext-query.h" #include "VocBase/key-generator.h" #include "VocBase/voc-shaper.h" #include "v8.h" @@ -614,8 +615,8 @@ static v8::Handle EnsureFulltextIndex (v8::Arguments const& argv, const bool create) { v8::HandleScope scope; - if (argv.Length() != 1 && argv.Length() != 2) { - return scope.Close(v8::ThrowException(TRI_CreateErrorObject(TRI_ERROR_ILLEGAL_OPTION, "usage: ensureFulltext(, )"))); + if (argv.Length() < 1 || argv.Length() > 3) { + return scope.Close(v8::ThrowException(TRI_CreateErrorObject(TRI_ERROR_ILLEGAL_OPTION, "usage: ensureFulltext(, , )"))); } string attributeName = TRI_ObjectToString(argv[0]); @@ -624,10 +625,15 @@ static v8::Handle EnsureFulltextIndex (v8::Arguments const& argv, } bool indexSubstrings = false; - if (argv.Length() == 2) { + if (argv.Length() > 1) { indexSubstrings = TRI_ObjectToBoolean(argv[1]); } + int minWordLength = TRI_FULLTEXT_WORDLENGTH_DEFAULT; + if (argv.Length() == 3) { + minWordLength = (int) TRI_ObjectToInt64(argv[2]); + } + // ............................................................................. // Check that we have a valid collection // ............................................................................. @@ -661,14 +667,14 @@ static v8::Handle EnsureFulltextIndex (v8::Arguments const& argv, TRI_document_collection_t* document = (TRI_document_collection_t*) primary; if (create) { - idx = TRI_EnsureFulltextIndexDocumentCollection(document, attributeName.c_str(), indexSubstrings, &created); + idx = TRI_EnsureFulltextIndexDocumentCollection(document, attributeName.c_str(), indexSubstrings, minWordLength, &created); if (idx == 0) { res = TRI_errno(); } } else { - idx = TRI_LookupFulltextIndexDocumentCollection(document, attributeName.c_str(), indexSubstrings); + idx = TRI_LookupFulltextIndexDocumentCollection(document, attributeName.c_str(), indexSubstrings, minWordLength); } if (idx == 0) { diff --git a/arangod/VocBase/document-collection.c b/arangod/VocBase/document-collection.c index 55df406981..0aceabb73c 100644 --- a/arangod/VocBase/document-collection.c +++ b/arangod/VocBase/document-collection.c @@ -34,6 +34,7 @@ #include "BasicsC/strings.h" #include "ShapedJson/shape-accessor.h" #include "VocBase/edge-collection.h" +#include "VocBase/fulltext-query.h" #include "VocBase/index.h" #include "VocBase/key-generator.h" #include "VocBase/voc-shaper.h" @@ -4091,7 +4092,8 @@ TRI_index_t* TRI_EnsureSkiplistIndexDocumentCollection (TRI_document_collection_ static TRI_index_t* LookupFulltextIndexDocumentCollection (TRI_document_collection_t* document, const char* attributeName, - const bool indexSubstrings) { + const bool indexSubstrings, + int minWordLength) { size_t i; assert(attributeName); @@ -4107,6 +4109,10 @@ static TRI_index_t* LookupFulltextIndexDocumentCollection (TRI_document_collecti continue; } + if (fulltext->_minWordLength != minWordLength) { + continue; + } + if (fulltext->base._fields._length != 1) { continue; } @@ -4129,6 +4135,7 @@ static TRI_index_t* LookupFulltextIndexDocumentCollection (TRI_document_collecti static TRI_index_t* CreateFulltextIndexDocumentCollection (TRI_document_collection_t* document, const char* attributeName, const bool indexSubstrings, + int minWordLength, TRI_idx_iid_t iid, bool* created) { TRI_index_t* idx; @@ -4140,7 +4147,7 @@ static TRI_index_t* CreateFulltextIndexDocumentCollection (TRI_document_collecti // a new one. // ........................................................................... - idx = LookupFulltextIndexDocumentCollection(document, attributeName, indexSubstrings); + idx = LookupFulltextIndexDocumentCollection(document, attributeName, indexSubstrings, minWordLength); if (idx != NULL) { LOG_TRACE("fulltext-index already created"); @@ -4151,7 +4158,7 @@ static TRI_index_t* CreateFulltextIndexDocumentCollection (TRI_document_collecti } // Create the fulltext index - idx = TRI_CreateFulltextIndex(&document->base, attributeName, indexSubstrings); + idx = TRI_CreateFulltextIndex(&document->base, attributeName, indexSubstrings, minWordLength); // If index id given, use it otherwise use the default. if (iid) { @@ -4187,9 +4194,11 @@ static int FulltextIndexFromJson (TRI_document_collection_t* document, TRI_json_t* attribute; TRI_json_t* fld; TRI_json_t* indexSubstrings; + TRI_json_t* minWordLength; char* attributeName; size_t fieldCount; bool doIndexSubstrings; + int minWordLengthValue; // extract fields fld = ExtractFields(definition, &fieldCount, iid); @@ -4214,13 +4223,19 @@ static int FulltextIndexFromJson (TRI_document_collection_t* document, if (indexSubstrings != NULL && indexSubstrings->_type == TRI_JSON_BOOLEAN) { doIndexSubstrings = indexSubstrings->_value._boolean; } + + minWordLength = TRI_LookupArrayJson(definition, "minLength"); + minWordLengthValue = TRI_FULLTEXT_WORDLENGTH_DEFAULT; + if (minWordLength != NULL && minWordLength->_type == TRI_JSON_NUMBER) { + minWordLengthValue = (int) minWordLength->_value._number; + } // create the index - idx = LookupFulltextIndexDocumentCollection(document, attributeName, doIndexSubstrings); + idx = LookupFulltextIndexDocumentCollection(document, attributeName, doIndexSubstrings, minWordLengthValue); if (idx == NULL) { bool created; - idx = CreateFulltextIndexDocumentCollection(document, attributeName, doIndexSubstrings, iid, &created); + idx = CreateFulltextIndexDocumentCollection(document, attributeName, doIndexSubstrings, minWordLengthValue, iid, &created); } if (idx == NULL) { @@ -4250,7 +4265,8 @@ static int FulltextIndexFromJson (TRI_document_collection_t* document, TRI_index_t* TRI_LookupFulltextIndexDocumentCollection (TRI_document_collection_t* document, const char* attributeName, - const bool indexSubstrings) { + const bool indexSubstrings, + int minWordLength) { TRI_index_t* idx; TRI_primary_collection_t* primary; @@ -4262,7 +4278,7 @@ TRI_index_t* TRI_LookupFulltextIndexDocumentCollection (TRI_document_collection_ TRI_READ_LOCK_DOCUMENTS_INDEXES_PRIMARY_COLLECTION(primary); - idx = LookupFulltextIndexDocumentCollection(document, attributeName, indexSubstrings); + idx = LookupFulltextIndexDocumentCollection(document, attributeName, indexSubstrings, minWordLength); TRI_READ_UNLOCK_DOCUMENTS_INDEXES_PRIMARY_COLLECTION(primary); @@ -4280,6 +4296,7 @@ TRI_index_t* TRI_LookupFulltextIndexDocumentCollection (TRI_document_collection_ TRI_index_t* TRI_EnsureFulltextIndexDocumentCollection (TRI_document_collection_t* document, const char* attributeName, const bool indexSubstrings, + int minWordLength, bool* created) { TRI_index_t* idx; TRI_primary_collection_t* primary; @@ -4292,7 +4309,7 @@ TRI_index_t* TRI_EnsureFulltextIndexDocumentCollection (TRI_document_collection_ TRI_WRITE_LOCK_DOCUMENTS_INDEXES_PRIMARY_COLLECTION(primary); - idx = CreateFulltextIndexDocumentCollection(document, attributeName, indexSubstrings, 0, created); + idx = CreateFulltextIndexDocumentCollection(document, attributeName, indexSubstrings, minWordLength, 0, created); TRI_WRITE_UNLOCK_DOCUMENTS_INDEXES_PRIMARY_COLLECTION(primary); diff --git a/arangod/VocBase/document-collection.h b/arangod/VocBase/document-collection.h index 3096108b05..e58ff378ed 100644 --- a/arangod/VocBase/document-collection.h +++ b/arangod/VocBase/document-collection.h @@ -553,7 +553,8 @@ struct TRI_index_s* TRI_EnsureSkiplistIndexDocumentCollection (TRI_document_coll struct TRI_index_s* TRI_LookupFulltextIndexDocumentCollection (TRI_document_collection_t*, const char*, - const bool); + const bool, + int); //////////////////////////////////////////////////////////////////////////////// /// @brief ensures that a fulltext index exists @@ -562,6 +563,7 @@ struct TRI_index_s* TRI_LookupFulltextIndexDocumentCollection (TRI_document_coll struct TRI_index_s* TRI_EnsureFulltextIndexDocumentCollection (TRI_document_collection_t*, const char*, const bool, + int, bool*); //////////////////////////////////////////////////////////////////////////////// diff --git a/arangod/VocBase/fulltext-query.h b/arangod/VocBase/fulltext-query.h index c89346a19e..e5482519f2 100644 --- a/arangod/VocBase/fulltext-query.h +++ b/arangod/VocBase/fulltext-query.h @@ -35,6 +35,25 @@ extern "C" { #endif +// ----------------------------------------------------------------------------- +// --SECTION-- public defines +// ----------------------------------------------------------------------------- + +//////////////////////////////////////////////////////////////////////////////// +/// @addtogroup VocBase +/// @{ +//////////////////////////////////////////////////////////////////////////////// + +//////////////////////////////////////////////////////////////////////////////// +/// @brief default minimum word length for a fulltext index +//////////////////////////////////////////////////////////////////////////////// + +#define TRI_FULLTEXT_WORDLENGTH_DEFAULT (2) + +//////////////////////////////////////////////////////////////////////////////// +/// @} +//////////////////////////////////////////////////////////////////////////////// + // ----------------------------------------------------------------------------- // --SECTION-- public functions // ----------------------------------------------------------------------------- diff --git a/arangod/VocBase/index.c b/arangod/VocBase/index.c index 4d191ddfc8..948105317a 100644 --- a/arangod/VocBase/index.c +++ b/arangod/VocBase/index.c @@ -4115,7 +4115,7 @@ static FTS_texts_t* GetTextsFulltextIndex (FTS_document_id_t document, } // parse the document text - words = TRI_get_words(text, textLength, 2, true); + words = TRI_get_words(text, textLength, (uint8_t) fulltextIndex->_minWordLength, true); if (words == NULL) { return NULL; } @@ -4149,23 +4149,31 @@ static FTS_texts_t* GetTextsFulltextIndex (FTS_document_id_t document, } //////////////////////////////////////////////////////////////////////////////// -/// @brief rebuilds a fulltext index by resizing it and re-adding documents +/// @brief creates a new fulltext index with the properties of an existing one, +/// but with adjusted (potentially bigger) sizes. The documents from the old +/// index will be added into the new index. +/// doc will not be re-inserted into the new index. It's the caller's +/// responsibility to add it later. This prevents duplicate document entries +/// in case document insertion has failed at a certain place. In this case, doc +/// might have been in the old index already, and copying the old index and +/// inserting doc again will lead to duplicates. So we exclude doc when copying +/// the old documents and make it the caller's responsibility to add doc later /// the caller must have write-locked the index //////////////////////////////////////////////////////////////////////////////// -static int ResizeFulltextIndex (TRI_index_t* idx) { +static int ResizeFulltextIndex (TRI_index_t* idx, TRI_doc_mptr_t const* doc) { TRI_fulltext_index_t* fulltextIndex; FTS_index_t* newIndex; uint64_t sizes[4]; - LOG_INFO("fulltext index resize was triggered"); + LOG_DEBUG("resizing fulltext index"); fulltextIndex = (TRI_fulltext_index_t*) idx; // this call will populate the sizes array FTS_HealthIndex(fulltextIndex->_fulltextIndex, sizes); - newIndex = FTS_CloneIndex(fulltextIndex->_fulltextIndex, sizes); + newIndex = FTS_CloneIndex(fulltextIndex->_fulltextIndex, (FTS_document_id_t) ((intptr_t) doc), sizes); if (newIndex == NULL) { return TRI_ERROR_OUT_OF_MEMORY; @@ -4191,16 +4199,22 @@ static int InsertFulltextIndex (TRI_index_t* idx, TRI_doc_mptr_t const* doc) { LOG_WARNING("internal error in InsertFulltextIndex"); return TRI_ERROR_INTERNAL; } - + TRI_WriteLockReadWriteLock(&fulltextIndex->_lock); res = FTS_AddDocument(fulltextIndex->_fulltextIndex, (FTS_document_id_t) ((intptr_t) doc)); - + if (res == TRI_ERROR_ARANGO_INDEX_NEEDS_RESIZE) { - // rebuild the index with adjusted (bigger) size - res = ResizeFulltextIndex(idx); + // rebuild the index with adjusted (bigger) sizes + res = ResizeFulltextIndex(idx, doc); if (res == TRI_ERROR_NO_ERROR) { // insert the document again because previous insert failed res = FTS_AddDocument(fulltextIndex->_fulltextIndex, (FTS_document_id_t) ((intptr_t) doc)); + if (res != TRI_ERROR_NO_ERROR) { + LOG_ERROR("adding document to fulltext index failed: %s", TRI_errno_string(res)); + } + } + else { + LOG_ERROR("resizing fulltext index failed: %s", TRI_errno_string(res)); } } @@ -4245,6 +4259,7 @@ static TRI_json_t* JsonFulltextIndex (TRI_index_t* idx, TRI_primary_collection_t TRI_Insert3ArrayJson(TRI_UNKNOWN_MEM_ZONE, json, "unique", TRI_CreateBooleanJson(TRI_UNKNOWN_MEM_ZONE, idx->_unique)); TRI_Insert3ArrayJson(TRI_UNKNOWN_MEM_ZONE, json, "type", TRI_CreateStringCopyJson(TRI_UNKNOWN_MEM_ZONE, "fulltext")); TRI_Insert3ArrayJson(TRI_UNKNOWN_MEM_ZONE, json, "indexSubstrings", TRI_CreateBooleanJson(TRI_UNKNOWN_MEM_ZONE, fulltextIndex->_indexSubstrings)); + TRI_Insert3ArrayJson(TRI_UNKNOWN_MEM_ZONE, json, "minWordLength", TRI_CreateNumberJson(TRI_UNKNOWN_MEM_ZONE, (double) fulltextIndex->_minWordLength)); TRI_Insert3ArrayJson(TRI_UNKNOWN_MEM_ZONE, json, "fields", fields); return json; @@ -4273,11 +4288,7 @@ static int RemoveFulltextIndex (TRI_index_t* idx, TRI_doc_mptr_t const* doc) { if (res == TRI_ERROR_ARANGO_INDEX_NEEDS_RESIZE) { // rebuild the index with adjusted (bigger) size - res = ResizeFulltextIndex(idx); - if (res == TRI_ERROR_NO_ERROR) { - // delete the document again because previous delete failed - res = FTS_DeleteDocument(fulltextIndex->_fulltextIndex, (FTS_document_id_t) ((intptr_t) doc)); - } + res = ResizeFulltextIndex(idx, doc); } TRI_WriteUnlockReadWriteLock(&fulltextIndex->_lock); @@ -4304,10 +4315,10 @@ static int UpdateFulltextIndex (TRI_index_t* idx, if (res == TRI_ERROR_ARANGO_INDEX_NEEDS_RESIZE) { // rebuild the index with adjusted (bigger) size - res = ResizeFulltextIndex(idx); + res = ResizeFulltextIndex(idx, newDoc); if (res == TRI_ERROR_NO_ERROR) { - // update the document again because previous update failed - res = FTS_UpdateDocument(fulltextIndex->_fulltextIndex, (FTS_document_id_t) ((intptr_t) newDoc)); + // insert just the new version of the document + res = FTS_AddDocument(fulltextIndex->_fulltextIndex, (FTS_document_id_t) ((intptr_t) newDoc)); } } @@ -4322,7 +4333,33 @@ static int UpdateFulltextIndex (TRI_index_t* idx, //////////////////////////////////////////////////////////////////////////////// static int CleanupFulltextIndex (TRI_index_t* idx) { - LOG_DEBUG("fulltext cleanup called"); + TRI_fulltext_index_t* fulltextIndex; + int res; + + LOG_TRACE("fulltext cleanup called"); + + fulltextIndex = (TRI_fulltext_index_t*) idx; + + TRI_WriteLockReadWriteLock(&fulltextIndex->_lock); + + while (1) { + // this will scan 100.000 document/word pairs at a time + // TODO: check if this number is reasonable + res = FTS_BackgroundTask(fulltextIndex->_fulltextIndex, 100000); + // 0 = ok, but unfinished + // 1 = oom + // 2 = needs resize + // 3 = finished + if (res == 3) { + // finished cleaning + break; + } + // TODO: maybe we want to clean more + break; + } + TRI_WriteUnlockReadWriteLock(&fulltextIndex->_lock); + + LOG_TRACE("finished cleaning up"); return TRI_ERROR_NO_ERROR; } @@ -4346,7 +4383,8 @@ static int CleanupFulltextIndex (TRI_index_t* idx) { TRI_index_t* TRI_CreateFulltextIndex (struct TRI_primary_collection_s* collection, const char* attributeName, - const bool indexSubstrings) { + const bool indexSubstrings, + int minWordLength) { TRI_fulltext_index_t* fulltextIndex; FTS_index_t* fts; TRI_shaper_t* shaper; @@ -4354,8 +4392,8 @@ TRI_index_t* TRI_CreateFulltextIndex (struct TRI_primary_collection_s* collectio TRI_shape_pid_t attribute; int options; // default sizes for index. TODO: adjust these - //uint64_t sizes[4] = { 20050, 100000, 570000, 10000000 }; - uint64_t sizes[4] = { 50, 1000, 5700, 10000 }; + //uint64_t sizes[4] = { 50, 100000, 5000, 1000 }; + uint64_t sizes[4] = { 500, 1000000, 50000, 10000 }; // look up the attribute shaper = collection->_shaper; @@ -4403,6 +4441,7 @@ TRI_index_t* TRI_CreateFulltextIndex (struct TRI_primary_collection_s* collectio fulltextIndex->_fulltextIndex = fts; fulltextIndex->_indexSubstrings = indexSubstrings; fulltextIndex->_attribute = attribute; + fulltextIndex->_minWordLength = minWordLength; TRI_InitVectorString(&fulltextIndex->base._fields, TRI_UNKNOWN_MEM_ZONE); TRI_PushBackVectorString(&fulltextIndex->base._fields, copy); diff --git a/arangod/VocBase/index.h b/arangod/VocBase/index.h index 4326997403..3ac5a75d4e 100644 --- a/arangod/VocBase/index.h +++ b/arangod/VocBase/index.h @@ -235,6 +235,7 @@ typedef struct TRI_fulltext_index_s { FTS_index_t* _fulltextIndex; TRI_shape_pid_t _attribute; TRI_read_write_lock_t _lock; + int _minWordLength; bool _indexSubstrings; } @@ -683,7 +684,8 @@ struct TRI_doc_mptr_s** TRI_LookupFulltextIndex (TRI_index_t*, const char* query TRI_index_t* TRI_CreateFulltextIndex (struct TRI_primary_collection_s*, const char*, - const bool); + const bool, + int); //////////////////////////////////////////////////////////////////////////////// /// @brief frees the memory allocated, but does not free the pointer diff --git a/js/server/tests/fulltext.js b/js/server/tests/fulltext.js index bbc0a1fe74..8f09c72dc2 100644 --- a/js/server/tests/fulltext.js +++ b/js/server/tests/fulltext.js @@ -384,7 +384,7 @@ function fulltextQuerySuite () { assertEqual(0, collection.FULLTEXT(idx, "no,cats,allowed").documents.length); assertEqual(0, collection.FULLTEXT(idx, "banana").documents.length); }, -/* + //////////////////////////////////////////////////////////////////////////////// /// @brief test duplicate entries //////////////////////////////////////////////////////////////////////////////// @@ -407,7 +407,7 @@ function fulltextQuerySuite () { //////////////////////////////////////////////////////////////////////////////// /// @brief test duplicate entries //////////////////////////////////////////////////////////////////////////////// - + testDuplicatesDocuments: function () { var text1 = "this is a short document text"; var text2 = "Some longer document text is put in here just to validate whats going on"; @@ -423,7 +423,7 @@ function fulltextQuerySuite () { assertEqual(10000, collection.FULLTEXT(idx, "some").documents.length); assertEqual(0, collection.FULLTEXT(idx, "banana").documents.length); }, -*/ + //////////////////////////////////////////////////////////////////////////////// /// @brief test similar entries //////////////////////////////////////////////////////////////////////////////// @@ -806,7 +806,6 @@ function fulltextQuerySuite () { } }, -/* //////////////////////////////////////////////////////////////////////////////// /// @brief 4 byte sequences //////////////////////////////////////////////////////////////////////////////// @@ -847,6 +846,7 @@ function fulltextQuerySuite () { assertEqual(1, collection.FULLTEXT(idx, "타이승려들은,수호사원으로서").documents.length); assertEqual(1, collection.FULLTEXT(idx, "prefix:타이승려,prefix:수호사원으").documents.length); assertEqual(1, collection.FULLTEXT(idx, "prefix:조상숭배로").documents.length); + /* require("console").log(4); assertEqual(1, collection.FULLTEXT(idx, "教材教辅考试").documents.length); // "图书简介 亚马逊图书,中国最大的网上书店。拥有文学,经济管理,少儿,人文社科,生活,艺术,科技,进口原版,期刊杂志等大类,教材教辅考试,历史,国学古籍,法律,军事,宗教,心理学,哲学,健康与养生,旅游与地图,娱乐,两性婚恋,时尚,家居休闲,孕产育儿,文学,小说,传记,青春与动漫绘本,家庭百科,外语,工具书,教育,心理励志,心灵读物,建筑,计算机与网络,科学与自然等数十小类共计300多万种中外图书 @@ -880,8 +880,8 @@ function fulltextQuerySuite () { assertEqual(1, collection.FULLTEXT(idx, "síðu,rættar,ritstjórni").documents.length); require("console").log(73); assertEqual(1, collection.FULLTEXT(idx, "prefix:læt").documents.length); + */ } - */ }; }; diff --git a/lib/Basics/Utf8Helper.cpp b/lib/Basics/Utf8Helper.cpp index 6a7ee0466c..8eb2f65c8a 100644 --- a/lib/Basics/Utf8Helper.cpp +++ b/lib/Basics/Utf8Helper.cpp @@ -250,6 +250,9 @@ char* Utf8Helper::tolower (TRI_memory_zone_t* zone, const char *src, int32_t src } else { utf8_dest = (char*) TRI_Allocate(zone, (srcLength+1) * sizeof(char), false); + if (utf8_dest == 0) { + return 0; + } dstLength = ucasemap_utf8ToLower(csm.getAlias(), utf8_dest, @@ -285,7 +288,9 @@ char* Utf8Helper::tolower (TRI_memory_zone_t* zone, const char *src, int32_t src #endif utf8_dest = TRI_LowerAsciiStringZ(zone, src); - dstLength = strlen(utf8_dest); + if (utf8_dest != 0) { + dstLength = strlen(utf8_dest); + } return utf8_dest; } @@ -371,14 +376,16 @@ char* Utf8Helper::toupper (TRI_memory_zone_t* zone, const char *src, int32_t src #endif utf8_dest = TRI_UpperAsciiStringZ(zone, src); - dstLength = strlen(utf8_dest); + if (utf8_dest != NULL) { + dstLength = strlen(utf8_dest); + } return utf8_dest; } TRI_vector_string_t* Utf8Helper::getWords (const char* const text, - const size_t textLength, - uint8_t minimalLength, - bool lowerCase) { + const size_t textLength, + uint8_t minimalLength, + bool lowerCase) { TRI_vector_string_t* words; UErrorCode status = U_ZERO_ERROR; UnicodeString word; @@ -401,17 +408,29 @@ TRI_vector_string_t* Utf8Helper::getWords (const char* const text, // lower case string int32_t lowerLength = 0; char* lower = tolower(TRI_UNKNOWN_MEM_ZONE, text, (int32_t) textLength, lowerLength); - - if (lowerLength == 0) { + + if (lower == NULL) { + // out of memory TRI_FreeVectorString(TRI_UNKNOWN_MEM_ZONE, words); return NULL; } + + if (lowerLength == 0) { + TRI_Free(TRI_UNKNOWN_MEM_ZONE, lower); + TRI_FreeVectorString(TRI_UNKNOWN_MEM_ZONE, words); + return NULL; + } + textUtf16 = TRI_Utf8ToUChar(TRI_UNKNOWN_MEM_ZONE, lower, lowerLength, &textUtf16Length); TRI_Free(TRI_UNKNOWN_MEM_ZONE, lower); } else { textUtf16 = TRI_Utf8ToUChar(TRI_UNKNOWN_MEM_ZONE, text, (int32_t) textLength, &textUtf16Length); } + + if (textUtf16 == NULL) { + return NULL; + } ULocDataLocaleType type = ULOC_VALID_LOCALE; const Locale& locale = _coll->getLocale(type, status); @@ -437,7 +456,9 @@ TRI_vector_string_t* Utf8Helper::getWords (const char* const text, if (tempUtf16Length >= minimalLength) { utext.extractBetween(start, end, tempUtf16, 0); utf8Word = TRI_UCharToUtf8(TRI_UNKNOWN_MEM_ZONE, tempUtf16, tempUtf16Length, &utf8WordLength); - TRI_PushBackVectorString(words, utf8Word); + if (utf8Word != 0) { + TRI_PushBackVectorString(words, utf8Word); + } } }