1
0
Fork 0

cleanup of fulltext indexes, resizing etc.

This commit is contained in:
Jan Steemann 2012-12-07 18:00:01 +01:00
parent 7261465139
commit cbaa57a1bd
11 changed files with 480 additions and 154 deletions

View File

@ -75,19 +75,30 @@ extern ZCOD zcdh;
/// @brief maximum number of Unicode characters for an indexed word
////////////////////////////////////////////////////////////////////////////////
#define MAX_WORD_LENGTH (40)
#define MAX_WORD_LENGTH (40)
////////////////////////////////////////////////////////////////////////////////
/// @brief gap between two words in a temporary search buffer
////////////////////////////////////////////////////////////////////////////////
#define SPACING (10)
#define SPACING (10)
////////////////////////////////////////////////////////////////////////////////
/// @brief maximum tolerable occupancy of the index (e.g. 60 %)
////////////////////////////////////////////////////////////////////////////////
#define HEALTH_THRESHOLD (75)
#define HEALTH_THRESHOLD (75)
////////////////////////////////////////////////////////////////////////////////
/// @brief index extra growth factor
/// if 1.0, the index will be resized to the values originally suggested. As
/// resizing is expensive, one might want to decrease the overall number of
/// resizings. This can be done by setting this number to a value bigger than
/// 1.0
/// TODO: find a good default value for this
////////////////////////////////////////////////////////////////////////////////
#define EXTRA_GROWTH_FACTOR (1.0)
////////////////////////////////////////////////////////////////////////////////
/// @brief the actual index struct used
@ -104,9 +115,10 @@ typedef struct {
TUBER* _index1;
TUBER* _index2;
TUBER* _index3;
uint64_t _ix3KKey;
int64_t _maxDocuments;
int64_t _numDocuments;
uint64_t _maxDocuments;
uint64_t _numDocuments;
FTS_texts_t* (*getTexts)(FTS_document_id_t, void*);
void (*freeWordlist)(FTS_texts_t*);
@ -168,6 +180,20 @@ static uint64_t GetUnicode (uint8_t** ptr) {
return 0;
}
////////////////////////////////////////////////////////////////////////////////
/// @brief translate zstr error code into TRI_error code
////////////////////////////////////////////////////////////////////////////////
static int TranslateZStrErrorCode (int zstrErrorCode) {
assert(zstrErrorCode != 0);
if (zstrErrorCode == 2) {
return TRI_ERROR_ARANGO_INDEX_NEEDS_RESIZE;
}
return TRI_ERROR_OUT_OF_MEMORY;
}
////////////////////////////////////////////////////////////////////////////////
/// @brief add a document to the index
////////////////////////////////////////////////////////////////////////////////
@ -195,6 +221,7 @@ int RealAddDocument (FTS_index_t* ftx, FTS_document_id_t docid, FTS_texts_t* raw
uint64_t bkey = 0;
uint64_t docb, dock;
int res;
int res2;
ix = (FTS_real_index*) ftx;
@ -202,6 +229,7 @@ int RealAddDocument (FTS_index_t* ftx, FTS_document_id_t docid, FTS_texts_t* raw
handle = ix->_firstFree;
if (handle == 0) {
// no more document handles free
LOG_ERROR("fail on %d", __LINE__);
return TRI_ERROR_ARANGO_INDEX_NEEDS_RESIZE;
}
@ -227,7 +255,7 @@ int RealAddDocument (FTS_index_t* ftx, FTS_document_id_t docid, FTS_texts_t* raw
// check for out of memory
if (zstrwl == NULL || zstr2a == NULL || zstr2b == NULL || x3zstr == NULL || x3zstrb == NULL) {
LOG_INFO("oom triggered in %d", __LINE__); res = TRI_ERROR_OUT_OF_MEMORY;
res = TRI_ERROR_OUT_OF_MEMORY;
goto oom;
}
@ -243,7 +271,7 @@ int RealAddDocument (FTS_index_t* ftx, FTS_document_id_t docid, FTS_texts_t* raw
unicode = GetUnicode(&utf);
while (unicode != 0) {
if (ZStrEnc(zstrwl, &zcutf, unicode) != 0) {
LOG_INFO("oom triggered in %d", __LINE__); res = TRI_ERROR_OUT_OF_MEMORY;
res = TRI_ERROR_OUT_OF_MEMORY;
goto oom;
}
@ -256,13 +284,13 @@ int RealAddDocument (FTS_index_t* ftx, FTS_document_id_t docid, FTS_texts_t* raw
// terminate the word and insert into STEX
if (ZStrEnc(zstrwl, &zcutf, 0) != 0) {
LOG_INFO("oom triggered in %d", __LINE__); res = TRI_ERROR_OUT_OF_MEMORY;
res = TRI_ERROR_OUT_OF_MEMORY;
goto oom;
}
ZStrNormalize(zstrwl);
if (ZStrSTAppend(stex, zstrwl) != 0) {
LOG_INFO("oom triggered in %d", __LINE__); res = TRI_ERROR_OUT_OF_MEMORY;
res = TRI_ERROR_OUT_OF_MEMORY;
goto oom;
}
}
@ -279,7 +307,7 @@ int RealAddDocument (FTS_index_t* ftx, FTS_document_id_t docid, FTS_texts_t* raw
for (wdx = 0; wdx < nowords; wdx++) {
// get it out as a word
if (ZStrInsert(zstrwl, wpt, 2) != 0) {
LOG_INFO("oom triggered in %d", __LINE__); res = TRI_ERROR_OUT_OF_MEMORY;
res = TRI_ERROR_OUT_OF_MEMORY;
goto oom;
}
@ -307,7 +335,7 @@ int RealAddDocument (FTS_index_t* ftx, FTS_document_id_t docid, FTS_texts_t* raw
// get the Z-string for the index-2 entry before this letter
i = ZStrTuberRead(ix->_index2, kkey[j], zstr2a);
if (i == 1) {
res = TRI_ERROR_INTERNAL; // TODO: check with Richard
res = TRI_ERROR_INTERNAL;
goto oom;
}
@ -335,6 +363,7 @@ int RealAddDocument (FTS_index_t* ftx, FTS_document_id_t docid, FTS_texts_t* raw
if (newlet != tran) {
// if not there, create a new index-2 entry for it
bkey = ZStrTuberIns(ix->_index2, kkey[j], tran);
// TODO: check bkey for INSFAIL
kkey[j + 1] = ZStrTuberK(ix->_index2, kkey[j], tran, bkey);
// update old index-2 entry to insert new letter
ZStrCxClear(&zcdelt, &ctx2a);
@ -344,14 +373,14 @@ int RealAddDocument (FTS_index_t* ftx, FTS_document_id_t docid, FTS_texts_t* raw
ZStrClear(zstr2b);
x64 = ZStrBitsOut(zstr2a, 1);
if (ZStrBitsIn(x64, 1, zstr2b) != 0) {
LOG_INFO("oom triggered in %d", __LINE__); res = TRI_ERROR_OUT_OF_MEMORY;
res = TRI_ERROR_OUT_OF_MEMORY;
goto oom;
}
if (x64 == 1) {
// copy over the B-key into index 3
docb = ZStrDec(zstr2a, &zcbky);
if (ZStrEnc(zstr2b, &zcbky, docb) != 0) {
LOG_INFO("oom triggered in %d", __LINE__); res = TRI_ERROR_OUT_OF_MEMORY;
res = TRI_ERROR_OUT_OF_MEMORY;
goto oom;
}
}
@ -364,27 +393,27 @@ int RealAddDocument (FTS_index_t* ftx, FTS_document_id_t docid, FTS_texts_t* raw
break;
}
if (ZStrCxEnc(zstr2b, &zcdelt, &ctx2b, newlet) != 0) {
LOG_INFO("oom triggered in %d", __LINE__); res = TRI_ERROR_OUT_OF_MEMORY;
res = TRI_ERROR_OUT_OF_MEMORY;
goto oom;
}
x64 = ZStrDec(zstr2a, &zcbky);
if (ZStrEnc(zstr2b, &zcbky, x64) != 0) {
LOG_INFO("oom triggered in %d", __LINE__); res = TRI_ERROR_OUT_OF_MEMORY;
res = TRI_ERROR_OUT_OF_MEMORY;
goto oom;
}
}
if (ZStrCxEnc(zstr2b, &zcdelt, &ctx2b, tran) != 0) {
LOG_INFO("oom triggered in %d", __LINE__); res = TRI_ERROR_OUT_OF_MEMORY;
res = TRI_ERROR_OUT_OF_MEMORY;
goto oom;
}
if (ZStrEnc(zstr2b, &zcbky, bkey) != 0) {
LOG_INFO("oom triggered in %d", __LINE__); res = TRI_ERROR_OUT_OF_MEMORY;
res = TRI_ERROR_OUT_OF_MEMORY;
goto oom;
}
if (newlet == oldlet) {
if (ZStrCxEnc(zstr2b, &zcdelt, &ctx2b, tran) != 0) {
LOG_INFO("oom triggered in %d", __LINE__); res = TRI_ERROR_OUT_OF_MEMORY;
res = TRI_ERROR_OUT_OF_MEMORY;
goto oom;
}
}
@ -392,24 +421,25 @@ int RealAddDocument (FTS_index_t* ftx, FTS_document_id_t docid, FTS_texts_t* raw
while (newlet != oldlet) {
oldlet = newlet;
if (ZStrCxEnc(zstr2b, &zcdelt, &ctx2b, newlet) != 0) {
LOG_INFO("oom triggered in %d", __LINE__); res = TRI_ERROR_OUT_OF_MEMORY;
res = TRI_ERROR_OUT_OF_MEMORY;
goto oom;
}
x64 = ZStrDec(zstr2a, &zcbky);
if (ZStrEnc(zstr2b, &zcbky, x64) != 0) {
LOG_INFO("oom triggered in %d", __LINE__); res = TRI_ERROR_OUT_OF_MEMORY;
res = TRI_ERROR_OUT_OF_MEMORY;
goto oom;
}
newlet = ZStrCxDec(zstr2a, &zcdelt, &ctx2a);
}
if (ZStrCxEnc(zstr2b, &zcdelt, &ctx2b, newlet) != 0) {
LOG_INFO("oom triggered in %d", __LINE__); res = TRI_ERROR_OUT_OF_MEMORY;
res = TRI_ERROR_OUT_OF_MEMORY;
goto oom;
}
}
ZStrNormalize(zstr2b);
if (ZStrTuberUpdate(ix->_index2, kkey[j], zstr2b) != 0) {
LOG_INFO("oom triggered in %d", __LINE__); res = TRI_ERROR_OUT_OF_MEMORY;
res2 = ZStrTuberUpdate(ix->_index2, kkey[j], zstr2b);
if (res2 != 0) {
res = TranslateZStrErrorCode(res2);
goto oom;
}
}
@ -424,7 +454,7 @@ int RealAddDocument (FTS_index_t* ftx, FTS_document_id_t docid, FTS_texts_t* raw
// so read the zstr from index2
i = ZStrTuberRead(ix->_index2, kkey[j], zstr2a);
if (i == 1) {
res = TRI_ERROR_INTERNAL; // TODO: check with Richard
res = TRI_ERROR_INTERNAL;
goto oom;
}
// is there already an index-3 entry available?
@ -435,6 +465,7 @@ int RealAddDocument (FTS_index_t* ftx, FTS_document_id_t docid, FTS_texts_t* raw
}
else {
docb = ZStrTuberIns(ix->_index3, kkey[j], 0);
// TODO: check docb
// put it into index 2
ZStrCxClear(&zcdelt, &ctx2a);
ZStrCxClear(&zcdelt, &ctx2b);
@ -443,12 +474,12 @@ int RealAddDocument (FTS_index_t* ftx, FTS_document_id_t docid, FTS_texts_t* raw
ZStrClear(zstr2b);
x64 = ZStrBitsOut(zstr2a, 1);
if (ZStrBitsIn(1, 1, zstr2b) != 0) {
LOG_INFO("oom triggered in %d", __LINE__); res = TRI_ERROR_OUT_OF_MEMORY;
res = TRI_ERROR_OUT_OF_MEMORY;
goto oom;
}
if (ZStrEnc(zstr2b, &zcbky, docb) != 0) {
LOG_INFO("oom triggered in %d", __LINE__); res = TRI_ERROR_OUT_OF_MEMORY;
res = TRI_ERROR_OUT_OF_MEMORY;
goto oom;
}
@ -461,18 +492,19 @@ int RealAddDocument (FTS_index_t* ftx, FTS_document_id_t docid, FTS_texts_t* raw
}
if (ZStrCxEnc(zstr2b, &zcdelt, &ctx2b, newlet) != 0) {
LOG_INFO("oom triggered in %d", __LINE__); res = TRI_ERROR_OUT_OF_MEMORY;
res = TRI_ERROR_OUT_OF_MEMORY;
goto oom;
}
x64 = ZStrDec(zstr2a, &zcbky);
if (ZStrEnc(zstr2b,&zcbky, x64) != 0) {
LOG_INFO("oom triggered in %d", __LINE__); res = TRI_ERROR_OUT_OF_MEMORY;
res = TRI_ERROR_OUT_OF_MEMORY;
goto oom;
}
}
ZStrNormalize(zstr2b);
if (ZStrTuberUpdate(ix->_index2, kkey[j], zstr2b) != 0) {
LOG_INFO("oom triggered in %d", __LINE__); res = TRI_ERROR_OUT_OF_MEMORY;
res2 = ZStrTuberUpdate(ix->_index2, kkey[j], zstr2b);
if (res2 != 0) {
res = TranslateZStrErrorCode(res2);
goto oom;
}
}
@ -481,7 +513,7 @@ int RealAddDocument (FTS_index_t* ftx, FTS_document_id_t docid, FTS_texts_t* raw
i = ZStrTuberRead(ix->_index3, dock, x3zstr);
ZStrClear(x3zstrb);
if (i == 1) {
res = TRI_ERROR_INTERNAL; // TODO: check with Richard
res = TRI_ERROR_INTERNAL;
goto oom;
}
@ -496,37 +528,38 @@ int RealAddDocument (FTS_index_t* ftx, FTS_document_id_t docid, FTS_texts_t* raw
}
if (ZStrCxEnc(x3zstrb, &zcdoc, &x3ctxb, newhan) != 0) {
LOG_INFO("oom triggered in %d", __LINE__); res = TRI_ERROR_OUT_OF_MEMORY;
res = TRI_ERROR_OUT_OF_MEMORY;
goto oom;
}
}
if (ZStrCxEnc(x3zstrb, &zcdoc, &x3ctxb, handle) != 0) {
LOG_INFO("oom triggered in %d", __LINE__); res = TRI_ERROR_OUT_OF_MEMORY;
res = TRI_ERROR_OUT_OF_MEMORY;
goto oom;
}
if (newhan == oldhan) {
if (ZStrCxEnc(x3zstrb, &zcdoc, &x3ctxb, handle) != 0) {
LOG_INFO("oom triggered in %d", __LINE__); res = TRI_ERROR_OUT_OF_MEMORY;
res = TRI_ERROR_OUT_OF_MEMORY;
goto oom;
}
}
else {
if (ZStrCxEnc(x3zstrb, &zcdoc, &x3ctxb, newhan) != 0) {
LOG_INFO("oom triggered in %d", __LINE__); res = TRI_ERROR_OUT_OF_MEMORY;
res = TRI_ERROR_OUT_OF_MEMORY;
goto oom;
}
while (newhan != oldhan) {
oldhan = newhan;
newhan = ZStrCxDec(x3zstr, &zcdoc, &x3ctx);
if (ZStrCxEnc(x3zstrb, &zcdoc, &x3ctxb, newhan) != 0) {
LOG_INFO("oom triggered in %d", __LINE__); res = TRI_ERROR_OUT_OF_MEMORY;
res = TRI_ERROR_OUT_OF_MEMORY;
goto oom;
}
}
}
ZStrNormalize(x3zstrb);
if (ZStrTuberUpdate(ix->_index3, dock, x3zstrb) != 0) {
LOG_INFO("oom triggered in %d", __LINE__); res = TRI_ERROR_OUT_OF_MEMORY;
res2 = ZStrTuberUpdate(ix->_index3, dock, x3zstrb);
if (res2 != 0) {
res = TranslateZStrErrorCode(res2);
goto oom;
}
@ -545,7 +578,7 @@ int RealAddDocument (FTS_index_t* ftx, FTS_document_id_t docid, FTS_texts_t* raw
tran = ZStrXlate(&zcutf, ixlet[j2]);
i = ZStrTuberRead(ix->_index1, kkey1[j2 + 1], zstr2a);
if (i == 1) {
res = TRI_ERROR_INTERNAL; // TODO: check with Richard
res = TRI_ERROR_INTERNAL;
goto oom;
}
// look to see if the letter is there
@ -565,6 +598,7 @@ int RealAddDocument (FTS_index_t* ftx, FTS_document_id_t docid, FTS_texts_t* raw
if (newlet != tran) {
// if not there, create a new index-1 entry for it
bkey = ZStrTuberIns(ix->_index1, kkey1[j2 + 1], tran);
// TODO: check bkey
kkey1[j2] = ZStrTuberK(ix->_index1, kkey1[j2 + 1], tran, bkey);
// update old index-1 entry to insert new letter
ZStrCxClear(&zcdelt, &ctx2a);
@ -580,26 +614,26 @@ int RealAddDocument (FTS_index_t* ftx, FTS_document_id_t docid, FTS_texts_t* raw
break;
}
if (ZStrCxEnc(zstr2b, &zcdelt, &ctx2b, newlet) != 0) {
LOG_INFO("oom triggered in %d", __LINE__); res = TRI_ERROR_OUT_OF_MEMORY;
res = TRI_ERROR_OUT_OF_MEMORY;
goto oom;
}
x64 = ZStrDec(zstr2a, &zcbky);
if (ZStrEnc(zstr2b, &zcbky, x64) != 0) {
LOG_INFO("oom triggered in %d", __LINE__); res = TRI_ERROR_OUT_OF_MEMORY;
res = TRI_ERROR_OUT_OF_MEMORY;
goto oom;
}
}
if (ZStrCxEnc(zstr2b, &zcdelt, &ctx2b, tran) != 0) {
LOG_INFO("oom triggered in %d", __LINE__); res = TRI_ERROR_OUT_OF_MEMORY;
res = TRI_ERROR_OUT_OF_MEMORY;
goto oom;
}
if (ZStrEnc(zstr2b, &zcbky, bkey) != 0) {
LOG_INFO("oom triggered in %d", __LINE__); res = TRI_ERROR_OUT_OF_MEMORY;
res = TRI_ERROR_OUT_OF_MEMORY;
goto oom;
}
if (newlet == oldlet) {
if (ZStrCxEnc(zstr2b, &zcdelt, &ctx2b, tran) != 0) {
LOG_INFO("oom triggered in %d", __LINE__); res = TRI_ERROR_OUT_OF_MEMORY;
res = TRI_ERROR_OUT_OF_MEMORY;
goto oom;
}
}
@ -607,24 +641,25 @@ int RealAddDocument (FTS_index_t* ftx, FTS_document_id_t docid, FTS_texts_t* raw
while (newlet != oldlet) {
oldlet = newlet;
if (ZStrCxEnc(zstr2b, &zcdelt, &ctx2b, newlet) != 0) {
LOG_INFO("oom triggered in %d", __LINE__); res = TRI_ERROR_OUT_OF_MEMORY;
res = TRI_ERROR_OUT_OF_MEMORY;
goto oom;
}
x64 = ZStrDec(zstr2a, &zcbky);
if (ZStrEnc(zstr2b, &zcbky, x64) != 0) {
LOG_INFO("oom triggered in %d", __LINE__); res = TRI_ERROR_OUT_OF_MEMORY;
res = TRI_ERROR_OUT_OF_MEMORY;
goto oom;
}
newlet = ZStrCxDec(zstr2a, &zcdelt, &ctx2a);
}
if (ZStrCxEnc(zstr2b, &zcdelt, &ctx2b, newlet) != 0) {
LOG_INFO("oom triggered in %d", __LINE__); res = TRI_ERROR_OUT_OF_MEMORY;
res = TRI_ERROR_OUT_OF_MEMORY;
goto oom;
}
}
ZStrNormalize(zstr2b);
if (ZStrTuberUpdate(ix->_index1, kkey1[j2 + 1], zstr2b) != 0) {
LOG_INFO("oom triggered in %d", __LINE__); res = TRI_ERROR_OUT_OF_MEMORY;
res2 = ZStrTuberUpdate(ix->_index1, kkey1[j2 + 1], zstr2b);
if (res2 != 0) {
res = TranslateZStrErrorCode(res2);
goto oom;
}
}
@ -685,11 +720,15 @@ static int RealDeleteDocument (FTS_index_t* ftx, FTS_document_id_t docid) {
}
if (i > ix->_lastSlot) {
LOG_ERROR("fail on %d", __LINE__);
return TRI_ERROR_ARANGO_INDEX_NEEDS_RESIZE;
}
ix->_handlesFree[i] = 1;
ix->_numDocuments--;
if (ix->_numDocuments > 0) {
// should never underflow
ix->_numDocuments--;
}
return TRI_ERROR_NO_ERROR;
}
@ -946,11 +985,17 @@ static void Ix2Recurs (STEX* dochan, FTS_real_index* ix, uint64_t kk2) {
/// @brief index recursion, prefix matching
////////////////////////////////////////////////////////////////////////////////
static void Ix1Recurs (STEX* dochan, FTS_real_index* ix, uint64_t kk1, uint64_t* wd) {
static int Ix1Recurs (STEX* dochan,
FTS_real_index* ix,
uint64_t kk1,
uint64_t* wd) {
ZSTR* zstr;
CTX ctx;
uint64_t newlet;
uint64_t kk2;
int res;
res = TRI_ERROR_NO_ERROR;
kk2 = FindKKey2(ix,wd);
@ -961,13 +1006,11 @@ static void Ix1Recurs (STEX* dochan, FTS_real_index* ix, uint64_t kk1, uint64_t*
// index 1 entry for this prefix
zstr = ZStrCons(10);
if (zstr == NULL) {
// TODO: out of memory
return TRI_ERROR_OUT_OF_MEMORY;
}
if (ZStrTuberRead(ix->_index1, kk1, zstr) == 1) {
// TODO: make this return an error instead
printf("recursion failed to read kk1\n");
exit(1);
return TRI_ERROR_INTERNAL;
}
ZStrCxClear(&zcdelt, &ctx);
@ -986,10 +1029,16 @@ static void Ix1Recurs (STEX* dochan, FTS_real_index* ix, uint64_t kk1, uint64_t*
bkey = ZStrDec(zstr, &zcbky);
newkk1 = ZStrTuberK(ix->_index1, kk1, newlet, bkey);
*(wd - 1) = newlet;
Ix1Recurs(dochan, ix, newkk1, wd - 1);
res = Ix1Recurs(dochan, ix, newkk1, wd - 1);
if (res != TRI_ERROR_NO_ERROR) {
return res;
}
}
ZStrDest(zstr);
return res;
}
////////////////////////////////////////////////////////////////////////////////
@ -1072,15 +1121,9 @@ int FTS_HealthIndex (FTS_index_t* ftx, uint64_t* stats) {
ix = (FTS_real_index*) ftx;
health = (ix->_numDocuments * 100) / ix->_maxDocuments;
stats[0] = (health * (ix->_numDocuments + 5)) / 50;
if (stats[0] < 5) {
stats[0] = 5;
}
if (ix->_options == FTS_INDEX_SUBSTRINGS) {
ZStrTuberStats(ix->_index1, st);
// LOG_TRACE("index 1 health %d size %d", (int) st[0], (int) st[1]);
stats[1] = st[1];
if (health < st[0]) {
health = st[0];
@ -1091,19 +1134,30 @@ int FTS_HealthIndex (FTS_index_t* ftx, uint64_t* stats) {
}
ZStrTuberStats(ix->_index2, st);
// LOG_TRACE("index 2 health %d size %d", (int) st[0], (int) st[1]);
stats[2] = st[1];
if (health < st[0]) {
health = st[0];
}
ZStrTuberStats(ix->_index3, st);
// LOG_TRACE("index 3 health %d size %d", (int) st[0], (int) st[1]);
stats[3] = st[1];
if (health < st[0]) {
health = st[0];
}
stats[0] = (health * (ix->_numDocuments + 5)) / 50;
if (stats[0] < (ix->_numDocuments + 5)) {
stats[0] = (ix->_numDocuments + 5);
}
if (EXTRA_GROWTH_FACTOR > 1.0) {
size_t i;
for (i = 0; i < 4; ++i) {
stats[i] = (uint64_t) ((double) stats[i] * (double) EXTRA_GROWTH_FACTOR);
}
}
return (int) health;
}
@ -1115,6 +1169,7 @@ int FTS_HealthIndex (FTS_index_t* ftx, uint64_t* stats) {
////////////////////////////////////////////////////////////////////////////////
FTS_index_t* FTS_CloneIndex (FTS_index_t* ftx,
FTS_document_id_t excludeDocument,
uint64_t sizes[4]) {
FTS_real_index* old;
FTS_index_t* clone;
@ -1126,15 +1181,24 @@ FTS_index_t* FTS_CloneIndex (FTS_index_t* ftx,
if (clone != NULL) {
// copy documents
FTS_document_id_t i;
uint64_t count = 0;
for (i = 1; i <= old->_lastSlot; i++) {
FTS_document_id_t found;
int res;
if (old->_handlesFree[i] == 1) {
// document is marked as deleted
continue;
}
res = FTS_AddDocument(clone, old->_handles[i]);
found = old->_handles[i];
if (found == excludeDocument) {
// do not insert this document, because the caller will insert it later
continue;
}
res = FTS_AddDocument(clone, found);
if (res != TRI_ERROR_NO_ERROR && res != TRI_ERROR_ARANGO_INDEX_NEEDS_RESIZE) {
// if resize fails, everything's ruined
LOG_ERROR("resizing the fulltext index failed with %d, sizes were: %llu %llu %llu %llu",
@ -1147,7 +1211,11 @@ FTS_index_t* FTS_CloneIndex (FTS_index_t* ftx,
FTS_FreeIndex(clone);
return NULL;
}
++count;
}
LOG_DEBUG("cloned %llu documents", (unsigned long long) count);
}
return clone;
@ -1170,11 +1238,11 @@ FTS_index_t* FTS_CreateIndex (void* context,
FTS_real_index* ix;
int i;
LOG_INFO("creating fulltext index with sizes %llu %llu %llu %llu",
(unsigned long long) sizes[0],
(unsigned long long) sizes[1],
(unsigned long long) sizes[2],
(unsigned long long) sizes[3]);
LOG_TRACE("creating fulltext index with sizes %llu %llu %llu %llu",
(unsigned long long) sizes[0],
(unsigned long long) sizes[1],
(unsigned long long) sizes[2],
(unsigned long long) sizes[3]);
ix = TRI_Allocate(TRI_UNKNOWN_MEM_ZONE, sizeof(FTS_real_index), false);
if (ix == NULL) {
@ -1194,10 +1262,11 @@ FTS_index_t* FTS_CreateIndex (void* context,
return NULL;
}
ix->_maxDocuments = (int64_t) sizes[0];
ix->_maxDocuments = sizes[0];
ix->_numDocuments = 0;
ix->_context = context;
ix->_options = options;
ix->_ix3KKey = 0;
// wordlists retrieval function
ix->getTexts = getTexts;
@ -1335,11 +1404,11 @@ int FTS_AddDocument (FTS_index_t* ftx, FTS_document_id_t docid) {
health = FTS_HealthIndex(ftx, sizes);
if (health > HEALTH_THRESHOLD || res == TRI_ERROR_ARANGO_INDEX_NEEDS_RESIZE) {
LOG_INFO("Add document: health exceeds threshold. suggested sizes are: %llu %llu %llu %llu",
(unsigned long long) sizes[0],
(unsigned long long) sizes[1],
(unsigned long long) sizes[2],
(unsigned long long) sizes[3]);
LOG_TRACE("fulltext index health threshold exceeded. new suggested sizes are: %llu %llu %llu %llu",
(unsigned long long) sizes[0],
(unsigned long long) sizes[1],
(unsigned long long) sizes[2],
(unsigned long long) sizes[3]);
res = TRI_ERROR_ARANGO_INDEX_NEEDS_RESIZE;
}
@ -1392,13 +1461,110 @@ int FTS_UpdateDocument (FTS_index_t* ftx, FTS_document_id_t docid) {
}
////////////////////////////////////////////////////////////////////////////////
/// @brief current not called. TODO: find out what its intention is
/// @brief read index3 and remove handles of unused documents. stop after docs
/// deletions. the scan & cleanup is incremental
/// the caller must have write-locked the index
////////////////////////////////////////////////////////////////////////////////
void FTS_BackgroundTask (FTS_index_t* ftx) {
/* obtain LOCKMAIN */
/* remove deleted handles from index3 not done QQQ */
/* release LOCKMAIN */
int FTS_BackgroundTask (FTS_index_t * ftx, int docs) {
FTS_real_index * ix;
int dleft,i;
CTX cold, cnew;
int cd;
uint64_t newterm,oldhan,han;
ZSTR *zold, *znew;
ix = (FTS_real_index *)ftx;
dleft=docs;
cd=0;
znew=ZStrCons(100);
if(znew==NULL) return 1;
zold=ZStrCons(100);
if(zold==NULL)
{
ZStrDest(znew);
return 1;
}
while(dleft>0)
{
uint64_t numDeletions;
assert(ix->_ix3KKey < (ix->_index3)->kmax);
numDeletions = 0;
i=ZStrTuberRead(ix->_index3,ix->_ix3KKey,zold);
if(i==2)
{
cd=1;
break;
}
if(i==0)
{
ZStrCxClear(&zcdoc,&cold);
ZStrCxClear(&zcdoc,&cnew);
ZStrClear(znew);
oldhan=0;
newterm=0;
while(1)
{
han=ZStrCxDec(zold,&zcdoc,&cold);
if(han==oldhan) {
break;
}
oldhan=han;
dleft--;
if(ix->_handlesFree[han]==0)
{
i=ZStrCxEnc(znew,&zcdoc,&cnew,han);
if(i!=0) {
ix->_ix3KKey = 0;
ZStrDest(znew);
ZStrDest(zold);
return 1;
}
newterm=han;
}
else {
// something was deleted
++numDeletions;
}
}
if (numDeletions > 0) {
// update existing entry in tuber
// but only if there's something to update
i=ZStrCxEnc(znew,&zcdoc,&cnew,newterm);
if(i!=0) {
ix->_ix3KKey = 0;
ZStrDest(znew);
ZStrDest(zold);
return 1;
}
ZStrNormalize(znew);
i=ZStrTuberUpdate(ix->_index3,ix->_ix3KKey,znew);
}
if(i!=0) {
ix->_ix3KKey = 0;
ZStrDest(znew);
ZStrDest(zold);
return i;
}
}
ix->_ix3KKey++;
if(ix->_ix3KKey >= (ix->_index3)->kmax)
{
ix->_ix3KKey = 0;
cd=3; // finished iterating over all document handles
break;
}
}
ZStrDest(znew);
ZStrDest(zold);
return cd;
}
////////////////////////////////////////////////////////////////////////////////
@ -1423,34 +1589,53 @@ FTS_document_ids_t* FTS_FindDocuments (FTS_index_t* ftx,
uint64_t word[2 * (MAX_WORD_LENGTH + SPACING)];
uint64_t ndocs = 0;
zstr2 = ZStrCons(10); /* from index-2 tuber */
// initialise
dc = NULL;
TRI_set_errno(TRI_ERROR_NO_ERROR);
zstr2 = ZStrCons(10); /* from index-2 tuber */
if (zstr2 == NULL) {
// TODO: out of memory
TRI_set_errno(TRI_ERROR_OUT_OF_MEMORY);
return NULL;
}
zstr3 = ZStrCons(10); /* from index-3 tuber */
zstr3 = ZStrCons(10); /* from index-3 tuber */
if (zstr3 == NULL) {
// TODO: out of memory
TRI_set_errno(TRI_ERROR_OUT_OF_MEMORY);
ZStrDest(zstr2);
return NULL;
}
zstra1 = ZStrCons(10); /* current list of documents */
if (zstra1 == NULL) {
// TODO: out of memory
TRI_set_errno(TRI_ERROR_OUT_OF_MEMORY);
ZStrDest(zstr3);
ZStrDest(zstr2);
return NULL;
}
zstra2 = ZStrCons(10); /* new list of documents */
if (zstra2 == NULL) {
// TODO: out of memory
TRI_set_errno(TRI_ERROR_OUT_OF_MEMORY);
ZStrDest(zstra1);
ZStrDest(zstr3);
ZStrDest(zstr2);
return NULL;
}
zstr = ZStrCons(4); /* work zstr from stex */
zstr = ZStrCons(4); /* work zstr from stex */
if (zstr == NULL) {
// TODO: out of memory
TRI_set_errno(TRI_ERROR_OUT_OF_MEMORY);
ZStrDest(zstra2);
ZStrDest(zstra1);
ZStrDest(zstr3);
ZStrDest(zstr2);
return NULL;
}
ix = (FTS_real_index*) ftx;
/* - for each term in the query */
// for each term in the query
for (queryterm = 0; queryterm < query->_len; queryterm++) {
if (query->_localOptions[queryterm] == FTS_MATCH_SUBSTRING &&
ix->_options != FTS_INDEX_SUBSTRINGS) {
@ -1512,7 +1697,8 @@ FTS_document_ids_t* FTS_FindDocuments (FTS_index_t* ftx,
}
if (ix->_handlesFree[newhan] == 0) {
if (ZStrCxEnc(zstra2, &zcdoc, &ctxa2, newhan) != 0) {
// TODO: out of memory
TRI_set_errno(TRI_ERROR_OUT_OF_MEMORY);
goto oom;
}
lasthan = newhan;
ndocs++;
@ -1541,7 +1727,8 @@ FTS_document_ids_t* FTS_FindDocuments (FTS_index_t* ftx,
if (newhan == nhand1) {
if (ix->_handlesFree[newhan] == 0) {
if (ZStrCxEnc(zstra2, &zcdoc, &ctxa2, newhan) != 0) {
// TODO: out of memory
TRI_set_errno(TRI_ERROR_OUT_OF_MEMORY);
goto oom;
}
lasthan = newhan;
ndocs++;
@ -1561,9 +1748,12 @@ FTS_document_ids_t* FTS_FindDocuments (FTS_index_t* ftx,
}
}
}
if (ZStrCxEnc(zstra2, &zcdoc, &ctxa2, lasthan) != 0) {
// TODO: out of memory
TRI_set_errno(TRI_ERROR_OUT_OF_MEMORY);
goto oom;
}
ZStrNormalize(zstra2);
ztemp = zstra1;
zstra1 = zstra2;
@ -1579,7 +1769,8 @@ FTS_document_ids_t* FTS_FindDocuments (FTS_index_t* ftx,
// make STEX to contain new list of handles
dochan = ZStrSTCons(2);
if (dochan == NULL) {
// TODO: out of memory
TRI_set_errno(TRI_ERROR_OUT_OF_MEMORY);
goto oom;
}
FillWordBuffer(&word[MAX_WORD_LENGTH + SPACING], query->_texts[queryterm]);
@ -1607,7 +1798,10 @@ FTS_document_ids_t* FTS_FindDocuments (FTS_index_t* ftx,
break;
}
// call routine to recursively put handles to STEX
Ix1Recurs(dochan, ix, kkey, word + MAX_WORD_LENGTH + SPACING);
if (Ix1Recurs(dochan, ix, kkey, word + MAX_WORD_LENGTH + SPACING) != TRI_ERROR_NO_ERROR) {
TRI_set_errno(TRI_ERROR_OUT_OF_MEMORY);
goto oom;
}
}
ZStrSTSort(dochan);
@ -1625,13 +1819,17 @@ FTS_document_ids_t* FTS_FindDocuments (FTS_index_t* ftx,
uint64_t newhan;
if (ZStrInsert(zstr, docpt, 2) != 0) {
// TODO: out of memory
TRI_set_errno(TRI_ERROR_OUT_OF_MEMORY);
ZStrSTDest(dochan);
goto oom;
}
newhan = ZStrDec(zstr, &zcdh);
docpt += ZStrExtLen(docpt, 2);
if (ix->_handlesFree[newhan] == 0) {
if (ZStrCxEnc(zstra2, &zcdoc, &ctxa2, newhan) != 0) {
// TODO: out of memory
TRI_set_errno(TRI_ERROR_OUT_OF_MEMORY);
ZStrSTDest(dochan);
goto oom;
}
lasthan = newhan;
ndocs++;
@ -1652,7 +1850,9 @@ FTS_document_ids_t* FTS_FindDocuments (FTS_index_t* ftx,
nhand1 = ZStrCxDec(zstra1, &zcdoc, &ctxa1);
if (ZStrInsert(zstr, docpt, 2) != 0) {
// TODO: out of memory
TRI_set_errno(TRI_ERROR_OUT_OF_MEMORY);
ZStrSTDest(dochan);
goto oom;
}
newhan = ZStrDec(zstr, &zcdh);
docpt += ZStrExtLen(docpt, 2);
@ -1667,8 +1867,11 @@ FTS_document_ids_t* FTS_FindDocuments (FTS_index_t* ftx,
if (newhan == nhand1) {
if (ix->_handlesFree[newhan] == 0) {
if (ZStrCxEnc(zstra2, &zcdoc, &ctxa2, newhan) != 0) {
// TODO: out of memory
TRI_set_errno(TRI_ERROR_OUT_OF_MEMORY);
ZStrSTDest(dochan);
goto oom;
}
lasthan = newhan;
ndocs++;
}
@ -1676,8 +1879,11 @@ FTS_document_ids_t* FTS_FindDocuments (FTS_index_t* ftx,
break;
}
if (ZStrInsert(zstr, docpt, 2) != 0) {
// TODO: out of memory
TRI_set_errno(TRI_ERROR_OUT_OF_MEMORY);
ZStrSTDest(dochan);
goto oom;
}
newhan = ZStrDec(zstr, &zcdh);
docpt += ZStrExtLen(docpt, 2);
odocs--;
@ -1693,7 +1899,9 @@ FTS_document_ids_t* FTS_FindDocuments (FTS_index_t* ftx,
break;
}
if (ZStrInsert(zstr, docpt, 2) != 0) {
// TODO: out of memory
TRI_set_errno(TRI_ERROR_OUT_OF_MEMORY);
ZStrSTDest(dochan);
goto oom;
}
newhan = ZStrDec(zstr, &zcdh);
docpt += ZStrExtLen(docpt, 2);
@ -1702,7 +1910,8 @@ FTS_document_ids_t* FTS_FindDocuments (FTS_index_t* ftx,
}
}
if (ZStrCxEnc(zstra2, &zcdoc, &ctxa2, lasthan) != 0) {
// TODO: out of memory
TRI_set_errno(TRI_ERROR_OUT_OF_MEMORY);
goto oom;
}
ZStrNormalize(zstra2);
ztemp = zstra1;
@ -1737,6 +1946,8 @@ FTS_document_ids_t* FTS_FindDocuments (FTS_index_t* ftx,
}
}
}
oom:
ZStrDest(zstra1);
ZStrDest(zstra2);

View File

@ -116,13 +116,14 @@ FTS_texts_t;
/// @brief determine the health of the index
////////////////////////////////////////////////////////////////////////////////
int FTS_HealthIndex (FTS_index_t*, uint64_t sizes[4]);
int FTS_HealthIndex (FTS_index_t*, uint64_t[4]);
////////////////////////////////////////////////////////////////////////////////
/// @brief clone an existing index
////////////////////////////////////////////////////////////////////////////////
FTS_index_t* FTS_CloneIndex (FTS_index_t*,
FTS_document_id_t,
uint64_t[4]);
////////////////////////////////////////////////////////////////////////////////
@ -159,6 +160,13 @@ int FTS_DeleteDocument (FTS_index_t*, FTS_document_id_t);
int FTS_UpdateDocument (FTS_index_t*, FTS_document_id_t);
////////////////////////////////////////////////////////////////////////////////
/// @brief read index3 and remove handles of unused documents. stop after docs
/// deletions. the bc struct can be used to do an incremental scan & cleanup
////////////////////////////////////////////////////////////////////////////////
int FTS_BackgroundTask (FTS_index_t* ftx, int docs);
////////////////////////////////////////////////////////////////////////////////
/// @brief perform a search in the index
////////////////////////////////////////////////////////////////////////////////

View File

@ -952,10 +952,11 @@ void ZStrTuberStats(TUBER * t, uint64_t * stats)
d2=100-d2;
if(d2>d1) d1=d2;
d2=((t->wct*8)*(d1+1))/50;
/*
printf("fuse %d freebit %d freekey %d kmax %d wct %d lenlen %d\n",
(int)t->fuses, (int)t->freebit, (int)t->freekey, (int)t->kmax,
(int)t->wct, (int)t->lenlen);
*/
if(d2<72*t->fuses)d2=72*t->fuses;
stats[0]=d1;
stats[1]=d2;

View File

@ -61,6 +61,7 @@
#include "VocBase/general-cursor.h"
#include "VocBase/document-collection.h"
#include "VocBase/edge-collection.h"
#include "VocBase/fulltext-query.h"
#include "VocBase/key-generator.h"
#include "VocBase/voc-shaper.h"
#include "v8.h"
@ -614,8 +615,8 @@ static v8::Handle<v8::Value> EnsureFulltextIndex (v8::Arguments const& argv,
const bool create) {
v8::HandleScope scope;
if (argv.Length() != 1 && argv.Length() != 2) {
return scope.Close(v8::ThrowException(TRI_CreateErrorObject(TRI_ERROR_ILLEGAL_OPTION, "usage: ensureFulltext(<attribute>, <indexSubstrings>)")));
if (argv.Length() < 1 || argv.Length() > 3) {
return scope.Close(v8::ThrowException(TRI_CreateErrorObject(TRI_ERROR_ILLEGAL_OPTION, "usage: ensureFulltext(<attribute>, <indexSubstrings>, <minWordLength>)")));
}
string attributeName = TRI_ObjectToString(argv[0]);
@ -624,10 +625,15 @@ static v8::Handle<v8::Value> EnsureFulltextIndex (v8::Arguments const& argv,
}
bool indexSubstrings = false;
if (argv.Length() == 2) {
if (argv.Length() > 1) {
indexSubstrings = TRI_ObjectToBoolean(argv[1]);
}
int minWordLength = TRI_FULLTEXT_WORDLENGTH_DEFAULT;
if (argv.Length() == 3) {
minWordLength = (int) TRI_ObjectToInt64(argv[2]);
}
// .............................................................................
// Check that we have a valid collection
// .............................................................................
@ -661,14 +667,14 @@ static v8::Handle<v8::Value> EnsureFulltextIndex (v8::Arguments const& argv,
TRI_document_collection_t* document = (TRI_document_collection_t*) primary;
if (create) {
idx = TRI_EnsureFulltextIndexDocumentCollection(document, attributeName.c_str(), indexSubstrings, &created);
idx = TRI_EnsureFulltextIndexDocumentCollection(document, attributeName.c_str(), indexSubstrings, minWordLength, &created);
if (idx == 0) {
res = TRI_errno();
}
}
else {
idx = TRI_LookupFulltextIndexDocumentCollection(document, attributeName.c_str(), indexSubstrings);
idx = TRI_LookupFulltextIndexDocumentCollection(document, attributeName.c_str(), indexSubstrings, minWordLength);
}
if (idx == 0) {

View File

@ -34,6 +34,7 @@
#include "BasicsC/strings.h"
#include "ShapedJson/shape-accessor.h"
#include "VocBase/edge-collection.h"
#include "VocBase/fulltext-query.h"
#include "VocBase/index.h"
#include "VocBase/key-generator.h"
#include "VocBase/voc-shaper.h"
@ -4091,7 +4092,8 @@ TRI_index_t* TRI_EnsureSkiplistIndexDocumentCollection (TRI_document_collection_
static TRI_index_t* LookupFulltextIndexDocumentCollection (TRI_document_collection_t* document,
const char* attributeName,
const bool indexSubstrings) {
const bool indexSubstrings,
int minWordLength) {
size_t i;
assert(attributeName);
@ -4107,6 +4109,10 @@ static TRI_index_t* LookupFulltextIndexDocumentCollection (TRI_document_collecti
continue;
}
if (fulltext->_minWordLength != minWordLength) {
continue;
}
if (fulltext->base._fields._length != 1) {
continue;
}
@ -4129,6 +4135,7 @@ static TRI_index_t* LookupFulltextIndexDocumentCollection (TRI_document_collecti
static TRI_index_t* CreateFulltextIndexDocumentCollection (TRI_document_collection_t* document,
const char* attributeName,
const bool indexSubstrings,
int minWordLength,
TRI_idx_iid_t iid,
bool* created) {
TRI_index_t* idx;
@ -4140,7 +4147,7 @@ static TRI_index_t* CreateFulltextIndexDocumentCollection (TRI_document_collecti
// a new one.
// ...........................................................................
idx = LookupFulltextIndexDocumentCollection(document, attributeName, indexSubstrings);
idx = LookupFulltextIndexDocumentCollection(document, attributeName, indexSubstrings, minWordLength);
if (idx != NULL) {
LOG_TRACE("fulltext-index already created");
@ -4151,7 +4158,7 @@ static TRI_index_t* CreateFulltextIndexDocumentCollection (TRI_document_collecti
}
// Create the fulltext index
idx = TRI_CreateFulltextIndex(&document->base, attributeName, indexSubstrings);
idx = TRI_CreateFulltextIndex(&document->base, attributeName, indexSubstrings, minWordLength);
// If index id given, use it otherwise use the default.
if (iid) {
@ -4187,9 +4194,11 @@ static int FulltextIndexFromJson (TRI_document_collection_t* document,
TRI_json_t* attribute;
TRI_json_t* fld;
TRI_json_t* indexSubstrings;
TRI_json_t* minWordLength;
char* attributeName;
size_t fieldCount;
bool doIndexSubstrings;
int minWordLengthValue;
// extract fields
fld = ExtractFields(definition, &fieldCount, iid);
@ -4214,13 +4223,19 @@ static int FulltextIndexFromJson (TRI_document_collection_t* document,
if (indexSubstrings != NULL && indexSubstrings->_type == TRI_JSON_BOOLEAN) {
doIndexSubstrings = indexSubstrings->_value._boolean;
}
minWordLength = TRI_LookupArrayJson(definition, "minLength");
minWordLengthValue = TRI_FULLTEXT_WORDLENGTH_DEFAULT;
if (minWordLength != NULL && minWordLength->_type == TRI_JSON_NUMBER) {
minWordLengthValue = (int) minWordLength->_value._number;
}
// create the index
idx = LookupFulltextIndexDocumentCollection(document, attributeName, doIndexSubstrings);
idx = LookupFulltextIndexDocumentCollection(document, attributeName, doIndexSubstrings, minWordLengthValue);
if (idx == NULL) {
bool created;
idx = CreateFulltextIndexDocumentCollection(document, attributeName, doIndexSubstrings, iid, &created);
idx = CreateFulltextIndexDocumentCollection(document, attributeName, doIndexSubstrings, minWordLengthValue, iid, &created);
}
if (idx == NULL) {
@ -4250,7 +4265,8 @@ static int FulltextIndexFromJson (TRI_document_collection_t* document,
TRI_index_t* TRI_LookupFulltextIndexDocumentCollection (TRI_document_collection_t* document,
const char* attributeName,
const bool indexSubstrings) {
const bool indexSubstrings,
int minWordLength) {
TRI_index_t* idx;
TRI_primary_collection_t* primary;
@ -4262,7 +4278,7 @@ TRI_index_t* TRI_LookupFulltextIndexDocumentCollection (TRI_document_collection_
TRI_READ_LOCK_DOCUMENTS_INDEXES_PRIMARY_COLLECTION(primary);
idx = LookupFulltextIndexDocumentCollection(document, attributeName, indexSubstrings);
idx = LookupFulltextIndexDocumentCollection(document, attributeName, indexSubstrings, minWordLength);
TRI_READ_UNLOCK_DOCUMENTS_INDEXES_PRIMARY_COLLECTION(primary);
@ -4280,6 +4296,7 @@ TRI_index_t* TRI_LookupFulltextIndexDocumentCollection (TRI_document_collection_
TRI_index_t* TRI_EnsureFulltextIndexDocumentCollection (TRI_document_collection_t* document,
const char* attributeName,
const bool indexSubstrings,
int minWordLength,
bool* created) {
TRI_index_t* idx;
TRI_primary_collection_t* primary;
@ -4292,7 +4309,7 @@ TRI_index_t* TRI_EnsureFulltextIndexDocumentCollection (TRI_document_collection_
TRI_WRITE_LOCK_DOCUMENTS_INDEXES_PRIMARY_COLLECTION(primary);
idx = CreateFulltextIndexDocumentCollection(document, attributeName, indexSubstrings, 0, created);
idx = CreateFulltextIndexDocumentCollection(document, attributeName, indexSubstrings, minWordLength, 0, created);
TRI_WRITE_UNLOCK_DOCUMENTS_INDEXES_PRIMARY_COLLECTION(primary);

View File

@ -553,7 +553,8 @@ struct TRI_index_s* TRI_EnsureSkiplistIndexDocumentCollection (TRI_document_coll
struct TRI_index_s* TRI_LookupFulltextIndexDocumentCollection (TRI_document_collection_t*,
const char*,
const bool);
const bool,
int);
////////////////////////////////////////////////////////////////////////////////
/// @brief ensures that a fulltext index exists
@ -562,6 +563,7 @@ struct TRI_index_s* TRI_LookupFulltextIndexDocumentCollection (TRI_document_coll
struct TRI_index_s* TRI_EnsureFulltextIndexDocumentCollection (TRI_document_collection_t*,
const char*,
const bool,
int,
bool*);
////////////////////////////////////////////////////////////////////////////////

View File

@ -35,6 +35,25 @@
extern "C" {
#endif
// -----------------------------------------------------------------------------
// --SECTION-- public defines
// -----------------------------------------------------------------------------
////////////////////////////////////////////////////////////////////////////////
/// @addtogroup VocBase
/// @{
////////////////////////////////////////////////////////////////////////////////
////////////////////////////////////////////////////////////////////////////////
/// @brief default minimum word length for a fulltext index
////////////////////////////////////////////////////////////////////////////////
#define TRI_FULLTEXT_WORDLENGTH_DEFAULT (2)
////////////////////////////////////////////////////////////////////////////////
/// @}
////////////////////////////////////////////////////////////////////////////////
// -----------------------------------------------------------------------------
// --SECTION-- public functions
// -----------------------------------------------------------------------------

View File

@ -4115,7 +4115,7 @@ static FTS_texts_t* GetTextsFulltextIndex (FTS_document_id_t document,
}
// parse the document text
words = TRI_get_words(text, textLength, 2, true);
words = TRI_get_words(text, textLength, (uint8_t) fulltextIndex->_minWordLength, true);
if (words == NULL) {
return NULL;
}
@ -4149,23 +4149,31 @@ static FTS_texts_t* GetTextsFulltextIndex (FTS_document_id_t document,
}
////////////////////////////////////////////////////////////////////////////////
/// @brief rebuilds a fulltext index by resizing it and re-adding documents
/// @brief creates a new fulltext index with the properties of an existing one,
/// but with adjusted (potentially bigger) sizes. The documents from the old
/// index will be added into the new index.
/// doc will not be re-inserted into the new index. It's the caller's
/// responsibility to add it later. This prevents duplicate document entries
/// in case document insertion has failed at a certain place. In this case, doc
/// might have been in the old index already, and copying the old index and
/// inserting doc again will lead to duplicates. So we exclude doc when copying
/// the old documents and make it the caller's responsibility to add doc later
/// the caller must have write-locked the index
////////////////////////////////////////////////////////////////////////////////
static int ResizeFulltextIndex (TRI_index_t* idx) {
static int ResizeFulltextIndex (TRI_index_t* idx, TRI_doc_mptr_t const* doc) {
TRI_fulltext_index_t* fulltextIndex;
FTS_index_t* newIndex;
uint64_t sizes[4];
LOG_INFO("fulltext index resize was triggered");
LOG_DEBUG("resizing fulltext index");
fulltextIndex = (TRI_fulltext_index_t*) idx;
// this call will populate the sizes array
FTS_HealthIndex(fulltextIndex->_fulltextIndex, sizes);
newIndex = FTS_CloneIndex(fulltextIndex->_fulltextIndex, sizes);
newIndex = FTS_CloneIndex(fulltextIndex->_fulltextIndex, (FTS_document_id_t) ((intptr_t) doc), sizes);
if (newIndex == NULL) {
return TRI_ERROR_OUT_OF_MEMORY;
@ -4191,16 +4199,22 @@ static int InsertFulltextIndex (TRI_index_t* idx, TRI_doc_mptr_t const* doc) {
LOG_WARNING("internal error in InsertFulltextIndex");
return TRI_ERROR_INTERNAL;
}
TRI_WriteLockReadWriteLock(&fulltextIndex->_lock);
res = FTS_AddDocument(fulltextIndex->_fulltextIndex, (FTS_document_id_t) ((intptr_t) doc));
if (res == TRI_ERROR_ARANGO_INDEX_NEEDS_RESIZE) {
// rebuild the index with adjusted (bigger) size
res = ResizeFulltextIndex(idx);
// rebuild the index with adjusted (bigger) sizes
res = ResizeFulltextIndex(idx, doc);
if (res == TRI_ERROR_NO_ERROR) {
// insert the document again because previous insert failed
res = FTS_AddDocument(fulltextIndex->_fulltextIndex, (FTS_document_id_t) ((intptr_t) doc));
if (res != TRI_ERROR_NO_ERROR) {
LOG_ERROR("adding document to fulltext index failed: %s", TRI_errno_string(res));
}
}
else {
LOG_ERROR("resizing fulltext index failed: %s", TRI_errno_string(res));
}
}
@ -4245,6 +4259,7 @@ static TRI_json_t* JsonFulltextIndex (TRI_index_t* idx, TRI_primary_collection_t
TRI_Insert3ArrayJson(TRI_UNKNOWN_MEM_ZONE, json, "unique", TRI_CreateBooleanJson(TRI_UNKNOWN_MEM_ZONE, idx->_unique));
TRI_Insert3ArrayJson(TRI_UNKNOWN_MEM_ZONE, json, "type", TRI_CreateStringCopyJson(TRI_UNKNOWN_MEM_ZONE, "fulltext"));
TRI_Insert3ArrayJson(TRI_UNKNOWN_MEM_ZONE, json, "indexSubstrings", TRI_CreateBooleanJson(TRI_UNKNOWN_MEM_ZONE, fulltextIndex->_indexSubstrings));
TRI_Insert3ArrayJson(TRI_UNKNOWN_MEM_ZONE, json, "minWordLength", TRI_CreateNumberJson(TRI_UNKNOWN_MEM_ZONE, (double) fulltextIndex->_minWordLength));
TRI_Insert3ArrayJson(TRI_UNKNOWN_MEM_ZONE, json, "fields", fields);
return json;
@ -4273,11 +4288,7 @@ static int RemoveFulltextIndex (TRI_index_t* idx, TRI_doc_mptr_t const* doc) {
if (res == TRI_ERROR_ARANGO_INDEX_NEEDS_RESIZE) {
// rebuild the index with adjusted (bigger) size
res = ResizeFulltextIndex(idx);
if (res == TRI_ERROR_NO_ERROR) {
// delete the document again because previous delete failed
res = FTS_DeleteDocument(fulltextIndex->_fulltextIndex, (FTS_document_id_t) ((intptr_t) doc));
}
res = ResizeFulltextIndex(idx, doc);
}
TRI_WriteUnlockReadWriteLock(&fulltextIndex->_lock);
@ -4304,10 +4315,10 @@ static int UpdateFulltextIndex (TRI_index_t* idx,
if (res == TRI_ERROR_ARANGO_INDEX_NEEDS_RESIZE) {
// rebuild the index with adjusted (bigger) size
res = ResizeFulltextIndex(idx);
res = ResizeFulltextIndex(idx, newDoc);
if (res == TRI_ERROR_NO_ERROR) {
// update the document again because previous update failed
res = FTS_UpdateDocument(fulltextIndex->_fulltextIndex, (FTS_document_id_t) ((intptr_t) newDoc));
// insert just the new version of the document
res = FTS_AddDocument(fulltextIndex->_fulltextIndex, (FTS_document_id_t) ((intptr_t) newDoc));
}
}
@ -4322,7 +4333,33 @@ static int UpdateFulltextIndex (TRI_index_t* idx,
////////////////////////////////////////////////////////////////////////////////
static int CleanupFulltextIndex (TRI_index_t* idx) {
LOG_DEBUG("fulltext cleanup called");
TRI_fulltext_index_t* fulltextIndex;
int res;
LOG_TRACE("fulltext cleanup called");
fulltextIndex = (TRI_fulltext_index_t*) idx;
TRI_WriteLockReadWriteLock(&fulltextIndex->_lock);
while (1) {
// this will scan 100.000 document/word pairs at a time
// TODO: check if this number is reasonable
res = FTS_BackgroundTask(fulltextIndex->_fulltextIndex, 100000);
// 0 = ok, but unfinished
// 1 = oom
// 2 = needs resize
// 3 = finished
if (res == 3) {
// finished cleaning
break;
}
// TODO: maybe we want to clean more
break;
}
TRI_WriteUnlockReadWriteLock(&fulltextIndex->_lock);
LOG_TRACE("finished cleaning up");
return TRI_ERROR_NO_ERROR;
}
@ -4346,7 +4383,8 @@ static int CleanupFulltextIndex (TRI_index_t* idx) {
TRI_index_t* TRI_CreateFulltextIndex (struct TRI_primary_collection_s* collection,
const char* attributeName,
const bool indexSubstrings) {
const bool indexSubstrings,
int minWordLength) {
TRI_fulltext_index_t* fulltextIndex;
FTS_index_t* fts;
TRI_shaper_t* shaper;
@ -4354,8 +4392,8 @@ TRI_index_t* TRI_CreateFulltextIndex (struct TRI_primary_collection_s* collectio
TRI_shape_pid_t attribute;
int options;
// default sizes for index. TODO: adjust these
//uint64_t sizes[4] = { 20050, 100000, 570000, 10000000 };
uint64_t sizes[4] = { 50, 1000, 5700, 10000 };
//uint64_t sizes[4] = { 50, 100000, 5000, 1000 };
uint64_t sizes[4] = { 500, 1000000, 50000, 10000 };
// look up the attribute
shaper = collection->_shaper;
@ -4403,6 +4441,7 @@ TRI_index_t* TRI_CreateFulltextIndex (struct TRI_primary_collection_s* collectio
fulltextIndex->_fulltextIndex = fts;
fulltextIndex->_indexSubstrings = indexSubstrings;
fulltextIndex->_attribute = attribute;
fulltextIndex->_minWordLength = minWordLength;
TRI_InitVectorString(&fulltextIndex->base._fields, TRI_UNKNOWN_MEM_ZONE);
TRI_PushBackVectorString(&fulltextIndex->base._fields, copy);

View File

@ -235,6 +235,7 @@ typedef struct TRI_fulltext_index_s {
FTS_index_t* _fulltextIndex;
TRI_shape_pid_t _attribute;
TRI_read_write_lock_t _lock;
int _minWordLength;
bool _indexSubstrings;
}
@ -683,7 +684,8 @@ struct TRI_doc_mptr_s** TRI_LookupFulltextIndex (TRI_index_t*, const char* query
TRI_index_t* TRI_CreateFulltextIndex (struct TRI_primary_collection_s*,
const char*,
const bool);
const bool,
int);
////////////////////////////////////////////////////////////////////////////////
/// @brief frees the memory allocated, but does not free the pointer

View File

@ -384,7 +384,7 @@ function fulltextQuerySuite () {
assertEqual(0, collection.FULLTEXT(idx, "no,cats,allowed").documents.length);
assertEqual(0, collection.FULLTEXT(idx, "banana").documents.length);
},
/*
////////////////////////////////////////////////////////////////////////////////
/// @brief test duplicate entries
////////////////////////////////////////////////////////////////////////////////
@ -407,7 +407,7 @@ function fulltextQuerySuite () {
////////////////////////////////////////////////////////////////////////////////
/// @brief test duplicate entries
////////////////////////////////////////////////////////////////////////////////
testDuplicatesDocuments: function () {
var text1 = "this is a short document text";
var text2 = "Some longer document text is put in here just to validate whats going on";
@ -423,7 +423,7 @@ function fulltextQuerySuite () {
assertEqual(10000, collection.FULLTEXT(idx, "some").documents.length);
assertEqual(0, collection.FULLTEXT(idx, "banana").documents.length);
},
*/
////////////////////////////////////////////////////////////////////////////////
/// @brief test similar entries
////////////////////////////////////////////////////////////////////////////////
@ -806,7 +806,6 @@ function fulltextQuerySuite () {
}
},
/*
////////////////////////////////////////////////////////////////////////////////
/// @brief 4 byte sequences
////////////////////////////////////////////////////////////////////////////////
@ -847,6 +846,7 @@ function fulltextQuerySuite () {
assertEqual(1, collection.FULLTEXT(idx, "타이승려들은,수호사원으로서").documents.length);
assertEqual(1, collection.FULLTEXT(idx, "prefix:타이승려,prefix:수호사원으").documents.length);
assertEqual(1, collection.FULLTEXT(idx, "prefix:조상숭배로").documents.length);
/*
require("console").log(4);
assertEqual(1, collection.FULLTEXT(idx, "教材教辅考试").documents.length);
// "图书简介 亚马逊图书,中国最大的网上书店。拥有文学,经济管理,少儿,人文社科,生活,艺术,科技,进口原版,期刊杂志等大类,教材教辅考试,历史国学古籍法律军事宗教心理学哲学健康与养生旅游与地图娱乐两性婚恋时尚家居休闲孕产育儿文学小说传记青春与动漫绘本家庭百科外语工具书教育心理励志心灵读物建筑计算机与网络科学与自然等数十小类共计300多万种中外图书
@ -880,8 +880,8 @@ function fulltextQuerySuite () {
assertEqual(1, collection.FULLTEXT(idx, "síðu,rættar,ritstjórni").documents.length);
require("console").log(73);
assertEqual(1, collection.FULLTEXT(idx, "prefix:læt").documents.length);
*/
}
*/
};
};

View File

@ -250,6 +250,9 @@ char* Utf8Helper::tolower (TRI_memory_zone_t* zone, const char *src, int32_t src
}
else {
utf8_dest = (char*) TRI_Allocate(zone, (srcLength+1) * sizeof(char), false);
if (utf8_dest == 0) {
return 0;
}
dstLength = ucasemap_utf8ToLower(csm.getAlias(),
utf8_dest,
@ -285,7 +288,9 @@ char* Utf8Helper::tolower (TRI_memory_zone_t* zone, const char *src, int32_t src
#endif
utf8_dest = TRI_LowerAsciiStringZ(zone, src);
dstLength = strlen(utf8_dest);
if (utf8_dest != 0) {
dstLength = strlen(utf8_dest);
}
return utf8_dest;
}
@ -371,14 +376,16 @@ char* Utf8Helper::toupper (TRI_memory_zone_t* zone, const char *src, int32_t src
#endif
utf8_dest = TRI_UpperAsciiStringZ(zone, src);
dstLength = strlen(utf8_dest);
if (utf8_dest != NULL) {
dstLength = strlen(utf8_dest);
}
return utf8_dest;
}
TRI_vector_string_t* Utf8Helper::getWords (const char* const text,
const size_t textLength,
uint8_t minimalLength,
bool lowerCase) {
const size_t textLength,
uint8_t minimalLength,
bool lowerCase) {
TRI_vector_string_t* words;
UErrorCode status = U_ZERO_ERROR;
UnicodeString word;
@ -401,17 +408,29 @@ TRI_vector_string_t* Utf8Helper::getWords (const char* const text,
// lower case string
int32_t lowerLength = 0;
char* lower = tolower(TRI_UNKNOWN_MEM_ZONE, text, (int32_t) textLength, lowerLength);
if (lowerLength == 0) {
if (lower == NULL) {
// out of memory
TRI_FreeVectorString(TRI_UNKNOWN_MEM_ZONE, words);
return NULL;
}
if (lowerLength == 0) {
TRI_Free(TRI_UNKNOWN_MEM_ZONE, lower);
TRI_FreeVectorString(TRI_UNKNOWN_MEM_ZONE, words);
return NULL;
}
textUtf16 = TRI_Utf8ToUChar(TRI_UNKNOWN_MEM_ZONE, lower, lowerLength, &textUtf16Length);
TRI_Free(TRI_UNKNOWN_MEM_ZONE, lower);
}
else {
textUtf16 = TRI_Utf8ToUChar(TRI_UNKNOWN_MEM_ZONE, text, (int32_t) textLength, &textUtf16Length);
}
if (textUtf16 == NULL) {
return NULL;
}
ULocDataLocaleType type = ULOC_VALID_LOCALE;
const Locale& locale = _coll->getLocale(type, status);
@ -437,7 +456,9 @@ TRI_vector_string_t* Utf8Helper::getWords (const char* const text,
if (tempUtf16Length >= minimalLength) {
utext.extractBetween(start, end, tempUtf16, 0);
utf8Word = TRI_UCharToUtf8(TRI_UNKNOWN_MEM_ZONE, tempUtf16, tempUtf16Length, &utf8WordLength);
TRI_PushBackVectorString(words, utf8Word);
if (utf8Word != 0) {
TRI_PushBackVectorString(words, utf8Word);
}
}
}