diff --git a/arangod/FulltextIndex/FTS_index.c b/arangod/FulltextIndex/FTS_index.c new file mode 100644 index 0000000000..b016c03aca --- /dev/null +++ b/arangod/FulltextIndex/FTS_index.c @@ -0,0 +1,1080 @@ +/* ftsindex.c - The Full Text Search */ +/* R. A. Parker 24.10.2012 */ + +#include +#include +#include +#include "avodoc.h" +#include "zstr.h" +#include "FTS_index.h" + +/* codes - in zcode.c so need externs here */ +extern ZCOD zcutf; +extern ZCOD zcbky; +extern ZCOD zcdelt; +extern ZCOD zcdoc; +extern ZCOD zckk; +extern ZCOD zcdh; + +typedef struct +{ +/* first the read/write lock for the index - needs to go here */ + int readwritelock; /* certainly NOT an int! */ + FTS_collection_id_t colid; /* collection ID for this index */ + FTS_document_id_t *handles; /* array converting handles to docid */ + uint8_t * handsfree; + FTS_document_id_t firstfree; /* Start of handle free chain. */ + FTS_document_id_t lastslot; + int options; + TUBER * index1; + TUBER * index2; + TUBER * index3; +} FTS_real_index; + +/* Get a unicode character from utf-8 */ + +uint64_t getunicode(uint8_t ** ptr) +{ + uint64_t c1; + c1=**ptr; + if(c1<128) + { + (*ptr)++; + return c1; + } + if(c1<224) + { + c1=((c1-192)<<6)+(*((*ptr)+1)-128); + (*ptr)+=2; + return c1; + } + if(c1<240) + { + c1=((c1-224)<<12)+((*((*ptr)+1)-128)<<6) + +(*((*ptr)+2)-128); + (*ptr)+=3; + return c1; + } + if(c1<248) + { + c1=((c1-240)<<18)+((*((*ptr)+1)-128)<<12) + +((*((*ptr)+2)-128)<<6) + +(*((*ptr)+3)-128); + (*ptr)+=4; + return c1; + } + return 0; +} + +FTS_index_t * FTS_CreateIndex(FTS_collection_id_t coll, + uint64_t options, uint64_t sizes[10]) +/* sizes[0] = size of handles table to start with */ +/* sizes[1] = number of bytes for index 1 */ +/* sizes[2] = number of bytes for index 2 */ +/* sizes[3] = number of bytes for index 3 */ + +{ + FTS_real_index * ix; + uint64_t bk; + int i; + ix=malloc(sizeof(FTS_real_index)); + if(ix==NULL) return NULL; + ix->colid=coll; +/* TBD initialize readwritelock */ + ix->handles=malloc((sizes[0]+2)*sizeof(FTS_document_id_t)); + ix->handsfree=malloc((sizes[0]+2)*sizeof(uint8_t)); +/* set up free chain of document handles */ + for(i=1;ihandles[i]=i+1; + ix->handsfree[i]=1; + } + ix->handles[sizes[0]]=0; /* end of free chain */ + ix->handsfree[sizes[0]]=1; + ix->firstfree=1; + ix->lastslot=sizes[0]; +/* create index 2 */ + ix->index2 = ZStrTuberCons(sizes[2],TUBER_BITS_8); + bk=ZStrTuberIns(ix->index2,0,0); + if(bk!=0) printf("Help - Can't insert root of index 2\n"); +/* create index 3 */ + ix->index3 = ZStrTuberCons(sizes[3],TUBER_BITS_32); + ix->options=options; +/* create index 1 if needed */ + if(options==FTS_INDEX_SUBSTRINGS) + { + ix->index1 = ZStrTuberCons(sizes[1],TUBER_BITS_8); + bk=ZStrTuberIns(ix->index1,0,0); + if(bk!=0) printf("Help - Can't insert root of index 1\n"); + } + return (FTS_index_t *) ix; +} + +void FTS_FreeIndex ( FTS_index_t * ftx) +{ + FTS_real_index * ix; + ix = (FTS_real_index *) ftx; + if(ix->options==FTS_INDEX_SUBSTRINGS) ZStrTuberDest(ix->index1); + ZStrTuberDest(ix->index2); + ZStrTuberDest(ix->index3); + free(ix->handsfree); + free(ix->handles); + free(ix); +} +int xxlet[100]; +void index2dump(FTS_real_index * ix, uint64_t kkey, int lev) +{ + CTX ctx, dctx,x3ctx; + ZSTR *zstr, *x3zstr; + int i,temp,md; + uint64_t x64,oldlet,newlet,bkey,newkkey; + uint64_t docb,dock,han,oldhan; + zstr=ZStrCons(30); + x3zstr=ZStrCons(35); + ZStrCxClear(&zcutf,&ctx); + ZStrCxClear(&zcdelt,&dctx); + ZStrCxClear(&zcdoc,&x3ctx); + for(i=1;iindex2,kkey,zstr); + temp=kkey; + if(i!=0) + { + printf("cannot read kkey = %d from TUBER\n",temp); + return; + } + md=ZStrBitsOut(zstr,1); + temp=kkey; + printf("...kkey %d ",temp); + temp=md; + printf("Md=%d ",temp); + temp=zstr->dat[0]; + printf(" zstr %x",temp); + if(md==1) + { + docb=ZStrCxDec(zstr,&zcbky,&ctx); + temp=docb; + printf(" doc-b = %d",temp); + dock=ZStrTuberK(ix->index3,kkey,0,docb); + temp=dock; + printf(" doc-k = %d",temp); + } + oldlet=0; + + while(1) + { + newlet=ZStrCxDec(zstr,&zcdelt,&dctx); + if(newlet==oldlet) break; + bkey=ZStrCxDec(zstr,&zcbky,&ctx); + x64=ZStrUnXl(&zcutf,newlet); + temp=x64; + if(temp<128) + printf(" %c",temp); + else + printf(" %x",temp); + temp=bkey; + printf(" %d",temp); + oldlet=newlet; + } + if(md==1) + { + printf("\n --- Docs ---"); + i=ZStrTuberRead(ix->index3,dock,x3zstr); + oldhan=0; + while(1) + { + han=ZStrCxDec(x3zstr,&zcdoc,&x3ctx); + if(han==oldhan) break; + temp=han; + printf("h= %d ",temp); + temp=ix->handles[han]; + printf("id= %d; ",temp); + oldhan=han; + } + } + printf("\n"); + i=ZStrTuberRead(ix->index2,kkey,zstr); + x64=ZStrBitsOut(zstr,1); + if(x64==1) + bkey=ZStrCxDec(zstr,&zcbky,&ctx); + oldlet=0; + ZStrCxClear(&zcdelt,&dctx); + while(1) + { + newlet=ZStrCxDec(zstr,&zcdelt,&dctx); + if(newlet==oldlet) return; + bkey=ZStrCxDec(zstr,&zcbky,&ctx); + newkkey=ZStrTuberK(ix->index2,kkey,newlet,bkey); + xxlet[lev]=ZStrUnXl(&zcutf,newlet); + index2dump(ix,newkkey,lev+1); + oldlet=newlet; + } +} + +void indexd(FTS_index_t * ftx) +{ + FTS_real_index * ix; + int i; + uint64_t kroot; +int temp; + ix = (FTS_real_index *)ftx; + printf("\n\nDump of Index\n"); +temp=ix->firstfree; + printf("Free-chain starts at handle %d\n",temp); + printf("======= First ten handles======\n"); + for(i=1;i<11;i++) + { +temp=ix->handles[i]; + printf("Handle %d is docid %d\n", i,temp); + } + printf("======= Index 2 ===============\n"); + kroot=ZStrTuberK(ix->index2,0,0,0); + index2dump(ix,kroot,1); +} + +void RealAddDocument(FTS_index_t * ftx, FTS_document_id_t docid) +{ + FTS_real_index * ix; + FTS_texts_t *rawwords; + CTX ctx2a, ctx2b, x3ctx, x3ctxb; + STEX * stex; + ZSTR *zstrwl, *zstr2a, *zstr2b, *x3zstr, *x3zstrb; + uint64_t letters[42]; + uint64_t ixlet[42]; + uint64_t kkey[42]; /* for word *without* this letter */ + uint64_t kkey1[42]; /* ix1 word whose last letter is this */ + int ixlen; + uint16_t * wpt; + uint64_t handle, newhan, oldhan; + uint64_t kroot,kroot1; + int nowords,wdx; + int i,j,len,j1,j2; + uint8_t * utf; + uint64_t unicode; + uint64_t tran,x64,oldlet, newlet, bkey; + uint64_t docb,dock; + + ix = (FTS_real_index *)ftx; + kroot=ZStrTuberK(ix->index2,0,0,0); + if(ix->options==FTS_INDEX_SUBSTRINGS) + kroot1=ZStrTuberK(ix->index1,0,0,0); + kkey[0]=kroot; /* origin of index 2 */ + +/* allocate the document handle */ + handle = ix->firstfree; +/* TBD what to do if no more handles */ + if(handle==0) + { + printf("Run out of document handles!\n"); + return; + } + ix->firstfree = ix->handles[handle]; + ix->handles[handle]=docid; + ix->handsfree[handle]=0; + +/* Get the actual words from the caller */ + rawwords = FTS_GetTexts(ix->colid, docid); + nowords=rawwords->_len; +/* Put the words into a STEX */ + + stex=ZStrSTCons(2); /* format 2=uint16 is all that there is! */ + zstrwl=ZStrCons(25); /* 25 enough for word list */ + zstr2a=ZStrCons(30); /* 30 uint64's is always enough for ix2 */ + zstr2b=ZStrCons(30); + x3zstr=ZStrCons(35); + x3zstrb=ZStrCons(35); + for(i=0;i_texts[i]; + j=0; + ZStrClear(zstrwl); + unicode=getunicode(&utf); + while(unicode!=0) + { + ZStrEnc(zstrwl,&zcutf,unicode); + unicode=getunicode(&utf); + j++; + if(j>40) break; + } +/* terminate the word and insert into STEX */ + ZStrEnc(zstrwl,&zcutf,0); + ZStrNormalize(zstrwl); + ZStrSTAppend(stex,zstrwl); + } +/* Sort them */ + ZStrSTSort(stex); +/* Set current length of word = 0 */ + ixlen=0; +/* For each word in the STEX */ + nowords=stex->cnt; + wpt=(uint16_t *) stex->list; + for(wdx=0;wdxindex2,kkey[j],zstr2a); + if(i==1) + { + printf("Kkey not found - we're buggered\n"); + } + + x64=ZStrBitsOut(zstr2a,1); + if(x64==1) + { +/* skip over the B-key into index 3 */ + docb=ZStrDec(zstr2a,&zcbky); + } +/* look to see if the letter is there */ + ZStrCxClear(&zcdelt, &ctx2a); + newlet=0; + while(1) + { + oldlet=newlet; + newlet=ZStrCxDec(zstr2a,&zcdelt,&ctx2a); + if(newlet==oldlet) break; + bkey=ZStrDec(zstr2a,&zcbky); + if(newlet>=tran) break; + } + if(newlet != tran) + { +/* if not there, create a new index-2 entry for it */ + bkey=ZStrTuberIns(ix->index2,kkey[j],tran); + kkey[j+1]=ZStrTuberK(ix->index2,kkey[j],tran,bkey); +/* update old index-2 entry to insert new letter */ + ZStrCxClear(&zcdelt, &ctx2a); + ZStrCxClear(&zcdelt, &ctx2b); + i=ZStrTuberRead(ix->index2,kkey[j],zstr2a); + ZStrClear(zstr2b); + x64=ZStrBitsOut(zstr2a,1); + ZStrBitsIn(x64,1,zstr2b); + if(x64==1) + { +/* copy over the B-key into index 3 */ + docb=ZStrDec(zstr2a,&zcbky); + ZStrEnc(zstr2b,&zcbky,docb); + } + newlet=0; + while(1) + { + oldlet=newlet; + newlet=ZStrCxDec(zstr2a,&zcdelt,&ctx2a); + if(newlet==oldlet) break; + if(newlet>tran) break; + ZStrCxEnc(zstr2b,&zcdelt,&ctx2b,newlet); + x64=ZStrDec(zstr2a,&zcbky); + ZStrEnc(zstr2b,&zcbky,x64); + } + ZStrCxEnc(zstr2b,&zcdelt,&ctx2b,tran); + ZStrEnc(zstr2b,&zcbky,bkey); + if(newlet==oldlet) + { + ZStrCxEnc(zstr2b,&zcdelt,&ctx2b,tran); + } + else + { + while(newlet!=oldlet) + { + oldlet=newlet; + ZStrCxEnc(zstr2b,&zcdelt,&ctx2b,newlet); + x64=ZStrDec(zstr2a,&zcbky); + ZStrEnc(zstr2b,&zcbky,x64); + newlet=ZStrCxDec(zstr2a,&zcdelt,&ctx2a); + } + ZStrCxEnc(zstr2b,&zcdelt,&ctx2b,newlet); + } + ZStrNormalize(zstr2b); + ZStrTuberUpdate(ix->index2,kkey[j],zstr2b); + } + else + { +/* - if it is, get its KKey and put in (next) slot */ + kkey[j+1]=ZStrTuberK(ix->index2,kkey[j],tran,bkey); + } + j++; + } +/* kkey[j] is kkey of whole word. */ +/* so read the zstr from index2 */ + i=ZStrTuberRead(ix->index2,kkey[j],zstr2a); + if(i==1) + { + printf("Kkey not found - we're running for cover\n"); + } +/* is there already an index-3 entry available? */ + x64=ZStrBitsOut(zstr2a,1); +/* If so, get its b-key */ + if(x64==1) + { + docb=ZStrDec(zstr2a,&zcbky); + } + else + { + docb=ZStrTuberIns(ix->index3,kkey[j],0); +/* put it into index 2 */ + ZStrCxClear(&zcdelt, &ctx2a); + ZStrCxClear(&zcdelt, &ctx2b); + i=ZStrTuberRead(ix->index2,kkey[j],zstr2a); + ZStrClear(zstr2b); + x64=ZStrBitsOut(zstr2a,1); + ZStrBitsIn(1,1,zstr2b); + ZStrEnc(zstr2b,&zcbky,docb); + newlet=0; + while(1) + { + oldlet=newlet; + newlet=ZStrCxDec(zstr2a,&zcdelt,&ctx2a); + if(newlet==oldlet) break; + ZStrCxEnc(zstr2b,&zcdelt,&ctx2b,newlet); + x64=ZStrDec(zstr2a,&zcbky); + ZStrEnc(zstr2b,&zcbky,x64); + } + ZStrNormalize(zstr2b); + ZStrTuberUpdate(ix->index2,kkey[j],zstr2b); + } + dock=ZStrTuberK(ix->index3,kkey[j],0,docb); +/* insert doc handle into index 3 */ + i=ZStrTuberRead(ix->index3,dock,x3zstr); + ZStrClear(x3zstrb); + if(i==1) + { + printf("Kkey not found in ix3 - we're doomed\n"); + } + ZStrCxClear(&zcdoc, &x3ctx); + ZStrCxClear(&zcdoc, &x3ctxb); + newhan=0; + while(1) + { + oldhan=newhan; + newhan=ZStrCxDec(x3zstr,&zcdoc,&x3ctx); + if(newhan==oldhan) break; + if(newhan>handle) break; + ZStrCxEnc(x3zstrb,&zcdoc,&x3ctxb,newhan); + } + ZStrCxEnc(x3zstrb,&zcdoc,&x3ctxb,handle); + if(newhan==oldhan) + ZStrCxEnc(x3zstrb,&zcdoc,&x3ctxb,handle); + else + { + ZStrCxEnc(x3zstrb,&zcdoc,&x3ctxb,newhan); + while(newhan!=oldhan) + { + oldhan=newhan; + newhan=ZStrCxDec(x3zstr,&zcdoc,&x3ctx); + ZStrCxEnc(x3zstrb,&zcdoc,&x3ctxb,newhan); + } + } + ZStrNormalize(x3zstrb); + ZStrTuberUpdate(ix->index3,dock,x3zstrb); +/* copy the word into ix */ + ixlen=len; + for(j=0;joptions==FTS_INDEX_SUBSTRINGS) + { + for(j1=0;j1=0;j2--) + { + tran=ZStrXlate(&zcutf,ixlet[j2]); + i=ZStrTuberRead(ix->index1,kkey1[j2+1],zstr2a); + if(i==1) + { + printf("Kkey not found - we're in trouble!\n"); + } +/* look to see if the letter is there */ + ZStrCxClear(&zcdelt, &ctx2a); + newlet=0; + while(1) + { + oldlet=newlet; + newlet=ZStrCxDec(zstr2a,&zcdelt,&ctx2a); + if(newlet==oldlet) break; + bkey=ZStrDec(zstr2a,&zcbky); + if(newlet>=tran) break; + } + if(newlet != tran) + { + +/* if not there, create a new index-1 entry for it */ + bkey=ZStrTuberIns(ix->index1,kkey1[j2+1],tran); + kkey1[j2]=ZStrTuberK(ix->index1,kkey1[j2+1],tran,bkey); +/* update old index-1 entry to insert new letter */ + ZStrCxClear(&zcdelt, &ctx2a); + ZStrCxClear(&zcdelt, &ctx2b); + i=ZStrTuberRead(ix->index1,kkey1[j2+1],zstr2a); + ZStrClear(zstr2b); + newlet=0; + while(1) + { + oldlet=newlet; + newlet=ZStrCxDec(zstr2a,&zcdelt,&ctx2a); + if(newlet==oldlet) break; + if(newlet>tran) break; + ZStrCxEnc(zstr2b,&zcdelt,&ctx2b,newlet); + x64=ZStrDec(zstr2a,&zcbky); + ZStrEnc(zstr2b,&zcbky,x64); + } + ZStrCxEnc(zstr2b,&zcdelt,&ctx2b,tran); + ZStrEnc(zstr2b,&zcbky,bkey); + if(newlet==oldlet) + { + ZStrCxEnc(zstr2b,&zcdelt,&ctx2b,tran); + } + else + { + while(newlet!=oldlet) + { + oldlet=newlet; + ZStrCxEnc(zstr2b,&zcdelt,&ctx2b,newlet); + x64=ZStrDec(zstr2a,&zcbky); + ZStrEnc(zstr2b,&zcbky,x64); + newlet=ZStrCxDec(zstr2a,&zcdelt,&ctx2a); + } + ZStrCxEnc(zstr2b,&zcdelt,&ctx2b,newlet); + } + ZStrNormalize(zstr2b); + ZStrTuberUpdate(ix->index1,kkey1[j2+1],zstr2b); + } + else + { + kkey1[j2]=ZStrTuberK(ix->index1,kkey1[j2+1],tran,bkey); + } + } + } + } + } + ZStrSTDest(stex); + ZStrDest(zstrwl); + ZStrDest(zstr2a); + ZStrDest(zstr2b); + ZStrDest(x3zstr); + ZStrDest(x3zstrb); +} + +void RealDeleteDocument(FTS_index_t * ftx, FTS_document_id_t docid) +{ + FTS_real_index * ix; + FTS_document_id_t i; + ix=(FTS_real_index *) ftx; + for(i=0;i<=ix->lastslot;i++) + { + if(ix->handsfree[i]==1) continue; + if(ix->handles[i]==docid) break; + } + if(i>ix->lastslot) + { +/* TBD - what to do if a document is deleted that isn't there? */ + printf("tried to delete nonexistent document\n"); + } + ix->handsfree[i]=1; +} + +/* now the customer-facing routines */ +/* These are needed so that the lock is held in Update */ +/* preventing a query getting a result with neither the */ +/* old version nor the new one */ + +void FTS_AddDocument(FTS_index_t * ftx, FTS_document_id_t docid) +{ +/* TBD obtain write lock */ + RealAddDocument(ftx,docid); +/* TBD release write lock */ +} + +void FTS_DeleteDocument(FTS_index_t * ftx, FTS_document_id_t docid) +{ +/* TBD obtain write lock */ + RealDeleteDocument(ftx,docid); +/* TBD release write lock */ +} + +void FTS_UpdateDocument(FTS_index_t * ftx, FTS_document_id_t docid) +{ +/* TBD obtain write lock */ + RealDeleteDocument(ftx,docid); + RealAddDocument(ftx,docid); +/* TBD release write lock */ +} + +void FTS_BackgroundTask(FTS_index_t * ftx) +{ +/* obtain LOCKMAIN */ +/* remove deleted handles from index3 not done QQQ */ +/* release LOCKMAIN */ +} +/* not a valid kkey - 52 bits long!*/ +#define NOTFOUND 0xF777777777777 + + +uint64_t findkkey1(FTS_real_index * ix, uint64_t * word) +{ + ZSTR *zstr; + CTX ctx; + uint64_t * wd; + uint64_t tran,newlet,oldlet,bkey,kk1; + int j; + zstr = ZStrCons(10); + wd=word; + while(*wd != 0) wd++; + kk1=ZStrTuberK(ix->index2,0,0,0); + while(1) + { + if(wd==word) break; + tran=*(--wd); +/* Get the Z-string for the index-1 entry of this key */ + j=ZStrTuberRead(ix->index1,kk1,zstr); + if(j==1) + { + kk1=NOTFOUND; + break; + } + ZStrCxClear(&zcdelt, &ctx); + newlet=0; + while(1) + { + oldlet=newlet; + newlet=ZStrCxDec(zstr,&zcdelt,&ctx); + if(newlet==oldlet) + { + kk1=NOTFOUND; + break; + } + bkey=ZStrDec(zstr,&zcbky); + if(newlet>tran) + { + kk1=NOTFOUND; + break; + } + if(newlet==tran) break; + } + if(kk1==NOTFOUND) break; + kk1=ZStrTuberK(ix->index1,kk1,tran,bkey); + } + ZStrDest(zstr); + return kk1; +} + +uint64_t findkkey2(FTS_real_index * ix, uint64_t * word) +{ + ZSTR *zstr; + CTX ctx; + uint64_t tran,x64,docb,newlet,oldlet,bkey,kk2; + int j; + zstr = ZStrCons(10); + kk2=ZStrTuberK(ix->index2,0,0,0); + while(1) + { + tran=*(word++); + if(tran==0) break; +/* Get the Z-string for the index-2 entry of this key */ + j=ZStrTuberRead(ix->index2,kk2,zstr); + if(j==1) + { + kk2=NOTFOUND; + break; + } + x64=ZStrBitsOut(zstr,1); + if(x64==1) + { +/* skip over the B-key into index 3 */ + docb=ZStrDec(zstr,&zcbky); + } +/* silly use of docb to get rid of compiler warning */ + if(docb==0xffffff) printf("impossible\n"); + ZStrCxClear(&zcdelt, &ctx); + newlet=0; + while(1) + { + oldlet=newlet; + newlet=ZStrCxDec(zstr,&zcdelt,&ctx); + if(newlet==oldlet) + { + kk2=NOTFOUND; + break; + } + bkey=ZStrDec(zstr,&zcbky); + if(newlet>tran) + { + kk2=NOTFOUND; + break; + } + if(newlet==tran) break; + } + if(kk2==NOTFOUND) break; + kk2=ZStrTuberK(ix->index2,kk2,tran,bkey); + } + ZStrDest(zstr); + return kk2; +} +/* QUERY */ +/* for each query term */ +/* update zstra2 to only contain handles matching that also */ + + +/* recursive index 2 handles kk2 to dochan STEX using zcdh */ + +void ix2recurs(STEX * dochan, FTS_real_index * ix, uint64_t kk2) +{ + ZSTR *zstr2, *zstr3, *zstr; + CTX ctx2, ctx3; + uint64_t docb,newlet,oldlet,newkk2,bkey; + uint64_t x64, dock, oldhan,newhan; + int i,j; + zstr2=ZStrCons(10); /* index 2 entry for this prefix */ + zstr3=ZStrCons(10); /* index 3 entry for this prefix */ + /* if any */ + zstr = ZStrCons(2); /* single doc handle work area */ + j=ZStrTuberRead(ix->index2,kk2,zstr2); + if(j==1) + { + printf("recursion failed to read kk2\n"); + exit(1); + } + x64=ZStrBitsOut(zstr2,1); + if(x64==1) + { +/* process the documents into the STEX */ +/* uses zcdh not LastEnc because it must sort into */ +/* numerical order */ + docb=ZStrDec(zstr2,&zcbky); + dock=ZStrTuberK(ix->index3,kk2,0,docb); + i=ZStrTuberRead(ix->index3,dock,zstr3); + if(i==1) + { + printf("Kkey not in ix3 - we're doomed\n"); + } + ZStrCxClear(&zcdoc, &ctx3); + newhan=0; + while(1) + { + oldhan=newhan; + newhan=ZStrCxDec(zstr3,&zcdoc,&ctx3); + if(newhan==oldhan) break; + if(ix->handsfree[newhan]==0) + { + ZStrClear(zstr); + ZStrEnc(zstr,&zcdh,newhan); + ZStrSTAppend(dochan,zstr); + } + } + } + ZStrCxClear(&zcdelt, &ctx2); + newlet=0; + while(1) + { + oldlet=newlet; + newlet=ZStrCxDec(zstr2,&zcdelt,&ctx2); + if(newlet==oldlet) break; + bkey=ZStrDec(zstr2,&zcbky); + newkk2=ZStrTuberK(ix->index2,kk2,newlet,bkey); + ix2recurs(dochan,ix,newkk2); + } + ZStrDest(zstr2); + ZStrDest(zstr3); + ZStrDest(zstr); + return; +} + +void ix1recurs(STEX * dochan, FTS_real_index * ix, uint64_t kk1, uint64_t * wd) +{ + + ZSTR *zstr; + CTX ctx; + int j; + uint64_t newlet,oldlet,bkey,newkk1,kk2; + kk2=findkkey2(ix,wd); + if(kk2!=NOTFOUND) ix2recurs(dochan,ix,kk2); + zstr=ZStrCons(10); /* index 1 entry for this prefix */ + j=ZStrTuberRead(ix->index1,kk1,zstr); + if(j==1) + { + printf("recursion failed to read kk1\n"); + exit(1); + } + ZStrCxClear(&zcdelt, &ctx); + newlet=0; + while(1) + { + oldlet=newlet; + newlet=ZStrCxDec(zstr,&zcdelt,&ctx); + if(newlet==oldlet) break; + bkey=ZStrDec(zstr,&zcbky); + newkk1=ZStrTuberK(ix->index1,kk1,newlet,bkey); + *(wd-1)=newlet; + ix1recurs(dochan,ix,newkk1,wd-1); + } + ZStrDest(zstr); + return; +} + +FTS_document_ids_t * FTS_FindDocuments (FTS_index_t * ftx, + FTS_query_t * query) +{ + FTS_document_ids_t * dc; + FTS_real_index * ix; + size_t queryterm; + ZSTR *zstr2,*zstr3; + ZSTR *zstra1, *zstra2, *ztemp; + ZSTR *zstr; + STEX * dochan; + CTX ctxa1, ctxa2; + CTX ctx3; + uint64_t word1[100]; + int i,j; + uint64_t kk2,kk1,x64,docb,dock; + uint64_t oldhan,newhan,ndocs,lasthan,odocs; + uint64_t nhand1,ohand1; + uint8_t * utf; + uint64_t unicode; + uint16_t *docpt; +/* TBD obtain read lock */ + + ix=(FTS_real_index *) ftx; + dc=malloc(sizeof(FTS_document_ids_t *)); + dc->_len=0; /* no docids so far */ + dc->_docs=NULL; + zstr2=ZStrCons(10); /* from index-2 tuber */ + zstr3=ZStrCons(10); /* from index-3 tuber */ + zstra1=ZStrCons(10); /* current list of documents */ + zstra2=ZStrCons(10); /* new list of documents */ + zstr =ZStrCons(4); /* work zstr from stex */ +/* - for each term in the query */ + for(queryterm=0;queryterm_len;queryterm++) + { +/* Depending on the query type, the objective is do */ +/* populate or "and" zstra1 with the sorted list */ +/* of document handles that match that term */ +/* TBD - what to do if it is not a legal option? */ +/* TBD combine this with otheer options - no need to use zstring */ + if(query->_localOptions[queryterm] == FTS_MATCH_COMPLETE) + { + j=0; + utf= query->_texts[queryterm]; + while(1) + { + unicode=getunicode(&utf); + word1[j++]=ZStrXlate(&zcutf,unicode); + if(unicode==0) break; + } + kk2=findkkey2(ix,word1); + if(kk2==NOTFOUND) break; + j=ZStrTuberRead(ix->index2,kk2,zstr2); + x64=ZStrBitsOut(zstr2,1); + if(x64!=1) break; + docb=ZStrDec(zstr2,&zcbky); + dock=ZStrTuberK(ix->index3,kk2,0,docb); + i=ZStrTuberRead(ix->index3,dock,zstr3); + if(i==1) + { + printf("Kkey not in ix3 - we're terrified\n"); + } + ZStrCxClear(&zcdoc, &ctx3); + ZStrCxClear(&zcdoc, &ctxa2); + ZStrClear(zstra2); + newhan=0; + lasthan=0; + ndocs=0; + if(queryterm==0) + { + while(1) + { + oldhan=newhan; + newhan=ZStrCxDec(zstr3,&zcdoc,&ctx3); + if(newhan==oldhan) break; + if(ix->handsfree[newhan]==0) + { + ZStrCxEnc(zstra2,&zcdoc,&ctxa2,newhan); + lasthan=newhan; + ndocs++; + } + } + + } + else + { + ZStrCxClear(&zcdoc, &ctxa1); + ohand1=0; + nhand1=ZStrCxDec(zstra1,&zcdoc,&ctxa1); + oldhan=0; + newhan=ZStrCxDec(zstr3,&zcdoc,&ctx3); +/* zstra1 = zstra1 & zstra2 */ + while(1) + { + if(nhand1==ohand1) break; + if(oldhan==newhan) break; + if(newhan==nhand1) + { + if(ix->handsfree[newhan]==0) + { + ZStrCxEnc(zstra2,&zcdoc,&ctxa2,newhan); + lasthan=newhan; + ndocs++; + } + oldhan=newhan; + newhan=ZStrCxDec(zstr3,&zcdoc,&ctx3); + ohand1=nhand1; + nhand1=ZStrCxDec(zstra1,&zcdoc,&ctxa1); + } + else if(newhan>nhand1) + { + ohand1=nhand1; + nhand1=ZStrCxDec(zstra1,&zcdoc,&ctxa1); + } + else + { + oldhan=newhan; + newhan=ZStrCxDec(zstr3,&zcdoc,&ctx3); + } + } + } + ZStrCxEnc(zstra2,&zcdoc,&ctxa2,lasthan); + ZStrNormalize(zstra2); + ztemp=zstra1; + zstra1=zstra2; + zstra2=ztemp; + } /* end of match-complete code */ + if ( (query->_localOptions[queryterm] == FTS_MATCH_PREFIX) || + (query->_localOptions[queryterm] == FTS_MATCH_SUBSTRING) ) + { +/* Make STEX to contain new list of handles */ + dochan=ZStrSTCons(2); + j=50; + utf= query->_texts[queryterm]; +/* TBD protect against query string greater than 40? */ + while(1) + { + unicode=getunicode(&utf); + word1[j++]=ZStrXlate(&zcutf,unicode); + if(unicode==0) break; + } + if (query->_localOptions[queryterm] == FTS_MATCH_PREFIX) + { + kk2=findkkey2(ix,word1+50); + if(kk2==NOTFOUND) break; +/* call routine to recursively put handles to STEX */ + ix2recurs(dochan,ix,kk2); + } + if (query->_localOptions[queryterm] == FTS_MATCH_SUBSTRING) + { + kk1=findkkey1(ix,word1+50); + if(kk1==NOTFOUND) break; +/* call routine to recursively put handles to STEX */ + ix1recurs(dochan,ix,kk1,word1+50); + } + ZStrSTSort(dochan); + odocs=dochan->cnt; + docpt=dochan->list; + ZStrCxClear(&zcdoc, &ctxa2); + ZStrClear(zstra2); + lasthan=0; + if(queryterm==0) + { + for(i=0;ihandsfree[newhan]==0) + { + ZStrCxEnc(zstra2,&zcdoc,&ctxa2,newhan); + lasthan=newhan; + } + } + } + else + { +/* merge prefix stex with zstra1 */ + ZStrCxClear(&zcdoc, &ctxa1); + ohand1=0; + if(odocs==0) continue; + nhand1=ZStrCxDec(zstra1,&zcdoc,&ctxa1); + ZStrInsert(zstr,docpt,2); + newhan=ZStrDec(zstr,&zcdh); + docpt+=ZStrExtLen(docpt,2); +/* zstra1 = zstra1 & zstra2 */ + while(1) + { + if(nhand1==ohand1) break; + if(newhan==nhand1) + { + if(ix->handsfree[newhan]==0) + { + ZStrCxEnc(zstra2,&zcdoc,&ctxa2,newhan); + lasthan=newhan; + ndocs++; + } + ZStrInsert(zstr,docpt,2); + newhan=ZStrDec(zstr,&zcdh); + docpt+=ZStrExtLen(docpt,2); + odocs--; + if(odocs==0) break; + ohand1=nhand1; + nhand1=ZStrCxDec(zstra1,&zcdoc,&ctxa1); + } + else if(newhan>nhand1) + { + ohand1=nhand1; + nhand1=ZStrCxDec(zstra1,&zcdoc,&ctxa1); + } + else + { + ZStrInsert(zstr,docpt,2); + newhan=ZStrDec(zstr,&zcdh); + docpt+=ZStrExtLen(docpt,2); + odocs--; + if(odocs==0) break; + } + } + } + ZStrCxEnc(zstra2,&zcdoc,&ctxa2,lasthan); + ZStrNormalize(zstra2); + ztemp=zstra1; + zstra1=zstra2; + zstra2=ztemp; + } /* end of match-prefix code */ + } + ZStrCxClear(&zcdoc, &ctxa1); + newhan=0; + dc->_docs=malloc(ndocs*sizeof(FTS_document_id_t)); + ndocs=0; + while(1) + { + oldhan=newhan; + newhan=ZStrCxDec(zstra1,&zcdoc,&ctxa1); + if(newhan==oldhan) break; + if(ix->handsfree[newhan]==0) + dc->_docs[ndocs++]=ix->handles[newhan]; + } + dc->_len=ndocs; + ZStrDest(zstra1); + ZStrDest(zstra2); +/* TBD relinquish read lock */ + return dc; +} + +void FTS_Free_Documents(FTS_document_ids_t * doclist) +{ + if(doclist->_docs!=NULL) free (doclist->_docs); + free(doclist); +} + +/* end of ftsindex.c */ diff --git a/arangod/FulltextIndex/FTS_index.h b/arangod/FulltextIndex/FTS_index.h new file mode 100644 index 0000000000..1ad8c48b3c --- /dev/null +++ b/arangod/FulltextIndex/FTS_index.h @@ -0,0 +1,51 @@ +/* ftsindex.h - The Full Text Search header file */ +/* R. A. Parker 6.6.2012 */ + +typedef struct FTS_REAL_index FTS_index_t; + +typedef struct +{ + uint64_t _globalOptions; + size_t _len; + uint64_t * _localOptions; + uint8_t * * _texts; +} FTS_query_t; + +/* local Options value (one for each word) */ + +#define FTS_MATCH_COMPLETE 1 +#define FTS_MATCH_PREFIX 2 +#define FTS_MATCH_SUBSTRING 4 +#define FTS_INDEX_SUBSTRINGS 1 + +typedef struct +{ + size_t _len; + FTS_document_id_t * _docs; +} FTS_document_ids_t; + +#define FTS_SIZES_DEFAULT {10,1000,57,100,0,0,0,0,0,0} + +FTS_index_t * FTS_CreateIndex(FTS_collection_id_t coll, + uint64_t options, uint64_t sizes[10]); + +void FTS_FreeIndex ( FTS_index_t * ftx); + +void FTS_BackgroundTask (FTS_index_t * ftx); + +void FTS_AddDocument(FTS_index_t * ftx, FTS_document_id_t docid); + +void FTS_DeleteDocument(FTS_index_t * ftx, FTS_document_id_t docid); + +void FTS_UpdateDocument(FTS_index_t * ftx, FTS_document_id_t docid); + +FTS_document_ids_t * FTS_FindDocuments (FTS_index_t * ftx, + FTS_query_t * query); + +void FTS_Free_Documents(FTS_document_ids_t *); + +void indexd(FTS_index_t * ftx); + +/* end of ftsindex.h */ + + diff --git a/arangod/FulltextIndex/avodoc.c b/arangod/FulltextIndex/avodoc.c new file mode 100644 index 0000000000..2ed73d9be5 --- /dev/null +++ b/arangod/FulltextIndex/avodoc.c @@ -0,0 +1,438 @@ +/* avodoc.c - My imitation of Avocado */ +/* R. A. Parker 26.11.2012 */ + +#include +#include +#include +#include "avodoc.h" +#include "FTS_index.h" + + +FTS_texts_t * cons() +{ + FTS_texts_t * tx; + tx=malloc(sizeof(FTS_texts_t)); + tx->_texts=malloc(10*sizeof(uint8_t *)); + return tx; +} + +uint8_t w1[]="trinket"; +uint8_t w2[]="fred"; +uint8_t w3[]="zebra"; +uint8_t w4[]="aardvark"; +uint8_t w5[]="freed"; +uint8_t w6[]="fredp"; +uint8_t w7[]="fredq"; +uint8_t w8[]="fredr"; +uint8_t wp[]="fre"; +uint8_t wf[]="red"; + +void freg(void * doc) +{ + printf("tried to free the document!\n"); +} + +FTS_texts_t * FTS_GetTexts + (FTS_collection_id_t colid, FTS_document_id_t docid) + +{ + FTS_texts_t * tx; + tx=cons(); + if( (colid==2) && (docid==2) ) + { + tx->_len=9; + tx->_texts[0]=w1; + tx->_texts[1]=w2; + tx->_texts[2]=w3; + tx->_texts[3]=w4; + tx->_texts[4]=w5; + tx->_texts[5]=w6; + tx->_texts[6]=w1; + tx->_texts[7]=w2; + tx->_texts[8]=w7; + } + if( (colid==2) && (docid==3) ) + { + tx->_len=7; + tx->_texts[0]=w4; + tx->_texts[1]=w4; + tx->_texts[2]=w4; + tx->_texts[3]=w4; + tx->_texts[4]=w5; + tx->_texts[5]=w6; + tx->_texts[6]=w4; + } + if( (colid==2) && (docid==5) ) + { + tx->_len=8; + tx->_texts[0]=w1; + tx->_texts[1]=w1; + tx->_texts[2]=w3; + tx->_texts[3]=w5; + tx->_texts[4]=w5; + tx->_texts[5]=w7; + tx->_texts[6]=w7; + tx->_texts[7]=w1; + } + if( (colid==2) && (docid==8) ) + { + tx->_len=10; + tx->_texts[0]=w1; + tx->_texts[1]=w2; + tx->_texts[2]=w3; + tx->_texts[3]=w4; + tx->_texts[4]=w1; + tx->_texts[5]=w2; + tx->_texts[6]=w3; + tx->_texts[7]=w4; + tx->_texts[8]=w1; + tx->_texts[9]=w2; + } + if( (colid==2) && (docid==11) ) + { + tx->_len=6; + tx->_texts[0]=w2; + tx->_texts[1]=w3; + tx->_texts[2]=w4; + tx->_texts[3]=w4; + tx->_texts[4]=w7; + tx->_texts[5]=w4; + } + if( (colid==1) && (docid==2) ) + { + tx->_len=9; + tx->_texts[0]=w1; + tx->_texts[1]=w2; + tx->_texts[2]=w3; + tx->_texts[3]=w4; + tx->_texts[4]=w5; + tx->_texts[5]=w6; + tx->_texts[6]=w1; + tx->_texts[7]=w2; + tx->_texts[8]=w7; + } + if( (colid==1) && (docid==3) ) + { + tx->_len=7; + tx->_texts[0]=w4; + tx->_texts[1]=w4; + tx->_texts[2]=w4; + tx->_texts[3]=w4; + tx->_texts[4]=w5; + tx->_texts[5]=w6; + tx->_texts[6]=w4; + } + if( (colid==1) && (docid==5) ) + { + tx->_len=8; + tx->_texts[0]=w1; + tx->_texts[1]=w1; + tx->_texts[2]=w3; + tx->_texts[3]=w5; + tx->_texts[4]=w5; + tx->_texts[5]=w7; + tx->_texts[6]=w7; + tx->_texts[7]=w1; + } + if( (colid==1) && (docid==8) ) + { + tx->_len=10; + tx->_texts[0]=w1; + tx->_texts[1]=w2; + tx->_texts[2]=w3; + tx->_texts[3]=w4; + tx->_texts[4]=w1; + tx->_texts[5]=w2; + tx->_texts[6]=w3; + tx->_texts[7]=w4; + tx->_texts[8]=w1; + tx->_texts[9]=w2; + } + if( (colid==1) && (docid==11) ) + { + tx->_len=6; + tx->_texts[0]=w2; + tx->_texts[1]=w3; + tx->_texts[2]=w4; + tx->_texts[3]=w4; + tx->_texts[4]=w7; + tx->_texts[5]=w4; + } + tx->free=freg; + return tx; +} + +int main(int argc, char ** argv) +{ + long long x1; + int i; +int temp; + FTS_collection_id_t colid1; + FTS_document_id_t docid; + FTS_index_t * ftx, *ftx2; + FTS_query_t query; + FTS_document_ids_t * queryres; + uint64_t def[10]=FTS_SIZES_DEFAULT; + printf("Minature FTS-test program started\n"); + query._localOptions = malloc(5*sizeof(uint64_t)); + query._texts = malloc(5*sizeof(uint8_t *)); + colid1=1; + + ftx=FTS_CreateIndex(colid1,0,def); + if(ftx==NULL) + { + printf("Create returned NULL, so giving up\n"); + return 1; + } + printf("Managed to create an index . . . so far so good\n"); + docid=11; + FTS_AddDocument(ftx,docid); + printf("Added document 11\n"); + docid=2; + FTS_AddDocument(ftx,docid); + printf("Added document 2\n"); + docid=3; + FTS_AddDocument(ftx,docid); + printf("Added document 3\n"); + docid=5; + FTS_AddDocument(ftx,docid); + printf("Added document 5\n"); + docid=8; + FTS_AddDocument(ftx,docid); + printf("Added document 8\n"); + FTS_BackgroundTask(ftx); + printf("Came out of background task\n"); + FTS_BackgroundTask(ftx); + printf("Came out of background task again\n"); +/* indexd(ftx); */ + query._globalOptions = 0; + query._len = 1; + query._localOptions[0]=FTS_MATCH_COMPLETE; /* whole word */ + query._texts[0] = w1; + queryres = FTS_FindDocuments(ftx,&query); + x1=queryres->_len; + printf("Resulted in %lld documents\n",x1); + for(i=0;i_docs[i]; + printf(" %d",temp); + } + printf("\n"); + FTS_Free_Documents(queryres); + + query._globalOptions = 0; + query._len = 2; + query._localOptions[0]=FTS_MATCH_COMPLETE; /* whole word */ + query._texts[0] = w4; + query._localOptions[1]=FTS_MATCH_COMPLETE; /* whole word */ + query._texts[1] = w2; + queryres = FTS_FindDocuments(ftx,&query); + x1=queryres->_len; +temp=x1; + printf("Resulted in %d documents\n",temp); + for(i=0;i_docs[i]; + printf(" %d",temp); + } + printf("\n"); + FTS_Free_Documents(queryres); + + docid=8; + FTS_DeleteDocument(ftx,docid); + printf("Deleted document 8\n"); +/* first query */ + query._globalOptions = 0; + query._len = 1; + query._localOptions[0]=FTS_MATCH_COMPLETE; /* whole word */ + query._texts[0] = w1; + queryres = FTS_FindDocuments(ftx,&query); + x1=queryres->_len; +temp=x1; + printf("Resulted in %d documents\n",temp); + for(i=0;i_docs[i]; + printf(" %d",temp); + } + printf("\n"); + FTS_Free_Documents(queryres); +/* second query */ + query._globalOptions = 0; + query._len = 1; + query._localOptions[0]=FTS_MATCH_PREFIX; /* whole word */ + query._texts[0] = wp; + queryres = FTS_FindDocuments(ftx,&query); + x1=queryres->_len; +temp=x1; + printf("Resulted in %d documents\n",temp); + for(i=0;i_docs[i]; + printf(" %d",temp); + } + printf("\n"); + FTS_Free_Documents(queryres); +/* third query */ + query._globalOptions = 0; + query._len = 2; + query._localOptions[0]=FTS_MATCH_COMPLETE; + query._localOptions[1]=FTS_MATCH_PREFIX; /* whole word */ + query._texts[0] = w1; + query._texts[1] = wp; + queryres = FTS_FindDocuments(ftx,&query); + x1=queryres->_len; +temp=x1; + printf("Resulted in %d documents\n",temp); + for(i=0;i_docs[i]; + printf(" %d",temp); + } + printf("\n"); + FTS_Free_Documents(queryres); +/* end of queries */ +/* now create an index with partial words allowed */ + + colid1=2; + ftx2=FTS_CreateIndex(colid1,FTS_INDEX_SUBSTRINGS,def); + if(ftx2==NULL) + { + printf("Create returned NULL, so giving up\n"); + return 1; + } + printf("Managed to create an index . . . so far so good\n"); + docid=11; + FTS_AddDocument(ftx2,docid); + printf("Added document 11\n"); + docid=2; + FTS_AddDocument(ftx2,docid); + printf("Added document 2\n"); + docid=3; + FTS_AddDocument(ftx2,docid); + printf("Added document 3\n"); + docid=5; + FTS_AddDocument(ftx2,docid); + printf("Added document 5\n"); + docid=8; + FTS_AddDocument(ftx2,docid); + printf("Added document 8\n"); + FTS_BackgroundTask(ftx2); + printf("Came out of background task\n"); + FTS_BackgroundTask(ftx2); + printf("Came out of background task again\n"); +/* indexd(ftx2); */ + query._globalOptions = 0; + query._len = 1; + query._localOptions[0]=FTS_MATCH_COMPLETE; /* whole word */ + query._texts[0] = w1; + queryres = FTS_FindDocuments(ftx2,&query); + x1=queryres->_len; + printf("Resulted in %lld documents\n",x1); + for(i=0;i_docs[i]; + printf(" %d",temp); + } + printf("\n"); + FTS_Free_Documents(queryres); + + query._globalOptions = 0; + query._len = 2; + query._localOptions[0]=FTS_MATCH_COMPLETE; /* whole word */ + query._texts[0] = w4; + query._localOptions[1]=FTS_MATCH_COMPLETE; /* whole word */ + query._texts[1] = w2; + queryres = FTS_FindDocuments(ftx2,&query); + x1=queryres->_len; +temp=x1; + printf("Resulted in %d documents\n",temp); + for(i=0;i_docs[i]; + printf(" %d",temp); + } + printf("\n"); + FTS_Free_Documents(queryres); + + docid=2; + FTS_DeleteDocument(ftx2,docid); + printf("Deleted document 2\n"); + docid=8; + FTS_DeleteDocument(ftx2,docid); + printf("Deleted document 8\n"); +/* first query */ + query._globalOptions = 0; + query._len = 1; + query._localOptions[0]=FTS_MATCH_COMPLETE; /* whole word */ + query._texts[0] = w1; + queryres = FTS_FindDocuments(ftx2,&query); + x1=queryres->_len; +temp=x1; + printf("Resulted in %d documents\n",temp); + for(i=0;i_docs[i]; + printf(" %d",temp); + } + printf("\n"); + FTS_Free_Documents(queryres); +/* second query */ + query._globalOptions = 0; + query._len = 1; + query._localOptions[0]=FTS_MATCH_PREFIX; /* whole word */ + query._texts[0] = wp; + queryres = FTS_FindDocuments(ftx2,&query); + x1=queryres->_len; +temp=x1; + printf("Resulted in %d documents\n",temp); + for(i=0;i_docs[i]; + printf(" %d",temp); + } + printf("\n"); + FTS_Free_Documents(queryres); +/* third query */ + query._globalOptions = 0; + query._len = 2; + query._localOptions[0]=FTS_MATCH_COMPLETE; + query._localOptions[1]=FTS_MATCH_PREFIX; /* whole word */ + query._texts[0] = w1; + query._texts[1] = wp; + queryres = FTS_FindDocuments(ftx2,&query); + x1=queryres->_len; +temp=x1; + printf("Resulted in %d documents\n",temp); + for(i=0;i_docs[i]; + printf(" %d",temp); + } + printf("\n"); + FTS_Free_Documents(queryres); + query._globalOptions = 0; + query._len = 1; + query._localOptions[0]=FTS_MATCH_SUBSTRING; /* whole word */ + query._texts[0] = wf; + queryres = FTS_FindDocuments(ftx2,&query); + x1=queryres->_len; + printf("Substring - Resulted in %lld documents\n",x1); + for(i=0;i_docs[i]; + printf(" %d",temp); + } + printf("\n"); + FTS_Free_Documents(queryres); + +/* end of queries */ + FTS_FreeIndex(ftx2); + FTS_FreeIndex(ftx); + printf("First simple test completed - free'd the index again\n"); + return 0; +} + +/* end of avodoc.c */ + + diff --git a/arangod/FulltextIndex/avodoc.h b/arangod/FulltextIndex/avodoc.h new file mode 100644 index 0000000000..77b7d788db --- /dev/null +++ b/arangod/FulltextIndex/avodoc.h @@ -0,0 +1,19 @@ +/* avodoc.h - header file for FTS access to documents */ +/* R. A. Parker 16.7.2012 */ + +typedef uint64_t FTS_collection_id_t; +typedef uint64_t FTS_document_id_t; + +typedef struct +{ + size_t _len; + uint8_t * * _texts; + void (*free)(void *); +} FTS_texts_t; + +FTS_texts_t * FTS_GetTexts + (FTS_collection_id_t colid, FTS_document_id_t docid); + +/* end of avodoc.h */ + + diff --git a/arangod/FulltextIndex/zcode.c b/arangod/FulltextIndex/zcode.c new file mode 100644 index 0000000000..f4991239e6 --- /dev/null +++ b/arangod/FulltextIndex/zcode.c @@ -0,0 +1,98 @@ +/* zcode.c - the Z-string code and hash module */ +/* R. A. Parker 13.11.2012 */ + +#include +#include +#include +#include "zstr.h" + +/* zcutf code for storing letters in words */ +uint64_t zcutfX[]={0,1,2,3,4,5,6,7,8,9,10,12,16,24,88,65624}; +uint64_t zcutfC[]={0x0,0x8,0x4,0xC,0x2,0x6,0xA,0xE, + 0x1,0x3,0xA,0x1C,0x48,0x2C0,0xD0000,0xF00000000}; +uint8_t zcutfL[]={4,4,4,4,4,4,4,4,4,4,5,6,7,10,20,36}; +uint8_t zcutfS[]={0,8,4,9,2,10,5,11,1,12,6,13,3,14,7,15}; +uint8_t zcutfTX[]={0x00,0x35,0x36,0x37,0x38,0x39,0x3A,0x3B, + 0x3C,0x3D,0x3E,0x3F,0x40,0x41,0x42,0x43, + 0x44,0x45,0x46,0x47,0x48,0x49,0x4A,0x4B, + 0x4C,0x4D,0x4E,0x4F,0x50,0x51,0x52,0x53, + 0x54,0x55,0x56,0x57,0x58,0x59,0x5A,0x5B, + 0x5C,0x5D,0x5E,0x5F,0x60,0x61,0x62,0x63, + 0x64,0x65,0x66,0x67,0x68,0x69,0x6A,0x6B, + 0x6C,0x6D,0x6E,0x6F,0x70,0x71,0x72,0x73, + 0x74,0x1D,0x2E,0x27,0x24,0x1B,0x29,0x2F, + 0x22,0x1F,0x32,0x30,0x25,0x28,0x20,0x1E, + 0x2C,0x31,0x23,0x21,0x1C,0x26,0x2D,0x2A, + 0x33,0x2B,0x34,0x75,0x76,0x77,0x78,0x79, + 0x7A,0x03,0x14,0x0D,0x0A,0x01,0x0F,0x15, + 0x08,0x05,0x18,0x16,0x0B,0x0E,0x06,0x04, + 0x12,0x17,0x09,0x07,0x02,0x0C,0x13,0x10, + 0x19,0x11,0x1A,0x7B,0x7C,0x7D,0x7E,0x7F}; +uint8_t zcutfUX[]={0x00,0x65,0x74,0x61,0x6F,0x69,0x6E,0x73, + 0x68,0x72,0x64,0x6C,0x75,0x63,0x6D,0x66, + 0x77,0x79,0x70,0x76,0x62,0x67,0x6B,0x71, + 0x6A,0x78,0x7A,0x45,0x54,0x41,0x4F,0x49, + 0x4E,0x53,0x48,0x52,0x44,0x4C,0x55,0x43, + 0x4D,0x46,0x57,0x59,0x50,0x56,0x42,0x47, + 0x4B,0x51,0x4A,0x58,0x5A,0x01,0x02,0x03, + 0x04,0x05,0x06,0x07,0x08,0x09,0x0A,0x0B, + 0x0C,0x0D,0x0E,0x0F,0x10,0x11,0x12,0x13, + 0x14,0x15,0x16,0x17,0x18,0x19,0x1A,0x1B, + 0x1C,0x1D,0x1E,0x1F,0x20,0x21,0x22,0x23, + 0x24,0x25,0x26,0x27,0x28,0x29,0x2A,0x2B, + 0x2C,0x2D,0x2E,0x2F,0x30,0x31,0x32,0x33, + 0x34,0x35,0x36,0x37,0x38,0x39,0x3A,0x3B, + 0x3C,0x3D,0x3E,0x3F,0x40,0x5B,0x5C,0x5D, + 0x5E,0x5F,0x60,0x7B,0x7C,0x7D,0x7E,0x7F}; +ZCOD zcutf = {2,16,127,4,zcutfX,zcutfC,zcutfL,zcutfS,zcutfTX,zcutfUX}; + +/* zcbky code for storing a B_KEY */ +uint64_t zcbkyX[]={0,1,2,3,4,20,276}; +uint64_t zcbkyC[]={0,4,6,10,0xB0,0xE00,0xF0000}; +uint8_t zcbkyL[]={1,3,3,4,8,12,20}; +uint8_t zcbkyS[]={0,0,0,0, 0,0,0,0, 1,1,3,4, 2,2,5,6}; +ZCOD zcbky = {1,7,0,4,zcbkyX,zcbkyC,zcbkyL,zcbkyS,NULL,NULL}; + +/* zcdelt code for storing UTF-8 deltas */ +uint64_t zcdeltX[]={0,1,2,3,4,5,6,7,8,10,14,22,86,65622}; +uint64_t zcdeltC[]={0,0x4,0x4,0xC,0x2,0x6,0xA,0xE,0x6,0x14, + 0x38,0x2C0, 0xD0000,0xF00000000}; +uint8_t zcdeltL[]={3,3,4,4,4,4,4,4,5,6,7,10,20,36}; +uint8_t zcdeltS[]={0,0,4,8,2,9,5,10,1,1,6,11,3,12,7,13}; +ZCOD zcdelt = {3,14,0,4,zcdeltX,zcdeltC,zcdeltL,zcdeltS,NULL,NULL}; + +/* zcdoc code for storing document handle deltas */ +uint64_t zcdocX[]={0,1,3,11,43,171,1195,1049771}; +uint64_t zcdocC[]={0,0x8,0x10,0xC0,0x80,0xC00,0x500000,0x38000000000}; +uint8_t zcdocL[]={3,4,6,8,10,13,23,42}; +uint8_t zcdocS[]={0,4,2,5,1,6,3,7}; +ZCOD zcdoc = {3,8,0,3,zcdocX,zcdocC,zcdocL,zcdocS,NULL,NULL}; + +/* zckk code for storing direct K-KEY values */ +uint64_t zckkX[]={0,65536,1114112,17891328}; +uint64_t zckkC[]={0,0x200000,0x1000000,0x18000000000}; +uint8_t zckkL[]={18,22,26,41}; +uint8_t zckkS[]={0,2,1,3}; +ZCOD zckk = {1,4,0,2,zckkX,zckkC,zckkL,zckkS,NULL,NULL}; + +/* zcdh code for putting doc handles into a stex */ +uint64_t zcdhX[]={0,8192,134225920}; +uint64_t zcdhC[]={0,0x10000000,0xC0000000000}; +uint8_t zcdhL[]={14,29,44}; +uint8_t zcdhS[]={0,0,1,2}; +ZCOD zcdh = {1,4,0,2,zcdhX,zcdhC,zcdhL,zcdhS,NULL,NULL}; + +uint64_t ZStrTuberK(TUBER * t, uint64_t d1, + uint64_t d2, uint64_t keyb) +{ + uint64_t keya; + if(d2<3) keya= (d1+5*d2) % t->kmax; + else keya = ( d1*(d1+d2) + 2*d2*d2 ) % t->kmax; + if(keyb==0) return keya; + if(keyb==1) return (keya+19)%t->kmax; + if(keyb==2) return (keya+43)%t->kmax; + if(keyb<47) return ((keya+3)*keyb)%t->kmax; + return ZStrTuberK(t,ZStrTuberK(t,d1,d2,keyb%47),0,keyb/47); +} + +/* end of zcode.c */ diff --git a/arangod/FulltextIndex/zstr.c b/arangod/FulltextIndex/zstr.c new file mode 100644 index 0000000000..217235107e --- /dev/null +++ b/arangod/FulltextIndex/zstr.c @@ -0,0 +1,1235 @@ +/* zstr.c - the Z-string module */ +/* R. A. Parker 14.11.2012 */ +/* bugfixed in merge - adjtop call added */ +/* bugfixed in tuber - wraparound */ +/* bugfix shift of 64 not happening */ + +#include +#include +#include +#include "zstr.h" + +ZSTR * ZStrCons(int elts) +{ + ZSTR * z; + z=malloc(sizeof(ZSTR)); + if(z==NULL) + { + printf("malloc failed on ZSTR structure\n"); + exit(1); + } + z->dat=malloc(elts*sizeof(uint64_t)); + if(z->dat==NULL) + { + printf("malloc failed on ZSTR structure (dat[%d])\n",elts); + exit(2); + } + z->dlen=malloc(elts*sizeof(long)); + if(z->dlen==NULL) + { + printf("malloc failed on ZSTR structure (dlen[%d])\n",elts); + exit(3); + } + z->alloc=elts; + z->firstix=0; + z->lastix=0; + z->dat[0]=0; + z->dlen[0]=0; + return z; +} + +void ZStrDest(ZSTR * z) +{ + free(z->dat); + free(z->dlen); + free(z); +} + +void ZStrClear(ZSTR * z) +{ + z->firstix=0; + z->lastix=0; + z->dat[0]=0; + z->dlen[0]=0; +} + +void ZStrBitsIn(uint64_t a, long bits, ZSTR * z) +{ + long clen; + clen=z->dlen[z->lastix]; + if(clen+bits <= 64) + { + z->dat[z->lastix]=(z->dat[z->lastix]<dlen[z->lastix]=clen+bits; + } + else + { + if(z->lastix+1 >= z->alloc) + { + z->alloc=(z->alloc + z->alloc/4 + 2); + z->dat=realloc(z->dat,z->alloc*sizeof(uint64_t)); + if(z->dat==NULL) + { + printf("realloc on ZSTR failed (dat[%d])\n",z->alloc); + exit(4); + } + z->dlen=realloc(z->dlen,z->alloc*sizeof(long)); + if(z->dlen==NULL) + { + printf("realloc on ZSTR failed (dlen[%d])\n",z->alloc); + exit(5); + } + } + z->lastix++; + z->dat[z->lastix]=a; + z->dlen[z->lastix]=bits; + } +} + +uint64_t ZStrBitsOut(ZSTR * z, long bits) +{ + uint64_t s,t; + long slen,wlen; + s=0; + slen=0; + while( slen+z->dlen[z->firstix] <= bits) + { + s<<=z->dlen[z->firstix]; + s+=z->dat[z->firstix]; + slen+=z->dlen[z->firstix]; + if(z->firstix==z->lastix) + { + z->dlen[z->firstix]=0; + z->dat[z->firstix]=0; + return s<<(bits-slen); + } + z->firstix++; + } + wlen=bits-slen; + if(wlen==0) return s; + s<<=wlen; + t=z->dat[z->firstix]>>(z->dlen[z->firstix]-wlen); + s+=t; + z->dat[z->firstix]^=(t<<(z->dlen[z->firstix]-wlen)); + z->dlen[z->firstix]-=wlen; + return s; +} + +uint64_t ZStrBitsPeek(ZSTR * z, long bits) +{ + uint64_t s; + int firstix; + long slen,wlen; + s=0; + slen=0; + firstix=z->firstix; + while( slen+z->dlen[firstix] <= bits) + { + s<<=z->dlen[firstix]; + s+=z->dat[firstix]; + slen+=z->dlen[firstix]; + if(firstix==z->lastix); + return s<<(bits-slen); + firstix++; + } + wlen=bits-slen; + if(wlen==0) return s; + s<<=wlen; + s+=z->dat[z->firstix]>>(z->dlen[z->firstix]-wlen); + return s; +} + +long ZStrLen(ZSTR * z) +{ + long tot; + int i; + tot=0; + for(i=z->firstix;i<=z->lastix;i++) tot+=z->dlen[i]; + return tot; +} + +void ZStrNormalize(ZSTR * z) +{ + while(z->lastix>z->firstix) + { + if(z->dat[z->lastix]!=0) break; + z->lastix--; + } + if(z->dat[z->lastix]==0) + { + z->dlen[z->lastix]=0; + return; + } + while( (z->dat[z->lastix]&1)==0 ) + { + z->dat[z->lastix]>>=1; + z->dlen[z->lastix]--; + } +} + +void ZStrEnc(ZSTR * z, ZCOD * zc, uint64_t a) +{ + int seg; + switch (zc->t) + { + case 1: + for(seg=1;seg<=zc->s;seg++) + if(aX[seg]) break; + seg--; + ZStrBitsIn(a-zc->X[seg]+zc->C[seg],zc->L[seg],z); + return; + case 2: + if(a<=zc->tmax) a=zc->TX[a]; + for(seg=1;seg<=zc->s;seg++) + if(aX[seg]) break; + seg--; + ZStrBitsIn(a-zc->X[seg]+zc->C[seg],zc->L[seg],z); + return; + default: + printf("invalid ZCOD type %d\n",zc->t); + exit(16); + } +} + +uint64_t ZStrDec(ZSTR * z, ZCOD * zc) +{ + int seg; + uint64_t topbit,s; + switch (zc->t) + { + case 1: + topbit=ZStrBitsPeek(z,zc->bits); + seg=zc->SG[topbit]; + s=ZStrBitsOut(z,zc->L[seg]); + return (s-zc->C[seg])+zc->X[seg]; + case 2: + topbit=ZStrBitsPeek(z,zc->bits); + seg=zc->SG[topbit]; + s=ZStrBitsOut(z,zc->L[seg]); + s = (s-zc->C[seg])+zc->X[seg]; + if(s<=zc->tmax) s=zc->UX[s]; + return s; + default: + printf("invalid ZCOD type %d\n",zc->t); + exit(18); + } +} + +uint64_t ZStrXlate(ZCOD * zc, uint64_t a) +{ + if(a<=zc->tmax) return zc->TX[a]; + return a; +} + +uint64_t ZStrUnXl(ZCOD * zc, uint64_t a) +{ + if(a<=zc->tmax) return zc->UX[a]; + return a; +} + +void ZStrLastEnc(ZSTR * z, uint64_t a) +{ + uint64_t b; + long len; + if(a==0) return; + b=a; + len=1; + while(b>1) + { + len++; + b>>=1; + } + a-=b<<(len-1); + ZStrBitsIn(1+(a<<1),len,z); +} + +uint64_t ZStrLastDec(ZSTR * z) +{ + long len; + uint64_t num,x; + len=ZStrLen(z); + + if(len==0) num=0; + else + { + num=ZStrBitsOut(z,len); + x=1; + x<<=len; + num+=x; + } + return (num>>1); +} + +void ZStrCxClear(ZCOD * zc, CTX * ctx) +{ + ctx->x1=0; +} + +void ZStrCxEnc(ZSTR * z, ZCOD * zc, CTX * ctx, uint64_t a) +{ + int seg; + uint64_t b; + switch (zc->t) + { + case 1: + case 2: + ZStrEnc(z,zc,a); + return; + case 3: + b=a-ctx->x1; + ctx->x1=a; + for(seg=1;seg<=zc->s;seg++) + if(bX[seg]) break; + seg--; + ZStrBitsIn(b-zc->X[seg]+zc->C[seg],zc->L[seg],z); + return; + default: + printf("invalid ZCOD type %d\n",zc->t); + exit(17); + } +} + +uint64_t ZStrCxDec(ZSTR * z, ZCOD * zc, CTX * ctx) +{ + int seg; + uint64_t topbit,s; + switch (zc->t) + { + case 1: + case 2: + return ZStrDec(z,zc); + case 3: + topbit=ZStrBitsPeek(z,zc->bits); + seg=zc->SG[topbit]; + s=ZStrBitsOut(z,zc->L[seg]); + s = (s-zc->C[seg])+zc->X[seg]; + ctx->x1+=s; + return ctx->x1; + default: + printf("invalid ZCOD type %d\n",zc->t); + exit(17); + } +} + + +int ZStrMaxLen(ZSTR * z, int fmt) +{ + uint64_t x; + if(fmt==2) x=15; + else + { + printf("unknown format %d in ZStrMaxLen\n",fmt); + exit(33); + } + return 1+(ZStrLen(z)/x); +} + +int ZStrExtract(ZSTR * z, void * x, int fmt) +{ + uint16_t * x2; + uint64_t s; + int len; + int words; + words=1; + if(fmt==2) + { + x2=(uint16_t *)x; + ZStrNormalize(z); + len=ZStrLen(z); + while(len>14) + { + words++; + s=ZStrBitsPeek(z,15); + if( (s&1)==1 ) + { + s=ZStrBitsOut(z,15); + *(x2++)=1+(s<<1); + len-=15; + } + else + { + s=ZStrBitsOut(z,16); + *(x2++)=1+s; +/* next line looks unsafe, but all non-zero z-strings have */ +/* their last bit 1, so if length is 15, previous case applies */ + len-=16; + } + } + s=ZStrBitsOut(z,14); + *x2 = s<<2; + return words; + } + printf("Format %d not known in ZStrExtract\n",fmt); + return 0; +} + +void ZStrInsert(ZSTR * z, void * x, int fmt) +{ + uint16_t * x2; + uint64_t s; + if(fmt==2) + { + x2=(uint16_t *)x; + ZStrClear(z); + while(1) + { + s=*(x2++); + if( (s&3)==0 ) + { + ZStrBitsIn(s>>2,14,z); + ZStrNormalize(z); + return; + } + if( (s&3)==3 ) + ZStrBitsIn(s>>1,15,z); + else + ZStrBitsIn(s-1,16,z); + } + } + printf("Format %d not known in ZStrExtract\n",fmt); +} + +int ZStrExtLen(void * x, int fmt) +{ + uint16_t * w; + int len; + w = (uint16_t *) x; + len=1; + while(((*(w++))&3)!=0) len++; + return len; +} + +STEX * ZStrSTCons(int fmt) +{ + STEX * st; + int i; + st=malloc(sizeof(STEX)); + if(st==NULL) + { + printf("malloc failed on STEX structure\n"); + exit(51); + } + st->pst=malloc(1281*sizeof(uint16_t *)); + if(st->pst==NULL) + { + printf("malloc failed on STEX pst\n"); + exit(52); + } + st->ptp=malloc(1281*sizeof(uint16_t *)); + if(st->ptp==NULL) + { + printf("malloc failed on STEX ptp\n"); + exit(53); + } + st->mal=malloc(1281*sizeof(uint64_t)); + if(st->mal==NULL) + { + printf("malloc failed on STEX mal\n"); + exit(55); + } + st->stcnt=malloc(1281*sizeof(uint64_t)); + if(st->stcnt==NULL) + { + printf("malloc failed on STEX stcnt\n"); + exit(56); + } + for(i=0;i<1281;i++) + st->mal[i]=0; + for(i=0;i<6;i++) st->inuse[i]=0; + st->listm=0; + return st; +} + +void ZStrSTDest(STEX * st) +{ + int i; + for(i=0;i<1281;i++) + if(st->mal[i]!=0) free(st->pst[i]); + if(st->listm!=0) free(st->list); + free(st->pst); + free(st->ptp); + free(st->mal); + free(st->stcnt); + free(st); +} + +int ZStrExtCompare(void * a, void * b, int fmt) +{ + uint16_t *a1, *b1; + a1=(uint16_t *) a; + b1=(uint16_t *) b; + while(1) + { + if((*a1) < (*b1)) return -1; + if((*a1) > (*b1)) return 1; + if(((*a1)&3)==0) + { + if(((*b1)&3)==0) + return 0; + return -1; + } + if(((*b1)&3)==0) + return 1; + a1++; + b1++; + } +} + +typedef struct +{ + STEX * st; + uint16_t pq[256]; + uint16_t ch[128]; +} SICH; +#define DEBUG + +#ifdef DEBUG + +void dumpheap(SICH * si) +{ + STEX * st; + int i,dat,ch; + st=si->st; + for(i=1;i<=50;i++) + { + dat=0xABCD; + if(i<128) ch=si->ch[i]; + else ch=-1; + if(si->pq[i]<1280) dat=*(st->pst[si->pq[i]]); + printf("nd %3d pq %3d ch %3d dt %x\n", + i,si->pq[i],ch,dat); + } +} + +#endif + +/* the first letter of variables are used . . . */ + +/* h int 1-255 index si (pq,ch) heap numbers */ +/* si look them up in SICH pq and you get */ + +/* s uint16_t 0-1278 index st. slot numbers */ +/* st look them up in STEX st (pst,etc) */ + +#define EXPIRED 10000 + +static void pqadvance(SICH * si, int htop) +{ + uint16_t snode; + STEX * st; + st=si->st; + snode=si->pq[htop]; + st->stcnt[snode]--; + if(st->stcnt[snode]==0) + { + si->pq[htop]=EXPIRED; + return; + } + while((*(st->ptp[snode])&3)!=0) st->ptp[snode]++; + st->ptp[snode]++; + return; +} + +static int heapcomp(SICH * si, int ha, int hb) +{ + STEX * st; + int r; + uint16_t *wa,*wb; + st=si->st; + if(si->pq[hb]==EXPIRED) return -1; + if(si->pq[ha]==EXPIRED) return 1; + wa=st->ptp[si->pq[ha]]; + wb=st->ptp[si->pq[hb]]; + r= ZStrExtCompare((void*)wa,(void*)wb,2); + return r; +} + +/* v int 0-7 index spath. level of operation */ +/* spath look them up in spath to get an h */ + +static void adjtop(SICH * si, int htop) +{ + int spath[8]; /* h = spath(v) */ + int vlev; + int hcur,hpar,hsib; /* 1-255 heap points */ + int r; + uint16_t temp; + vlev=0; + hcur=htop; + while(1) /* loop over all strings to insert */ + { +/* populate the special path */ + while(1) + { + spath[vlev]=hcur; + if(hcur>=128) break; + if( (si->pq[hcur]==EXPIRED) && (hcur!=htop) ) break; + hcur=2*hcur+si->ch[hcur]; + vlev++; + } + while(1) /* find the correct place to put hcur */ + { + if(vlev==0) return; + r = heapcomp(si,htop,hcur); + if(r!=-1) break; + vlev--; + hcur=spath[vlev]; + } + if(r==1) while(1) /* bump up */ + { + if(vlev==0) return; + hpar=spath[vlev-1]; + hsib=hcur^1; + r = heapcomp(si,htop,hsib); + if(r==0) break; + if(r==1) si->ch[hpar]^=1; + temp=si->pq[hcur]; + si->pq[hcur]=si->pq[htop]; + si->pq[htop]=temp; + vlev--; + hcur=spath[vlev]; + } + pqadvance(si, htop); + } +} + +/* Return pointer to last string <= x */ + +void * ZStrSTFind(STEX * st, void * x) +{ + uint16_t *wx, *w3, *w1, *w2; + int i; + if(st->listw==0) return NULL; /* list is empty */ + wx = (uint16_t *) x; + w1=st->list; /* very first word */ + w3=w1+st->listw-2; /* just before last word */ + i=ZStrExtCompare( (void*)w1, (void*)wx,2); + if(i>0) return NULL; /* first word bigger */ + while(w3>=w1) + { + if(((*w3)&3)==0) break; + w3--; + } + w3++; /* first word of last string */ +/* x1 and x3 point to first and last string */ + while(w1!=w3) + { + w2=w1+(w3-w1)/2; + while(w2>=w1) + { + if(((*w2)&3)==0) break; + w2--; + } + w2++; + if(w2==w1) /* no earlier start - try later */ + { + w2=w1+(w3-w1)/2; + while(w2=w3) return w1; + } + i=ZStrExtCompare( (void*)w2, (void*)wx,2); + if(i>0) w3=w2; + else w1=w2; + } + return w1; +} + +static void merge(STEX * st, int layer) +{ + uint16_t sfst,slst,snpl,ssc,i; + uint16_t *wout, *w1; + SICH si; + size_t mem; + int hcur,r; + if(st->inuse[layer]==0) return; + si.st=st; + sfst=256*layer; + slst=sfst+st->inuse[layer]; /* one more than last */ + snpl=256*(layer+1)+st->inuse[layer+1]; /* new place */ + hcur=1; + mem=0; + for(i=sfst;iptp[i]-st->pst[i])*sizeof(uint16_t); + st->ptp[i]=st->pst[i]; + si.pq[hcur++]=i; + } + while(hcur<256) si.pq[hcur++]=EXPIRED; + + if(mem>st->mal[snpl]) + { + if(st->mal[snpl]!=0) free(st->pst[snpl]); + st->pst[snpl]=malloc(mem); + if(st->pst[snpl]==NULL) + { + printf("malloc in merge failed\n"); + exit(44); + } + st->mal[snpl]=mem; + } + st->stcnt[snpl]=0; + hcur=127; + while(hcur>=1) + { + r=0; + while(r==0) + { + r=heapcomp(&si,2*hcur,2*hcur+1); + if(r!=0) break; + pqadvance(&si,2*hcur); + adjtop(&si,2*hcur); /* bugfix added */ + } + if(r==-1) si.ch[hcur]=0; + else si.ch[hcur]=1; + adjtop(&si,hcur); + hcur--; + } + + wout=st->pst[snpl]; + while(si.pq[1]!=EXPIRED) + { + ssc=si.pq[1]; + w1=st->ptp[ssc]; + while(((*w1)&3)!=0) *(wout++) =*(w1++); + *(wout++) =*(w1++); + st->ptp[ssc]=w1; + st->stcnt[ssc]--; + if(st->stcnt[ssc]==0) + si.pq[1]=EXPIRED; + adjtop(&si,1); + st->stcnt[snpl]++; + } + st->ptp[snpl]=wout; + st->inuse[layer]=0; + st->inuse[layer+1]++; + if(st->inuse[layer+1] == 255) merge(st,layer+1); +} + +void ZStrSTAppend(STEX * st, ZSTR * z) +{ + size_t len; + int sno; + len=ZStrMaxLen(z,2)*sizeof(uint16_t); + sno=st->inuse[0]; + if(len>st->mal[sno]) + { + if(st->mal[sno]!=0) free(st->pst[sno]); + st->pst[sno]=malloc(len); + if(st->pst[sno]==NULL) + { + printf("malloc in Append failed\n"); + exit(45); + } + st->mal[sno]=len; + } + len=ZStrExtract(z,st->pst[sno],2); + st->ptp[sno]=st->pst[sno]+len; + st->stcnt[sno]=1; + st->inuse[0]++; + if(st->inuse[0]>=255) merge(st,0); +} + +void ZStrSTSort(STEX * st) +{ + int lev,lev2,mxlev; + uint16_t sans; + lev=0; + mxlev=0; + while(lev<6) + { +/* check to find maximum level */ + for(lev2=0;lev2<6;lev2++) + if(st->inuse[lev2]!=0) mxlev=lev2; + if( (lev==mxlev) && (st->inuse[lev]==1) ) break; + merge(st,lev); + lev++; + continue; + } + if(st->listm!=0) free (st->list); + if(st->inuse[lev]==0) /* nothing there at all! */ + { + st->listw=0; + st->listm=0; + return; + } + sans=256*lev; + st->list=st->pst[sans]; + st->listw=st->ptp[sans]-st->pst[sans]; + st->listm=st->mal[sans]; + st->cnt=st->stcnt[sans]; + st->mal[sans]=0; +} + +TUBER * ZStrTuberCons(size_t size, int options) +{ + TUBER * t; + int i; + t=malloc(sizeof(TUBER)); + if(t==NULL) + { + printf("failed to malloc TUBER struct\n"); + exit(34); + } +/* compute number of K-keys per word from options */ + i=options&7; + t->kperw=0; + if(i==1) t->kperw=8; + if(i==2) t->kperw=4; + if(i==3) t->kperw=2; + if(i==4) t->kperw=1; + if(t->kperw == 0) + { + printf("Invalid options field in ZStrTuberCons\n"); + exit(35); + } +/* compute maximum K-key from suggested size */ + t->kmax=(size*8)/t->kperw; + t->kmax--; + if( (t->kmax%2) == 0) t->kmax--; + while(1) + { + t->kmax+=2; + for(i=3;i<47;i++) + if( (t->kmax%i)==0) break; + if(i==47) break; + } + t->wct = (t->kmax+t->kperw-1)/t->kperw; + t->tiptop=t->wct*t->kperw; + t->tub = malloc(8*t->wct); + if(t->tub == NULL) + { + printf("Unable to malloc tuber data\n"); + exit(36); + } + for(i=0;iwct;i++) t->tub[i]=0x8000000000000000ll; + t->lenlen=3; + t->mult=8; + if(t->kperw==2) + { + t->lenlen=4; + t->mult=16; + } + if(t->kperw==1) + { + t->lenlen=5; + t->mult=32; + } + return t; +} + +void ZStrTuberDest(TUBER * t) +{ + free(t->tub); + free(t); +} + +typedef struct +{ + TUBER * tub; + uint64_t curw; /* up on tub->tub */ + long curb; /* 0-62 */ + long hdrlen; +} CuR; + +static void copycur(CuR * c1, CuR * c2) +{ + c2->tub=c1->tub; + c2->curw=c1->curw; + c2->curb=c1->curb; +} + +static uint64_t getbits(CuR * cur, long bits) +{ + uint64_t got,got1; + uint64_t one; + long newbits; + uint64_t x; + TUBER * tub; + + one=1; + tub=cur->tub; + if(bits+cur->curb < 63) + { + got=tub->tub[cur->curw]; + cur->curb+=bits; + got>>=(63-cur->curb); + } + else + { + got=tub->tub[cur->curw]; + newbits=bits+cur->curb-63; + cur->curb=newbits; + cur->curw++; + if(cur->curw>=tub->wct) cur->curw=0; + got1=tub->tub[cur->curw]; + got1<<=1; +/* bugfix shift of 64 not happening */ + if(newbits!=0) + got=(got<>(64-newbits)); + } + x = got&((one<tub; + cur->curw+=(bits/63); + cur->curb+=(bits%63); + if(cur->curb>62) + { + cur->curw++; + cur->curb-=63; + } + while(cur->curw>=t->wct) cur->curw-=t->wct; +} + +static void putbits(CuR * cur, uint64_t data, long bits) +{ + TUBER * tub; + uint64_t x1,x2; + uint64_t one; + long newbits; + tub=cur->tub; + one=1; + x2=(one<<(63-cur->curb))-one; /* mask for ~old bits */ + if(bits+cur->curb < 63) + { + x1=(one<<(63-cur->curb-bits))-one; + x2=x1^x2; /* new bits mask */ + x1=~x2; /* old bits mask */ + x1=x1&tub->tub[cur->curw]; /*old bits (inc. top one) */ + tub->tub[cur->curw]=x1+((data<<(63-cur->curb-bits))&x2); + cur->curb+=bits; + return; + } + x1=~x2; + x1=x1&tub->tub[cur->curw]; /* old bits */ + newbits=cur->curb+bits-63; + tub->tub[cur->curw]=x1+(data>>(newbits)); + cur->curw++; + if(cur->curw>=tub->wct) cur->curw=0; + cur->curb=newbits; + x1=((one<<(63-newbits))-one)|0x8000000000000000; /* keep these */ + x2=tub->tub[cur->curw]&x1; + tub->tub[cur->curw]=x2+((data<<(63-newbits))&(~x1)); + return; +} + +static long gethdr(CuR * cur) +{ + TUBER * t; + long dlen; + uint64_t h; + t=cur->tub; + h=getbits(cur,t->lenlen+1); + cur->hdrlen=t->lenlen+1; + dlen=h; + dlen-=2; + if(h<3) return dlen; + dlen=0; + while( (h>>t->lenlen)!=0 ) + { + h-=t->mult; + h=(h<<1)+getbits(cur,1); + cur->hdrlen++; + dlen+=t->mult; + } + dlen+=h; + dlen-=2; + return dlen; +} + +void ZStrTuberStats(TUBER * t, int query, uint64_t * stats) +{ + stats[0]=t->kmax; + stats[1]=t->wct*8; + stats[2]=200; +} + +typedef struct +{ + TUBER * tub; + uint64_t first; + uint64_t last; + uint64_t words; +} BlK; + +/* Set cur to point to the wanted string */ +static void locate(TUBER * t, uint64_t kkey, BlK * blk, CuR * cur) +{ + uint64_t curkkey; + long dlen; +/* fill in the BlK structure with first, last and number of words */ + blk->last = blk->first = kkey/t->kperw; + blk->words=1; + blk->tub=t; + if(blk->first>0) blk->first--; + else blk->first=t->wct-1; + while( (t->tub[blk->first]>>63)==0) + { + if(blk->first>0) blk->first--; + else blk->first=t->wct-1; + blk->words++; + } + blk->first++; + if(blk->first >= t->wct) blk->first=0; + while( (t->tub[blk->last]>>63)==0) + { + blk->last++; + if(blk->last >= t->wct) blk->last=0; + blk->words++; + } +/* set the CuR structure to point to the required string */ + cur->tub=t; + cur->curw=blk->first; + cur->curb=0; + curkkey=blk->first*t->kperw; + while(curkkey!=kkey) + { + dlen=gethdr(cur); + if(dlen>0)skipbits(cur,dlen); + curkkey++; +/* bugfixed in tuber - wraparound */ + if(curkkey==t->kperw*t->wct) curkkey=0; + } +} + +/* grabs specified number of kkeys from cur*/ +long grabrest(CuR * cur, BlK * blk, uint64_t kkeys, ZSTR * z) +{ + uint64_t i,b; + long j,k,freeb; + TUBER * t; + CuR cur1; + t = blk->tub; + for(i=0;i63) + { + b=getbits(cur,63); + ZStrBitsIn(b,63,z); + k-=63; + } + b=getbits(cur,k); + ZStrBitsIn(b,k,z); + while(j>63) + { + b=getbits(cur,63); + ZStrBitsIn(b,63,z); + j-=63; + } + if(j>0) + { + b=getbits(cur,j); + ZStrBitsIn(b,j,z); + }; + } + freeb=63-cur->curb; + while(cur->curw!=blk->last) + { + freeb+=63; + cur->curw++; + if(cur->curw>=t->wct) cur->curw=0; + } + return freeb; +} + +static long blkfuse(BlK * blk, CuR * cur, ZSTR * z) +{ + TUBER * t; + uint64_t kkeys; + t=blk->tub; + blk->last++; + if(blk->last >= t->wct) blk->last=0; + cur->curw=blk->last; + cur->curb=0; + blk->words++; + kkeys=t->kperw; + while( (t->tub[blk->last]>>63)==0) + { + blk->last++; + if(blk->last >= t->wct) blk->last=0; + blk->words++; + kkeys+=t->kperw; + } + return grabrest(cur,blk,kkeys,z); +} + +void movebits(ZSTR * z, long bits, CuR * cur) +{ + uint64_t j; + long bt; + bt=bits; + while(bt>60) + { + j=ZStrBitsOut(z,60); + bt-=60; + putbits(cur,j,60); + } + j=ZStrBitsOut(z,bt); + putbits(cur,j,bt); +} + + +int ZStrTuberRead(TUBER * t, uint64_t kkey, ZSTR * z) +{ + long i; + uint64_t j; + BlK blk; + CuR cur; + locate(t,kkey,&blk,&cur); + i=gethdr(&cur); + if(i==-2) return 1; + ZStrClear(z); + if(i==-1) return 0; + while(i>60) + { + j=getbits(&cur,60); + ZStrBitsIn(j,60,z); + i-=60; + } + if(i>0) + { + j=getbits(&cur,i); + ZStrBitsIn(j,i,z); + } + ZStrBitsIn(1,1,z); + return 0; +} + +uint64_t ZStrTuberIns(TUBER * t, uint64_t d1, uint64_t d2) +{ + BlK blk; + CuR cur,cur1; + uint64_t kkey,keyb; + int i; +/* first find a keyb that works */ + for(keyb=0;keyb<65536;keyb++) + { + kkey=ZStrTuberK(t,d1,d2,keyb); + locate(t,kkey,&blk, &cur); + copycur(&cur,&cur1); + i=gethdr(&cur); + if(i==-2) break; + } + if(keyb==65536) return INSFAIL; +/* equal size so change from key-not-found to zero */ + putbits(&cur1,1,(cur.tub)->lenlen+1); + return keyb; +} + +int ZStrTuberUpdate(TUBER * t, uint64_t kkey, ZSTR * z) +{ + BlK blk; + CuR cur; + CuR cur1; + ZSTR * z1; + long i1,i2,i3,j,k,b1,sparebits,bitlen; + int i; + uint64_t kkeys; + uint64_t w,m1,m2; + int fuseflag; + locate(t,kkey,&blk, &cur); + copycur(&cur,&cur1); + i1=gethdr(&cur1); + if(i1<0) + i1=0; + i3=i1; + i1+=cur1.hdrlen; /* current total length in tuber */ + j=ZStrLen(z); + k=j+1; + b1=0; + while(k>=(t->mult)) + { + b1++; + k-=t->mult; + } +/* so b1 is the number of 1-bits in the header */ +/* and k is the value of the remainder of the header bits */ +/* and j is the length of the z-string part (inc. last 1) */ + i2=b1+j+t->lenlen; + if(j==0) i2++; +/* so now i2 is the new length */ + if(i2==i1) /* same length case */ + { + for(i=0;ilenlen); + if(j>1)movebits(z,j-1,&cur); + return 0; + } + skipbits(&cur1,i3); + kkeys=((blk.last+1)*t->kperw)-1; + if(kkeys>=kkey) kkeys=kkeys-kkey; + else kkeys=t->tiptop+kkeys-kkey; + z1=ZStrCons(kkeys/t->wct+7); /* first shot */ + sparebits=grabrest(&cur1,&blk,kkeys,z1); + fuseflag=0; + while(sparebits+i1 (t->wct/3)) return 1; + } + sparebits=sparebits+i1-i2; + if(fuseflag==1) + { + m1=0x7fffffffffffffffull; + m2=0x8000000000000000ull; + w=blk.first; + while(w!=blk.last) + { + t->tub[w]&=m1; + w++; + if(w>=t->wct) w=0; + } + t->tub[w]|=m2; + } + bitlen=ZStrLen(z); + for(i=0;ilenlen); + if(j>1) movebits(z,j-1,&cur); + bitlen=ZStrLen(z1); + movebits(z1,bitlen,&cur); + ZStrClear(z1); + movebits(z1,sparebits,&cur); + return 0; +} + +void ZStrTuberDelete(TUBER * t, uint64_t kkey) +{ + BlK blk; + CuR cur; + CuR cur1; + ZSTR * z; + long i1,bitlen; + uint64_t kkeys; + + locate(t,kkey,&blk, &cur); + copycur(&cur,&cur1); + i1=gethdr(&cur1); + + skipbits(&cur1,i1); + kkeys=((blk.last+1)*t->kperw)-1; + if(kkeys>=kkey) kkeys=kkeys-kkey; + else kkeys=t->tiptop+kkeys-kkey; + z=ZStrCons(kkeys/t->wct+7); /* about right */ + grabrest(&cur1,&blk,kkeys,z); + bitlen=ZStrLen(z); /* probably should compute in grabrest */ + putbits(&cur,0,t->lenlen+1); /* put in key-not-present */ + movebits(z,bitlen,&cur); +} + +/* end of zstr.c */ diff --git a/arangod/FulltextIndex/zstr.h b/arangod/FulltextIndex/zstr.h new file mode 100644 index 0000000000..de57938633 --- /dev/null +++ b/arangod/FulltextIndex/zstr.h @@ -0,0 +1,107 @@ +/* zstr.h - header file for the z-string module */ +/* R. A. Parker 3.5.2012 */ + +typedef struct +{ + uint64_t * dat; + long * dlen; + int alloc; + int firstix; + int lastix; +} ZSTR; + +ZSTR * ZStrCons(int elts); +void ZStrDest(ZSTR * z); +void ZStrClear(ZSTR * z); +void ZStrBitsIn(uint64_t a, long bits, ZSTR * z); +uint64_t ZStrBitsOut(ZSTR * z, long bits); +uint64_t ZStrBitsPeek(ZSTR * z, long bits); +long ZStrLen(ZSTR * z); +void ZStrNormalize(ZSTR * z); + +typedef struct +{ + int t; /* code type */ + int s; /* segments */ + int tmax; /* Top to translate */ + int bits; /* that determine len */ + uint64_t * X; /* first of segment */ + uint64_t * C; /* code added */ + uint8_t * L; /* length in bits */ + uint8_t * SG; /* segment for top bits */ + uint8_t * TX; /* translate table */ + uint8_t * UX; /* untranslate table */ +} ZCOD; + +void ZStrEnc(ZSTR * z, ZCOD * zc, uint64_t a); +uint64_t ZStrDec(ZSTR * z, ZCOD * zc); +uint64_t ZStrXlate(ZCOD * zc, uint64_t a); +uint64_t ZStrUnXl(ZCOD * zc, uint64_t a); +void ZStrLastEnc(ZSTR * z, uint64_t a); +uint64_t ZStrLastDec(ZSTR * z); + +typedef struct +{ + uint64_t x1; +} CTX; + +void ZStrCxClear(ZCOD * zc, CTX * ctx); +void ZStrCxEnc(ZSTR * z, ZCOD * zc, CTX * ctx, uint64_t a); +uint64_t ZStrCxDec(ZSTR * z, ZCOD * zc, CTX * ctx); + + +int ZStrMaxLen(ZSTR * z, int fmt); +int ZStrExtract(ZSTR * z, void * x, int fmt); +void ZStrInsert(ZSTR * z, void * x, int fmt); +int ZStrExtCompare(void * x, void * y, int fmt); +int ZStrExtLen(void * x, int fmt); + +typedef struct +{ + uint16_t ** pst; /* 1281 pointers to start */ + uint16_t ** ptp; /* 1281 pointers to top */ + uint64_t * mal; /* 1281 number of bytes allocated */ + uint64_t * stcnt; /* 1281 number of strings in clump */ + uint16_t inuse[6]; + uint16_t * list; /* final list */ + uint64_t listw; /* number of uint16s in final list */ + uint64_t listm; /* number of uint16's malloc'd */ + uint64_t cnt; /* number if strings in list */ +} STEX; + +STEX * ZStrSTCons(int fmt); +void ZStrSTDest(STEX * st); +void ZStrSTAppend(STEX * st, ZSTR * z); +void ZStrSTSort(STEX * st); +void * ZStrSTFind(STEX * st, void * x); + +typedef struct +{ + uint64_t kperw; /* K keys per word */ + uint64_t kmax; /* (prime) number of keys */ + uint64_t tiptop; /* number of spaces in tuber */ + uint64_t wct; /* number of 64-bit words */ + long lenlen; /* length of length string */ + uint64_t mult; /* length bits per initial 1-bit */ + uint64_t * tub; /* tuber data pointer */ +} TUBER; + +#define TUBER_BITS_8 1 +#define TUBER_BITS_16 2 +#define TUBER_BITS_32 3 +#define TUBER_BITS_64 4 + +TUBER * ZStrTuberCons(size_t size, int options); +void ZStrTuberDest(TUBER * t); +void ZStrTuberStats(TUBER * t, int query, uint64_t * stats); +int ZStrTuberRead(TUBER * t, uint64_t kkey, ZSTR * z); +int ZStrTuberUpdate(TUBER * t, uint64_t kkey, ZSTR * z); +void ZStrTuberDelete(TUBER * t, uint64_t kkey); +#define INSFAIL 128000 +uint64_t ZStrTuberIns(TUBER * t, uint64_t d1, uint64_t d2); +uint64_t ZStrTuberK(TUBER * t, uint64_t d1, + uint64_t d2, uint64_t keyb); + +/* end of zstr.h */ + + diff --git a/arangod/FulltextIndex/zstrreg.c b/arangod/FulltextIndex/zstrreg.c new file mode 100644 index 0000000000..ca31f8d59c --- /dev/null +++ b/arangod/FulltextIndex/zstrreg.c @@ -0,0 +1,488 @@ +/* zstring regression program */ +/* R. A. Parker 15.11.2012 */ + +#include +#include +#include "zstr.h" + +int err; + +void ckint(int x, int was, int shdbe) +{ + if(was==shdbe) return; + err++; + printf("Error %d, was %x (%d), should be %x\n",x,was,was,shdbe); +} + +void ZDUMP(ZSTR * z) +{ + int i; + printf("alloc %d firstix %d lastix %d\n", + z->alloc,z->firstix,z->lastix); + for(i=z->firstix;i<=z->lastix;i++) + printf("ix %d, val %16llx length %d\n", + i,(unsigned long long)z->dat[i],(int)z->dlen[i]); +} + +void TUBDUMP(TUBER * t) +{ + long i1,i2,i3,i4,i5,i6; + int i; + long long ff; + i1=t->kperw; + i2=t->kmax; + i3=t->wct; + i4=t->tiptop; + i5=t->lenlen; + i6=t->mult; + printf("kperw %ld, kmax %ld, wct %ld, ", + i1,i2,i3); + printf("tiptop %ld, lenlen %ld, mult %ld\n", + i4,i5,i6); + for(i=0;itub[i]; + printf("%16llx ",ff); + if(i%5==4) printf("\n"); + } + if((i%5)!=0) printf("\n"); +} + +int main(int argc, char ** argv) +{ + uint16_t y[10]; +/* first test code 0xx 10xxx 11xxxx */ +/* 0-3 4-11 12-27 */ + + uint64_t tx1[]={0,4,12,28}; + uint64_t tc1[]={0,0x10,0x30}; + uint8_t tl1[]={3,5,6}; + uint8_t tsg1[]={0,0,1,2}; + ZCOD zc1 = {1,3,0,2,tx1,tc1,tl1,tsg1,tl1,tl1}; + +/* second test code 0xx 10xxx 11xxxx */ +/* 0-3 4-11 12-27 */ +/* after translation 0 1 2 3 4 5 6 */ +/* goes to 4 5 0 2 1 6 3 */ + + uint64_t tx2[]={0,4,12,28}; + uint64_t tc2[]={0,0x10,0x30}; + uint8_t tl2[]={3,5,6}; + uint8_t tsg2[]={0,0,1,2}; + uint8_t ttx2[]={4,5,0,2,1,6,3}; + uint8_t tux2[]={2,4,3,6,0,1,5}; + ZCOD zc2 = {2,3,6,2,tx2,tc2,tl2,tsg2,ttx2,tux2}; + +/* third test code 0xx 10xxx 11xxxx */ +/* with delta 0-3 4-11 12-27 */ + + uint64_t tx3[]={0,4,12,28}; + uint64_t tc3[]={0,0x10,0x30}; + uint8_t tl3[]={3,5,6}; + uint8_t tsg3[]={0,0,1,2}; + ZCOD zc3 = {3,3,0,2,tx3,tc3,tl3,tsg3,tl3,tl3}; + + STEX * st; + uint16_t sw0[]={0x0000}; + uint16_t sw2[]={0xFFFC}; + + TUBER * t1; + uint64_t stats[20]; + + ZSTR * z1; + CTX ctx; + uint64_t i,j,k; + uint64_t nokeys; + long len; + uint64_t d1,d2; + uint64_t b0,b1,b2,b3,b4,b5,b6,b7; + uint64_t k0,k1,k2,k3,k4,k5,k6,k7; + uint16_t * fw1; + int q; + err=0; + +/* */ +/* 001 - 020 First batch to just exercise the simple */ +/* bit handling routines a little */ + + z1=ZStrCons(3); + ckint(1,z1->alloc,3); /* did it allocate OK */ + len=ZStrLen(z1); + ckint(2,len,0); /* len=0 at start */ + ZStrBitsIn(0x05A792,24,z1); + len=ZStrLen(z1); + ckint(3,len,24); /* len=24 now */ + ZStrBitsIn(0xF,4,z1); + len=ZStrLen(z1); + ckint(4,len,28); /* len=28 now */ + j=ZStrBitsPeek(z1,16); + ckint(5,j,0x05A7); /* first 16 bits */ + len=ZStrLen(z1); + ckint(6,len,28); /* length the same */ + j=ZStrBitsOut(z1,8); + ckint(7,j,0x05); /* first 8 bits */ + len=ZStrLen(z1); + ckint(8,len,20); /* length 20 now */ + j=ZStrBitsPeek(z1,28); + ckint(9,j,0xA792F00); /* first 28 bits! */ + j=ZStrBitsOut(z1,12); + ckint(10,j,0xA79); /* last 12 bits */ + len=ZStrLen(z1); + ckint(11,len,8); /* length 8 now */ + ZStrBitsIn(0xC0,8,z1); + len=ZStrLen(z1); + ckint(12,len,16); /* length 16 */ + j=ZStrBitsPeek(z1,16); + ckint(13,j,0x2FC0); /* 0x2FC0 -> Normalize */ + ZStrNormalize(z1); + j=ZStrBitsPeek(z1,16); + ckint(14,j,0x2FC0); /* still 0x2FC0 */ + len=ZStrLen(z1); + ckint(15,len,10); /* length 11 now */ + ZStrClear(z1); + len=ZStrLen(z1); + ckint(16,len,0); /* length 0 now */ + j=ZStrBitsPeek(z1,28); + ckint(17,j,0); /* last 28 bits all 0 */ + j=ZStrBitsOut(z1,12); + ckint(18,j,0); /* last 12 bits all 0 */ + ZStrDest(z1); + +/* */ +/* 0021 - 039 Next batch to test basic Enc/Decode */ +/* */ +/* test code 0xx 10xxx 11xxxx */ +/* 0-3 4-11 12-27 (28+ illegal) */ + + z1=ZStrCons(3); + ZStrCxEnc(z1,&zc1,&ctx,3); + len=ZStrLen(z1); + ckint(21,len,3); /* length 3 now */ + j=ZStrBitsPeek(z1,5); + ckint(22,j,0xC); /* 011 00 */ + j=ZStrCxDec(z1,&zc1,&ctx); + ckint(23,j,3); + len=ZStrLen(z1); + ckint(24,len,0); /* length 0 now */ + ZStrClear(z1); + ZStrCxEnc(z1,&zc1,&ctx,27); /* put in limit values */ + ZStrCxEnc(z1,&zc1,&ctx,4); + ZStrCxEnc(z1,&zc1,&ctx,3); + ZStrCxEnc(z1,&zc1,&ctx,12); + ZStrCxEnc(z1,&zc1,&ctx,11); + ZStrCxEnc(z1,&zc1,&ctx,0); + len=ZStrLen(z1); + ckint(25,len,28); /* length should be 28 */ + ZStrNormalize(z1); + len=ZStrLen(z1); + ckint(26,len,25); /* length should be 25 */ + j=ZStrCxDec(z1,&zc1,&ctx); + ckint(27,j,27); + j=ZStrCxDec(z1,&zc1,&ctx); + ckint(28,j,4); + j=ZStrCxDec(z1,&zc1,&ctx); + ckint(29,j,3); + j=ZStrCxDec(z1,&zc1,&ctx); + ckint(30,j,12); + j=ZStrCxDec(z1,&zc1,&ctx); + ckint(31,j,11); + j=ZStrCxDec(z1,&zc1,&ctx); + ckint(32,j,0); + ZStrClear(z1); + j=0; + for(i=0;i<1000;i++) + { + j+=11; + if(j>27) j-=28; + ZStrCxEnc(z1,&zc1,&ctx,j); + } + ZStrNormalize(z1); + j=0; + for(i=0;i<1000;i++) + { + j+=11; + if(j>27) j-=28; + k=ZStrCxDec(z1,&zc1,&ctx); + ckint(33,k,j); + } + len=ZStrLen(z1); + ckint(34,len,0); + ZStrNormalize(z1); + ZStrDest(z1); + +/* */ +/* 0041 - 059 Next batch to test type 2 Enc/Decode */ + +/* second test code 0xx 10xxx 11xxxx */ +/* 0-3 4-11 12-27 */ +/* after translation 0 1 2 3 4 5 6 */ +/* goes to 4 5 0 2 1 6 3 */ + + z1=ZStrCons(3); + ZStrCxEnc(z1,&zc2,&ctx,6); + len=ZStrLen(z1); + ckint(41,len,3); /* length 3 now */ + j=ZStrBitsPeek(z1,5); + ckint(42,j,0xC); /* 011 00 */ + j=ZStrCxDec(z1,&zc2,&ctx); + ckint(43,j,6); + len=ZStrLen(z1); + ckint(44,len,0); /* length 0 now */ + ZStrClear(z1); + ZStrCxEnc(z1,&zc2,&ctx,27); /* put in limit values */ + ZStrCxEnc(z1,&zc2,&ctx,0); /* 4 */ + ZStrCxEnc(z1,&zc2,&ctx,6); /* 3 */ + ZStrCxEnc(z1,&zc2,&ctx,12); + ZStrCxEnc(z1,&zc2,&ctx,11); + ZStrCxEnc(z1,&zc2,&ctx,2); /* 0 */ + len=ZStrLen(z1); + ckint(45,len,28); /* length should be 28 */ + ZStrNormalize(z1); + len=ZStrLen(z1); + ckint(46,len,25); /* length should be 25 */ + j=ZStrCxDec(z1,&zc2,&ctx); + ckint(47,j,27); + j=ZStrCxDec(z1,&zc2,&ctx); + ckint(48,j,0); + j=ZStrCxDec(z1,&zc2,&ctx); + ckint(49,j,6); + j=ZStrCxDec(z1,&zc2,&ctx); + ckint(50,j,12); + j=ZStrCxDec(z1,&zc2,&ctx); + ckint(51,j,11); + j=ZStrCxDec(z1,&zc2,&ctx); + ckint(52,j,2); + ZStrClear(z1); + j=0; + for(i=0;i<1000;i++) + { + j+=11; + if(j>27) j-=28; + ZStrCxEnc(z1,&zc2,&ctx,j); + } + ZStrNormalize(z1); + j=0; + for(i=0;i<1000;i++) + { + j+=11; + if(j>27) j-=28; + k=ZStrCxDec(z1,&zc2,&ctx); + ckint(53,k,j); + } + len=ZStrLen(z1); + ckint(54,len,0); + ZStrNormalize(z1); + ZStrDest(z1); + +/* */ +/* 0060 - 079 Test Xlate and UnXl */ +/* after translation 0 1 2 3 4 5 6 */ +/* goes to 4 5 0 2 1 6 3 */ + + k=ZStrXlate(&zc2,0); + ckint(60,k,4); + k=ZStrXlate(&zc2,1); + ckint(61,k,5); + k=ZStrXlate(&zc2,2); + ckint(62,k,0); + k=ZStrXlate(&zc2,3); + ckint(63,k,2); + k=ZStrXlate(&zc2,4); + ckint(64,k,1); + k=ZStrXlate(&zc2,5); + ckint(65,k,6); + k=ZStrXlate(&zc2,6); + ckint(66,k,3); + k=ZStrXlate(&zc2,7); + ckint(67,k,7); + k=ZStrXlate(&zc2,17); + ckint(68,k,17); + k=ZStrXlate(&zc2,77777); + ckint(69,k,77777); + + k=ZStrUnXl(&zc2,0); + ckint(70,k,2); + k=ZStrUnXl(&zc2,1); + ckint(71,k,4); + k=ZStrUnXl(&zc2,2); + ckint(72,k,3); + k=ZStrUnXl(&zc2,3); + ckint(73,k,6); + k=ZStrUnXl(&zc2,4); + ckint(74,k,0); + k=ZStrUnXl(&zc2,5); + ckint(75,k,1); + k=ZStrUnXl(&zc2,6); + ckint(76,k,5); + k=ZStrUnXl(&zc2,7); + ckint(77,k,7); + k=ZStrUnXl(&zc2,17); + ckint(78,k,17); + k=ZStrUnXl(&zc2,7777); + ckint(79,k,7777); + +/* */ +/* 0080 - 099 Test Enc/Decode of type 3 (delta) code */ +/* */ +/* test code 0xx 10xxx 11xxxx DELTA */ +/* 0-3 4-11 12-27 (28+ illegal) */ + + z1=ZStrCons(3); + ZStrCxClear(&zc3,&ctx); + ZStrCxEnc(z1,&zc3,&ctx,3); + ZStrCxEnc(z1,&zc3,&ctx,5); + ZStrCxEnc(z1,&zc3,&ctx,9); /* 011 010 10000 */ + len=ZStrLen(z1); + ckint(80,len,11); /* length 11 now */ + j=ZStrBitsPeek(z1,10); + ckint(81,j,0x1A8); /* 011 010 1000 */ + ZStrNormalize(z1); + j=ZStrBitsPeek(z1,10); + ckint(82,j,0x1A8); /* 011 010 1000 */ + ZStrCxClear(&zc3,&ctx); + j=ZStrCxDec(z1,&zc3,&ctx); + ckint(83,j,3); + j=ZStrCxDec(z1,&zc3,&ctx); + ckint(84,j,5); + j=ZStrCxDec(z1,&zc3,&ctx); + ckint(85,j,9); + len=ZStrLen(z1); + ckint(86,len,0); /* length 0 now */ + j=ZStrCxDec(z1,&zc3,&ctx); + ckint(87,j,9); + j=ZStrCxDec(z1,&zc3,&ctx); + ckint(88,j,9); + + ZStrClear(z1); + ZStrCxClear(&zc3,&ctx); + j=0; + for(i=0;i<1000;i++) + { + j+=4; + ZStrCxEnc(z1,&zc3,&ctx,j); + } + ZStrNormalize(z1); + ZStrCxClear(&zc3,&ctx); + j=0; + for(i=0;i<1000;i++) + { + j+=4; + k=ZStrCxDec(z1,&zc3,&ctx); + ckint(89,k,j); + } + len=ZStrLen(z1); + ckint(90,len,0); + ZStrDest(z1); +/* */ +/* 100 - 119 Test Extract, Insert and ExtLen */ + z1=ZStrCons(3); + ZStrBitsIn(0xDEADBEEF,32,z1); + len=ZStrLen(z1); + ckint(100,len,32); /* len=32 now */ + len=ZStrMaxLen(z1,2); + ckint(101,len,3); + len=ZStrExtract(z1,(void *)y,2); + ckint(102,len,3); + ckint(103,y[0],0xDEAE); + ckint(104,y[1],0xBEEF); + ckint(105,y[2],0x8000); + ZStrDest(z1); + z1=ZStrCons(5); + ZStrInsert(z1,(void *)y,2); + len=ZStrLen(z1); + ckint(106,len,32); + j=ZStrBitsOut(z1,32); + ckint(107,j,0xDEADBEEF); + ZStrDest(z1); + +/* 200 - 299 - test the codes and hashes */ +/* not yet written */ + +/* 300 - 399 test the tuber things */ + + z1=ZStrCons(3); + for(q=0;q<400;q+=100) + { + if(q==0) t1 = ZStrTuberCons(152,TUBER_BITS_8); + if(q==100) t1 = ZStrTuberCons(152,TUBER_BITS_16); + if(q==200) t1 = ZStrTuberCons(152,TUBER_BITS_32); + if(q==300) t1 = ZStrTuberCons(152,TUBER_BITS_64); + + nokeys=t1->kmax; + d1=nokeys/2; + d2=0; +/* try inserting three items with same keya */ +/* should get keybe as 0, 1 and 2 respectively. */ +/* this relies on three inserts working with keyb*/ +/* coming out as 0, 1 and 2. Change construction*/ +/* size if this doesn't work. */ + b0=ZStrTuberIns(t1,d1,d2); + ckint(300+q,b0,0); + b1=ZStrTuberIns(t1,d1,d2); + ckint(301+q,b1,1); + b2=ZStrTuberIns(t1,d1,d2); + ckint(302+q,b2,2); + k0=ZStrTuberK(t1,d1,d2,b0); + k1=ZStrTuberK(t1,d1,d2,b1); + k2=ZStrTuberK(t1,d1,d2,b2); + ZStrTuberDelete(t1,k0); + + ZStrBitsIn(0xDEAD,16,z1); + ZStrNormalize(z1); + ZStrTuberUpdate(t1,k1,z1); + ZStrClear(z1); + j=ZStrTuberRead(t1,k1,z1); + ckint(303+q,j,0); + len=ZStrLen(z1); + ckint(304+q,len,16); + j=ZStrBitsOut(z1,16); + ckint(305+q,j,0xDEAD); /* get our data back */ + ZStrTuberDest(t1); + } + ZStrDest(z1); +/* */ +/* 700 - 799 STEX testing - sorting the words */ +/* */ + + z1=ZStrCons(3); + st=ZStrSTCons(2); + ZStrBitsIn(0xDB,8,z1); + ZStrSTAppend(st,z1); + ZStrSTSort(st); + fw1=ZStrSTFind(st,(void*) sw0); + if(fw1!=NULL) ckint(704,*fw1,7777); + fw1=ZStrSTFind(st,(void*) sw2); + ckint(705,*fw1,0xDB00); + ZStrSTDest(st); + st=ZStrSTCons(2); + for(i=1;i<100;i++) + { + j=(17*i)%97; + ZStrClear(z1); + ZStrBitsIn(j,8,z1); + ZStrSTAppend(st,z1); + } + ZStrSTSort(st); + ZStrSTDest(st); + ZStrDest(z1); + +/* */ +/* 800 - 810 LastEnc and LastDec testing */ +/* */ + + z1=ZStrCons(5); + for(i=0;i<10000;i++) + { + ZStrClear(z1); + ZStrLastEnc(z1,i); + j=ZStrLastDec(z1); + ckint(800,j,i); + } + ZStrDest(z1); + + + printf("End of z-string regression - %d errors\n",err); + return 0; +} + +/* end of zstring regression module */