Delta-encode terms in interior nodes. While experiments have shown

that this is of marginal utility when encoding terms resulting from
regular English text, it turns out to be very useful when encoding
inputs with very large terms. (CVS 3520)

FossilOrigin-Name: c8151a998ec2423b417566823dc9957c7d5d782c
This commit is contained in:
shess 2006-11-29 01:02:03 +00:00
parent 3590f15775
commit 7e3d0c2d2f
3 changed files with 110 additions and 32 deletions

View File

@ -138,13 +138,19 @@
**
** varint iHeight; (height from leaf level, always >0)
** varint iBlockid; (block id of node's leftmost subtree)
** array {
** varint nTerm; (length of term)
** char pTerm[nTerm]; (content of term)
** optional {
** varint nTerm; (length of first term)
** char pTerm[nTerm]; (content of first term)
** array {
** (further terms are delta-encoded)
** varint nPrefix; (length of shared prefix with previous term)
** varint nSuffix; (length of unshared suffix)
** char pTermSuffix[nSuffix]; (unshared suffix of next term)
** }
** }
**
** Here, array { X } means zero or more occurrences of X, adjacent in
** memory.
** Here, optional { X } means an optional element, while array { X }
** means zero or more occurrences of X, adjacent in memory.
**
** An interior node encodes n terms separating n+1 subtrees. The
** subtree blocks are contiguous, so only the first subtree's blockid
@ -3690,7 +3696,8 @@ static void interiorBlockValidate(InteriorBlock *pBlock){
nData -= n;
/* Zero or more terms of positive length */
while( nData!=0 ){
if( nData!=0 ){
/* First term is not delta-encoded. */
n = getVarint32(pData, &iDummy);
assert( n>0 );
assert( iDummy>0 );
@ -3698,6 +3705,26 @@ static void interiorBlockValidate(InteriorBlock *pBlock){
assert( n+iDummy<=nData );
pData += n+iDummy;
nData -= n+iDummy;
/* Following terms delta-encoded. */
while( nData!=0 ){
/* Length of shared prefix. */
n = getVarint32(pData, &iDummy);
assert( n>0 );
assert( iDummy>=0 );
assert( n<nData );
pData += n;
nData -= n;
/* Length and data of distinct suffix. */
n = getVarint32(pData, &iDummy);
assert( n>0 );
assert( iDummy>0 );
assert( n+iDummy>0);
assert( n+iDummy<=nData );
pData += n+iDummy;
nData -= n+iDummy;
}
}
}
#define ASSERT_VALID_INTERIOR_BLOCK(x) interiorBlockValidate(x)
@ -3710,6 +3737,7 @@ typedef struct InteriorWriter {
InteriorBlock *first, *last;
struct InteriorWriter *parentWriter;
DataBuffer term; /* Last term written to block "last". */
sqlite_int64 iOpeningChildBlock; /* First child block in block "last". */
#ifndef NDEBUG
sqlite_int64 iLastChildBlock; /* for consistency checks. */
@ -3735,6 +3763,7 @@ static void interiorWriterInit(int iHeight, const char *pTerm, int nTerm,
block = interiorBlockNew(iHeight, iChildBlock, pTerm, nTerm);
pWriter->last = pWriter->first = block;
ASSERT_VALID_INTERIOR_BLOCK(pWriter->last);
dataBufferInit(&pWriter->term, 0);
}
/* Append the child node rooted at iChildBlock to the interior node,
@ -3744,10 +3773,28 @@ static void interiorWriterAppend(InteriorWriter *pWriter,
const char *pTerm, int nTerm,
sqlite_int64 iChildBlock){
char c[VARINT_MAX+VARINT_MAX];
int n = putVarint(c, nTerm);
int n, nPrefix = 0;
ASSERT_VALID_INTERIOR_BLOCK(pWriter->last);
/* The first term written into an interior node is actually
** associated with the second child added (the first child was added
** in interiorWriterInit, or in the if clause at the bottom of this
** function). That term gets encoded straight up, with nPrefix left
** at 0.
*/
if( pWriter->term.nData==0 ){
n = putVarint(c, nTerm);
}else{
while( nPrefix<pWriter->term.nData &&
pTerm[nPrefix]==pWriter->term.pData[nPrefix] ){
nPrefix++;
}
n = putVarint(c, nPrefix);
n += putVarint(c+n, nTerm-nPrefix);
}
#ifndef NDEBUG
pWriter->iLastChildBlock++;
#endif
@ -3756,14 +3803,17 @@ static void interiorWriterAppend(InteriorWriter *pWriter,
/* Overflow to a new block if the new term makes the current block
** too big, and the current block already has enough terms.
*/
if( pWriter->last->data.nData+n+nTerm>INTERIOR_MAX &&
if( pWriter->last->data.nData+n+nTerm-nPrefix>INTERIOR_MAX &&
iChildBlock-pWriter->iOpeningChildBlock>INTERIOR_MIN_TERMS ){
pWriter->last->next = interiorBlockNew(pWriter->iHeight, iChildBlock,
pTerm, nTerm);
pWriter->last = pWriter->last->next;
pWriter->iOpeningChildBlock = iChildBlock;
dataBufferReset(&pWriter->term);
}else{
dataBufferAppend2(&pWriter->last->data, c, n, pTerm, nTerm);
dataBufferAppend2(&pWriter->last->data, c, n,
pTerm+nPrefix, nTerm-nPrefix);
dataBufferReplace(&pWriter->term, pTerm, nTerm);
}
ASSERT_VALID_INTERIOR_BLOCK(pWriter->last);
}
@ -3785,6 +3835,7 @@ static int interiorWriterDestroy(InteriorWriter *pWriter){
interiorWriterDestroy(pWriter->parentWriter);
free(pWriter->parentWriter);
}
dataBufferDestroy(&pWriter->term);
SCRAMBLE(pWriter);
return SQLITE_OK;
}
@ -3841,13 +3892,14 @@ static int interiorWriterRootInfo(fulltext_vtab *v, InteriorWriter *pWriter,
/****************************************************************/
/* InteriorReader is used to read off the data from an interior node
** (see comment at top of file for the format). InteriorReader does
** not own its data, so interiorReaderDestroy() is a formality.
** (see comment at top of file for the format).
*/
typedef struct InteriorReader {
const char *pData;
int nData;
DataBuffer term; /* previous term, for decoding term delta. */
sqlite_int64 iBlockid;
} InteriorReader;
@ -3857,7 +3909,7 @@ static void interiorReaderDestroy(InteriorReader *pReader){
static void interiorReaderInit(const char *pData, int nData,
InteriorReader *pReader){
int n;
int n, nTerm;
/* Require at least the leading flag byte */
assert( nData>0 );
@ -3870,10 +3922,25 @@ static void interiorReaderInit(const char *pData, int nData,
assert( 1+n<=nData );
pReader->pData = pData+1+n;
pReader->nData = nData-(1+n);
/* A single-child interior node (such as when a leaf node was too
** large for the segment directory) won't have any terms.
** Otherwise, decode the first term.
*/
if( pReader->nData==0 ){
dataBufferInit(&pReader->term, 0);
}else{
n = getVarint32(pReader->pData, &nTerm);
dataBufferInit(&pReader->term, nTerm);
dataBufferReplace(&pReader->term, pReader->pData+n, nTerm);
assert( n+nTerm<=pReader->nData );
pReader->pData += n+nTerm;
pReader->nData -= n+nTerm;
}
}
static int interiorReaderAtEnd(InteriorReader *pReader){
return pReader->nData<=0;
return pReader->term.nData==0;
}
static sqlite_int64 interiorReaderCurrentBlockid(InteriorReader *pReader){
@ -3881,26 +3948,37 @@ static sqlite_int64 interiorReaderCurrentBlockid(InteriorReader *pReader){
}
static int interiorReaderTermBytes(InteriorReader *pReader){
int nTerm;
assert( !interiorReaderAtEnd(pReader) );
getVarint32(pReader->pData, &nTerm);
return nTerm;
return pReader->term.nData;
}
static const char *interiorReaderTerm(InteriorReader *pReader){
int n, nTerm;
assert( !interiorReaderAtEnd(pReader) );
n = getVarint32(pReader->pData, &nTerm);
return pReader->pData+n;
return pReader->term.pData;
}
/* Step forward to the next term in the node. */
static void interiorReaderStep(InteriorReader *pReader){
int n, nTerm;
assert( !interiorReaderAtEnd(pReader) );
n = getVarint32(pReader->pData, &nTerm);
assert( n+nTerm<=pReader->nData );
pReader->pData += n+nTerm;
pReader->nData -= n+nTerm;
/* If the last term has been read, signal eof, else construct the
** next term.
*/
if( pReader->nData==0 ){
dataBufferReset(&pReader->term);
}else{
int n, nPrefix, nSuffix;
n = getVarint32(pReader->pData, &nPrefix);
n += getVarint32(pReader->pData+n, &nSuffix);
/* Truncate the current term and append suffix data. */
pReader->term.nData = nPrefix;
dataBufferAppend(&pReader->term, pReader->pData+n, nSuffix);
assert( n+nSuffix<=pReader->nData );
pReader->pData += n+nSuffix;
pReader->nData -= n+nSuffix;
}
pReader->iBlockid++;
}

View File

@ -1,5 +1,5 @@
C Improvements\sto\sthe\sspeed\stests\srecently\sadded\sto\sthe\stest\ssuite.\s(CVS\s3519)
D 2006-11-23T21:09:11
C Delta-encode\sterms\sin\sinterior\snodes.\s\sWhile\sexperiments\shave\sshown\nthat\sthis\sis\sof\smarginal\sutility\swhen\sencoding\sterms\sresulting\sfrom\nregular\sEnglish\stext,\sit\sturns\sout\sto\sbe\svery\suseful\swhen\sencoding\ninputs\swith\svery\slarge\sterms.\s(CVS\s3520)
D 2006-11-29T01:02:03
F Makefile.in 8e14898d41a53033ecb687d93c9cd5d109fb9ae3
F Makefile.linux-gcc 2d8574d1ba75f129aba2019f0b959db380a90935
F README 9c4e2d6706bdcc3efdd773ce752a8cdab4f90028
@ -33,7 +33,7 @@ F ext/fts1/fulltext.h 08525a47852d1d62a0be81d3fc3fe2d23b094efd
F ext/fts1/simple_tokenizer.c 1844d72f7194c3fd3d7e4173053911bf0661b70d
F ext/fts1/tokenizer.h 0c53421b832366d20d720d21ea3e1f6e66a36ef9
F ext/fts2/README.txt 8c18f41574404623b76917b9da66fcb0ab38328d
F ext/fts2/fts2.c 74a5db3f7f8e49dfa2a5d40e5fdece09bf23e5a8
F ext/fts2/fts2.c 6065a73ad89e7fb0dcfc41d1b110f856dea98dc8
F ext/fts2/fts2.h bbdab26d34f91974d5b9ade8b7836c140a7c4ce1
F ext/fts2/fts2_hash.c b3f22116d4ef0bc8f2da6e3fdc435c86d0951a9b
F ext/fts2/fts2_hash.h e283308156018329f042816eb09334df714e105e
@ -421,7 +421,7 @@ F www/tclsqlite.tcl bb0d1357328a42b1993d78573e587c6dcbc964b9
F www/vdbe.tcl 87a31ace769f20d3627a64fa1fade7fed47b90d0
F www/version3.tcl 890248cf7b70e60c383b0e84d77d5132b3ead42b
F www/whentouse.tcl 97e2b5cd296f7d8057e11f44427dea8a4c2db513
P 3dea7fbefdadb269e58ae76bb9a7281c96d8b15b
R e1c7cce5166a5c272088fe6b51d8b51c
U drh
Z ccf2319a3485025d8ece0a253ded8d83
P 272c1a6e61d053121b5412564948dad4366b5727
R 64f64a706f1c764008e348249e53ac19
U shess
Z 613a143cab46a942074b48bd6bce5a4c

View File

@ -1 +1 @@
272c1a6e61d053121b5412564948dad4366b5727
c8151a998ec2423b417566823dc9957c7d5d782c