Refactoring groundwork for coming work on interior nodes. Change

LeafWriter to use empty data buffer (instead of empty term) to detect
an empty block.  Code to validate interior nodes.  Moderate revisions
to leaf-node and doclist validation.  Recast leafWriterStep() in terms
of LeafWriterStepMerge(). (CVS 3512)

FossilOrigin-Name: f30771d5c7ef2b502af95d81a18796b75271ada4
This commit is contained in:
shess 2006-11-17 21:12:15 +00:00
parent de163af26e
commit 9e6a561554
3 changed files with 138 additions and 93 deletions

View File

@ -82,10 +82,6 @@
** the type. Due to how deletion is implemented in the segmentation
** system, on-disk doclists MUST store at least positions.
**
** TODO(shess) Delta-encode docids. This provides a 10% win versus
** DL_POSITIONS_OFFSETS on the first 100,000 documents of the Enron
** corpus, greater versus DL_POSITIONS.
**
**
**** Segment leaf nodes ****
** Segment leaf nodes store terms and doclists, ordered by term. Leaf
@ -403,7 +399,6 @@ static int getVarint32(const char *p, int *pi){
** dataBufferExpand - expand capacity without adding data.
** dataBufferAppend - append data.
** dataBufferAppend2 - append two pieces of data at once.
** dataBufferAppendLenData - append a varint-encoded length plus data.
** dataBufferReplace - replace buffer's data.
*/
typedef struct DataBuffer {
@ -453,12 +448,6 @@ static void dataBufferAppend2(DataBuffer *pBuffer,
memcpy(pBuffer->pData+pBuffer->nData+nSource1, pSource2, nSource2);
pBuffer->nData += nSource1+nSource2;
}
static void dataBufferAppendLenData(DataBuffer *pBuffer,
const char *pSource, int nSource){
char c[VARINT_MAX];
int n = putVarint(c, nSource);
dataBufferAppend2(pBuffer, c, n, pSource, nSource);
}
static void dataBufferReplace(DataBuffer *pBuffer,
const char *pSource, int nSource){
dataBufferReset(pBuffer);
@ -649,11 +638,12 @@ static void dlrDestroy(DLReader *pReader){
** last docid found because it's convenient in other assertions for
** DLWriter.
*/
static int docListValidate(DocListType iType, const char *pData, int nData,
static void docListValidate(DocListType iType, const char *pData, int nData,
sqlite_int64 *pLastDocid){
sqlite_int64 iPrevDocid = 0;
assert( nData>0 );
assert( pData!=0 );
assert( nData!=0 );
assert( pData+nData>pData );
while( nData!=0 ){
sqlite_int64 iDocidDelta;
int n = getVarint(pData, &iDocidDelta);
@ -677,8 +667,10 @@ static int docListValidate(DocListType iType, const char *pData, int nData,
nData -= n;
}
if( pLastDocid ) *pLastDocid = iPrevDocid;
return 1;
}
#define ASSERT_VALID_DOCLIST(i, p, n, o) docListValidate(i, p, n, o)
#else
#define ASSERT_VALID_DOCLIST(i, p, n, o) assert( 1 )
#endif
/*******************************************************************/
@ -736,7 +728,7 @@ static void dlwAppend(DLWriter *pWriter,
** the expected docid. This is essential because we'll trust this
** docid in future delta-encoding.
*/
assert( docListValidate(pWriter->iType, pData, nData, &iLastDocidDelta) );
ASSERT_VALID_DOCLIST(pWriter->iType, pData, nData, &iLastDocidDelta);
assert( iLastDocid==iFirstDocid-iDocid+iLastDocidDelta );
/* Append recoded initial docid and everything else. Rest of docids
@ -3667,6 +3659,49 @@ static InteriorBlock *interiorBlockNew(int iHeight, sqlite_int64 iChildBlock,
return block;
}
#ifndef NDEBUG
/* Verify that the data is readable as an interior node. */
static void interiorBlockValidate(InteriorBlock *pBlock){
const char *pData = pBlock->data.pData;
int nData = pBlock->data.nData;
int n, iDummy;
sqlite_int64 iBlockid;
assert( nData>0 );
assert( pData!=0 );
assert( pData+nData>pData );
/* Must lead with height of node as a varint(n), n>0 */
n = getVarint32(pData, &iDummy);
assert( n>0 );
assert( iDummy>0 );
assert( n<nData );
pData += n;
nData -= n;
/* Must contain iBlockid. */
n = getVarint(pData, &iBlockid);
assert( n>0 );
assert( n<=nData );
pData += n;
nData -= n;
/* Zero or more terms of positive length */
while( nData!=0 ){
n = getVarint32(pData, &iDummy);
assert( n>0 );
assert( iDummy>0 );
assert( n+iDummy>0);
assert( n+iDummy<=nData );
pData += n+iDummy;
nData -= n+iDummy;
}
}
#define ASSERT_VALID_INTERIOR_BLOCK(x) interiorBlockValidate(x)
#else
#define ASSERT_VALID_INTERIOR_BLOCK(x) assert( 1 )
#endif
typedef struct InteriorWriter {
int iHeight; /* from 0 at leaves. */
InteriorBlock *first, *last;
@ -3696,6 +3731,7 @@ static void interiorWriterInit(int iHeight, const char *pTerm, int nTerm,
#endif
block = interiorBlockNew(iHeight, iChildBlock, pTerm, nTerm);
pWriter->last = pWriter->first = block;
ASSERT_VALID_INTERIOR_BLOCK(pWriter->last);
}
/* Append the child node rooted at iChildBlock to the interior node,
@ -3707,6 +3743,8 @@ static void interiorWriterAppend(InteriorWriter *pWriter,
char c[VARINT_MAX+VARINT_MAX];
int n = putVarint(c, nTerm);
ASSERT_VALID_INTERIOR_BLOCK(pWriter->last);
#ifndef NDEBUG
pWriter->iLastChildBlock++;
#endif
@ -3724,6 +3762,7 @@ static void interiorWriterAppend(InteriorWriter *pWriter,
}else{
dataBufferAppend2(&pWriter->last->data, c, n, pTerm, nTerm);
}
ASSERT_VALID_INTERIOR_BLOCK(pWriter->last);
}
/* Free the space used by pWriter, including the linked-list of
@ -3769,6 +3808,7 @@ static int interiorWriterRootInfo(fulltext_vtab *v, InteriorWriter *pWriter,
/* Flush the first block to %_segments, and create a new level of
** interior node.
*/
ASSERT_VALID_INTERIOR_BLOCK(block);
rc = block_insert(v, block->data.pData, block->data.nData, &iBlockid);
if( rc!=SQLITE_OK ) return rc;
*piEndBlockid = iBlockid;
@ -3782,6 +3822,7 @@ static int interiorWriterRootInfo(fulltext_vtab *v, InteriorWriter *pWriter,
** node.
*/
for(block=block->next; block!=NULL; block=block->next){
ASSERT_VALID_INTERIOR_BLOCK(block);
rc = block_insert(v, block->data.pData, block->data.nData, &iBlockid);
if( rc!=SQLITE_OK ) return rc;
*piEndBlockid = iBlockid;
@ -3925,9 +3966,6 @@ typedef struct LeafWriter {
} LeafWriter;
static void leafWriterInit(int iLevel, int idx, LeafWriter *pWriter){
char c[VARINT_MAX];
int n;
CLEAR(pWriter);
pWriter->iLevel = iLevel;
pWriter->idx = idx;
@ -3936,54 +3974,74 @@ static void leafWriterInit(int iLevel, int idx, LeafWriter *pWriter){
/* Start out with a reasonably sized block, though it can grow. */
dataBufferInit(&pWriter->data, LEAF_MAX);
n = putVarint(c, 0);
dataBufferReplace(&pWriter->data, c, n);
}
#ifndef NDEBUG
/* Verify that the data is readable as a leaf node. */
static int leafNodeValidate(const char *pData, int nData){
static void leafNodeValidate(const char *pData, int nData){
int n, iDummy;
if( nData==0 ) return;
assert( nData>0 );
assert( pData!=0 );
assert( nData!=0 );
assert( pData+nData>pData );
/* Must lead with a varint(0) */
n = getVarint32(pData, &iDummy);
assert( iDummy==0 );
if( nData==n ) return 1;
assert( n>0 );
assert( n<nData );
pData += n;
nData -= n;
/* Leading term length and data must fit in buffer. */
n = getVarint32(pData, &iDummy);
assert( n>0 );
assert( iDummy>0 );
assert( n+iDummy>0 );
assert( n+iDummy<nData );
pData += n+iDummy;
nData -= n+iDummy;
/* Leading term's doclist length and data must fit. */
n = getVarint32(pData, &iDummy);
assert( n>0 );
assert( iDummy>0 );
assert( n+iDummy>0 );
assert( n+iDummy<=nData );
assert( docListValidate(DL_DEFAULT, pData+n, iDummy, NULL) );
ASSERT_VALID_DOCLIST(DL_DEFAULT, pData+n, iDummy, NULL);
pData += n+iDummy;
nData -= n+iDummy;
/* Verify that trailing terms and doclists also are readable. */
while( nData!=0 ){
n = getVarint32(pData, &iDummy);
n += getVarint32(pData+n, &iDummy);
assert( n>0 );
assert( iDummy>=0 );
assert( n<nData );
pData += n;
nData -= n;
n = getVarint32(pData, &iDummy);
assert( n>0 );
assert( iDummy>0 );
assert( n+iDummy>0 );
assert( n+iDummy<nData );
pData += n+iDummy;
nData -= n+iDummy;
n = getVarint32(pData, &iDummy);
assert( n>0 );
assert( iDummy>0 );
assert( n+iDummy>0 );
assert( n+iDummy<=nData );
assert( docListValidate(DL_DEFAULT, pData+n, iDummy, NULL) );
ASSERT_VALID_DOCLIST(DL_DEFAULT, pData+n, iDummy, NULL);
pData += n+iDummy;
nData -= n+iDummy;
}
return 1;
}
#define ASSERT_VALID_LEAF_NODE(p, n) leafNodeValidate(p, n)
#else
#define ASSERT_VALID_LEAF_NODE(p, n) assert( 1 )
#endif
/* Flush the current leaf node to %_segments, and adding the resulting
@ -4002,7 +4060,7 @@ static int leafWriterInternalFlush(fulltext_vtab *v, LeafWriter *pWriter,
assert( nData>2 );
assert( iData>=0 );
assert( iData+nData<=pWriter->data.nData );
assert( leafNodeValidate(pWriter->data.pData+iData, nData) );
ASSERT_VALID_LEAF_NODE(pWriter->data.pData+iData, nData);
rc = block_insert(v, pWriter->data.pData+iData, nData, &iBlockid);
if( rc!=SQLITE_OK ) return rc;
@ -4039,8 +4097,7 @@ static int leafWriterFlush(fulltext_vtab *v, LeafWriter *pWriter){
if( rc!=SQLITE_OK ) return rc;
/* Re-initialize the output buffer. */
pWriter->data.nData = putVarint(pWriter->data.pData, 0);
dataBufferReset(&pWriter->term);
dataBufferReset(&pWriter->data);
return SQLITE_OK;
}
@ -4064,7 +4121,7 @@ static int leafWriterRootInfo(fulltext_vtab *v, LeafWriter *pWriter,
}
/* Flush remaining leaf data. */
if( pWriter->data.nData>1 ){
if( pWriter->data.nData>0 ){
int rc = leafWriterFlush(v, pWriter);
if( rc!=SQLITE_OK ) return rc;
}
@ -4096,7 +4153,7 @@ static int leafWriterFinalize(fulltext_vtab *v, LeafWriter *pWriter){
if( rc!=SQLITE_OK ) return rc;
/* Don't bother storing an entirely empty segment. */
if( iEndBlockid==0 && nRootInfo==1 ) return SQLITE_OK;
if( iEndBlockid==0 && nRootInfo==0 ) return SQLITE_OK;
return segdir_set(v, pWriter->iLevel, pWriter->idx,
pWriter->iStartBlockid, pWriter->iEndBlockid,
@ -4112,25 +4169,32 @@ static void leafWriterDestroy(LeafWriter *pWriter){
/* Encode a term into the leafWriter, delta-encoding as appropriate. */
static void leafWriterEncodeTerm(LeafWriter *pWriter,
const char *pTerm, int nTerm){
if( pWriter->term.nData==0 ){
/* Encode the entire leading term as:
char c[VARINT_MAX+VARINT_MAX];
int n;
if( pWriter->data.nData==0 ){
/* Encode the node header and leading term as:
** varint(0)
** varint(nTerm)
** char pTerm[nTerm]
*/
assert( pWriter->data.nData==1 );
dataBufferAppendLenData(&pWriter->data, pTerm, nTerm);
n = putVarint(c, '\0');
n += putVarint(c+n, nTerm);
dataBufferAppend2(&pWriter->data, c, n, pTerm, nTerm);
}else{
/* Delta-encode the term as:
** varint(nPrefix)
** varint(nSuffix)
** char pTermSuffix[nSuffix]
*/
char c[VARINT_MAX+VARINT_MAX];
int n, nPrefix = 0;
int nPrefix = 0;
while( nPrefix<nTerm && nPrefix<pWriter->term.nData &&
assert( nTerm>0 );
while( nPrefix<pWriter->term.nData &&
pTerm[nPrefix]==pWriter->term.pData[nPrefix] ){
nPrefix++;
/* Failing this implies that the terms weren't in order. */
assert( nPrefix<nTerm );
}
n = putVarint(c, nPrefix);
@ -4140,44 +4204,6 @@ static void leafWriterEncodeTerm(LeafWriter *pWriter,
dataBufferReplace(&pWriter->term, pTerm, nTerm);
}
/* Push pTerm[nTerm] along with the doclist data to the leaf layer of
** %_segments.
*/
/* TODO(shess) Revise writeZeroSegment() so that doclists are
** constructed directly in pWriter->data. That implies refactoring
** leafWriterStep() and leafWriterStepMerge() to share more code.
*/
static int leafWriterStep(fulltext_vtab *v, LeafWriter *pWriter,
const char *pTerm, int nTerm,
const char *pData, int nData){
int rc;
/* Flush existing data if this item won't fit well. */
if( pWriter->data.nData>1 &&
(nData+nTerm>STANDALONE_MIN ||
pWriter->data.nData+nData+nTerm>LEAF_MAX) ){
rc = leafWriterFlush(v, pWriter);
if( rc!=SQLITE_OK ) return rc;
}
leafWriterEncodeTerm(pWriter, pTerm, nTerm);
/* Encode the doclist as:
** varint(nDoclist)
** char pDoclist[nDoclist]
*/
dataBufferAppendLenData(&pWriter->data, pData, nData);
/* Flush standalone blocks right out */
if( nData+nTerm>STANDALONE_MIN ){
rc = leafWriterFlush(v, pWriter);
if( rc!=SQLITE_OK ) return rc;
}
assert( leafNodeValidate(pWriter->data.pData, pWriter->data.nData) );
return SQLITE_OK;
}
/* Used to avoid a memmove when a large amount of doclist data is in
** the buffer. This constructs a node and term header before
** iDoclistData and flushes the resulting complete node using
@ -4214,7 +4240,7 @@ static int leafWriterStepMerge(fulltext_vtab *v, LeafWriter *pWriter,
int iTermData = pWriter->data.nData, iDoclistData;
int i, nData, n, nActualData, nActual, rc;
assert( leafNodeValidate(pWriter->data.pData, pWriter->data.nData) );
ASSERT_VALID_LEAF_NODE(pWriter->data.pData, pWriter->data.nData);
leafWriterEncodeTerm(pWriter, pTerm, nTerm);
iDoclistData = pWriter->data.nData;
@ -4229,9 +4255,9 @@ static int leafWriterStepMerge(fulltext_vtab *v, LeafWriter *pWriter,
dataBufferAppend(&pWriter->data, c, n);
docListMerge(&pWriter->data, pReaders, nReaders);
assert( docListValidate(DL_DEFAULT,
ASSERT_VALID_DOCLIST(DL_DEFAULT,
pWriter->data.pData+iDoclistData+n,
pWriter->data.nData-iDoclistData-n, NULL) );
pWriter->data.nData-iDoclistData-n, NULL);
/* The actual amount of doclist data at this point could be smaller
** than the length we encoded. Additionally, the space required to
@ -4254,7 +4280,7 @@ static int leafWriterStepMerge(fulltext_vtab *v, LeafWriter *pWriter,
*/
if( nTerm+nActualData>STANDALONE_MIN ){
/* Push leaf node from before this term. */
if( iTermData>1 ){
if( iTermData>0 ){
rc = leafWriterInternalFlush(v, pWriter, 0, iTermData);
if( rc!=SQLITE_OK ) return rc;
}
@ -4268,8 +4294,8 @@ static int leafWriterStepMerge(fulltext_vtab *v, LeafWriter *pWriter,
if( rc!=SQLITE_OK ) return rc;
/* Leave the node empty. */
pWriter->data.nData = putVarint(pWriter->data.pData, 0);
dataBufferReset(&pWriter->term);
dataBufferReset(&pWriter->data);
return rc;
}
@ -4317,11 +4343,30 @@ static int leafWriterStepMerge(fulltext_vtab *v, LeafWriter *pWriter,
pWriter->data.nData-iDoclistData);
pWriter->data.nData -= iDoclistData-n;
}
assert( leafNodeValidate(pWriter->data.pData, pWriter->data.nData) );
ASSERT_VALID_LEAF_NODE(pWriter->data.pData, pWriter->data.nData);
return SQLITE_OK;
}
/* Push pTerm[nTerm] along with the doclist data to the leaf layer of
** %_segments.
*/
/* TODO(shess) Revise writeZeroSegment() so that doclists are
** constructed directly in pWriter->data.
*/
static int leafWriterStep(fulltext_vtab *v, LeafWriter *pWriter,
const char *pTerm, int nTerm,
const char *pData, int nData){
int rc;
DLReader reader;
dlrInit(&reader, DL_DEFAULT, pData, nData);
rc = leafWriterStepMerge(v, pWriter, pTerm, nTerm, &reader, 1);
dlrDestroy(&reader);
return rc;
}
/****************************************************************/
/* LeafReader is used to iterate over an individual leaf node. */

View File

@ -1,5 +1,5 @@
C Delta-encode\sdocids.\s\sThis\sis\sgood\sfor\saround\s22%\sreduction\sin\sindex\nsize\swith\sDL_POSITIONS.\s\sIt\simproves\sperformance\sabout\s5%-6%.\s(CVS\s3511)
D 2006-11-13T21:09:25
C Refactoring\sgroundwork\sfor\scoming\swork\son\sinterior\snodes.\s\sChange\nLeafWriter\sto\suse\sempty\sdata\sbuffer\s(instead\sof\sempty\sterm)\sto\sdetect\nan\sempty\sblock.\s\sCode\sto\svalidate\sinterior\snodes.\s\sModerate\srevisions\nto\sleaf-node\sand\sdoclist\svalidation.\s\sRecast\sleafWriterStep()\sin\sterms\nof\sLeafWriterStepMerge().\s(CVS\s3512)
D 2006-11-17T21:12:16
F Makefile.in 8e14898d41a53033ecb687d93c9cd5d109fb9ae3
F Makefile.linux-gcc 2d8574d1ba75f129aba2019f0b959db380a90935
F README 9c4e2d6706bdcc3efdd773ce752a8cdab4f90028
@ -33,7 +33,7 @@ F ext/fts1/fulltext.h 08525a47852d1d62a0be81d3fc3fe2d23b094efd
F ext/fts1/simple_tokenizer.c 1844d72f7194c3fd3d7e4173053911bf0661b70d
F ext/fts1/tokenizer.h 0c53421b832366d20d720d21ea3e1f6e66a36ef9
F ext/fts2/README.txt 8c18f41574404623b76917b9da66fcb0ab38328d
F ext/fts2/fts2.c 7909381760660b3da9918ff3e618e2c83315234b
F ext/fts2/fts2.c 57d8cd57ce18c3ce7b194b4810fe7e119ec7e6a3
F ext/fts2/fts2.h bbdab26d34f91974d5b9ade8b7836c140a7c4ce1
F ext/fts2/fts2_hash.c b3f22116d4ef0bc8f2da6e3fdc435c86d0951a9b
F ext/fts2/fts2_hash.h e283308156018329f042816eb09334df714e105e
@ -421,7 +421,7 @@ F www/tclsqlite.tcl bb0d1357328a42b1993d78573e587c6dcbc964b9
F www/vdbe.tcl 87a31ace769f20d3627a64fa1fade7fed47b90d0
F www/version3.tcl 890248cf7b70e60c383b0e84d77d5132b3ead42b
F www/whentouse.tcl 97e2b5cd296f7d8057e11f44427dea8a4c2db513
P 64b7e3406134ac4891113b9bb432ad97504268bb
R 5cca903a493ab0c4e72312813e09cd62
P 9b6d413d751d962b67cb4e3a208efe61581cb822
R ff81ed1c8b4721212823c87e00f2e6b9
U shess
Z 6c02cb52391a3d0abd67d903b02caa78
Z 82e324f504b7a8a8d9f0a515f8d329aa

View File

@ -1 +1 @@
9b6d413d751d962b67cb4e3a208efe61581cb822
f30771d5c7ef2b502af95d81a18796b75271ada4