Refactor PLWriter to remove owned buffer. DLCollector (Document List

Collector) now handles the case where PLWriter (Position List Writer)
needed a local buffer.  Change to using the associated DLWriter
(Document List Writer) buffer, which reduces the number of memory
copies needed in doclist processing, and brings PLWriter operation in
line with DLWriter operation. (CVS 3707)

FossilOrigin-Name: d04fa3a13a84f49074c673b8ee2fb6541da061b5
This commit is contained in:
shess 2007-03-22 00:14:28 +00:00
parent 4607fc06f6
commit 13ee81fe96
3 changed files with 121 additions and 81 deletions

View File

@ -690,6 +690,7 @@ static void docListValidate(DocListType iType, const char *pData, int nData,
** dlwDestroy - clear the writer's memory. Does not free buffer.
** dlwAppend - append raw doclist data to buffer.
** dlwAdd - construct doclist element and append to buffer.
** Only apply dlwAdd() to DL_DOCIDS doclists (else use PLWriter).
*/
typedef struct DLWriter {
DocListType iType;
@ -751,24 +752,14 @@ static void dlwAppend(DLWriter *pWriter,
}
pWriter->iPrevDocid = iLastDocid;
}
static void dlwAdd(DLWriter *pWriter, sqlite_int64 iDocid,
const char *pPosList, int nPosList){
static void dlwAdd(DLWriter *pWriter, sqlite_int64 iDocid){
char c[VARINT_MAX];
int n = putVarint(c, iDocid-pWriter->iPrevDocid);
assert( pWriter->iPrevDocid<iDocid );
assert( pPosList==0 || pWriter->iType>DL_DOCIDS );
assert( pWriter->iType==DL_DOCIDS );
dataBufferAppend(pWriter->b, c, n);
if( pWriter->iType>DL_DOCIDS ){
n = putVarint(c, 0);
if( nPosList>0 ){
dataBufferAppend2(pWriter->b, pPosList, nPosList, c, n);
}else{
dataBufferAppend(pWriter->b, c, n);
}
}
pWriter->iPrevDocid = iDocid;
}
@ -854,11 +845,10 @@ static void plrStep(PLReader *pReader){
pReader->nData -= n;
}
static void plrInit(PLReader *pReader, DocListType iType,
const char *pData, int nData){
pReader->pData = pData;
pReader->nData = nData;
pReader->iType = iType;
static void plrInit(PLReader *pReader, DLReader *pDLReader){
pReader->pData = dlrPosData(pDLReader);
pReader->nData = dlrPosDataLen(pDLReader);
pReader->iType = pDLReader->iType;
pReader->iColumn = 0;
pReader->iPosition = 0;
pReader->iStartOffset = 0;
@ -872,34 +862,38 @@ static void plrDestroy(PLReader *pReader){
/*******************************************************************/
/* PLWriter is used in constructing a document's position list. As a
** convenience, if iType is DL_DOCIDS, PLWriter becomes a no-op.
** PLWriter writes to the associated DLWriter's buffer.
**
** plwInit - init for writing a document's poslist.
** plwReset - reset the writer for a new document.
** plwDestroy - clear a writer.
** plwNew - malloc storage and initialize it.
** plwDelete - clear and free storage.
** plwDlwAdd - append the docid and poslist to a doclist writer.
** plwAdd - append position and offset information.
** plwTerminate - add any necessary doclist terminator.
**
** Calling plwAdd() after plwTerminate() may result in a corrupt
** doclist.
*/
/* TODO(shess) PLWriter is used in two ways. fulltextUpdate() uses it
** in construction of a new doclist. docListTrim() and mergePosList()
** use it when trimming. In the former case, it wants to own the
** DataBuffer, in the latter it's possible it could encode into a
** pre-existing DataBuffer.
/* TODO(shess) Until we've written the second item, we can cache the
** first item's information. Then we'd have three states:
**
** - initialized with docid, no positions.
** - docid and one position.
** - docid and multiple positions.
**
** Only the last state needs to actually write to dlw->b, which would
** be an improvement in the DLCollector case.
*/
typedef struct PLWriter {
DataBuffer b;
DLWriter *dlw;
sqlite_int64 iDocid;
DocListType iType;
int iColumn; /* the last column written */
int iPos; /* the last position written */
int iOffset; /* the last start offset written */
} PLWriter;
static void plwDlwAdd(PLWriter *pWriter, DLWriter *dlWriter){
dlwAdd(dlWriter, pWriter->iDocid, pWriter->b.pData, pWriter->b.nData);
}
/* TODO(shess) In the case where the parent is reading these values
** from a PLReader, we could optimize to a copy if that PLReader has
** the same type as pWriter.
*/
static void plwAdd(PLWriter *pWriter, int iColumn, int iPos,
int iStartOffset, int iEndOffset){
/* Worst-case space for POS_COLUMN, iColumn, iPosDelta,
@ -908,7 +902,10 @@ static void plwAdd(PLWriter *pWriter, int iColumn, int iPos,
char c[5*VARINT_MAX];
int n = 0;
if( pWriter->iType==DL_DOCIDS ) return;
/* Ban plwAdd() after plwTerminate(). */
assert( pWriter->iPos!=-1 );
if( pWriter->dlw->iType==DL_DOCIDS ) return;
if( iColumn!=pWriter->iColumn ){
n += putVarint(c+n, POS_COLUMN);
@ -920,30 +917,50 @@ static void plwAdd(PLWriter *pWriter, int iColumn, int iPos,
assert( iPos>=pWriter->iPos );
n += putVarint(c+n, POS_BASE+(iPos-pWriter->iPos));
pWriter->iPos = iPos;
if( pWriter->iType==DL_POSITIONS_OFFSETS ){
if( pWriter->dlw->iType==DL_POSITIONS_OFFSETS ){
assert( iStartOffset>=pWriter->iOffset );
n += putVarint(c+n, iStartOffset-pWriter->iOffset);
pWriter->iOffset = iStartOffset;
assert( iEndOffset>=iStartOffset );
n += putVarint(c+n, iEndOffset-iStartOffset);
}
dataBufferAppend(&pWriter->b, c, n);
dataBufferAppend(pWriter->dlw->b, c, n);
}
static void plwReset(PLWriter *pWriter,
sqlite_int64 iDocid, DocListType iType){
dataBufferReset(&pWriter->b);
pWriter->iDocid = iDocid;
pWriter->iType = iType;
static void plwInit(PLWriter *pWriter, DLWriter *dlw, sqlite_int64 iDocid){
char c[VARINT_MAX];
int n;
pWriter->dlw = dlw;
assert( iDocid>pWriter->dlw->iPrevDocid );
n = putVarint(c, iDocid-pWriter->dlw->iPrevDocid);
dataBufferAppend(pWriter->dlw->b, c, n);
pWriter->dlw->iPrevDocid = iDocid;
pWriter->iColumn = 0;
pWriter->iPos = 0;
pWriter->iOffset = 0;
}
static void plwInit(PLWriter *pWriter, sqlite_int64 iDocid, DocListType iType){
dataBufferInit(&pWriter->b, 0);
plwReset(pWriter, iDocid, iType);
/* TODO(shess) Should plwDestroy() also terminate the doclist? But
** then plwDestroy() would no longer be just a destructor, it would
** also be doing work, which isn't consistent with the overall idiom.
** Another option would be for plwAdd() to always append any necessary
** terminator, so that the output is always correct. But that would
** add incremental work to the common case with the only benefit being
** API elegance. Punt for now.
*/
static void plwTerminate(PLWriter *pWriter){
if( pWriter->dlw->iType>DL_DOCIDS ){
char c[VARINT_MAX];
int n = putVarint(c, POS_END);
dataBufferAppend(pWriter->dlw->b, c, n);
}
#ifndef NDEBUG
/* Mark as terminated for assert in plwAdd(). */
pWriter->iPos = -1;
#endif
}
static void plwDestroy(PLWriter *pWriter){
dataBufferDestroy(&pWriter->b);
SCRAMBLE(pWriter);
}
@ -957,14 +974,27 @@ static void plwDestroy(PLWriter *pWriter){
** dlcAddDoclist - add the collected doclist to the given buffer.
*/
typedef struct DLCollector {
DataBuffer b;
DLWriter dlw;
PLWriter plw;
} DLCollector;
/* TODO(shess) This could also be done by calling plwTerminate() and
** dataBufferAppend(). I tried that, expecting nominal performance
** differences, but it seemed to pretty reliably be worth 1% to code
** it this way. I suspect it's the incremental malloc overhead (some
** percentage of the plwTerminate() calls will cause a realloc), so
** this might be worth revisiting if the DataBuffer implementation
** changes.
*/
static void dlcAddDoclist(DLCollector *pCollector, DataBuffer *b){
DLWriter dlw;
dlwInit(&dlw, pCollector->plw.iType, b);
plwDlwAdd(&pCollector->plw, &dlw);
dlwDestroy(&dlw);
if( pCollector->dlw.iType>DL_DOCIDS ){
char c[VARINT_MAX];
int n = putVarint(c, POS_END);
dataBufferAppend2(b, pCollector->b.pData, pCollector->b.nData, c, n);
}else{
dataBufferAppend(b, pCollector->b.pData, pCollector->b.nData);
}
}
static void dlcAddPos(DLCollector *pCollector, int iColumn, int iPos,
int iStartOffset, int iEndOffset){
@ -973,11 +1003,15 @@ static void dlcAddPos(DLCollector *pCollector, int iColumn, int iPos,
static DLCollector *dlcNew(sqlite_int64 iDocid, DocListType iType){
DLCollector *pCollector = malloc(sizeof(DLCollector));
plwInit(&pCollector->plw, iDocid, iType);
dataBufferInit(&pCollector->b, 0);
dlwInit(&pCollector->dlw, iType, &pCollector->b);
plwInit(&pCollector->plw, &pCollector->dlw, iDocid);
return pCollector;
}
static void dlcDelete(DLCollector *pCollector){
plwDestroy(&pCollector->plw);
dlwDestroy(&pCollector->dlw);
dataBufferDestroy(&pCollector->b);
SCRAMBLE(pCollector);
free(pCollector);
}
@ -985,43 +1019,50 @@ static void dlcDelete(DLCollector *pCollector){
/* Copy the doclist data of iType in pData/nData into *out, trimming
** unnecessary data as we go. Only columns matching iColumn are
** copied, all columns copied if iColimn is -1. Elements with no
** copied, all columns copied if iColumn is -1. Elements with no
** matching columns are dropped. The output is an iOutType doclist.
*/
/* NOTE(shess) This code is only valid after all doclists are merged.
** If this is run before merges, then doclist items which represent
** deletion will be trimmed, and will thus not effect a deletion
** during the merge.
*/
static void docListTrim(DocListType iType, const char *pData, int nData,
int iColumn, DocListType iOutType, DataBuffer *out){
DLReader dlReader;
DLWriter dlWriter;
PLWriter plWriter;
assert( iOutType<=iType );
dlrInit(&dlReader, iType, pData, nData);
dlwInit(&dlWriter, iOutType, out);
plwInit(&plWriter, 0, iOutType);
while( !dlrAtEnd(&dlReader) ){
PLReader plReader;
PLWriter plWriter;
int match = 0;
plrInit(&plReader, dlReader.iType,
dlrPosData(&dlReader), dlrPosDataLen(&dlReader));
plwReset(&plWriter, dlrDocid(&dlReader), iOutType);
plrInit(&plReader, &dlReader);
while( !plrAtEnd(&plReader) ){
if( iColumn==-1 || plrColumn(&plReader)==iColumn ){
match = 1;
if( !match ){
plwInit(&plWriter, &dlWriter, dlrDocid(&dlReader));
match = 1;
}
plwAdd(&plWriter, plrColumn(&plReader), plrPosition(&plReader),
plrStartOffset(&plReader), plrEndOffset(&plReader));
}
plrStep(&plReader);
}
if( match ) plwDlwAdd(&plWriter, &dlWriter);
if( match ){
plwTerminate(&plWriter);
plwDestroy(&plWriter);
}
plrDestroy(&plReader);
dlrStep(&dlReader);
}
plwDestroy(&plWriter);
dlwDestroy(&dlWriter);
dlrDestroy(&dlReader);
}
@ -1172,9 +1213,8 @@ static void mergePosList(DLReader *pLeft, DLReader *pRight, DLWriter *pOut){
assert( dlrDocid(pLeft)==dlrDocid(pRight) );
assert( pOut->iType!=DL_POSITIONS_OFFSETS );
plrInit(&left, pLeft->iType, dlrPosData(pLeft), dlrPosDataLen(pLeft));
plrInit(&right, pRight->iType, dlrPosData(pRight), dlrPosDataLen(pRight));
plwInit(&writer, dlrDocid(pLeft), pOut->iType);
plrInit(&left, pLeft);
plrInit(&right, pRight);
while( !plrAtEnd(&left) && !plrAtEnd(&right) ){
if( plrColumn(&left)<plrColumn(&right) ){
@ -1186,23 +1226,23 @@ static void mergePosList(DLReader *pLeft, DLReader *pRight, DLWriter *pOut){
}else if( plrPosition(&left)+1>plrPosition(&right) ){
plrStep(&right);
}else{
match = 1;
if( !match ){
plwInit(&writer, pOut, dlrDocid(pLeft));
match = 1;
}
plwAdd(&writer, plrColumn(&right), plrPosition(&right), 0, 0);
plrStep(&left);
plrStep(&right);
}
}
/* TODO(shess) We could remember the output position, encode the
** docid, then encode the poslist directly into the output. If no
** match, we back out to the stored output position. This would
** also reduce the malloc count.
*/
if( match ) plwDlwAdd(&writer, pOut);
if( match ){
plwTerminate(&writer);
plwDestroy(&writer);
}
plrDestroy(&left);
plrDestroy(&right);
plwDestroy(&writer);
}
/* We have two doclists with positions: pLeft and pRight.
@ -1272,7 +1312,7 @@ static void docListAndMerge(
}else if( dlrDocid(&right)<dlrDocid(&left) ){
dlrStep(&right);
}else{
dlwAdd(&writer, dlrDocid(&left), 0, 0);
dlwAdd(&writer, dlrDocid(&left));
dlrStep(&left);
dlrStep(&right);
}
@ -1310,13 +1350,13 @@ static void docListOrMerge(
while( !dlrAtEnd(&left) || !dlrAtEnd(&right) ){
if( dlrAtEnd(&right) || dlrDocid(&left)<dlrDocid(&right) ){
dlwAdd(&writer, dlrDocid(&left), 0, 0);
dlwAdd(&writer, dlrDocid(&left));
dlrStep(&left);
}else if( dlrAtEnd(&left) || dlrDocid(&right)<dlrDocid(&left) ){
dlwAdd(&writer, dlrDocid(&right), 0, 0);
dlwAdd(&writer, dlrDocid(&right));
dlrStep(&right);
}else{
dlwAdd(&writer, dlrDocid(&left), 0, 0);
dlwAdd(&writer, dlrDocid(&left));
dlrStep(&left);
dlrStep(&right);
}
@ -1354,7 +1394,7 @@ static void docListExceptMerge(
dlrStep(&right);
}
if( dlrAtEnd(&right) || dlrDocid(&left)<dlrDocid(&right) ){
dlwAdd(&writer, dlrDocid(&left), 0, 0);
dlwAdd(&writer, dlrDocid(&left));
}
dlrStep(&left);
}

View File

@ -1,5 +1,5 @@
C Refactor\sPLWriter\sin\spreparation\sfor\sbuffered-document\schange.\nCurrently,\sPLWriter\s(Position\sList\sWriter)\screates\sa\slocally-owned\nDataBuffer\sto\swrite\sinto.\s\sThis\sis\snecessary\sto\ssupport\sdoclist\ncollection\sduring\stokenization,\swhere\sthere\sis\sno\sobvious\sbuffer\sto\nwrite\soutput\sto,\sbut\sis\snot\snecessary\sfor\sthe\sother\susers\sof\sPLWriter.\n\sThis\schange\sadds\sa\sDLCollector\s(Doc\sList\sCollector)\sstructure\sto\nhandle\sthe\stokenization\scase.\n\nAlso\sfix\sa\spotential\smemory\sleak\sin\swriteZeroSegment().\s\sIn\scase\sof\nerror\sfrom\sleafWriterStep(),\sthe\sDataBuffer\sdl\swas\sbeing\sleaked.\s(CVS\s3706)
D 2007-03-20T23:52:38
C Refactor\sPLWriter\sto\sremove\sowned\sbuffer.\s\sDLCollector\s(Document\sList\nCollector)\snow\shandles\sthe\scase\swhere\sPLWriter\s(Position\sList\sWriter)\nneeded\sa\slocal\sbuffer.\s\sChange\sto\susing\sthe\sassociated\sDLWriter\n(Document\sList\sWriter)\sbuffer,\swhich\sreduces\sthe\snumber\sof\smemory\ncopies\sneeded\sin\sdoclist\sprocessing,\sand\sbrings\sPLWriter\soperation\sin\nline\swith\sDLWriter\soperation.\s(CVS\s3707)
D 2007-03-22T00:14:29
F Makefile.in 1fe3d0b46e40fd684e1e61f8e8056cefed16de9f
F Makefile.linux-gcc 2d8574d1ba75f129aba2019f0b959db380a90935
F README 9c4e2d6706bdcc3efdd773ce752a8cdab4f90028
@ -34,7 +34,7 @@ F ext/fts1/fulltext.h 08525a47852d1d62a0be81d3fc3fe2d23b094efd
F ext/fts1/simple_tokenizer.c 1844d72f7194c3fd3d7e4173053911bf0661b70d
F ext/fts1/tokenizer.h 0c53421b832366d20d720d21ea3e1f6e66a36ef9
F ext/fts2/README.txt 8c18f41574404623b76917b9da66fcb0ab38328d
F ext/fts2/fts2.c aba63e7f4892a2e7cf50054181cda3d246c3ba0a
F ext/fts2/fts2.c de8321a2ad1edea1f0dd223cb86cf008451784a4
F ext/fts2/fts2.h bbdab26d34f91974d5b9ade8b7836c140a7c4ce1
F ext/fts2/fts2_hash.c b3f22116d4ef0bc8f2da6e3fdc435c86d0951a9b
F ext/fts2/fts2_hash.h e283308156018329f042816eb09334df714e105e
@ -437,7 +437,7 @@ F www/tclsqlite.tcl bb0d1357328a42b1993d78573e587c6dcbc964b9
F www/vdbe.tcl 87a31ace769f20d3627a64fa1fade7fed47b90d0
F www/version3.tcl 890248cf7b70e60c383b0e84d77d5132b3ead42b
F www/whentouse.tcl 97e2b5cd296f7d8057e11f44427dea8a4c2db513
P 7dc7658887046f066b564a5994578074a99756ba
R 28415623e14534daa33e7418f28a0adb
P 1b9918e20767aebc9c1e7523027139e5fbc12688
R 86ecbb6dcb3fabbb334fec798aed3031
U shess
Z d8903aa3843e1c017cd54e70c455deff
Z f6bd67aa8facf9e71ae06b9f1a1aa4bb

View File

@ -1 +1 @@
1b9918e20767aebc9c1e7523027139e5fbc12688
d04fa3a13a84f49074c673b8ee2fb6541da061b5