Update comments in fts3.c to more accurately describe the doclist format.

FossilOrigin-Name: e424a0307359fee6875424c10ecad1a10acfba0e
This commit is contained in:
drh 2010-01-08 23:01:32 +00:00
parent 0a7905295c
commit 819443e506
3 changed files with 62 additions and 27 deletions

View File

@ -23,9 +23,6 @@
** SQLite (in which case SQLITE_ENABLE_FTS3 is defined). ** SQLite (in which case SQLITE_ENABLE_FTS3 is defined).
*/ */
/* TODO(shess) Consider exporting this comment to an HTML file or the
** wiki.
*/
/* The full-text index is stored in a series of b+tree (-like) /* The full-text index is stored in a series of b+tree (-like)
** structures called segments which map terms to doclists. The ** structures called segments which map terms to doclists. The
** structures are like b+trees in layout, but are constructed from the ** structures are like b+trees in layout, but are constructed from the
@ -48,13 +45,27 @@
** 21 bits - BBA ** 21 bits - BBA
** and so on. ** and so on.
** **
** This is identical to how sqlite encodes varints (see util.c). ** This is similar in concept to how sqlite encodes "varints" but
** the encoding is not the same. SQLite varints are big-endian
** are are limited to 9 bytes in length whereas FTS3 varints are
** little-endian and can be upt to 10 bytes in length (in theory).
**
** Example encodings:
**
** 1: 0x01
** 127: 0x7f
** 128: 0x81 0x00
** **
** **
**** Document lists **** **** Document lists ****
** A doclist (document list) holds a docid-sorted list of hits for a ** A doclist (document list) holds a docid-sorted list of hits for a
** given term. Doclists hold docids, and can optionally associate ** given term. Doclists hold docids, and can optionally associate
** token positions and offsets with docids. ** token positions and offsets with docids. A position is the index
** of a word within the document. The first word of the document has
** a position of 0.
**
** FTS3 used to optionally store character offsets using a compile-time
** option. But that functionality is no longer supported.
** **
** A DL_POSITIONS_OFFSETS doclist is stored like this: ** A DL_POSITIONS_OFFSETS doclist is stored like this:
** **
@ -62,16 +73,12 @@
** varint docid; ** varint docid;
** array { (position list for column 0) ** array { (position list for column 0)
** varint position; (delta from previous position plus POS_BASE) ** varint position; (delta from previous position plus POS_BASE)
** varint startOffset; (delta from previous startOffset)
** varint endOffset; (delta from startOffset)
** } ** }
** array { ** array {
** varint POS_COLUMN; (marks start of position list for new column) ** varint POS_COLUMN; (marks start of position list for new column)
** varint column; (index of new column) ** varint column; (index of new column)
** array { ** array {
** varint position; (delta from previous position plus POS_BASE) ** varint position; (delta from previous position plus POS_BASE)
** varint startOffset;(delta from previous startOffset)
** varint endOffset; (delta from startOffset)
** } ** }
** } ** }
** varint POS_END; (marks end of positions for this document. ** varint POS_END; (marks end of positions for this document.
@ -79,10 +86,23 @@
** **
** Here, array { X } means zero or more occurrences of X, adjacent in ** Here, array { X } means zero or more occurrences of X, adjacent in
** memory. A "position" is an index of a token in the token stream ** memory. A "position" is an index of a token in the token stream
** generated by the tokenizer, while an "offset" is a byte offset, ** generated by the tokenizer. Note that POS_END and POS_COLUMN occur
** both based at 0. Note that POS_END and POS_COLUMN occur in the ** in the same logical place as the position element, and act as sentinals
** same logical place as the position element, and act as sentinals ** ending a position list array. POS_END is 0. POS_COLUMN is 1.
** ending a position list array. ** The positions numbers are not stored literally but rather as two more
** the difference from the prior position, or the just the position plus
** 2 for the first position. Example:
**
** label: A B C D E F G H I J K
** value: 123 5 9 1 1 14 35 0 234 72 0
**
** The 123 value is the first docid. For column zero in this document
** there are two matches at positions 3 and 10 (5-2 and 9-2+3). The 1
** at D signals the start of a new column; the 1 at E indicates that the
** new column is column number 1. There are two positions at 12 and 45
** (14-2 and 35-2+12). The 0 at H indicate the end-of-document. The
** 234 at I is the next docid. It has one position 72 (72-2) and then
** terminates with the 0 at K.
** **
** A DL_POSITIONS doclist omits the startOffset and endOffset ** A DL_POSITIONS doclist omits the startOffset and endOffset
** information. A DL_DOCIDS doclist omits both the position and ** information. A DL_DOCIDS doclist omits both the position and
@ -388,12 +408,23 @@ void sqlite3Fts3Dequote(char *z){
} }
} }
/*
** Read a single varint from the doclist at *pp and advance *pp to point
** to the next element of the varlist. Add the value of the varint
** to *pVal.
*/
static void fts3GetDeltaVarint(char **pp, sqlite3_int64 *pVal){ static void fts3GetDeltaVarint(char **pp, sqlite3_int64 *pVal){
sqlite3_int64 iVal; sqlite3_int64 iVal;
*pp += sqlite3Fts3GetVarint(*pp, &iVal); *pp += sqlite3Fts3GetVarint(*pp, &iVal);
*pVal += iVal; *pVal += iVal;
} }
/*
** As long as *pp has not reached its end (pEnd), then do the same
** as fts3GetDeltaVarint(): read a single varint and add it to *pVal.
** But if we have reached the end of the varint, just set *pp=0 and
** leave *pVal unchanged.
*/
static void fts3GetDeltaVarint2(char **pp, char *pEnd, sqlite3_int64 *pVal){ static void fts3GetDeltaVarint2(char **pp, char *pEnd, sqlite3_int64 *pVal){
if( *pp>=pEnd ){ if( *pp>=pEnd ){
*pp = 0; *pp = 0;
@ -782,12 +813,6 @@ static int fts3OpenMethod(sqlite3_vtab *pVTab, sqlite3_vtab_cursor **ppCsr){
return SQLITE_OK; return SQLITE_OK;
} }
/****************************************************************/
/****************************************************************/
/****************************************************************/
/****************************************************************/
/* /*
** Close the cursor. For additional information see the documentation ** Close the cursor. For additional information see the documentation
** on the xClose method of the virtual table interface. ** on the xClose method of the virtual table interface.

View File

@ -1,5 +1,8 @@
C Added\soption\sto\sdump\spages. -----BEGIN PGP SIGNED MESSAGE-----
D 2010-01-08T04:50:22 Hash: SHA1
C Update\scomments\sin\sfts3.c\sto\smore\saccurately\sdescribe\sthe\sdoclist\sformat.
D 2010-01-08T23:01:33
F Makefile.arm-wince-mingw32ce-gcc fcd5e9cd67fe88836360bb4f9ef4cb7f8e2fb5a0 F Makefile.arm-wince-mingw32ce-gcc fcd5e9cd67fe88836360bb4f9ef4cb7f8e2fb5a0
F Makefile.in c5827ead754ab32b9585487177c93bb00b9497b3 F Makefile.in c5827ead754ab32b9585487177c93bb00b9497b3
F Makefile.linux-gcc d53183f4aa6a9192d249731c90dbdffbd2c68654 F Makefile.linux-gcc d53183f4aa6a9192d249731c90dbdffbd2c68654
@ -56,7 +59,7 @@ F ext/fts2/mkfts2amal.tcl 974d5d438cb3f7c4a652639262f82418c1e4cff0
F ext/fts3/README.syntax a19711dc5458c20734b8e485e75fb1981ec2427a F ext/fts3/README.syntax a19711dc5458c20734b8e485e75fb1981ec2427a
F ext/fts3/README.tokenizers 998756696647400de63d5ba60e9655036cb966e9 F ext/fts3/README.tokenizers 998756696647400de63d5ba60e9655036cb966e9
F ext/fts3/README.txt 8c18f41574404623b76917b9da66fcb0ab38328d F ext/fts3/README.txt 8c18f41574404623b76917b9da66fcb0ab38328d
F ext/fts3/fts3.c 7b1969f6b958059ab7c6c8450fa4f27cf88681c7 F ext/fts3/fts3.c 451eb6554f3fce20e39ad6e3aea8b73e570582eb
F ext/fts3/fts3.h 3a10a0af180d502cecc50df77b1b22df142817fe F ext/fts3/fts3.h 3a10a0af180d502cecc50df77b1b22df142817fe
F ext/fts3/fts3Int.h 45bc7e284806042119722c8f4127ee944b77f0dd F ext/fts3/fts3Int.h 45bc7e284806042119722c8f4127ee944b77f0dd
F ext/fts3/fts3_expr.c f4ff02ebe854e97ac03ff00b38b728a9ab57fd4b F ext/fts3/fts3_expr.c f4ff02ebe854e97ac03ff00b38b728a9ab57fd4b
@ -785,7 +788,14 @@ F tool/speedtest2.tcl ee2149167303ba8e95af97873c575c3e0fab58ff
F tool/speedtest8.c 2902c46588c40b55661e471d7a86e4dd71a18224 F tool/speedtest8.c 2902c46588c40b55661e471d7a86e4dd71a18224
F tool/speedtest8inst1.c 293327bc76823f473684d589a8160bde1f52c14e F tool/speedtest8inst1.c 293327bc76823f473684d589a8160bde1f52c14e
F tool/vdbe-compress.tcl d70ea6d8a19e3571d7ab8c9b75cba86d1173ff0f F tool/vdbe-compress.tcl d70ea6d8a19e3571d7ab8c9b75cba86d1173ff0f
P b97aca1200d959a1e7c08dd4e9dbce4724342119 P 08c545f03082421166a21274b39e07bb348c17e6
R 954d1ee5bc4e8dbfdb8e25f58c6f3247 R fd480c683a62029e9ec671a9327220e6
U shaneh U drh
Z 6999a75f8e62879e0ec4ab88e2a317f7 Z fc689ecb2cdb2cbb899bfa1ef8dbe1b0
-----BEGIN PGP SIGNATURE-----
Version: GnuPG v1.4.6 (GNU/Linux)
iD8DBQFLR7lQoxKgR168RlERAiFaAJ0baFwo7PSUJOJhCvizsqF8gfeh8QCfSRxZ
sHuj1+mD+8zQC/K58Uto6eY=
=pm4s
-----END PGP SIGNATURE-----

View File

@ -1 +1 @@
08c545f03082421166a21274b39e07bb348c17e6 e424a0307359fee6875424c10ecad1a10acfba0e