Add the "offsets=0" option to fts5, to create a smaller index without term offset information. A few things are currently broken on this branch.

FossilOrigin-Name: 40b5bbf02a824ca73b33aa4ae1c7d5f65b7cda10
This commit is contained in:
dan 2015-12-17 20:36:13 +00:00
parent 1db0a72be2
commit b12dc84fbb
9 changed files with 272 additions and 45 deletions

View File

@ -151,6 +151,7 @@ struct Fts5Config {
char *zContent; /* content table */
char *zContentRowid; /* "content_rowid=" option value */
int bColumnsize; /* "columnsize=" option value (dflt==1) */
int bOffsets; /* "offsets=" option value (dflt==1) */
char *zContentExprlist;
Fts5Tokenizer *pTok;
fts5_tokenizer *pTokApi;
@ -292,6 +293,13 @@ char *sqlite3Fts5Strndup(int *pRc, const char *pIn, int nIn);
/* Character set tests (like isspace(), isalpha() etc.) */
int sqlite3Fts5IsBareword(char t);
/* Bucket of terms object used by the integrity-check in offsets=0 mode. */
typedef struct Fts5Termset Fts5Termset;
int sqlite3Fts5TermsetNew(Fts5Termset**);
int sqlite3Fts5TermsetAdd(Fts5Termset*, const char*, int, int *pbPresent);
void sqlite3Fts5TermsetFree(Fts5Termset*);
/*
** End of interface to code in fts5_buffer.c.
**************************************************************************/
@ -492,7 +500,7 @@ typedef struct Fts5Hash Fts5Hash;
/*
** Create a hash table, free a hash table.
*/
int sqlite3Fts5HashNew(Fts5Hash**, int *pnSize);
int sqlite3Fts5HashNew(Fts5Config*, Fts5Hash**, int *pnSize);
void sqlite3Fts5HashFree(Fts5Hash*);
int sqlite3Fts5HashWrite(

View File

@ -292,3 +292,77 @@ int sqlite3Fts5IsBareword(char t){
}
/*************************************************************************
*/
typedef struct Fts5TermsetEntry Fts5TermsetEntry;
struct Fts5TermsetEntry {
char *pTerm;
int nTerm;
Fts5TermsetEntry *pNext;
};
struct Fts5Termset {
Fts5TermsetEntry *apHash[512];
};
int sqlite3Fts5TermsetNew(Fts5Termset **pp){
int rc = SQLITE_OK;
*pp = sqlite3Fts5MallocZero(&rc, sizeof(Fts5Termset));
return rc;
}
int sqlite3Fts5TermsetAdd(
Fts5Termset *p,
const char *pTerm, int nTerm,
int *pbPresent
){
int rc = SQLITE_OK;
int i;
int hash = 13;
Fts5TermsetEntry *pEntry;
/* Calculate a hash value for this term */
for(i=0; i<nTerm; i++){
hash += (hash << 3) + (int)pTerm[i];
}
hash = hash % ArraySize(p->apHash);
*pbPresent = 0;
for(pEntry=p->apHash[hash]; pEntry; pEntry=pEntry->pNext){
if( pEntry->nTerm==nTerm && memcmp(pEntry->pTerm, pTerm, nTerm)==0 ){
*pbPresent = 1;
break;
}
}
if( pEntry==0 ){
pEntry = sqlite3Fts5MallocZero(&rc, sizeof(Fts5TermsetEntry) + nTerm);
if( pEntry ){
pEntry->pTerm = (char*)&pEntry[1];
pEntry->nTerm = nTerm;
memcpy(pEntry->pTerm, pTerm, nTerm);
pEntry->pNext = p->apHash[hash];
p->apHash[hash] = pEntry;
}
}
return rc;
}
void sqlite3Fts5TermsetFree(Fts5Termset *p){
if( p ){
int i;
for(i=0; i<ArraySize(p->apHash); i++){
Fts5TermsetEntry *pEntry = p->apHash[i];
while( pEntry ){
Fts5TermsetEntry *pDel = pEntry;
pEntry = pEntry->pNext;
sqlite3_free(pDel);
}
}
sqlite3_free(p);
}
}

View File

@ -14,7 +14,6 @@
*/
#include "fts5Int.h"
#define FTS5_DEFAULT_PAGE_SIZE 4050
@ -345,6 +344,16 @@ static int fts5ConfigParseSpecial(
return rc;
}
if( sqlite3_strnicmp("offsets", zCmd, nCmd)==0 ){
if( (zArg[0]!='0' && zArg[0]!='1') || zArg[1]!='\0' ){
*pzErr = sqlite3_mprintf("malformed offsets=... directive");
rc = SQLITE_ERROR;
}else{
pConfig->bOffsets = (zArg[0]=='1');
}
return rc;
}
*pzErr = sqlite3_mprintf("unrecognized option: \"%.*s\"", nCmd, zCmd);
return SQLITE_ERROR;
}
@ -500,6 +509,7 @@ int sqlite3Fts5ConfigParse(
pRet->zDb = sqlite3Fts5Strndup(&rc, azArg[1], -1);
pRet->zName = sqlite3Fts5Strndup(&rc, azArg[2], -1);
pRet->bColumnsize = 1;
pRet->bOffsets = 1;
#ifdef SQLITE_DEBUG
pRet->bPrefixIndex = 1;
#endif

View File

@ -26,6 +26,7 @@ typedef struct Fts5HashEntry Fts5HashEntry;
struct Fts5Hash {
int bOffsets; /* Copy of Fts5Config.bOffsets */
int *pnByte; /* Pointer to bytes counter */
int nEntry; /* Number of entries currently in hash */
int nSlot; /* Size of aSlot[] array */
@ -79,7 +80,7 @@ struct Fts5HashEntry {
/*
** Allocate a new hash table.
*/
int sqlite3Fts5HashNew(Fts5Hash **ppNew, int *pnByte){
int sqlite3Fts5HashNew(Fts5Config *pConfig, Fts5Hash **ppNew, int *pnByte){
int rc = SQLITE_OK;
Fts5Hash *pNew;
@ -90,6 +91,7 @@ int sqlite3Fts5HashNew(Fts5Hash **ppNew, int *pnByte){
int nByte;
memset(pNew, 0, sizeof(Fts5Hash));
pNew->pnByte = pnByte;
pNew->bOffsets = pConfig->bOffsets;
pNew->nSlot = 1024;
nByte = sizeof(Fts5HashEntry*) * pNew->nSlot;
@ -214,6 +216,7 @@ int sqlite3Fts5HashWrite(
Fts5HashEntry *p;
u8 *pPtr;
int nIncr = 0; /* Amount to increment (*pHash->pnByte) by */
int bNew = pHash->bOffsets; /* If non-delete entry should be written */
/* Attempt to locate an existing hash entry */
iHash = fts5HashKey2(pHash->nSlot, (u8)bByte, (const u8*)pToken, nToken);
@ -250,6 +253,7 @@ int sqlite3Fts5HashWrite(
p->iSzPoslist = p->nData;
p->nData += 1;
p->iRowid = iRowid;
p->iCol = (pHash->bOffsets-1);
p->pHashNext = pHash->aSlot[iHash];
pHash->aSlot[iHash] = p;
pHash->nEntry++;
@ -286,24 +290,32 @@ int sqlite3Fts5HashWrite(
p->nData += sqlite3Fts5PutVarint(&pPtr[p->nData], iRowid - p->iRowid);
p->iSzPoslist = p->nData;
p->nData += 1;
p->iCol = 0;
p->iCol = (pHash->bOffsets-1);
p->iPos = 0;
p->iRowid = iRowid;
bNew = 1;
}
if( iCol>=0 ){
/* Append a new column value, if necessary */
assert( iCol>=p->iCol );
if( iCol!=p->iCol ){
pPtr[p->nData++] = 0x01;
p->nData += sqlite3Fts5PutVarint(&pPtr[p->nData], iCol);
p->iCol = iCol;
p->iPos = 0;
if( pHash->bOffsets==0 ){
bNew = 1;
p->iCol = iPos = iCol;
}else{
pPtr[p->nData++] = 0x01;
p->nData += sqlite3Fts5PutVarint(&pPtr[p->nData], iCol);
p->iCol = iCol;
p->iPos = 0;
}
}
/* Append the new position offset */
p->nData += sqlite3Fts5PutVarint(&pPtr[p->nData], iPos - p->iPos + 2);
p->iPos = iPos;
/* Append the new position offset, if necessary */
if( bNew ){
p->nData += sqlite3Fts5PutVarint(&pPtr[p->nData], iPos - p->iPos + 2);
p->iPos = iPos;
}
}else{
/* This is a delete. Set the delete flag. */
p->bDel = 1;

View File

@ -4001,6 +4001,14 @@ struct PoslistCallbackCtx {
int eState; /* See above */
};
typedef struct PoslistOffsetsCtx PoslistOffsetsCtx;
struct PoslistOffsetsCtx {
Fts5Buffer *pBuf; /* Append to this buffer */
Fts5Colset *pColset; /* Restrict matches to this column */
int iRead;
int iWrite;
};
/*
** TODO: Make this more efficient!
*/
@ -4012,6 +4020,28 @@ static int fts5IndexColsetTest(Fts5Colset *pColset, int iCol){
return 0;
}
static void fts5PoslistOffsetsCallback(
Fts5Index *p,
void *pContext,
const u8 *pChunk, int nChunk
){
PoslistOffsetsCtx *pCtx = (PoslistOffsetsCtx*)pContext;
assert_nc( nChunk>=0 );
if( nChunk>0 ){
int i = 0;
while( i<nChunk ){
int iVal;
i += fts5GetVarint32(&pChunk[i], iVal);
iVal += pCtx->iRead - 2;
pCtx->iRead = iVal;
if( fts5IndexColsetTest(pCtx->pColset, iVal) ){
fts5BufferSafeAppendVarint(pCtx->pBuf, iVal + 2 - pCtx->iWrite);
pCtx->iWrite = iVal;
}
}
}
}
static void fts5PoslistFilterCallback(
Fts5Index *p,
void *pContext,
@ -4079,12 +4109,20 @@ static void fts5SegiterPoslist(
if( pColset==0 ){
fts5ChunkIterate(p, pSeg, (void*)pBuf, fts5PoslistCallback);
}else{
PoslistCallbackCtx sCtx;
sCtx.pBuf = pBuf;
sCtx.pColset = pColset;
sCtx.eState = fts5IndexColsetTest(pColset, 0);
assert( sCtx.eState==0 || sCtx.eState==1 );
fts5ChunkIterate(p, pSeg, (void*)&sCtx, fts5PoslistFilterCallback);
if( p->pConfig->bOffsets==0 ){
PoslistOffsetsCtx sCtx;
memset(&sCtx, 0, sizeof(sCtx));
sCtx.pBuf = pBuf;
sCtx.pColset = pColset;
fts5ChunkIterate(p, pSeg, (void*)&sCtx, fts5PoslistOffsetsCallback);
}else{
PoslistCallbackCtx sCtx;
sCtx.pBuf = pBuf;
sCtx.pColset = pColset;
assert( sCtx.eState==0 || sCtx.eState==1 );
sCtx.eState = fts5IndexColsetTest(pColset, 0);
fts5ChunkIterate(p, pSeg, (void*)&sCtx, fts5PoslistFilterCallback);
}
}
}
}
@ -4446,7 +4484,7 @@ int sqlite3Fts5IndexBeginWrite(Fts5Index *p, int bDelete, i64 iRowid){
/* Allocate the hash table if it has not already been allocated */
if( p->pHash==0 ){
p->rc = sqlite3Fts5HashNew(&p->pHash, &p->nPendingData);
p->rc = sqlite3Fts5HashNew(p->pConfig, &p->pHash, &p->nPendingData);
}
/* Flush the hash table to disk if required */
@ -4804,7 +4842,9 @@ int sqlite3Fts5IterPoslist(
Fts5SegIter *pSeg = &pIter->aSeg[ pIter->aFirst[1].iFirst ];
assert( pIter->pIndex->rc==SQLITE_OK );
*piRowid = pSeg->iRowid;
if( pSeg->iLeafOffset+pSeg->nPos<=pSeg->pLeaf->szLeaf ){
if( pIter->pIndex->pConfig->bOffsets
&& pSeg->iLeafOffset+pSeg->nPos<=pSeg->pLeaf->szLeaf
){
u8 *pPos = &pSeg->pLeaf->p[pSeg->iLeafOffset];
if( pColset==0 || pIter->bFiltered ){
*pn = pSeg->nPos;

View File

@ -825,6 +825,7 @@ struct Fts5IntegrityCtx {
int iCol;
int szCol;
u64 cksum;
Fts5Termset *pTermset;
Fts5Config *pConfig;
};
@ -832,21 +833,33 @@ struct Fts5IntegrityCtx {
** Tokenization callback used by integrity check.
*/
static int fts5StorageIntegrityCallback(
void *pContext, /* Pointer to Fts5InsertCtx object */
void *pContext, /* Pointer to Fts5IntegrityCtx object */
int tflags,
const char *pToken, /* Buffer containing token */
int nToken, /* Size of token in bytes */
int iStart, /* Start offset of token */
int iEnd /* End offset of token */
){
int rc = SQLITE_OK;
Fts5IntegrityCtx *pCtx = (Fts5IntegrityCtx*)pContext;
if( (tflags & FTS5_TOKEN_COLOCATED)==0 || pCtx->szCol==0 ){
pCtx->szCol++;
}
pCtx->cksum ^= sqlite3Fts5IndexCksum(
pCtx->pConfig, pCtx->iRowid, pCtx->iCol, pCtx->szCol-1, pToken, nToken
);
return SQLITE_OK;
if( pCtx->pTermset ){
int bPresent = 0;
rc = sqlite3Fts5TermsetAdd(pCtx->pTermset, pToken, nToken, &bPresent);
if( rc==SQLITE_OK && bPresent==0 ){
pCtx->cksum ^= sqlite3Fts5IndexCksum(
pCtx->pConfig, pCtx->iRowid, 0, pCtx->iCol, pToken, nToken
);
}
}else{
pCtx->cksum ^= sqlite3Fts5IndexCksum(
pCtx->pConfig, pCtx->iRowid, pCtx->iCol, pCtx->szCol-1, pToken, nToken
);
}
return rc;
}
/*
@ -886,17 +899,24 @@ int sqlite3Fts5StorageIntegrity(Fts5Storage *p){
if( pConfig->abUnindexed[i] ) continue;
ctx.iCol = i;
ctx.szCol = 0;
rc = sqlite3Fts5Tokenize(pConfig,
FTS5_TOKENIZE_DOCUMENT,
(const char*)sqlite3_column_text(pScan, i+1),
sqlite3_column_bytes(pScan, i+1),
(void*)&ctx,
fts5StorageIntegrityCallback
);
if( pConfig->bColumnsize && ctx.szCol!=aColSize[i] ){
if( pConfig->bOffsets==0 ){
rc = sqlite3Fts5TermsetNew(&ctx.pTermset);
}
if( rc==SQLITE_OK ){
rc = sqlite3Fts5Tokenize(pConfig,
FTS5_TOKENIZE_DOCUMENT,
(const char*)sqlite3_column_text(pScan, i+1),
sqlite3_column_bytes(pScan, i+1),
(void*)&ctx,
fts5StorageIntegrityCallback
);
}
if( rc==SQLITE_OK && pConfig->bColumnsize && ctx.szCol!=aColSize[i] ){
rc = FTS5_CORRUPT;
}
aTotalSize[i] += ctx.szCol;
sqlite3Fts5TermsetFree(ctx.pTermset);
ctx.pTermset = 0;
}
if( rc!=SQLITE_OK ) break;
}

View File

@ -0,0 +1,59 @@
# 2015 December 18
#
# The author disclaims copyright to this source code. In place of
# a legal notice, here is a blessing:
#
# May you do good and not evil.
# May you find forgiveness for yourself and forgive others.
# May you share freely, never taking more than you give.
#
#*************************************************************************
# This file implements regression tests for SQLite library. The
# focus of this script is testing the FTS5 module.
#
source [file join [file dirname [info script]] fts5_common.tcl]
set testprefix fts5offsets
# If SQLITE_ENABLE_FTS5 is not defined, omit this file.
ifcapable !fts5 {
finish_test
return
}
do_execsql_test 1.0 {
CREATE VIRTUAL TABLE t1 USING fts5(a, b, c, offsets=0);
INSERT INTO t1 VALUES('h d g', 'j b b g b', 'i e i d h g g'); -- 1
INSERT INTO t1 VALUES('h j d', 'j h d a h', 'f d d g g f b'); -- 2
INSERT INTO t1 VALUES('j c i', 'f f h e f', 'c j i j c h f'); -- 3
INSERT INTO t1 VALUES('e g g', 'g e d h i', 'e d b e g d c'); -- 4
INSERT INTO t1 VALUES('b c c', 'd i h a f', 'd i j f a b c'); -- 5
INSERT INTO t1 VALUES('e d e', 'b c j g d', 'a i f d h b d'); -- 6
INSERT INTO t1 VALUES('g h e', 'b c d i d', 'e f c i f i c'); -- 7
INSERT INTO t1 VALUES('c f j', 'j j i e a', 'h a c f d h e'); -- 8
INSERT INTO t1 VALUES('a h i', 'c i a f a', 'c f d h g d g'); -- 9
INSERT INTO t1 VALUES('j g g', 'e f e f f', 'h j b i c g e'); -- 10
}
do_execsql_test 1.1 {
INSERT INTO t1(t1) VALUES('integrity-check');
}
foreach {tn match res} {
1 "a:a" {9}
2 "b:g" {1 4 6}
3 "c:h" {1 3 6 8 9 10}
} {
do_execsql_test 1.2.$tn.1 {
SELECT rowid FROM t1($match);
} $res
do_execsql_test 1.2.$tn.2 {
SELECT rowid FROM t1($match || '*');
} $res
}
finish_test

View File

@ -1,5 +1,5 @@
C Fix\sthe\sspellfix1_scriptcode()\sfunction\sto\signore\swhitespace\sand\spunctuation,\nand\sto\srecognize\shebrew\sand\sarabic\sscripts.
D 2015-12-17T14:18:21.904
C Add\sthe\s"offsets=0"\soption\sto\sfts5,\sto\screate\sa\ssmaller\sindex\swithout\sterm\soffset\sinformation.\sA\sfew\sthings\sare\scurrently\sbroken\son\sthis\sbranch.
D 2015-12-17T20:36:13.853
F Makefile.in 28bcd6149e050dff35d4dcfd97e890cd387a499d
F Makefile.linux-gcc 7bc79876b875010e8c8f9502eb935ca92aa3c434
F Makefile.msc 5fff077fcc46de7714ed6eebb6159a4c00eab751
@ -97,15 +97,15 @@ F ext/fts3/unicode/mkunicode.tcl 95cf7ec186e48d4985e433ff8a1c89090a774252
F ext/fts3/unicode/parseunicode.tcl da577d1384810fb4e2b209bf3313074353193e95
F ext/fts5/extract_api_docs.tcl a36e54ec777172ddd3f9a88daf593b00848368e0
F ext/fts5/fts5.h 8b9a13b309b180e9fb88ea5666c0d8d73c6102d9
F ext/fts5/fts5Int.h acf968e43d57b6b1caf7554d34ec35d6ed3b4fe8
F ext/fts5/fts5Int.h 4e1bb66d8e607bf38e881eb455cdf36cc3fa9e42
F ext/fts5/fts5_aux.c 1f384972d606375b8fa078319f25ab4b5feb1b35
F ext/fts5/fts5_buffer.c 1e49512a535045e621246dc7f4f65f3593fa0fc2
F ext/fts5/fts5_config.c 0ee66188609a62342e9f9aeefa3c3e44518a4dd6
F ext/fts5/fts5_buffer.c 389d377d04f6e622644c3343ab5e511f6646de36
F ext/fts5/fts5_config.c ba5248a05c28ec6a6fdf2599a86e9fd67e5c61e2
F ext/fts5/fts5_expr.c 80075fa45091bad42100c4a5c4f2efc83e43e3af
F ext/fts5/fts5_hash.c 25838d525e97f8662ff3504be94d0bad24f9a37e
F ext/fts5/fts5_index.c 578f46697080f11a1e26cd45a1c039c043a3111d
F ext/fts5/fts5_hash.c d4a6b52faca0134cc7bcc880f03a257a0dec2636
F ext/fts5/fts5_index.c 53b3a8f1c9c1f6e5e896b6dc0a7ad26c2eea23a2
F ext/fts5/fts5_main.c ef04699949ab8e42d590ae30188afef7ad58776e
F ext/fts5/fts5_storage.c 9ea3d92178743758b6c54d9fe8836bbbdcc92e3b
F ext/fts5/fts5_storage.c 0dc37a6183e1061e255f23971198d8878159d4ef
F ext/fts5/fts5_tcl.c 3bf445e66de32137d4693694ff7b1fd6074e32bd
F ext/fts5/fts5_test_mi.c e96be827aa8f571031e65e481251dc1981d608bf
F ext/fts5/fts5_tokenize.c 618efe033bceb80c521b1e9ddfd9fee85fb5946e
@ -156,6 +156,7 @@ F ext/fts5/test/fts5integrity.test 87db5d4e7da0ce04a1dcba5ba91658673c997a65
F ext/fts5/test/fts5matchinfo.test 2163b0013e824bba65499da9e34ea4da41349cc2
F ext/fts5/test/fts5merge.test 8f3cdba2ec9c5e7e568246e81b700ad37f764367
F ext/fts5/test/fts5near.test b214cddb1c1f1bddf45c75af768f20145f7e71cc
F ext/fts5/test/fts5offsets.test 09fc61d553ae4e985afc0146ec77f3439503fc6b
F ext/fts5/test/fts5onepass.test 7ed9608e258132cb8d55e7c479b08676ad68810c
F ext/fts5/test/fts5optimize.test 42741e7c085ee0a1276140a752d4407d97c2c9f5
F ext/fts5/test/fts5phrase.test f6d1d464da5beb25dc56277aa4f1d6102f0d9a2f
@ -1405,7 +1406,10 @@ F tool/vdbe_profile.tcl 246d0da094856d72d2c12efec03250d71639d19f
F tool/warnings-clang.sh f6aa929dc20ef1f856af04a730772f59283631d4
F tool/warnings.sh 48bd54594752d5be3337f12c72f28d2080cb630b
F tool/win/sqlite.vsix deb315d026cc8400325c5863eef847784a219a2f
P 85ebd46c701e0a094a4690cd8f1d0cbae9aa257c
R 70cdc580d7d9b92b032d33f5e0d50f2e
U drh
Z 6d4b5ed9cd1870281d7d851922f323d8
P 7adfa4a5794e47f97491c08abeaaac90e826b331
R a68b4412544bee6f6bc95a23674c55d4
T *branch * fts5-offsets
T *sym-fts5-offsets *
T -sym-trunk *
U dan
Z a51f39853c3a5371cd0ec3358f50f2d7

View File

@ -1 +1 @@
7adfa4a5794e47f97491c08abeaaac90e826b331
40b5bbf02a824ca73b33aa4ae1c7d5f65b7cda10