Add a rudimentary tokenizer and parser to FTS1 for parsing the module

arguments during initialization.   Recognized arguments include a
tokenizer selector and a list of virtual table columns. (CVS 3403)

FossilOrigin-Name: 227dc3feb537e6efd5b0c1d2dad40193db07d5aa
This commit is contained in:
drh 2006-09-11 00:34:22 +00:00
parent 4ca8aac2b4
commit e410296021
9 changed files with 285 additions and 57 deletions

View File

@ -806,6 +806,8 @@ typedef struct fulltext_vtab {
sqlite3 *db;
const char *zName; /* virtual table name */
sqlite3_tokenizer *pTokenizer; /* tokenizer for inserts and queries */
int nColumn; /* Number of columns */
char **azColumn; /* Names of all columns */
/* Precompiled statements which we keep as long as the table is
** open.
@ -1081,6 +1083,9 @@ static int term_delete(fulltext_vtab *v, sqlite_int64 rowid){
return sql_single_step_statement(v, TERM_DELETE_STMT, &s);
}
/*
** Free the memory used to contain a fulltext_vtab structure.
*/
static void fulltext_vtab_destroy(fulltext_vtab *v){
int iStmt;
@ -1096,53 +1101,268 @@ static void fulltext_vtab_destroy(fulltext_vtab *v){
v->pTokenizer->pModule->xDestroy(v->pTokenizer);
v->pTokenizer = NULL;
}
free(v->azColumn);
free((void *) v->zName);
free(v);
}
/*
** Token types for parsing the arguments to xConnect or xCreate.
*/
#define TOKEN_EOF 0 /* End of file */
#define TOKEN_SPACE 1 /* Any kind of whitespace */
#define TOKEN_ID 2 /* An identifier */
#define TOKEN_STRING 3 /* A string literal */
#define TOKEN_PUNCT 4 /* A single punctuation character */
/*
** If X is a character that can be used in an identifier then
** IdChar(X) will be true. Otherwise it is false.
**
** For ASCII, any character with the high-order bit set is
** allowed in an identifier. For 7-bit characters,
** sqlite3IsIdChar[X] must be 1.
**
** Ticket #1066. the SQL standard does not allow '$' in the
** middle of identfiers. But many SQL implementations do.
** SQLite will allow '$' in identifiers for compatibility.
** But the feature is undocumented.
*/
static const char isIdChar[] = {
/* x0 x1 x2 x3 x4 x5 x6 x7 x8 x9 xA xB xC xD xE xF */
0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, /* 2x */
1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 0, 0, 0, 0, 0, /* 3x */
0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, /* 4x */
1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 0, 0, 0, 1, /* 5x */
0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, /* 6x */
1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 0, 0, 0, 0, /* 7x */
};
#define IdChar(C) (((c=C)&0x80)!=0 || (c>0x1f && isIdChar[c-0x20]))
/*
** Return the length of the token that begins at z[0].
** Store the token type in *tokenType before returning.
*/
static int getToken(const char *z, int *tokenType){
int i, c;
switch( *z ){
case 0: {
*tokenType = TOKEN_EOF;
return 0;
}
case ' ': case '\t': case '\n': case '\f': case '\r': {
for(i=1; isspace(z[i]); i++){}
*tokenType = TOKEN_SPACE;
return i;
}
case '\'':
case '"': {
int delim = z[0];
for(i=1; (c=z[i])!=0; i++){
if( c==delim ){
if( z[i+1]==delim ){
i++;
}else{
break;
}
}
}
*tokenType = TOKEN_STRING;
return i + (c!=0);
}
case '[': {
for(i=1, c=z[0]; c!=']' && (c=z[i])!=0; i++){}
*tokenType = TOKEN_ID;
return i;
}
default: {
if( !IdChar(*z) ){
break;
}
for(i=1; IdChar(z[i]); i++){}
*tokenType = TOKEN_ID;
return i;
}
}
*tokenType = TOKEN_PUNCT;
return 1;
}
/*
** A token extracted from a string is an instance of the following
** structure.
*/
typedef struct Token {
const char *z; /* Pointer to token text. Not '\000' terminated */
short int n; /* Length of the token text in bytes. */
} Token;
/*
** Given a input string (which is really one of the argv[] parameters
** passed into xConnect or xCreate) split the string up into tokens.
** Return an array of pointers to '\000' terminated strings, one string
** for each non-whitespace token.
**
** The returned array is terminated by a single NULL pointer.
**
** Space to hold the returned array is obtained from a single
** malloc and should be freed by passing the return value to free().
** The individual strings within the token list are all a part of
** the single memory allocation and will all be freed at once.
*/
static char **tokenizeString(const char *z, int *pnToken){
int nToken = 0;
Token *aToken = malloc( strlen(z) * sizeof(aToken[0]) );
int n = 1;
int e, i;
int totalSize = 0;
char **azToken;
char *zCopy;
while( n>0 ){
n = getToken(z, &e);
if( e!=TOKEN_SPACE ){
aToken[nToken].z = z;
aToken[nToken].n = n;
nToken++;
totalSize += n+1;
}
z += n;
}
azToken = (char**)malloc( nToken*sizeof(char*) + totalSize );
zCopy = (char*)&azToken[nToken];
nToken--;
for(i=0; i<nToken; i++){
azToken[i] = zCopy;
n = aToken[i].n;
memcpy(zCopy, aToken[i].z, n);
zCopy[n] = 0;
zCopy += n+1;
}
azToken[nToken] = 0;
free(aToken);
*pnToken = nToken;
return azToken;
}
/*
** Remove the first nSkip tokens from a token list as well
** as all "(", ",", and ")" tokens from a token list.
**
** The memory for a token list comes from a single malloc().
** This routine just rearranges the pointers within that allocation.
** The token list is still freed by a single free().
*/
static void removeDelimiterTokens(char **azTokens, int nSkip, int *pnToken){
int i, j, c;
for(i=nSkip, j=0; azTokens[i]; i++){
c = azTokens[i][0];
if( c=='(' || c==',' || c==')' ) continue;
azTokens[j++] = azTokens[i];
}
azTokens[j] = 0;
*pnToken = j;
}
/* Current interface:
** argv[0] - module name
** argv[1] - database name
** argv[2] - table name
** argv[0] - module name
** argv[1] - database name
** argv[2...] - arguments.
**
** Arguments:
**
** tokenizer NAME(ARG1,ARG2,...)
** columns(C1,C2,C3,...)
** argv[3] - tokenizer name (optional, a sensible default is provided)
** argv[4..] - passed to tokenizer (optional based on tokenizer)
**/
static int fulltextConnect(sqlite3 *db, void *pAux, int argc, char **argv,
sqlite3_vtab **ppVTab, char **pzErr){
int rc;
fulltext_vtab *v;
static int fulltextConnect(
sqlite3 *db,
void *pAux,
int argc, const char *const*argv,
sqlite3_vtab **ppVTab,
char **pzErr
){
int rc, i;
fulltext_vtab *v = 0;
const sqlite3_tokenizer_module *m = NULL;
char **azToken = 0;
int seen_tokenizer = 0;
int seen_columns = 0;
assert( argc>=3 );
v = (fulltext_vtab *) malloc(sizeof(fulltext_vtab));
if( v==0 ) goto connect_failed;
memset(v, 0, sizeof(*v));
v->db = db;
v->zName = string_dup(argv[2]);
v->pTokenizer = NULL;
if( argc==3 ){
sqlite3Fts1SimpleTokenizerModule(&m);
} else {
/* TODO(shess) For now, add new tokenizers as else if clauses. */
if( !strcmp(argv[3], "simple") ){
sqlite3Fts1SimpleTokenizerModule(&m);
} else {
*pzErr = sqlite3_mprintf("unknown tokenizer: %s", argv[3]);
assert( "unrecognized tokenizer"==NULL );
/* Process arguments to the module */
for(i=3; i<argc; i++){
int nToken;
azToken = tokenizeString(argv[i], &nToken);
if( azToken==0 ) goto connect_failed;
removeDelimiterTokens(azToken, 0, &nToken);
if( nToken>=2 && strcmp(azToken[0],"tokenizer")==0 ){
if( seen_tokenizer ){
*pzErr = sqlite3_mprintf("multiple tokenizer definitions");
goto connect_failed;
}
seen_tokenizer = 1;
if( !strcmp(azToken[1], "simple") ){
sqlite3Fts1SimpleTokenizerModule(&m);
} else {
*pzErr = sqlite3_mprintf("unknown tokenizer: %s", azToken[1]);
goto connect_failed;
}
rc = m->xCreate(nToken-2, (const char *const*)&azToken[2],&v->pTokenizer);
v->pTokenizer->pModule = m;
m = 0;
if( rc ){
goto connect_failed;
}
}else if( nToken>=2 && strcmp(azToken[0], "columns")==0 ){
if( seen_columns ){
*pzErr = sqlite3_mprintf("multiple column definitions");
goto connect_failed;
}
removeDelimiterTokens(azToken, 1, &nToken);
v->nColumn = nToken;
v->azColumn = azToken;
azToken = 0;
seen_columns = 1;
}else{
*pzErr = sqlite3_mprintf("bad argument: %s", argv[i]);
goto connect_failed;
}
free(azToken);
azToken = 0;
}
/* TODO(shess) Since tokenization impacts the index, the parameters
** to the tokenizer need to be identical when a persistent virtual
** table is re-created. One solution would be a meta-table to track
** such information in the database. Then we could verify that the
** information is identical on subsequent creates.
/* Put in default values for the column names and the tokenizer if
** none is specified in the arguments.
*/
/* TODO(shess) Why isn't argv already (const char **)? */
rc = m->xCreate(argc-3, (const char **) (argv+3), &v->pTokenizer);
if( rc!=SQLITE_OK ) return rc;
v->pTokenizer->pModule = m;
if( !seen_tokenizer ){
sqlite3Fts1SimpleTokenizerModule(&m);
rc = m->xCreate(0, 0, &v->pTokenizer);
v->pTokenizer->pModule = m;
if( rc!=SQLITE_OK ){
goto connect_failed;
}
m = 0;
}
if( !seen_columns ){
v->nColumn = 1;
v->azColumn = malloc( sizeof(char*)*2 );
if( v->azColumn==0 ) goto connect_failed;
v->azColumn[0] = "content";
v->azColumn[1] = 0;
}
/* TODO: verify the existence of backing tables foo_content, foo_term */
@ -1154,9 +1374,17 @@ static int fulltextConnect(sqlite3 *db, void *pAux, int argc, char **argv,
*ppVTab = &v->base;
TRACE(("FTS1 Connect %p\n", v));
return SQLITE_OK;
connect_failed:
if( v ){
fulltext_vtab_destroy(v);
}
free(azToken);
return SQLITE_ERROR;
}
static int fulltextCreate(sqlite3 *db, void *pAux, int argc, char **argv,
static int fulltextCreate(sqlite3 *db, void *pAux,
int argc, const char *const*argv,
sqlite3_vtab **ppVTab, char **pzErr){
int rc;
assert( argc>=3 );

View File

@ -40,7 +40,7 @@ struct sqlite3_tokenizer_module {
** Create and destroy a tokenizer. argc/argv are passed down from
** the fulltext virtual table creation to allow customization.
*/
int (*xCreate)(int argc, const char **argv,
int (*xCreate)(int argc, const char *const*argv,
sqlite3_tokenizer **ppTokenizer);
int (*xDestroy)(sqlite3_tokenizer *pTokenizer);

View File

@ -1,5 +1,5 @@
C Add\spzErr\sparameters\sto\sthe\sxConnect\sand\sxCreate\smethods\sof\svirtual\stables\nin\sorder\sto\sprovide\sbetter\serror\sreporting.\s\sThis\sis\san\sinterface\schange\nfor\svirtual\stables.\s\sPrior\svirtual\stable\simplementations\swill\sneed\sto\sbe\nmodified\sand\srecompiled.\s(CVS\s3402)
D 2006-09-10T17:31:59
C Add\sa\srudimentary\stokenizer\sand\sparser\sto\sFTS1\sfor\sparsing\sthe\smodule\narguments\sduring\sinitialization.\s\s\sRecognized\sarguments\sinclude\sa\ntokenizer\sselector\sand\sa\slist\sof\svirtual\stable\scolumns.\s(CVS\s3403)
D 2006-09-11T00:34:22
F Makefile.in cabd42d34340f49260bc2a7668c38eba8d4cfd99
F Makefile.linux-gcc 2d8574d1ba75f129aba2019f0b959db380a90935
F README 9c4e2d6706bdcc3efdd773ce752a8cdab4f90028
@ -21,11 +21,11 @@ F ext/README.txt 913a7bd3f4837ab14d7e063304181787658b14e1
F ext/fts1/README.txt 20ac73b006a70bcfd80069bdaf59214b6cf1db5e
F ext/fts1/ft_hash.c 3927bd880e65329bdc6f506555b228b28924921b
F ext/fts1/ft_hash.h 1a35e654a235c2c662d3ca0dfc3138ad60b8b7d5
F ext/fts1/fts1.c a0f9600c5d3fedaf0002247b554c0570c431bf9e
F ext/fts1/fts1.c 022a985bafaecdd6d245ddfeba68f9d268fccd9d
F ext/fts1/fts1.h fe8e8f38dd6d2d2645b9b0d6972e80985249575f
F ext/fts1/fts1_hash.c 3196cee866edbebb1c0521e21672e6d599965114
F ext/fts1/fts1_hash.h 957d378355ed29f672cd5add012ce8b088a5e089
F ext/fts1/fts1_tokenizer.h a90c4d022d1c5e50ca931a9b6415bc8bce12b76e
F ext/fts1/fts1_tokenizer.h 12c0e7ad83120aff1f86ca848149f96f61da738b
F ext/fts1/fts1_tokenizer1.c 1155942be01e8b191b13ac2ea4604b301f77e73e
F ext/fts1/fulltext.c d935e600d87bc86b7d64f55c7520ea41d6034c5c
F ext/fts1/fulltext.h 08525a47852d1d62a0be81d3fc3fe2d23b094efd
@ -86,7 +86,7 @@ F src/random.c d40f8d356cecbd351ccfab6eaedd7ec1b54f5261
F src/select.c 0d4724930a1f34c747105ed1802fa4af0d8eb519
F src/server.c 087b92a39d883e3fa113cae259d64e4c7438bc96
F src/shell.c 233f7766e532a204bed465249ffc584424ed1757
F src/sqlite.h.in c76f7a4609631606f657fbe976e3bc901d39c2d3
F src/sqlite.h.in 19f5390cce182242b309a053aa1ee2b902bee147
F src/sqlite3ext.h 11a046b3519c4b9b7709e6d6a95c3a36366f684a
F src/sqliteInt.h 259adce944cc3b28da1fa3df9beb9ba86017a45d
F src/table.c d8817f43a6c6bf139487db161760b9e1e02da3f1
@ -98,14 +98,14 @@ F src/test4.c 8b784cd82de158a2317cb4ac4bc86f91ad315e25
F src/test5.c 7162f8526affb771c4ed256826eee7bb9eca265f
F src/test6.c 60a02961ceb7b3edc25f5dc5c1ac2556622a76de
F src/test7.c 03fa8d787f6aebc6d1f72504d52f33013ad2c8e3
F src/test8.c f86da05e9611275a8ea8bbd679ebe89e9dddc4f1
F src/test8.c cdde31e45651081a88845d5e66eeed450a7e2a3e
F src/test_async.c e3deaedd4d86a56391b81808fde9e44fbd92f1d3
F src/test_autoext.c bbb70bc1c83bd273cf59908ca9b486ae5df55a59
F src/test_loadext.c 22065d601a18878e5542191001f0eaa5d77c0ed8
F src/test_md5.c 6c42bc0a3c0b54be34623ff77a0eec32b2fa96e3
F src/test_schema.c 01a3bdd6005bffe6212468bf8e232fe31086d235
F src/test_schema.c ced72140a3a25c148975428e170ec1850d3c3a7d
F src/test_server.c a6460daed0b92ecbc2531b6dc73717470e7a648c
F src/test_tclvar.c be4e54ce56d612a90907e5190d8142875cdc778c
F src/test_tclvar.c 315e77c17f128ff8c06b38c08617fd07c825a95b
F src/tokenize.c dfdff21768fbedd40e8d3ca84fc5d0d7af2b46dd
F src/trigger.c 0fc40125820409a6274834a6e04ad804d96e2793
F src/update.c 951f95ef044cf6d28557c48dc35cb0711a0b9129
@ -119,7 +119,7 @@ F src/vdbeapi.c 81f531d7dc5c898131b02ef85f6c6144ab2892cf
F src/vdbeaux.c 9fab61427a0741c9c123e8ff16e349b1f90397be
F src/vdbefifo.c 9efb94c8c3f4c979ebd0028219483f88e57584f5
F src/vdbemem.c 26623176bf1c616aa478da958fac49502491a921
F src/vtab.c 430513b5e2b3cfe72f960be2d1dff41ce8ac0f9d
F src/vtab.c 4d360f2222c6c9a4b779d733fbfb8ddf61be9eb4
F src/where.c 75a89957fcb8c068bec55caa4e9d2ed5fa0b0724
F tclinstaller.tcl 046e3624671962dc50f0481d7c25b38ef803eb42
F test/aggerror.test a867e273ef9e3d7919f03ef4f0e8c0d2767944f2
@ -397,7 +397,7 @@ F www/tclsqlite.tcl bb0d1357328a42b1993d78573e587c6dcbc964b9
F www/vdbe.tcl 87a31ace769f20d3627a64fa1fade7fed47b90d0
F www/version3.tcl 890248cf7b70e60c383b0e84d77d5132b3ead42b
F www/whentouse.tcl 97e2b5cd296f7d8057e11f44427dea8a4c2db513
P 36693a5cb72b4363010f9ab0866e1f7865f65275
R bc62906dea603a74fb4a9c89628cc681
P f44b8bae97b6872524580009c96d07391578c388
R 9998640c1fac069b758db49fd22e886b
U drh
Z c46929b76e373fecb1fd2b6b3f4e1308
Z 2b1ea65d3e0e9f0fb73f4523c49b6c09

View File

@ -1 +1 @@
f44b8bae97b6872524580009c96d07391578c388
227dc3feb537e6efd5b0c1d2dad40193db07d5aa

View File

@ -12,7 +12,7 @@
** This header file defines the interface that the SQLite library
** presents to client programs.
**
** @(#) $Id: sqlite.h.in,v 1.191 2006/09/10 17:31:59 drh Exp $
** @(#) $Id: sqlite.h.in,v 1.192 2006/09/11 00:34:22 drh Exp $
*/
#ifndef _SQLITE3_H_
#define _SQLITE3_H_
@ -1581,10 +1581,10 @@ typedef struct sqlite3_module sqlite3_module;
struct sqlite3_module {
int iVersion;
int (*xCreate)(sqlite3*, void *pAux,
int argc, char **argv,
int argc, const char *const*argv,
sqlite3_vtab **ppVTab, char**);
int (*xConnect)(sqlite3*, void *pAux,
int argc, char **argv,
int argc, const char *const*argv,
sqlite3_vtab **ppVTab, char**);
int (*xBestIndex)(sqlite3_vtab *pVTab, sqlite3_index_info*);
int (*xDisconnect)(sqlite3_vtab *pVTab);

View File

@ -13,7 +13,7 @@
** is not included in the SQLite library. It is used for automated
** testing of the SQLite library.
**
** $Id: test8.c,v 1.41 2006/09/10 17:32:00 drh Exp $
** $Id: test8.c,v 1.42 2006/09/11 00:34:22 drh Exp $
*/
#include "sqliteInt.h"
#include "tcl.h"
@ -257,7 +257,7 @@ static int echoDeclareVtab(
echo_vtab *pVtab,
sqlite3 *db,
int argc,
char **argv
const char *const*argv
){
int rc = SQLITE_OK;
@ -311,7 +311,7 @@ static int echoDestructor(sqlite3_vtab *pVtab){
static int echoConstructor(
sqlite3 *db,
void *pAux,
int argc, char **argv,
int argc, const char *const*argv,
sqlite3_vtab **ppVtab,
char **pzErr
){
@ -358,7 +358,7 @@ static int echoConstructor(
static int echoCreate(
sqlite3 *db,
void *pAux,
int argc, char **argv,
int argc, const char *const*argv,
sqlite3_vtab **ppVtab,
char **pzErr
){
@ -394,7 +394,7 @@ static int echoCreate(
static int echoConnect(
sqlite3 *db,
void *pAux,
int argc, char **argv,
int argc, const char *const*argv,
sqlite3_vtab **ppVtab,
char **pzErr
){

View File

@ -13,7 +13,7 @@
** is not included in the SQLite library. It is used for automated
** testing of the SQLite library.
**
** $Id: test_schema.c,v 1.10 2006/09/10 17:32:00 drh Exp $
** $Id: test_schema.c,v 1.11 2006/09/11 00:34:22 drh Exp $
*/
/* The code in this file defines a sqlite3 virtual-table module that
@ -84,7 +84,7 @@ static int schemaDestroy(sqlite3_vtab *pVtab){
static int schemaCreate(
sqlite3 *db,
void *pAux,
int argc, char **argv,
int argc, const char *const*argv,
sqlite3_vtab **ppVtab,
char **pzErr
){

View File

@ -16,7 +16,7 @@
** The emphasis of this file is a virtual table that provides
** access to TCL variables.
**
** $Id: test_tclvar.c,v 1.9 2006/09/10 17:32:00 drh Exp $
** $Id: test_tclvar.c,v 1.10 2006/09/11 00:34:22 drh Exp $
*/
#include "sqliteInt.h"
#include "tcl.h"
@ -51,7 +51,7 @@ struct tclvar_cursor {
static int tclvarConnect(
sqlite3 *db,
void *pAux,
int argc, char **argv,
int argc, const char *const*argv,
sqlite3_vtab **ppVtab,
char **pzErr
){

View File

@ -11,7 +11,7 @@
*************************************************************************
** This file contains code used to help implement virtual tables.
**
** $Id: vtab.c,v 1.33 2006/09/10 17:32:00 drh Exp $
** $Id: vtab.c,v 1.34 2006/09/11 00:34:22 drh Exp $
*/
#ifndef SQLITE_OMIT_VIRTUALTABLE
#include "sqliteInt.h"
@ -286,13 +286,13 @@ static int vtabCallConstructor(
sqlite3 *db,
Table *pTab,
Module *pMod,
int (*xConstruct)(sqlite3*,void*,int,char**,sqlite3_vtab**,char**),
int (*xConstruct)(sqlite3*,void*,int,const char*const*,sqlite3_vtab**,char**),
char **pzErr
){
int rc;
int rc2;
sqlite3_vtab *pVtab;
char **azArg = pTab->azModuleArg;
const char *const*azArg = (const char *const*)pTab->azModuleArg;
int nArg = pTab->nModuleArg;
char *zErr = 0;
@ -314,7 +314,7 @@ static int vtabCallConstructor(
if( zErr==0 ){
*pzErr = sqlite3MPrintf("vtable constructor failed: %s", pTab->zName);
}else {
*pzErr = sqlite3_mprintf("%s", zErr);
*pzErr = sqlite3MPrintf("%s", zErr);
sqlite3_free(zErr);
}
}else if( db->pVTab ){