Omit the long-disused FTS1 and FTS2 implements from the active source tree.

The code will persist forever in the source repository, but there is no point
in carrying it around in the latest tarballs where it is never used.

FossilOrigin-Name: 2bb50d5aedef0fd216d94058f477a58d88aa3a68bbadc94fa67998b7c391a8ff
This commit is contained in:
drh 2023-01-14 19:53:42 +00:00
parent 3f7b9944b8
commit 57003792df
31 changed files with 11 additions and 16560 deletions

View File

@ -315,24 +315,6 @@ SRC = \
# Source code for extensions
#
SRC += \
$(TOP)/ext/fts1/fts1.c \
$(TOP)/ext/fts1/fts1.h \
$(TOP)/ext/fts1/fts1_hash.c \
$(TOP)/ext/fts1/fts1_hash.h \
$(TOP)/ext/fts1/fts1_porter.c \
$(TOP)/ext/fts1/fts1_tokenizer.h \
$(TOP)/ext/fts1/fts1_tokenizer1.c
SRC += \
$(TOP)/ext/fts2/fts2.c \
$(TOP)/ext/fts2/fts2.h \
$(TOP)/ext/fts2/fts2_hash.c \
$(TOP)/ext/fts2/fts2_hash.h \
$(TOP)/ext/fts2/fts2_icu.c \
$(TOP)/ext/fts2/fts2_porter.c \
$(TOP)/ext/fts2/fts2_tokenizer.h \
$(TOP)/ext/fts2/fts2_tokenizer.c \
$(TOP)/ext/fts2/fts2_tokenizer1.c
SRC += \
$(TOP)/ext/fts3/fts3.c \
$(TOP)/ext/fts3/fts3.h \
@ -565,14 +547,6 @@ HDR = \
# Header files used by extensions
#
EXTHDR += \
$(TOP)/ext/fts1/fts1.h \
$(TOP)/ext/fts1/fts1_hash.h \
$(TOP)/ext/fts1/fts1_tokenizer.h
EXTHDR += \
$(TOP)/ext/fts2/fts2.h \
$(TOP)/ext/fts2/fts2_hash.h \
$(TOP)/ext/fts2/fts2_tokenizer.h
EXTHDR += \
$(TOP)/ext/fts3/fts3.h \
$(TOP)/ext/fts3/fts3Int.h \
@ -1148,24 +1122,6 @@ shell.c: $(SHELL_SRC) $(TOP)/tool/mkshellc.tcl
icu.lo: $(TOP)/ext/icu/icu.c $(HDR) $(EXTHDR)
$(LTCOMPILE) -DSQLITE_CORE -c $(TOP)/ext/icu/icu.c
fts2.lo: $(TOP)/ext/fts2/fts2.c $(HDR) $(EXTHDR)
$(LTCOMPILE) -DSQLITE_CORE -c $(TOP)/ext/fts2/fts2.c
fts2_hash.lo: $(TOP)/ext/fts2/fts2_hash.c $(HDR) $(EXTHDR)
$(LTCOMPILE) -DSQLITE_CORE -c $(TOP)/ext/fts2/fts2_hash.c
fts2_icu.lo: $(TOP)/ext/fts2/fts2_icu.c $(HDR) $(EXTHDR)
$(LTCOMPILE) -DSQLITE_CORE -c $(TOP)/ext/fts2/fts2_icu.c
fts2_porter.lo: $(TOP)/ext/fts2/fts2_porter.c $(HDR) $(EXTHDR)
$(LTCOMPILE) -DSQLITE_CORE -c $(TOP)/ext/fts2/fts2_porter.c
fts2_tokenizer.lo: $(TOP)/ext/fts2/fts2_tokenizer.c $(HDR) $(EXTHDR)
$(LTCOMPILE) -DSQLITE_CORE -c $(TOP)/ext/fts2/fts2_tokenizer.c
fts2_tokenizer1.lo: $(TOP)/ext/fts2/fts2_tokenizer1.c $(HDR) $(EXTHDR)
$(LTCOMPILE) -DSQLITE_CORE -c $(TOP)/ext/fts2/fts2_tokenizer1.c
fts3.lo: $(TOP)/ext/fts3/fts3.c $(HDR) $(EXTHDR)
$(LTCOMPILE) -DSQLITE_CORE -c $(TOP)/ext/fts3/fts3.c

View File

@ -1412,20 +1412,6 @@ SRC05 = \
$(TOP)\src\wal.h \
$(TOP)\src\whereInt.h
# Extension source code files, part 1.
#
SRC06 = \
$(TOP)\ext\fts1\fts1.c \
$(TOP)\ext\fts1\fts1_hash.c \
$(TOP)\ext\fts1\fts1_porter.c \
$(TOP)\ext\fts1\fts1_tokenizer1.c \
$(TOP)\ext\fts2\fts2.c \
$(TOP)\ext\fts2\fts2_hash.c \
$(TOP)\ext\fts2\fts2_icu.c \
$(TOP)\ext\fts2\fts2_porter.c \
$(TOP)\ext\fts2\fts2_tokenizer.c \
$(TOP)\ext\fts2\fts2_tokenizer1.c
# Extension source code files, part 2.
#
SRC07 = \
@ -1448,16 +1434,6 @@ SRC07 = \
$(TOP)\ext\rbu\sqlite3rbu.c \
$(TOP)\ext\misc\stmt.c
# Extension header files, part 1.
#
SRC08 = \
$(TOP)\ext\fts1\fts1.h \
$(TOP)\ext\fts1\fts1_hash.h \
$(TOP)\ext\fts1\fts1_tokenizer.h \
$(TOP)\ext\fts2\fts2.h \
$(TOP)\ext\fts2\fts2_hash.h \
$(TOP)\ext\fts2\fts2_tokenizer.h
# Extension header files, part 2.
#
SRC09 = \
@ -1498,7 +1474,7 @@ SRC12 =
# All source code files.
#
SRC = $(SRC00) $(SRC01) $(SRC03) $(SRC04) $(SRC05) $(SRC06) $(SRC07) $(SRC08) $(SRC09) $(SRC10) $(SRC11) $(SRC12)
SRC = $(SRC00) $(SRC01) $(SRC03) $(SRC04) $(SRC05) $(SRC07) $(SRC09) $(SRC10) $(SRC11) $(SRC12)
# Source code to the test files.
#
@ -1606,7 +1582,6 @@ TESTEXT = $(TESTEXT) $(TOP)\ext\misc\zipfile.c
TESTSRC2 = \
$(SRC00) \
$(SRC01) \
$(SRC06) \
$(SRC07) \
$(SRC10) \
$(TOP)\ext\async\sqlite3async.c
@ -1641,14 +1616,6 @@ HDR = \
# Header files used by extensions
#
EXTHDR = $(EXTHDR) \
$(TOP)\ext\fts1\fts1.h \
$(TOP)\ext\fts1\fts1_hash.h \
$(TOP)\ext\fts1\fts1_tokenizer.h
EXTHDR = $(EXTHDR) \
$(TOP)\ext\fts2\fts2.h \
$(TOP)\ext\fts2\fts2_hash.h \
$(TOP)\ext\fts2\fts2_tokenizer.h
EXTHDR = $(EXTHDR) \
$(TOP)\ext\fts3\fts3.h \
$(TOP)\ext\fts3\fts3Int.h \
@ -1860,9 +1827,7 @@ mptest: mptester.exe
for %i in ($(SRC03)) do copy /Y %i tsrc
for %i in ($(SRC04)) do copy /Y %i tsrc
for %i in ($(SRC05)) do copy /Y %i tsrc
for %i in ($(SRC06)) do copy /Y %i tsrc
for %i in ($(SRC07)) do copy /Y %i tsrc
for %i in ($(SRC08)) do copy /Y %i tsrc
for %i in ($(SRC09)) do copy /Y %i tsrc
for %i in ($(SRC10)) do copy /Y %i tsrc
for %i in ($(SRC11)) do copy /Y %i tsrc
@ -2267,24 +2232,6 @@ zlib:
icu.lo: $(TOP)\ext\icu\icu.c $(HDR) $(EXTHDR)
$(LTCOMPILE) $(CORE_COMPILE_OPTS) $(NO_WARN) -DSQLITE_CORE -c $(TOP)\ext\icu\icu.c
fts2.lo: $(TOP)\ext\fts2\fts2.c $(HDR) $(EXTHDR)
$(LTCOMPILE) $(CORE_COMPILE_OPTS) $(NO_WARN) -DSQLITE_CORE -c $(TOP)\ext\fts2\fts2.c
fts2_hash.lo: $(TOP)\ext\fts2\fts2_hash.c $(HDR) $(EXTHDR)
$(LTCOMPILE) $(CORE_COMPILE_OPTS) $(NO_WARN) -DSQLITE_CORE -c $(TOP)\ext\fts2\fts2_hash.c
fts2_icu.lo: $(TOP)\ext\fts2\fts2_icu.c $(HDR) $(EXTHDR)
$(LTCOMPILE) $(CORE_COMPILE_OPTS) $(NO_WARN) -DSQLITE_CORE -c $(TOP)\ext\fts2\fts2_icu.c
fts2_porter.lo: $(TOP)\ext\fts2\fts2_porter.c $(HDR) $(EXTHDR)
$(LTCOMPILE) $(CORE_COMPILE_OPTS) $(NO_WARN) -DSQLITE_CORE -c $(TOP)\ext\fts2\fts2_porter.c
fts2_tokenizer.lo: $(TOP)\ext\fts2\fts2_tokenizer.c $(HDR) $(EXTHDR)
$(LTCOMPILE) $(CORE_COMPILE_OPTS) $(NO_WARN) -DSQLITE_CORE -c $(TOP)\ext\fts2\fts2_tokenizer.c
fts2_tokenizer1.lo: $(TOP)\ext\fts2\fts2_tokenizer1.c $(HDR) $(EXTHDR)
$(LTCOMPILE) $(CORE_COMPILE_OPTS) $(NO_WARN) -DSQLITE_CORE -c $(TOP)\ext\fts2\fts2_tokenizer1.c
fts3.lo: $(TOP)\ext\fts3\fts3.c $(HDR) $(EXTHDR)
$(LTCOMPILE) $(CORE_COMPILE_OPTS) $(NO_WARN) -DSQLITE_CORE -c $(TOP)\ext\fts3\fts3.c

View File

@ -1,2 +0,0 @@
This folder contains source code to the first full-text search
extension for SQLite.

View File

@ -1,404 +0,0 @@
/*
** 2001 September 22
**
** The author disclaims copyright to this source code. In place of
** a legal notice, here is a blessing:
**
** May you do good and not evil.
** May you find forgiveness for yourself and forgive others.
** May you share freely, never taking more than you give.
**
*************************************************************************
** This is the implementation of generic hash-tables used in SQLite.
** We've modified it slightly to serve as a standalone hash table
** implementation for the full-text indexing module.
*/
#include <assert.h>
#include <stdlib.h>
#include <string.h>
#include "ft_hash.h"
void *malloc_and_zero(int n){
void *p = malloc(n);
if( p ){
memset(p, 0, n);
}
return p;
}
/* Turn bulk memory into a hash table object by initializing the
** fields of the Hash structure.
**
** "pNew" is a pointer to the hash table that is to be initialized.
** keyClass is one of the constants HASH_INT, HASH_POINTER,
** HASH_BINARY, or HASH_STRING. The value of keyClass
** determines what kind of key the hash table will use. "copyKey" is
** true if the hash table should make its own private copy of keys and
** false if it should just use the supplied pointer. CopyKey only makes
** sense for HASH_STRING and HASH_BINARY and is ignored
** for other key classes.
*/
void HashInit(Hash *pNew, int keyClass, int copyKey){
assert( pNew!=0 );
assert( keyClass>=HASH_STRING && keyClass<=HASH_BINARY );
pNew->keyClass = keyClass;
#if 0
if( keyClass==HASH_POINTER || keyClass==HASH_INT ) copyKey = 0;
#endif
pNew->copyKey = copyKey;
pNew->first = 0;
pNew->count = 0;
pNew->htsize = 0;
pNew->ht = 0;
pNew->xMalloc = malloc_and_zero;
pNew->xFree = free;
}
/* Remove all entries from a hash table. Reclaim all memory.
** Call this routine to delete a hash table or to reset a hash table
** to the empty state.
*/
void HashClear(Hash *pH){
HashElem *elem; /* For looping over all elements of the table */
assert( pH!=0 );
elem = pH->first;
pH->first = 0;
if( pH->ht ) pH->xFree(pH->ht);
pH->ht = 0;
pH->htsize = 0;
while( elem ){
HashElem *next_elem = elem->next;
if( pH->copyKey && elem->pKey ){
pH->xFree(elem->pKey);
}
pH->xFree(elem);
elem = next_elem;
}
pH->count = 0;
}
#if 0 /* NOT USED */
/*
** Hash and comparison functions when the mode is HASH_INT
*/
static int intHash(const void *pKey, int nKey){
return nKey ^ (nKey<<8) ^ (nKey>>8);
}
static int intCompare(const void *pKey1, int n1, const void *pKey2, int n2){
return n2 - n1;
}
#endif
#if 0 /* NOT USED */
/*
** Hash and comparison functions when the mode is HASH_POINTER
*/
static int ptrHash(const void *pKey, int nKey){
uptr x = Addr(pKey);
return x ^ (x<<8) ^ (x>>8);
}
static int ptrCompare(const void *pKey1, int n1, const void *pKey2, int n2){
if( pKey1==pKey2 ) return 0;
if( pKey1<pKey2 ) return -1;
return 1;
}
#endif
/*
** Hash and comparison functions when the mode is HASH_STRING
*/
static int strHash(const void *pKey, int nKey){
const char *z = (const char *)pKey;
int h = 0;
if( nKey<=0 ) nKey = (int) strlen(z);
while( nKey > 0 ){
h = (h<<3) ^ h ^ *z++;
nKey--;
}
return h & 0x7fffffff;
}
static int strCompare(const void *pKey1, int n1, const void *pKey2, int n2){
if( n1!=n2 ) return 1;
return strncmp((const char*)pKey1,(const char*)pKey2,n1);
}
/*
** Hash and comparison functions when the mode is HASH_BINARY
*/
static int binHash(const void *pKey, int nKey){
int h = 0;
const char *z = (const char *)pKey;
while( nKey-- > 0 ){
h = (h<<3) ^ h ^ *(z++);
}
return h & 0x7fffffff;
}
static int binCompare(const void *pKey1, int n1, const void *pKey2, int n2){
if( n1!=n2 ) return 1;
return memcmp(pKey1,pKey2,n1);
}
/*
** Return a pointer to the appropriate hash function given the key class.
**
** The C syntax in this function definition may be unfamilar to some
** programmers, so we provide the following additional explanation:
**
** The name of the function is "hashFunction". The function takes a
** single parameter "keyClass". The return value of hashFunction()
** is a pointer to another function. Specifically, the return value
** of hashFunction() is a pointer to a function that takes two parameters
** with types "const void*" and "int" and returns an "int".
*/
static int (*hashFunction(int keyClass))(const void*,int){
#if 0 /* HASH_INT and HASH_POINTER are never used */
switch( keyClass ){
case HASH_INT: return &intHash;
case HASH_POINTER: return &ptrHash;
case HASH_STRING: return &strHash;
case HASH_BINARY: return &binHash;;
default: break;
}
return 0;
#else
if( keyClass==HASH_STRING ){
return &strHash;
}else{
assert( keyClass==HASH_BINARY );
return &binHash;
}
#endif
}
/*
** Return a pointer to the appropriate hash function given the key class.
**
** For help in interpreted the obscure C code in the function definition,
** see the header comment on the previous function.
*/
static int (*compareFunction(int keyClass))(const void*,int,const void*,int){
#if 0 /* HASH_INT and HASH_POINTER are never used */
switch( keyClass ){
case HASH_INT: return &intCompare;
case HASH_POINTER: return &ptrCompare;
case HASH_STRING: return &strCompare;
case HASH_BINARY: return &binCompare;
default: break;
}
return 0;
#else
if( keyClass==HASH_STRING ){
return &strCompare;
}else{
assert( keyClass==HASH_BINARY );
return &binCompare;
}
#endif
}
/* Link an element into the hash table
*/
static void insertElement(
Hash *pH, /* The complete hash table */
struct _ht *pEntry, /* The entry into which pNew is inserted */
HashElem *pNew /* The element to be inserted */
){
HashElem *pHead; /* First element already in pEntry */
pHead = pEntry->chain;
if( pHead ){
pNew->next = pHead;
pNew->prev = pHead->prev;
if( pHead->prev ){ pHead->prev->next = pNew; }
else { pH->first = pNew; }
pHead->prev = pNew;
}else{
pNew->next = pH->first;
if( pH->first ){ pH->first->prev = pNew; }
pNew->prev = 0;
pH->first = pNew;
}
pEntry->count++;
pEntry->chain = pNew;
}
/* Resize the hash table so that it cantains "new_size" buckets.
** "new_size" must be a power of 2. The hash table might fail
** to resize if sqliteMalloc() fails.
*/
static void rehash(Hash *pH, int new_size){
struct _ht *new_ht; /* The new hash table */
HashElem *elem, *next_elem; /* For looping over existing elements */
int (*xHash)(const void*,int); /* The hash function */
assert( (new_size & (new_size-1))==0 );
new_ht = (struct _ht *)pH->xMalloc( new_size*sizeof(struct _ht) );
if( new_ht==0 ) return;
if( pH->ht ) pH->xFree(pH->ht);
pH->ht = new_ht;
pH->htsize = new_size;
xHash = hashFunction(pH->keyClass);
for(elem=pH->first, pH->first=0; elem; elem = next_elem){
int h = (*xHash)(elem->pKey, elem->nKey) & (new_size-1);
next_elem = elem->next;
insertElement(pH, &new_ht[h], elem);
}
}
/* This function (for internal use only) locates an element in an
** hash table that matches the given key. The hash for this key has
** already been computed and is passed as the 4th parameter.
*/
static HashElem *findElementGivenHash(
const Hash *pH, /* The pH to be searched */
const void *pKey, /* The key we are searching for */
int nKey,
int h /* The hash for this key. */
){
HashElem *elem; /* Used to loop thru the element list */
int count; /* Number of elements left to test */
int (*xCompare)(const void*,int,const void*,int); /* comparison function */
if( pH->ht ){
struct _ht *pEntry = &pH->ht[h];
elem = pEntry->chain;
count = pEntry->count;
xCompare = compareFunction(pH->keyClass);
while( count-- && elem ){
if( (*xCompare)(elem->pKey,elem->nKey,pKey,nKey)==0 ){
return elem;
}
elem = elem->next;
}
}
return 0;
}
/* Remove a single entry from the hash table given a pointer to that
** element and a hash on the element's key.
*/
static void removeElementGivenHash(
Hash *pH, /* The pH containing "elem" */
HashElem* elem, /* The element to be removed from the pH */
int h /* Hash value for the element */
){
struct _ht *pEntry;
if( elem->prev ){
elem->prev->next = elem->next;
}else{
pH->first = elem->next;
}
if( elem->next ){
elem->next->prev = elem->prev;
}
pEntry = &pH->ht[h];
if( pEntry->chain==elem ){
pEntry->chain = elem->next;
}
pEntry->count--;
if( pEntry->count<=0 ){
pEntry->chain = 0;
}
if( pH->copyKey && elem->pKey ){
pH->xFree(elem->pKey);
}
pH->xFree( elem );
pH->count--;
if( pH->count<=0 ){
assert( pH->first==0 );
assert( pH->count==0 );
HashClear(pH);
}
}
/* Attempt to locate an element of the hash table pH with a key
** that matches pKey,nKey. Return the data for this element if it is
** found, or NULL if there is no match.
*/
void *HashFind(const Hash *pH, const void *pKey, int nKey){
int h; /* A hash on key */
HashElem *elem; /* The element that matches key */
int (*xHash)(const void*,int); /* The hash function */
if( pH==0 || pH->ht==0 ) return 0;
xHash = hashFunction(pH->keyClass);
assert( xHash!=0 );
h = (*xHash)(pKey,nKey);
assert( (pH->htsize & (pH->htsize-1))==0 );
elem = findElementGivenHash(pH,pKey,nKey, h & (pH->htsize-1));
return elem ? elem->data : 0;
}
/* Insert an element into the hash table pH. The key is pKey,nKey
** and the data is "data".
**
** If no element exists with a matching key, then a new
** element is created. A copy of the key is made if the copyKey
** flag is set. NULL is returned.
**
** If another element already exists with the same key, then the
** new data replaces the old data and the old data is returned.
** The key is not copied in this instance. If a malloc fails, then
** the new data is returned and the hash table is unchanged.
**
** If the "data" parameter to this function is NULL, then the
** element corresponding to "key" is removed from the hash table.
*/
void *HashInsert(Hash *pH, const void *pKey, int nKey, void *data){
int hraw; /* Raw hash value of the key */
int h; /* the hash of the key modulo hash table size */
HashElem *elem; /* Used to loop thru the element list */
HashElem *new_elem; /* New element added to the pH */
int (*xHash)(const void*,int); /* The hash function */
assert( pH!=0 );
xHash = hashFunction(pH->keyClass);
assert( xHash!=0 );
hraw = (*xHash)(pKey, nKey);
assert( (pH->htsize & (pH->htsize-1))==0 );
h = hraw & (pH->htsize-1);
elem = findElementGivenHash(pH,pKey,nKey,h);
if( elem ){
void *old_data = elem->data;
if( data==0 ){
removeElementGivenHash(pH,elem,h);
}else{
elem->data = data;
}
return old_data;
}
if( data==0 ) return 0;
new_elem = (HashElem*)pH->xMalloc( sizeof(HashElem) );
if( new_elem==0 ) return data;
if( pH->copyKey && pKey!=0 ){
new_elem->pKey = pH->xMalloc( nKey );
if( new_elem->pKey==0 ){
pH->xFree(new_elem);
return data;
}
memcpy((void*)new_elem->pKey, pKey, nKey);
}else{
new_elem->pKey = (void*)pKey;
}
new_elem->nKey = nKey;
pH->count++;
if( pH->htsize==0 ){
rehash(pH,8);
if( pH->htsize==0 ){
pH->count = 0;
pH->xFree(new_elem);
return data;
}
}
if( pH->count > pH->htsize ){
rehash(pH,pH->htsize*2);
}
assert( pH->htsize>0 );
assert( (pH->htsize & (pH->htsize-1))==0 );
h = hraw & (pH->htsize-1);
insertElement(pH, &pH->ht[h], new_elem);
new_elem->data = data;
return 0;
}

View File

@ -1,111 +0,0 @@
/*
** 2001 September 22
**
** The author disclaims copyright to this source code. In place of
** a legal notice, here is a blessing:
**
** May you do good and not evil.
** May you find forgiveness for yourself and forgive others.
** May you share freely, never taking more than you give.
**
*************************************************************************
** This is the header file for the generic hash-table implementation
** used in SQLite. We've modified it slightly to serve as a standalone
** hash table implementation for the full-text indexing module.
**
*/
#ifndef _HASH_H_
#define _HASH_H_
/* Forward declarations of structures. */
typedef struct Hash Hash;
typedef struct HashElem HashElem;
/* A complete hash table is an instance of the following structure.
** The internals of this structure are intended to be opaque -- client
** code should not attempt to access or modify the fields of this structure
** directly. Change this structure only by using the routines below.
** However, many of the "procedures" and "functions" for modifying and
** accessing this structure are really macros, so we can't really make
** this structure opaque.
*/
struct Hash {
char keyClass; /* HASH_INT, _POINTER, _STRING, _BINARY */
char copyKey; /* True if copy of key made on insert */
int count; /* Number of entries in this table */
HashElem *first; /* The first element of the array */
void *(*xMalloc)(int); /* malloc() function to use */
void (*xFree)(void *); /* free() function to use */
int htsize; /* Number of buckets in the hash table */
struct _ht { /* the hash table */
int count; /* Number of entries with this hash */
HashElem *chain; /* Pointer to first entry with this hash */
} *ht;
};
/* Each element in the hash table is an instance of the following
** structure. All elements are stored on a single doubly-linked list.
**
** Again, this structure is intended to be opaque, but it can't really
** be opaque because it is used by macros.
*/
struct HashElem {
HashElem *next, *prev; /* Next and previous elements in the table */
void *data; /* Data associated with this element */
void *pKey; int nKey; /* Key associated with this element */
};
/*
** There are 4 different modes of operation for a hash table:
**
** HASH_INT nKey is used as the key and pKey is ignored.
**
** HASH_POINTER pKey is used as the key and nKey is ignored.
**
** HASH_STRING pKey points to a string that is nKey bytes long
** (including the null-terminator, if any). Case
** is respected in comparisons.
**
** HASH_BINARY pKey points to binary data nKey bytes long.
** memcmp() is used to compare keys.
**
** A copy of the key is made for HASH_STRING and HASH_BINARY
** if the copyKey parameter to HashInit is 1.
*/
/* #define HASH_INT 1 // NOT USED */
/* #define HASH_POINTER 2 // NOT USED */
#define HASH_STRING 3
#define HASH_BINARY 4
/*
** Access routines. To delete, insert a NULL pointer.
*/
void HashInit(Hash*, int keytype, int copyKey);
void *HashInsert(Hash*, const void *pKey, int nKey, void *pData);
void *HashFind(const Hash*, const void *pKey, int nKey);
void HashClear(Hash*);
/*
** Macros for looping over all elements of a hash table. The idiom is
** like this:
**
** Hash h;
** HashElem *p;
** ...
** for(p=HashFirst(&h); p; p=HashNext(p)){
** SomeStructure *pData = HashData(p);
** // do something with pData
** }
*/
#define HashFirst(H) ((H)->first)
#define HashNext(E) ((E)->next)
#define HashData(E) ((E)->data)
#define HashKey(E) ((E)->pKey)
#define HashKeysize(E) ((E)->nKey)
/*
** Number of entries in a hash table
*/
#define HashCount(H) ((H)->count)
#endif /* _HASH_H_ */

File diff suppressed because it is too large Load Diff

View File

@ -1,11 +0,0 @@
#include "sqlite3.h"
#ifdef __cplusplus
extern "C" {
#endif /* __cplusplus */
int sqlite3Fts1Init(sqlite3 *db);
#ifdef __cplusplus
} /* extern "C" */
#endif /* __cplusplus */

View File

@ -1,369 +0,0 @@
/*
** 2001 September 22
**
** The author disclaims copyright to this source code. In place of
** a legal notice, here is a blessing:
**
** May you do good and not evil.
** May you find forgiveness for yourself and forgive others.
** May you share freely, never taking more than you give.
**
*************************************************************************
** This is the implementation of generic hash-tables used in SQLite.
** We've modified it slightly to serve as a standalone hash table
** implementation for the full-text indexing module.
*/
#include <assert.h>
#include <stdlib.h>
#include <string.h>
/*
** The code in this file is only compiled if:
**
** * The FTS1 module is being built as an extension
** (in which case SQLITE_CORE is not defined), or
**
** * The FTS1 module is being built into the core of
** SQLite (in which case SQLITE_ENABLE_FTS1 is defined).
*/
#if !defined(SQLITE_CORE) || defined(SQLITE_ENABLE_FTS1)
#include "fts1_hash.h"
static void *malloc_and_zero(int n){
void *p = malloc(n);
if( p ){
memset(p, 0, n);
}
return p;
}
/* Turn bulk memory into a hash table object by initializing the
** fields of the Hash structure.
**
** "pNew" is a pointer to the hash table that is to be initialized.
** keyClass is one of the constants
** FTS1_HASH_BINARY or FTS1_HASH_STRING. The value of keyClass
** determines what kind of key the hash table will use. "copyKey" is
** true if the hash table should make its own private copy of keys and
** false if it should just use the supplied pointer.
*/
void sqlite3Fts1HashInit(fts1Hash *pNew, int keyClass, int copyKey){
assert( pNew!=0 );
assert( keyClass>=FTS1_HASH_STRING && keyClass<=FTS1_HASH_BINARY );
pNew->keyClass = keyClass;
pNew->copyKey = copyKey;
pNew->first = 0;
pNew->count = 0;
pNew->htsize = 0;
pNew->ht = 0;
pNew->xMalloc = malloc_and_zero;
pNew->xFree = free;
}
/* Remove all entries from a hash table. Reclaim all memory.
** Call this routine to delete a hash table or to reset a hash table
** to the empty state.
*/
void sqlite3Fts1HashClear(fts1Hash *pH){
fts1HashElem *elem; /* For looping over all elements of the table */
assert( pH!=0 );
elem = pH->first;
pH->first = 0;
if( pH->ht ) pH->xFree(pH->ht);
pH->ht = 0;
pH->htsize = 0;
while( elem ){
fts1HashElem *next_elem = elem->next;
if( pH->copyKey && elem->pKey ){
pH->xFree(elem->pKey);
}
pH->xFree(elem);
elem = next_elem;
}
pH->count = 0;
}
/*
** Hash and comparison functions when the mode is FTS1_HASH_STRING
*/
static int strHash(const void *pKey, int nKey){
const char *z = (const char *)pKey;
int h = 0;
if( nKey<=0 ) nKey = (int) strlen(z);
while( nKey > 0 ){
h = (h<<3) ^ h ^ *z++;
nKey--;
}
return h & 0x7fffffff;
}
static int strCompare(const void *pKey1, int n1, const void *pKey2, int n2){
if( n1!=n2 ) return 1;
return strncmp((const char*)pKey1,(const char*)pKey2,n1);
}
/*
** Hash and comparison functions when the mode is FTS1_HASH_BINARY
*/
static int binHash(const void *pKey, int nKey){
int h = 0;
const char *z = (const char *)pKey;
while( nKey-- > 0 ){
h = (h<<3) ^ h ^ *(z++);
}
return h & 0x7fffffff;
}
static int binCompare(const void *pKey1, int n1, const void *pKey2, int n2){
if( n1!=n2 ) return 1;
return memcmp(pKey1,pKey2,n1);
}
/*
** Return a pointer to the appropriate hash function given the key class.
**
** The C syntax in this function definition may be unfamilar to some
** programmers, so we provide the following additional explanation:
**
** The name of the function is "hashFunction". The function takes a
** single parameter "keyClass". The return value of hashFunction()
** is a pointer to another function. Specifically, the return value
** of hashFunction() is a pointer to a function that takes two parameters
** with types "const void*" and "int" and returns an "int".
*/
static int (*hashFunction(int keyClass))(const void*,int){
if( keyClass==FTS1_HASH_STRING ){
return &strHash;
}else{
assert( keyClass==FTS1_HASH_BINARY );
return &binHash;
}
}
/*
** Return a pointer to the appropriate hash function given the key class.
**
** For help in interpreted the obscure C code in the function definition,
** see the header comment on the previous function.
*/
static int (*compareFunction(int keyClass))(const void*,int,const void*,int){
if( keyClass==FTS1_HASH_STRING ){
return &strCompare;
}else{
assert( keyClass==FTS1_HASH_BINARY );
return &binCompare;
}
}
/* Link an element into the hash table
*/
static void insertElement(
fts1Hash *pH, /* The complete hash table */
struct _fts1ht *pEntry, /* The entry into which pNew is inserted */
fts1HashElem *pNew /* The element to be inserted */
){
fts1HashElem *pHead; /* First element already in pEntry */
pHead = pEntry->chain;
if( pHead ){
pNew->next = pHead;
pNew->prev = pHead->prev;
if( pHead->prev ){ pHead->prev->next = pNew; }
else { pH->first = pNew; }
pHead->prev = pNew;
}else{
pNew->next = pH->first;
if( pH->first ){ pH->first->prev = pNew; }
pNew->prev = 0;
pH->first = pNew;
}
pEntry->count++;
pEntry->chain = pNew;
}
/* Resize the hash table so that it cantains "new_size" buckets.
** "new_size" must be a power of 2. The hash table might fail
** to resize if sqliteMalloc() fails.
*/
static void rehash(fts1Hash *pH, int new_size){
struct _fts1ht *new_ht; /* The new hash table */
fts1HashElem *elem, *next_elem; /* For looping over existing elements */
int (*xHash)(const void*,int); /* The hash function */
assert( (new_size & (new_size-1))==0 );
new_ht = (struct _fts1ht *)pH->xMalloc( new_size*sizeof(struct _fts1ht) );
if( new_ht==0 ) return;
if( pH->ht ) pH->xFree(pH->ht);
pH->ht = new_ht;
pH->htsize = new_size;
xHash = hashFunction(pH->keyClass);
for(elem=pH->first, pH->first=0; elem; elem = next_elem){
int h = (*xHash)(elem->pKey, elem->nKey) & (new_size-1);
next_elem = elem->next;
insertElement(pH, &new_ht[h], elem);
}
}
/* This function (for internal use only) locates an element in an
** hash table that matches the given key. The hash for this key has
** already been computed and is passed as the 4th parameter.
*/
static fts1HashElem *findElementGivenHash(
const fts1Hash *pH, /* The pH to be searched */
const void *pKey, /* The key we are searching for */
int nKey,
int h /* The hash for this key. */
){
fts1HashElem *elem; /* Used to loop thru the element list */
int count; /* Number of elements left to test */
int (*xCompare)(const void*,int,const void*,int); /* comparison function */
if( pH->ht ){
struct _fts1ht *pEntry = &pH->ht[h];
elem = pEntry->chain;
count = pEntry->count;
xCompare = compareFunction(pH->keyClass);
while( count-- && elem ){
if( (*xCompare)(elem->pKey,elem->nKey,pKey,nKey)==0 ){
return elem;
}
elem = elem->next;
}
}
return 0;
}
/* Remove a single entry from the hash table given a pointer to that
** element and a hash on the element's key.
*/
static void removeElementGivenHash(
fts1Hash *pH, /* The pH containing "elem" */
fts1HashElem* elem, /* The element to be removed from the pH */
int h /* Hash value for the element */
){
struct _fts1ht *pEntry;
if( elem->prev ){
elem->prev->next = elem->next;
}else{
pH->first = elem->next;
}
if( elem->next ){
elem->next->prev = elem->prev;
}
pEntry = &pH->ht[h];
if( pEntry->chain==elem ){
pEntry->chain = elem->next;
}
pEntry->count--;
if( pEntry->count<=0 ){
pEntry->chain = 0;
}
if( pH->copyKey && elem->pKey ){
pH->xFree(elem->pKey);
}
pH->xFree( elem );
pH->count--;
if( pH->count<=0 ){
assert( pH->first==0 );
assert( pH->count==0 );
fts1HashClear(pH);
}
}
/* Attempt to locate an element of the hash table pH with a key
** that matches pKey,nKey. Return the data for this element if it is
** found, or NULL if there is no match.
*/
void *sqlite3Fts1HashFind(const fts1Hash *pH, const void *pKey, int nKey){
int h; /* A hash on key */
fts1HashElem *elem; /* The element that matches key */
int (*xHash)(const void*,int); /* The hash function */
if( pH==0 || pH->ht==0 ) return 0;
xHash = hashFunction(pH->keyClass);
assert( xHash!=0 );
h = (*xHash)(pKey,nKey);
assert( (pH->htsize & (pH->htsize-1))==0 );
elem = findElementGivenHash(pH,pKey,nKey, h & (pH->htsize-1));
return elem ? elem->data : 0;
}
/* Insert an element into the hash table pH. The key is pKey,nKey
** and the data is "data".
**
** If no element exists with a matching key, then a new
** element is created. A copy of the key is made if the copyKey
** flag is set. NULL is returned.
**
** If another element already exists with the same key, then the
** new data replaces the old data and the old data is returned.
** The key is not copied in this instance. If a malloc fails, then
** the new data is returned and the hash table is unchanged.
**
** If the "data" parameter to this function is NULL, then the
** element corresponding to "key" is removed from the hash table.
*/
void *sqlite3Fts1HashInsert(
fts1Hash *pH, /* The hash table to insert into */
const void *pKey, /* The key */
int nKey, /* Number of bytes in the key */
void *data /* The data */
){
int hraw; /* Raw hash value of the key */
int h; /* the hash of the key modulo hash table size */
fts1HashElem *elem; /* Used to loop thru the element list */
fts1HashElem *new_elem; /* New element added to the pH */
int (*xHash)(const void*,int); /* The hash function */
assert( pH!=0 );
xHash = hashFunction(pH->keyClass);
assert( xHash!=0 );
hraw = (*xHash)(pKey, nKey);
assert( (pH->htsize & (pH->htsize-1))==0 );
h = hraw & (pH->htsize-1);
elem = findElementGivenHash(pH,pKey,nKey,h);
if( elem ){
void *old_data = elem->data;
if( data==0 ){
removeElementGivenHash(pH,elem,h);
}else{
elem->data = data;
}
return old_data;
}
if( data==0 ) return 0;
new_elem = (fts1HashElem*)pH->xMalloc( sizeof(fts1HashElem) );
if( new_elem==0 ) return data;
if( pH->copyKey && pKey!=0 ){
new_elem->pKey = pH->xMalloc( nKey );
if( new_elem->pKey==0 ){
pH->xFree(new_elem);
return data;
}
memcpy((void*)new_elem->pKey, pKey, nKey);
}else{
new_elem->pKey = (void*)pKey;
}
new_elem->nKey = nKey;
pH->count++;
if( pH->htsize==0 ){
rehash(pH,8);
if( pH->htsize==0 ){
pH->count = 0;
pH->xFree(new_elem);
return data;
}
}
if( pH->count > pH->htsize ){
rehash(pH,pH->htsize*2);
}
assert( pH->htsize>0 );
assert( (pH->htsize & (pH->htsize-1))==0 );
h = hraw & (pH->htsize-1);
insertElement(pH, &pH->ht[h], new_elem);
new_elem->data = data;
return 0;
}
#endif /* !defined(SQLITE_CORE) || defined(SQLITE_ENABLE_FTS1) */

View File

@ -1,112 +0,0 @@
/*
** 2001 September 22
**
** The author disclaims copyright to this source code. In place of
** a legal notice, here is a blessing:
**
** May you do good and not evil.
** May you find forgiveness for yourself and forgive others.
** May you share freely, never taking more than you give.
**
*************************************************************************
** This is the header file for the generic hash-table implementation
** used in SQLite. We've modified it slightly to serve as a standalone
** hash table implementation for the full-text indexing module.
**
*/
#ifndef _FTS1_HASH_H_
#define _FTS1_HASH_H_
/* Forward declarations of structures. */
typedef struct fts1Hash fts1Hash;
typedef struct fts1HashElem fts1HashElem;
/* A complete hash table is an instance of the following structure.
** The internals of this structure are intended to be opaque -- client
** code should not attempt to access or modify the fields of this structure
** directly. Change this structure only by using the routines below.
** However, many of the "procedures" and "functions" for modifying and
** accessing this structure are really macros, so we can't really make
** this structure opaque.
*/
struct fts1Hash {
char keyClass; /* HASH_INT, _POINTER, _STRING, _BINARY */
char copyKey; /* True if copy of key made on insert */
int count; /* Number of entries in this table */
fts1HashElem *first; /* The first element of the array */
void *(*xMalloc)(int); /* malloc() function to use */
void (*xFree)(void *); /* free() function to use */
int htsize; /* Number of buckets in the hash table */
struct _fts1ht { /* the hash table */
int count; /* Number of entries with this hash */
fts1HashElem *chain; /* Pointer to first entry with this hash */
} *ht;
};
/* Each element in the hash table is an instance of the following
** structure. All elements are stored on a single doubly-linked list.
**
** Again, this structure is intended to be opaque, but it can't really
** be opaque because it is used by macros.
*/
struct fts1HashElem {
fts1HashElem *next, *prev; /* Next and previous elements in the table */
void *data; /* Data associated with this element */
void *pKey; int nKey; /* Key associated with this element */
};
/*
** There are 2 different modes of operation for a hash table:
**
** FTS1_HASH_STRING pKey points to a string that is nKey bytes long
** (including the null-terminator, if any). Case
** is respected in comparisons.
**
** FTS1_HASH_BINARY pKey points to binary data nKey bytes long.
** memcmp() is used to compare keys.
**
** A copy of the key is made if the copyKey parameter to fts1HashInit is 1.
*/
#define FTS1_HASH_STRING 1
#define FTS1_HASH_BINARY 2
/*
** Access routines. To delete, insert a NULL pointer.
*/
void sqlite3Fts1HashInit(fts1Hash*, int keytype, int copyKey);
void *sqlite3Fts1HashInsert(fts1Hash*, const void *pKey, int nKey, void *pData);
void *sqlite3Fts1HashFind(const fts1Hash*, const void *pKey, int nKey);
void sqlite3Fts1HashClear(fts1Hash*);
/*
** Shorthand for the functions above
*/
#define fts1HashInit sqlite3Fts1HashInit
#define fts1HashInsert sqlite3Fts1HashInsert
#define fts1HashFind sqlite3Fts1HashFind
#define fts1HashClear sqlite3Fts1HashClear
/*
** Macros for looping over all elements of a hash table. The idiom is
** like this:
**
** fts1Hash h;
** fts1HashElem *p;
** ...
** for(p=fts1HashFirst(&h); p; p=fts1HashNext(p)){
** SomeStructure *pData = fts1HashData(p);
** // do something with pData
** }
*/
#define fts1HashFirst(H) ((H)->first)
#define fts1HashNext(E) ((E)->next)
#define fts1HashData(E) ((E)->data)
#define fts1HashKey(E) ((E)->pKey)
#define fts1HashKeysize(E) ((E)->nKey)
/*
** Number of entries in a hash table
*/
#define fts1HashCount(H) ((H)->count)
#endif /* _FTS1_HASH_H_ */

View File

@ -1,643 +0,0 @@
/*
** 2006 September 30
**
** The author disclaims copyright to this source code. In place of
** a legal notice, here is a blessing:
**
** May you do good and not evil.
** May you find forgiveness for yourself and forgive others.
** May you share freely, never taking more than you give.
**
*************************************************************************
** Implementation of the full-text-search tokenizer that implements
** a Porter stemmer.
*/
/*
** The code in this file is only compiled if:
**
** * The FTS1 module is being built as an extension
** (in which case SQLITE_CORE is not defined), or
**
** * The FTS1 module is being built into the core of
** SQLite (in which case SQLITE_ENABLE_FTS1 is defined).
*/
#if !defined(SQLITE_CORE) || defined(SQLITE_ENABLE_FTS1)
#include <assert.h>
#include <stdlib.h>
#include <stdio.h>
#include <string.h>
#include <ctype.h>
#include "fts1_tokenizer.h"
/*
** Class derived from sqlite3_tokenizer
*/
typedef struct porter_tokenizer {
sqlite3_tokenizer base; /* Base class */
} porter_tokenizer;
/*
** Class derived from sqlit3_tokenizer_cursor
*/
typedef struct porter_tokenizer_cursor {
sqlite3_tokenizer_cursor base;
const char *zInput; /* input we are tokenizing */
int nInput; /* size of the input */
int iOffset; /* current position in zInput */
int iToken; /* index of next token to be returned */
char *zToken; /* storage for current token */
int nAllocated; /* space allocated to zToken buffer */
} porter_tokenizer_cursor;
/* Forward declaration */
static const sqlite3_tokenizer_module porterTokenizerModule;
/*
** Create a new tokenizer instance.
*/
static int porterCreate(
int argc, const char * const *argv,
sqlite3_tokenizer **ppTokenizer
){
porter_tokenizer *t;
t = (porter_tokenizer *) calloc(sizeof(*t), 1);
if( t==NULL ) return SQLITE_NOMEM;
*ppTokenizer = &t->base;
return SQLITE_OK;
}
/*
** Destroy a tokenizer
*/
static int porterDestroy(sqlite3_tokenizer *pTokenizer){
free(pTokenizer);
return SQLITE_OK;
}
/*
** Prepare to begin tokenizing a particular string. The input
** string to be tokenized is zInput[0..nInput-1]. A cursor
** used to incrementally tokenize this string is returned in
** *ppCursor.
*/
static int porterOpen(
sqlite3_tokenizer *pTokenizer, /* The tokenizer */
const char *zInput, int nInput, /* String to be tokenized */
sqlite3_tokenizer_cursor **ppCursor /* OUT: Tokenization cursor */
){
porter_tokenizer_cursor *c;
c = (porter_tokenizer_cursor *) malloc(sizeof(*c));
if( c==NULL ) return SQLITE_NOMEM;
c->zInput = zInput;
if( zInput==0 ){
c->nInput = 0;
}else if( nInput<0 ){
c->nInput = (int)strlen(zInput);
}else{
c->nInput = nInput;
}
c->iOffset = 0; /* start tokenizing at the beginning */
c->iToken = 0;
c->zToken = NULL; /* no space allocated, yet. */
c->nAllocated = 0;
*ppCursor = &c->base;
return SQLITE_OK;
}
/*
** Close a tokenization cursor previously opened by a call to
** porterOpen() above.
*/
static int porterClose(sqlite3_tokenizer_cursor *pCursor){
porter_tokenizer_cursor *c = (porter_tokenizer_cursor *) pCursor;
free(c->zToken);
free(c);
return SQLITE_OK;
}
/*
** Vowel or consonant
*/
static const char cType[] = {
0, 1, 1, 1, 0, 1, 1, 1, 0, 1, 1, 1, 1, 1, 0, 1, 1, 1, 1, 1, 0,
1, 1, 1, 2, 1
};
/*
** isConsonant() and isVowel() determine if their first character in
** the string they point to is a consonant or a vowel, according
** to Porter ruls.
**
** A consonate is any letter other than 'a', 'e', 'i', 'o', or 'u'.
** 'Y' is a consonant unless it follows another consonant,
** in which case it is a vowel.
**
** In these routine, the letters are in reverse order. So the 'y' rule
** is that 'y' is a consonant unless it is followed by another
** consonent.
*/
static int isVowel(const char*);
static int isConsonant(const char *z){
int j;
char x = *z;
if( x==0 ) return 0;
assert( x>='a' && x<='z' );
j = cType[x-'a'];
if( j<2 ) return j;
return z[1]==0 || isVowel(z + 1);
}
static int isVowel(const char *z){
int j;
char x = *z;
if( x==0 ) return 0;
assert( x>='a' && x<='z' );
j = cType[x-'a'];
if( j<2 ) return 1-j;
return isConsonant(z + 1);
}
/*
** Let any sequence of one or more vowels be represented by V and let
** C be sequence of one or more consonants. Then every word can be
** represented as:
**
** [C] (VC){m} [V]
**
** In prose: A word is an optional consonant followed by zero or
** vowel-consonant pairs followed by an optional vowel. "m" is the
** number of vowel consonant pairs. This routine computes the value
** of m for the first i bytes of a word.
**
** Return true if the m-value for z is 1 or more. In other words,
** return true if z contains at least one vowel that is followed
** by a consonant.
**
** In this routine z[] is in reverse order. So we are really looking
** for an instance of of a consonant followed by a vowel.
*/
static int m_gt_0(const char *z){
while( isVowel(z) ){ z++; }
if( *z==0 ) return 0;
while( isConsonant(z) ){ z++; }
return *z!=0;
}
/* Like mgt0 above except we are looking for a value of m which is
** exactly 1
*/
static int m_eq_1(const char *z){
while( isVowel(z) ){ z++; }
if( *z==0 ) return 0;
while( isConsonant(z) ){ z++; }
if( *z==0 ) return 0;
while( isVowel(z) ){ z++; }
if( *z==0 ) return 1;
while( isConsonant(z) ){ z++; }
return *z==0;
}
/* Like mgt0 above except we are looking for a value of m>1 instead
** or m>0
*/
static int m_gt_1(const char *z){
while( isVowel(z) ){ z++; }
if( *z==0 ) return 0;
while( isConsonant(z) ){ z++; }
if( *z==0 ) return 0;
while( isVowel(z) ){ z++; }
if( *z==0 ) return 0;
while( isConsonant(z) ){ z++; }
return *z!=0;
}
/*
** Return TRUE if there is a vowel anywhere within z[0..n-1]
*/
static int hasVowel(const char *z){
while( isConsonant(z) ){ z++; }
return *z!=0;
}
/*
** Return TRUE if the word ends in a double consonant.
**
** The text is reversed here. So we are really looking at
** the first two characters of z[].
*/
static int doubleConsonant(const char *z){
return isConsonant(z) && z[0]==z[1] && isConsonant(z+1);
}
/*
** Return TRUE if the word ends with three letters which
** are consonant-vowel-consonent and where the final consonant
** is not 'w', 'x', or 'y'.
**
** The word is reversed here. So we are really checking the
** first three letters and the first one cannot be in [wxy].
*/
static int star_oh(const char *z){
return
z[0]!=0 && isConsonant(z) &&
z[0]!='w' && z[0]!='x' && z[0]!='y' &&
z[1]!=0 && isVowel(z+1) &&
z[2]!=0 && isConsonant(z+2);
}
/*
** If the word ends with zFrom and xCond() is true for the stem
** of the word that preceeds the zFrom ending, then change the
** ending to zTo.
**
** The input word *pz and zFrom are both in reverse order. zTo
** is in normal order.
**
** Return TRUE if zFrom matches. Return FALSE if zFrom does not
** match. Not that TRUE is returned even if xCond() fails and
** no substitution occurs.
*/
static int stem(
char **pz, /* The word being stemmed (Reversed) */
const char *zFrom, /* If the ending matches this... (Reversed) */
const char *zTo, /* ... change the ending to this (not reversed) */
int (*xCond)(const char*) /* Condition that must be true */
){
char *z = *pz;
while( *zFrom && *zFrom==*z ){ z++; zFrom++; }
if( *zFrom!=0 ) return 0;
if( xCond && !xCond(z) ) return 1;
while( *zTo ){
*(--z) = *(zTo++);
}
*pz = z;
return 1;
}
/*
** This is the fallback stemmer used when the porter stemmer is
** inappropriate. The input word is copied into the output with
** US-ASCII case folding. If the input word is too long (more
** than 20 bytes if it contains no digits or more than 6 bytes if
** it contains digits) then word is truncated to 20 or 6 bytes
** by taking 10 or 3 bytes from the beginning and end.
*/
static void copy_stemmer(const char *zIn, int nIn, char *zOut, int *pnOut){
int i, mx, j;
int hasDigit = 0;
for(i=0; i<nIn; i++){
int c = zIn[i];
if( c>='A' && c<='Z' ){
zOut[i] = c - 'A' + 'a';
}else{
if( c>='0' && c<='9' ) hasDigit = 1;
zOut[i] = c;
}
}
mx = hasDigit ? 3 : 10;
if( nIn>mx*2 ){
for(j=mx, i=nIn-mx; i<nIn; i++, j++){
zOut[j] = zOut[i];
}
i = j;
}
zOut[i] = 0;
*pnOut = i;
}
/*
** Stem the input word zIn[0..nIn-1]. Store the output in zOut.
** zOut is at least big enough to hold nIn bytes. Write the actual
** size of the output word (exclusive of the '\0' terminator) into *pnOut.
**
** Any upper-case characters in the US-ASCII character set ([A-Z])
** are converted to lower case. Upper-case UTF characters are
** unchanged.
**
** Words that are longer than about 20 bytes are stemmed by retaining
** a few bytes from the beginning and the end of the word. If the
** word contains digits, 3 bytes are taken from the beginning and
** 3 bytes from the end. For long words without digits, 10 bytes
** are taken from each end. US-ASCII case folding still applies.
**
** If the input word contains not digits but does characters not
** in [a-zA-Z] then no stemming is attempted and this routine just
** copies the input into the input into the output with US-ASCII
** case folding.
**
** Stemming never increases the length of the word. So there is
** no chance of overflowing the zOut buffer.
*/
static void porter_stemmer(const char *zIn, int nIn, char *zOut, int *pnOut){
int i, j, c;
char zReverse[28];
char *z, *z2;
if( nIn<3 || nIn>=sizeof(zReverse)-7 ){
/* The word is too big or too small for the porter stemmer.
** Fallback to the copy stemmer */
copy_stemmer(zIn, nIn, zOut, pnOut);
return;
}
for(i=0, j=sizeof(zReverse)-6; i<nIn; i++, j--){
c = zIn[i];
if( c>='A' && c<='Z' ){
zReverse[j] = c + 'a' - 'A';
}else if( c>='a' && c<='z' ){
zReverse[j] = c;
}else{
/* The use of a character not in [a-zA-Z] means that we fallback
** to the copy stemmer */
copy_stemmer(zIn, nIn, zOut, pnOut);
return;
}
}
memset(&zReverse[sizeof(zReverse)-5], 0, 5);
z = &zReverse[j+1];
/* Step 1a */
if( z[0]=='s' ){
if(
!stem(&z, "sess", "ss", 0) &&
!stem(&z, "sei", "i", 0) &&
!stem(&z, "ss", "ss", 0)
){
z++;
}
}
/* Step 1b */
z2 = z;
if( stem(&z, "dee", "ee", m_gt_0) ){
/* Do nothing. The work was all in the test */
}else if(
(stem(&z, "gni", "", hasVowel) || stem(&z, "de", "", hasVowel))
&& z!=z2
){
if( stem(&z, "ta", "ate", 0) ||
stem(&z, "lb", "ble", 0) ||
stem(&z, "zi", "ize", 0) ){
/* Do nothing. The work was all in the test */
}else if( doubleConsonant(z) && (*z!='l' && *z!='s' && *z!='z') ){
z++;
}else if( m_eq_1(z) && star_oh(z) ){
*(--z) = 'e';
}
}
/* Step 1c */
if( z[0]=='y' && hasVowel(z+1) ){
z[0] = 'i';
}
/* Step 2 */
switch( z[1] ){
case 'a':
stem(&z, "lanoita", "ate", m_gt_0) ||
stem(&z, "lanoit", "tion", m_gt_0);
break;
case 'c':
stem(&z, "icne", "ence", m_gt_0) ||
stem(&z, "icna", "ance", m_gt_0);
break;
case 'e':
stem(&z, "rezi", "ize", m_gt_0);
break;
case 'g':
stem(&z, "igol", "log", m_gt_0);
break;
case 'l':
stem(&z, "ilb", "ble", m_gt_0) ||
stem(&z, "illa", "al", m_gt_0) ||
stem(&z, "iltne", "ent", m_gt_0) ||
stem(&z, "ile", "e", m_gt_0) ||
stem(&z, "ilsuo", "ous", m_gt_0);
break;
case 'o':
stem(&z, "noitazi", "ize", m_gt_0) ||
stem(&z, "noita", "ate", m_gt_0) ||
stem(&z, "rota", "ate", m_gt_0);
break;
case 's':
stem(&z, "msila", "al", m_gt_0) ||
stem(&z, "ssenevi", "ive", m_gt_0) ||
stem(&z, "ssenluf", "ful", m_gt_0) ||
stem(&z, "ssensuo", "ous", m_gt_0);
break;
case 't':
stem(&z, "itila", "al", m_gt_0) ||
stem(&z, "itivi", "ive", m_gt_0) ||
stem(&z, "itilib", "ble", m_gt_0);
break;
}
/* Step 3 */
switch( z[0] ){
case 'e':
stem(&z, "etaci", "ic", m_gt_0) ||
stem(&z, "evita", "", m_gt_0) ||
stem(&z, "ezila", "al", m_gt_0);
break;
case 'i':
stem(&z, "itici", "ic", m_gt_0);
break;
case 'l':
stem(&z, "laci", "ic", m_gt_0) ||
stem(&z, "luf", "", m_gt_0);
break;
case 's':
stem(&z, "ssen", "", m_gt_0);
break;
}
/* Step 4 */
switch( z[1] ){
case 'a':
if( z[0]=='l' && m_gt_1(z+2) ){
z += 2;
}
break;
case 'c':
if( z[0]=='e' && z[2]=='n' && (z[3]=='a' || z[3]=='e') && m_gt_1(z+4) ){
z += 4;
}
break;
case 'e':
if( z[0]=='r' && m_gt_1(z+2) ){
z += 2;
}
break;
case 'i':
if( z[0]=='c' && m_gt_1(z+2) ){
z += 2;
}
break;
case 'l':
if( z[0]=='e' && z[2]=='b' && (z[3]=='a' || z[3]=='i') && m_gt_1(z+4) ){
z += 4;
}
break;
case 'n':
if( z[0]=='t' ){
if( z[2]=='a' ){
if( m_gt_1(z+3) ){
z += 3;
}
}else if( z[2]=='e' ){
stem(&z, "tneme", "", m_gt_1) ||
stem(&z, "tnem", "", m_gt_1) ||
stem(&z, "tne", "", m_gt_1);
}
}
break;
case 'o':
if( z[0]=='u' ){
if( m_gt_1(z+2) ){
z += 2;
}
}else if( z[3]=='s' || z[3]=='t' ){
stem(&z, "noi", "", m_gt_1);
}
break;
case 's':
if( z[0]=='m' && z[2]=='i' && m_gt_1(z+3) ){
z += 3;
}
break;
case 't':
stem(&z, "eta", "", m_gt_1) ||
stem(&z, "iti", "", m_gt_1);
break;
case 'u':
if( z[0]=='s' && z[2]=='o' && m_gt_1(z+3) ){
z += 3;
}
break;
case 'v':
case 'z':
if( z[0]=='e' && z[2]=='i' && m_gt_1(z+3) ){
z += 3;
}
break;
}
/* Step 5a */
if( z[0]=='e' ){
if( m_gt_1(z+1) ){
z++;
}else if( m_eq_1(z+1) && !star_oh(z+1) ){
z++;
}
}
/* Step 5b */
if( m_gt_1(z) && z[0]=='l' && z[1]=='l' ){
z++;
}
/* z[] is now the stemmed word in reverse order. Flip it back
** around into forward order and return.
*/
*pnOut = i = strlen(z);
zOut[i] = 0;
while( *z ){
zOut[--i] = *(z++);
}
}
/*
** Characters that can be part of a token. We assume any character
** whose value is greater than 0x80 (any UTF character) can be
** part of a token. In other words, delimiters all must have
** values of 0x7f or lower.
*/
static const char isIdChar[] = {
/* x0 x1 x2 x3 x4 x5 x6 x7 x8 x9 xA xB xC xD xE xF */
1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 0, 0, 0, 0, 0, /* 3x */
0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, /* 4x */
1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 0, 0, 0, 1, /* 5x */
0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, /* 6x */
1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 0, 0, 0, 0, /* 7x */
};
#define idChar(C) (((ch=C)&0x80)!=0 || (ch>0x2f && isIdChar[ch-0x30]))
#define isDelim(C) (((ch=C)&0x80)==0 && (ch<0x30 || !isIdChar[ch-0x30]))
/*
** Extract the next token from a tokenization cursor. The cursor must
** have been opened by a prior call to porterOpen().
*/
static int porterNext(
sqlite3_tokenizer_cursor *pCursor, /* Cursor returned by porterOpen */
const char **pzToken, /* OUT: *pzToken is the token text */
int *pnBytes, /* OUT: Number of bytes in token */
int *piStartOffset, /* OUT: Starting offset of token */
int *piEndOffset, /* OUT: Ending offset of token */
int *piPosition /* OUT: Position integer of token */
){
porter_tokenizer_cursor *c = (porter_tokenizer_cursor *) pCursor;
const char *z = c->zInput;
while( c->iOffset<c->nInput ){
int iStartOffset, ch;
/* Scan past delimiter characters */
while( c->iOffset<c->nInput && isDelim(z[c->iOffset]) ){
c->iOffset++;
}
/* Count non-delimiter characters. */
iStartOffset = c->iOffset;
while( c->iOffset<c->nInput && !isDelim(z[c->iOffset]) ){
c->iOffset++;
}
if( c->iOffset>iStartOffset ){
int n = c->iOffset-iStartOffset;
if( n>c->nAllocated ){
c->nAllocated = n+20;
c->zToken = realloc(c->zToken, c->nAllocated);
if( c->zToken==NULL ) return SQLITE_NOMEM;
}
porter_stemmer(&z[iStartOffset], n, c->zToken, pnBytes);
*pzToken = c->zToken;
*piStartOffset = iStartOffset;
*piEndOffset = c->iOffset;
*piPosition = c->iToken++;
return SQLITE_OK;
}
}
return SQLITE_DONE;
}
/*
** The set of routines that implement the porter-stemmer tokenizer
*/
static const sqlite3_tokenizer_module porterTokenizerModule = {
0,
porterCreate,
porterDestroy,
porterOpen,
porterClose,
porterNext,
};
/*
** Allocate a new porter tokenizer. Return a pointer to the new
** tokenizer in *ppModule
*/
void sqlite3Fts1PorterTokenizerModule(
sqlite3_tokenizer_module const**ppModule
){
*ppModule = &porterTokenizerModule;
}
#endif /* !defined(SQLITE_CORE) || defined(SQLITE_ENABLE_FTS1) */

View File

@ -1,90 +0,0 @@
/*
** 2006 July 10
**
** The author disclaims copyright to this source code.
**
*************************************************************************
** Defines the interface to tokenizers used by fulltext-search. There
** are three basic components:
**
** sqlite3_tokenizer_module is a singleton defining the tokenizer
** interface functions. This is essentially the class structure for
** tokenizers.
**
** sqlite3_tokenizer is used to define a particular tokenizer, perhaps
** including customization information defined at creation time.
**
** sqlite3_tokenizer_cursor is generated by a tokenizer to generate
** tokens from a particular input.
*/
#ifndef _FTS1_TOKENIZER_H_
#define _FTS1_TOKENIZER_H_
/* TODO(shess) Only used for SQLITE_OK and SQLITE_DONE at this time.
** If tokenizers are to be allowed to call sqlite3_*() functions, then
** we will need a way to register the API consistently.
*/
#include "sqlite3.h"
/*
** Structures used by the tokenizer interface.
*/
typedef struct sqlite3_tokenizer sqlite3_tokenizer;
typedef struct sqlite3_tokenizer_cursor sqlite3_tokenizer_cursor;
typedef struct sqlite3_tokenizer_module sqlite3_tokenizer_module;
struct sqlite3_tokenizer_module {
int iVersion; /* currently 0 */
/*
** Create and destroy a tokenizer. argc/argv are passed down from
** the fulltext virtual table creation to allow customization.
*/
int (*xCreate)(int argc, const char *const*argv,
sqlite3_tokenizer **ppTokenizer);
int (*xDestroy)(sqlite3_tokenizer *pTokenizer);
/*
** Tokenize a particular input. Call xOpen() to prepare to
** tokenize, xNext() repeatedly until it returns SQLITE_DONE, then
** xClose() to free any internal state. The pInput passed to
** xOpen() must exist until the cursor is closed. The ppToken
** result from xNext() is only valid until the next call to xNext()
** or until xClose() is called.
*/
/* TODO(shess) current implementation requires pInput to be
** nul-terminated. This should either be fixed, or pInput/nBytes
** should be converted to zInput.
*/
int (*xOpen)(sqlite3_tokenizer *pTokenizer,
const char *pInput, int nBytes,
sqlite3_tokenizer_cursor **ppCursor);
int (*xClose)(sqlite3_tokenizer_cursor *pCursor);
int (*xNext)(sqlite3_tokenizer_cursor *pCursor,
const char **ppToken, int *pnBytes,
int *piStartOffset, int *piEndOffset, int *piPosition);
};
struct sqlite3_tokenizer {
const sqlite3_tokenizer_module *pModule; /* The module for this tokenizer */
/* Tokenizer implementations will typically add additional fields */
};
struct sqlite3_tokenizer_cursor {
sqlite3_tokenizer *pTokenizer; /* Tokenizer for this cursor. */
/* Tokenizer implementations will typically add additional fields */
};
/*
** Get the module for a tokenizer which generates tokens based on a
** set of non-token characters. The default is to break tokens at any
** non-alnum character, though the set of delimiters can also be
** specified by the first argv argument to xCreate().
*/
/* TODO(shess) This doesn't belong here. Need some sort of
** registration process.
*/
void sqlite3Fts1SimpleTokenizerModule(sqlite3_tokenizer_module const**ppModule);
void sqlite3Fts1PorterTokenizerModule(sqlite3_tokenizer_module const**ppModule);
#endif /* _FTS1_TOKENIZER_H_ */

View File

@ -1,221 +0,0 @@
/*
** The author disclaims copyright to this source code.
**
*************************************************************************
** Implementation of the "simple" full-text-search tokenizer.
*/
/*
** The code in this file is only compiled if:
**
** * The FTS1 module is being built as an extension
** (in which case SQLITE_CORE is not defined), or
**
** * The FTS1 module is being built into the core of
** SQLite (in which case SQLITE_ENABLE_FTS1 is defined).
*/
#if !defined(SQLITE_CORE) || defined(SQLITE_ENABLE_FTS1)
#include <assert.h>
#include <stdlib.h>
#include <stdio.h>
#include <string.h>
#include <ctype.h>
#include "fts1_tokenizer.h"
typedef struct simple_tokenizer {
sqlite3_tokenizer base;
char delim[128]; /* flag ASCII delimiters */
} simple_tokenizer;
typedef struct simple_tokenizer_cursor {
sqlite3_tokenizer_cursor base;
const char *pInput; /* input we are tokenizing */
int nBytes; /* size of the input */
int iOffset; /* current position in pInput */
int iToken; /* index of next token to be returned */
char *pToken; /* storage for current token */
int nTokenAllocated; /* space allocated to zToken buffer */
} simple_tokenizer_cursor;
/* Forward declaration */
static const sqlite3_tokenizer_module simpleTokenizerModule;
static int isDelim(simple_tokenizer *t, unsigned char c){
return c<0x80 && t->delim[c];
}
/*
** Create a new tokenizer instance.
*/
static int simpleCreate(
int argc, const char * const *argv,
sqlite3_tokenizer **ppTokenizer
){
simple_tokenizer *t;
t = (simple_tokenizer *) calloc(sizeof(*t), 1);
if( t==NULL ) return SQLITE_NOMEM;
/* TODO(shess) Delimiters need to remain the same from run to run,
** else we need to reindex. One solution would be a meta-table to
** track such information in the database, then we'd only want this
** information on the initial create.
*/
if( argc>1 ){
int i, n = strlen(argv[1]);
for(i=0; i<n; i++){
unsigned char ch = argv[1][i];
/* We explicitly don't support UTF-8 delimiters for now. */
if( ch>=0x80 ){
free(t);
return SQLITE_ERROR;
}
t->delim[ch] = 1;
}
} else {
/* Mark non-alphanumeric ASCII characters as delimiters */
int i;
for(i=1; i<0x80; i++){
t->delim[i] = !isalnum(i);
}
}
*ppTokenizer = &t->base;
return SQLITE_OK;
}
/*
** Destroy a tokenizer
*/
static int simpleDestroy(sqlite3_tokenizer *pTokenizer){
free(pTokenizer);
return SQLITE_OK;
}
/*
** Prepare to begin tokenizing a particular string. The input
** string to be tokenized is pInput[0..nBytes-1]. A cursor
** used to incrementally tokenize this string is returned in
** *ppCursor.
*/
static int simpleOpen(
sqlite3_tokenizer *pTokenizer, /* The tokenizer */
const char *pInput, int nBytes, /* String to be tokenized */
sqlite3_tokenizer_cursor **ppCursor /* OUT: Tokenization cursor */
){
simple_tokenizer_cursor *c;
c = (simple_tokenizer_cursor *) malloc(sizeof(*c));
if( c==NULL ) return SQLITE_NOMEM;
c->pInput = pInput;
if( pInput==0 ){
c->nBytes = 0;
}else if( nBytes<0 ){
c->nBytes = (int)strlen(pInput);
}else{
c->nBytes = nBytes;
}
c->iOffset = 0; /* start tokenizing at the beginning */
c->iToken = 0;
c->pToken = NULL; /* no space allocated, yet. */
c->nTokenAllocated = 0;
*ppCursor = &c->base;
return SQLITE_OK;
}
/*
** Close a tokenization cursor previously opened by a call to
** simpleOpen() above.
*/
static int simpleClose(sqlite3_tokenizer_cursor *pCursor){
simple_tokenizer_cursor *c = (simple_tokenizer_cursor *) pCursor;
free(c->pToken);
free(c);
return SQLITE_OK;
}
/*
** Extract the next token from a tokenization cursor. The cursor must
** have been opened by a prior call to simpleOpen().
*/
static int simpleNext(
sqlite3_tokenizer_cursor *pCursor, /* Cursor returned by simpleOpen */
const char **ppToken, /* OUT: *ppToken is the token text */
int *pnBytes, /* OUT: Number of bytes in token */
int *piStartOffset, /* OUT: Starting offset of token */
int *piEndOffset, /* OUT: Ending offset of token */
int *piPosition /* OUT: Position integer of token */
){
simple_tokenizer_cursor *c = (simple_tokenizer_cursor *) pCursor;
simple_tokenizer *t = (simple_tokenizer *) pCursor->pTokenizer;
unsigned char *p = (unsigned char *)c->pInput;
while( c->iOffset<c->nBytes ){
int iStartOffset;
/* Scan past delimiter characters */
while( c->iOffset<c->nBytes && isDelim(t, p[c->iOffset]) ){
c->iOffset++;
}
/* Count non-delimiter characters. */
iStartOffset = c->iOffset;
while( c->iOffset<c->nBytes && !isDelim(t, p[c->iOffset]) ){
c->iOffset++;
}
if( c->iOffset>iStartOffset ){
int i, n = c->iOffset-iStartOffset;
if( n>c->nTokenAllocated ){
c->nTokenAllocated = n+20;
c->pToken = realloc(c->pToken, c->nTokenAllocated);
if( c->pToken==NULL ) return SQLITE_NOMEM;
}
for(i=0; i<n; i++){
/* TODO(shess) This needs expansion to handle UTF-8
** case-insensitivity.
*/
unsigned char ch = p[iStartOffset+i];
c->pToken[i] = ch<0x80 ? tolower(ch) : ch;
}
*ppToken = c->pToken;
*pnBytes = n;
*piStartOffset = iStartOffset;
*piEndOffset = c->iOffset;
*piPosition = c->iToken++;
return SQLITE_OK;
}
}
return SQLITE_DONE;
}
/*
** The set of routines that implement the simple tokenizer
*/
static const sqlite3_tokenizer_module simpleTokenizerModule = {
0,
simpleCreate,
simpleDestroy,
simpleOpen,
simpleClose,
simpleNext,
};
/*
** Allocate a new simple tokenizer. Return a pointer to the new
** tokenizer in *ppModule
*/
void sqlite3Fts1SimpleTokenizerModule(
sqlite3_tokenizer_module const**ppModule
){
*ppModule = &simpleTokenizerModule;
}
#endif /* !defined(SQLITE_CORE) || defined(SQLITE_ENABLE_FTS1) */

File diff suppressed because it is too large Load Diff

View File

@ -1,11 +0,0 @@
#include "sqlite3.h"
#ifdef __cplusplus
extern "C" {
#endif /* __cplusplus */
int fulltext_init(sqlite3 *db);
#ifdef __cplusplus
} /* extern "C" */
#endif /* __cplusplus */

View File

@ -1,174 +0,0 @@
/*
** The author disclaims copyright to this source code.
**
*************************************************************************
** Implementation of the "simple" full-text-search tokenizer.
*/
#include <assert.h>
#if !defined(__APPLE__)
#include <malloc.h>
#else
#include <stdlib.h>
#endif
#include <stdio.h>
#include <string.h>
#include <ctype.h>
#include "tokenizer.h"
/* Duplicate a string; the caller must free() the returned string.
* (We don't use strdup() since it's not part of the standard C library and
* may not be available everywhere.) */
/* TODO(shess) Copied from fulltext.c, consider util.c for such
** things. */
static char *string_dup(const char *s){
char *str = malloc(strlen(s) + 1);
strcpy(str, s);
return str;
}
typedef struct simple_tokenizer {
sqlite3_tokenizer base;
const char *zDelim; /* token delimiters */
} simple_tokenizer;
typedef struct simple_tokenizer_cursor {
sqlite3_tokenizer_cursor base;
const char *pInput; /* input we are tokenizing */
int nBytes; /* size of the input */
const char *pCurrent; /* current position in pInput */
int iToken; /* index of next token to be returned */
char *zToken; /* storage for current token */
int nTokenBytes; /* actual size of current token */
int nTokenAllocated; /* space allocated to zToken buffer */
} simple_tokenizer_cursor;
static sqlite3_tokenizer_module simpleTokenizerModule;/* forward declaration */
static int simpleCreate(
int argc, const char **argv,
sqlite3_tokenizer **ppTokenizer
){
simple_tokenizer *t;
t = (simple_tokenizer *) malloc(sizeof(simple_tokenizer));
/* TODO(shess) Delimiters need to remain the same from run to run,
** else we need to reindex. One solution would be a meta-table to
** track such information in the database, then we'd only want this
** information on the initial create.
*/
if( argc>1 ){
t->zDelim = string_dup(argv[1]);
} else {
/* Build a string excluding alphanumeric ASCII characters */
char zDelim[0x80]; /* nul-terminated, so nul not a member */
int i, j;
for(i=1, j=0; i<0x80; i++){
if( !isalnum(i) ){
zDelim[j++] = i;
}
}
zDelim[j++] = '\0';
assert( j<=sizeof(zDelim) );
t->zDelim = string_dup(zDelim);
}
*ppTokenizer = &t->base;
return SQLITE_OK;
}
static int simpleDestroy(sqlite3_tokenizer *pTokenizer){
simple_tokenizer *t = (simple_tokenizer *) pTokenizer;
free((void *) t->zDelim);
free(t);
return SQLITE_OK;
}
static int simpleOpen(
sqlite3_tokenizer *pTokenizer,
const char *pInput, int nBytes,
sqlite3_tokenizer_cursor **ppCursor
){
simple_tokenizer_cursor *c;
c = (simple_tokenizer_cursor *) malloc(sizeof(simple_tokenizer_cursor));
c->pInput = pInput;
c->nBytes = nBytes<0 ? (int) strlen(pInput) : nBytes;
c->pCurrent = c->pInput; /* start tokenizing at the beginning */
c->iToken = 0;
c->zToken = NULL; /* no space allocated, yet. */
c->nTokenBytes = 0;
c->nTokenAllocated = 0;
*ppCursor = &c->base;
return SQLITE_OK;
}
static int simpleClose(sqlite3_tokenizer_cursor *pCursor){
simple_tokenizer_cursor *c = (simple_tokenizer_cursor *) pCursor;
if( NULL!=c->zToken ){
free(c->zToken);
}
free(c);
return SQLITE_OK;
}
static int simpleNext(
sqlite3_tokenizer_cursor *pCursor,
const char **ppToken, int *pnBytes,
int *piStartOffset, int *piEndOffset, int *piPosition
){
simple_tokenizer_cursor *c = (simple_tokenizer_cursor *) pCursor;
simple_tokenizer *t = (simple_tokenizer *) pCursor->pTokenizer;
int ii;
while( c->pCurrent-c->pInput<c->nBytes ){
int n = (int) strcspn(c->pCurrent, t->zDelim);
if( n>0 ){
if( n+1>c->nTokenAllocated ){
c->zToken = realloc(c->zToken, n+1);
}
for(ii=0; ii<n; ii++){
/* TODO(shess) This needs expansion to handle UTF-8
** case-insensitivity.
*/
char ch = c->pCurrent[ii];
c->zToken[ii] = (unsigned char)ch<0x80 ? tolower((unsigned char)ch):ch;
}
c->zToken[n] = '\0';
*ppToken = c->zToken;
*pnBytes = n;
*piStartOffset = (int) (c->pCurrent-c->pInput);
*piEndOffset = *piStartOffset+n;
*piPosition = c->iToken++;
c->pCurrent += n + 1;
return SQLITE_OK;
}
c->pCurrent += n + 1;
/* TODO(shess) could strspn() to skip delimiters en masse. Needs
** to happen in two places, though, which is annoying.
*/
}
return SQLITE_DONE;
}
static sqlite3_tokenizer_module simpleTokenizerModule = {
0,
simpleCreate,
simpleDestroy,
simpleOpen,
simpleClose,
simpleNext,
};
void get_simple_tokenizer_module(
sqlite3_tokenizer_module **ppModule
){
*ppModule = &simpleTokenizerModule;
}

View File

@ -1,89 +0,0 @@
/*
** 2006 July 10
**
** The author disclaims copyright to this source code.
**
*************************************************************************
** Defines the interface to tokenizers used by fulltext-search. There
** are three basic components:
**
** sqlite3_tokenizer_module is a singleton defining the tokenizer
** interface functions. This is essentially the class structure for
** tokenizers.
**
** sqlite3_tokenizer is used to define a particular tokenizer, perhaps
** including customization information defined at creation time.
**
** sqlite3_tokenizer_cursor is generated by a tokenizer to generate
** tokens from a particular input.
*/
#ifndef _TOKENIZER_H_
#define _TOKENIZER_H_
/* TODO(shess) Only used for SQLITE_OK and SQLITE_DONE at this time.
** If tokenizers are to be allowed to call sqlite3_*() functions, then
** we will need a way to register the API consistently.
*/
#include "sqlite3.h"
/*
** Structures used by the tokenizer interface.
*/
typedef struct sqlite3_tokenizer sqlite3_tokenizer;
typedef struct sqlite3_tokenizer_cursor sqlite3_tokenizer_cursor;
typedef struct sqlite3_tokenizer_module sqlite3_tokenizer_module;
struct sqlite3_tokenizer_module {
int iVersion; /* currently 0 */
/*
** Create and destroy a tokenizer. argc/argv are passed down from
** the fulltext virtual table creation to allow customization.
*/
int (*xCreate)(int argc, const char **argv,
sqlite3_tokenizer **ppTokenizer);
int (*xDestroy)(sqlite3_tokenizer *pTokenizer);
/*
** Tokenize a particular input. Call xOpen() to prepare to
** tokenize, xNext() repeatedly until it returns SQLITE_DONE, then
** xClose() to free any internal state. The pInput passed to
** xOpen() must exist until the cursor is closed. The ppToken
** result from xNext() is only valid until the next call to xNext()
** or until xClose() is called.
*/
/* TODO(shess) current implementation requires pInput to be
** nul-terminated. This should either be fixed, or pInput/nBytes
** should be converted to zInput.
*/
int (*xOpen)(sqlite3_tokenizer *pTokenizer,
const char *pInput, int nBytes,
sqlite3_tokenizer_cursor **ppCursor);
int (*xClose)(sqlite3_tokenizer_cursor *pCursor);
int (*xNext)(sqlite3_tokenizer_cursor *pCursor,
const char **ppToken, int *pnBytes,
int *piStartOffset, int *piEndOffset, int *piPosition);
};
struct sqlite3_tokenizer {
sqlite3_tokenizer_module *pModule; /* The module for this tokenizer */
/* Tokenizer implementations will typically add additional fields */
};
struct sqlite3_tokenizer_cursor {
sqlite3_tokenizer *pTokenizer; /* Tokenizer for this cursor. */
/* Tokenizer implementations will typically add additional fields */
};
/*
** Get the module for a tokenizer which generates tokens based on a
** set of non-token characters. The default is to break tokens at any
** non-alnum character, though the set of delimiters can also be
** specified by the first argv argument to xCreate().
*/
/* TODO(shess) This doesn't belong here. Need some sort of
** registration process.
*/
void get_simple_tokenizer_module(sqlite3_tokenizer_module **ppModule);
#endif /* _TOKENIZER_H_ */

View File

@ -1,133 +0,0 @@
1. FTS2 Tokenizers
When creating a new full-text table, FTS2 allows the user to select
the text tokenizer implementation to be used when indexing text
by specifying a "tokenizer" clause as part of the CREATE VIRTUAL TABLE
statement:
CREATE VIRTUAL TABLE <table-name> USING fts2(
<columns ...> [, tokenizer <tokenizer-name> [<tokenizer-args>]]
);
The built-in tokenizers (valid values to pass as <tokenizer name>) are
"simple" and "porter".
<tokenizer-args> should consist of zero or more white-space separated
arguments to pass to the selected tokenizer implementation. The
interpretation of the arguments, if any, depends on the individual
tokenizer.
2. Custom Tokenizers
FTS2 allows users to provide custom tokenizer implementations. The
interface used to create a new tokenizer is defined and described in
the fts2_tokenizer.h source file.
Registering a new FTS2 tokenizer is similar to registering a new
virtual table module with SQLite. The user passes a pointer to a
structure containing pointers to various callback functions that
make up the implementation of the new tokenizer type. For tokenizers,
the structure (defined in fts2_tokenizer.h) is called
"sqlite3_tokenizer_module".
FTS2 does not expose a C-function that users call to register new
tokenizer types with a database handle. Instead, the pointer must
be encoded as an SQL blob value and passed to FTS2 through the SQL
engine by evaluating a special scalar function, "fts2_tokenizer()".
The fts2_tokenizer() function may be called with one or two arguments,
as follows:
SELECT fts2_tokenizer(<tokenizer-name>);
SELECT fts2_tokenizer(<tokenizer-name>, <sqlite3_tokenizer_module ptr>);
Where <tokenizer-name> is a string identifying the tokenizer and
<sqlite3_tokenizer_module ptr> is a pointer to an sqlite3_tokenizer_module
structure encoded as an SQL blob. If the second argument is present,
it is registered as tokenizer <tokenizer-name> and a copy of it
returned. If only one argument is passed, a pointer to the tokenizer
implementation currently registered as <tokenizer-name> is returned,
encoded as a blob. Or, if no such tokenizer exists, an SQL exception
(error) is raised.
SECURITY: If the fts2 extension is used in an environment where potentially
malicious users may execute arbitrary SQL (i.e. gears), they should be
prevented from invoking the fts2_tokenizer() function, possibly using the
authorisation callback.
See "Sample code" below for an example of calling the fts2_tokenizer()
function from C code.
3. ICU Library Tokenizers
If this extension is compiled with the SQLITE_ENABLE_ICU pre-processor
symbol defined, then there exists a built-in tokenizer named "icu"
implemented using the ICU library. The first argument passed to the
xCreate() method (see fts2_tokenizer.h) of this tokenizer may be
an ICU locale identifier. For example "tr_TR" for Turkish as used
in Turkey, or "en_AU" for English as used in Australia. For example:
"CREATE VIRTUAL TABLE thai_text USING fts2(text, tokenizer icu th_TH)"
The ICU tokenizer implementation is very simple. It splits the input
text according to the ICU rules for finding word boundaries and discards
any tokens that consist entirely of white-space. This may be suitable
for some applications in some locales, but not all. If more complex
processing is required, for example to implement stemming or
discard punctuation, this can be done by creating a tokenizer
implementation that uses the ICU tokenizer as part of its implementation.
When using the ICU tokenizer this way, it is safe to overwrite the
contents of the strings returned by the xNext() method (see
fts2_tokenizer.h).
4. Sample code.
The following two code samples illustrate the way C code should invoke
the fts2_tokenizer() scalar function:
int registerTokenizer(
sqlite3 *db,
char *zName,
const sqlite3_tokenizer_module *p
){
int rc;
sqlite3_stmt *pStmt;
const char zSql[] = "SELECT fts2_tokenizer(?, ?)";
rc = sqlite3_prepare_v2(db, zSql, -1, &pStmt, 0);
if( rc!=SQLITE_OK ){
return rc;
}
sqlite3_bind_text(pStmt, 1, zName, -1, SQLITE_STATIC);
sqlite3_bind_blob(pStmt, 2, &p, sizeof(p), SQLITE_STATIC);
sqlite3_step(pStmt);
return sqlite3_finalize(pStmt);
}
int queryTokenizer(
sqlite3 *db,
char *zName,
const sqlite3_tokenizer_module **pp
){
int rc;
sqlite3_stmt *pStmt;
const char zSql[] = "SELECT fts2_tokenizer(?)";
*pp = 0;
rc = sqlite3_prepare_v2(db, zSql, -1, &pStmt, 0);
if( rc!=SQLITE_OK ){
return rc;
}
sqlite3_bind_text(pStmt, 1, zName, -1, SQLITE_STATIC);
if( SQLITE_ROW==sqlite3_step(pStmt) ){
if( sqlite3_column_type(pStmt, 0)==SQLITE_BLOB ){
memcpy(pp, sqlite3_column_blob(pStmt, 0), sizeof(*pp));
}
}
return sqlite3_finalize(pStmt);
}

View File

@ -1,4 +0,0 @@
This folder contains source code to the second full-text search
extension for SQLite. While the API is the same, this version uses a
substantially different storage schema from fts1, so tables will need
to be rebuilt.

File diff suppressed because it is too large Load Diff

View File

@ -1,26 +0,0 @@
/*
** 2006 Oct 10
**
** The author disclaims copyright to this source code. In place of
** a legal notice, here is a blessing:
**
** May you do good and not evil.
** May you find forgiveness for yourself and forgive others.
** May you share freely, never taking more than you give.
**
******************************************************************************
**
** This header file is used by programs that want to link against the
** FTS2 library. All it does is declare the sqlite3Fts2Init() interface.
*/
#include "sqlite3.h"
#ifdef __cplusplus
extern "C" {
#endif /* __cplusplus */
int sqlite3Fts2Init(sqlite3 *db);
#ifdef __cplusplus
} /* extern "C" */
#endif /* __cplusplus */

View File

@ -1,376 +0,0 @@
/*
** 2001 September 22
**
** The author disclaims copyright to this source code. In place of
** a legal notice, here is a blessing:
**
** May you do good and not evil.
** May you find forgiveness for yourself and forgive others.
** May you share freely, never taking more than you give.
**
*************************************************************************
** This is the implementation of generic hash-tables used in SQLite.
** We've modified it slightly to serve as a standalone hash table
** implementation for the full-text indexing module.
*/
/*
** The code in this file is only compiled if:
**
** * The FTS2 module is being built as an extension
** (in which case SQLITE_CORE is not defined), or
**
** * The FTS2 module is being built into the core of
** SQLite (in which case SQLITE_ENABLE_FTS2 is defined).
*/
#if !defined(SQLITE_CORE) || defined(SQLITE_ENABLE_FTS2)
#include <assert.h>
#include <stdlib.h>
#include <string.h>
#include "sqlite3.h"
#include "sqlite3ext.h"
SQLITE_EXTENSION_INIT3
#include "fts2_hash.h"
/*
** Malloc and Free functions
*/
static void *fts2HashMalloc(int n){
void *p = sqlite3_malloc(n);
if( p ){
memset(p, 0, n);
}
return p;
}
static void fts2HashFree(void *p){
sqlite3_free(p);
}
/* Turn bulk memory into a hash table object by initializing the
** fields of the Hash structure.
**
** "pNew" is a pointer to the hash table that is to be initialized.
** keyClass is one of the constants
** FTS2_HASH_BINARY or FTS2_HASH_STRING. The value of keyClass
** determines what kind of key the hash table will use. "copyKey" is
** true if the hash table should make its own private copy of keys and
** false if it should just use the supplied pointer.
*/
void sqlite3Fts2HashInit(fts2Hash *pNew, int keyClass, int copyKey){
assert( pNew!=0 );
assert( keyClass>=FTS2_HASH_STRING && keyClass<=FTS2_HASH_BINARY );
pNew->keyClass = keyClass;
pNew->copyKey = copyKey;
pNew->first = 0;
pNew->count = 0;
pNew->htsize = 0;
pNew->ht = 0;
}
/* Remove all entries from a hash table. Reclaim all memory.
** Call this routine to delete a hash table or to reset a hash table
** to the empty state.
*/
void sqlite3Fts2HashClear(fts2Hash *pH){
fts2HashElem *elem; /* For looping over all elements of the table */
assert( pH!=0 );
elem = pH->first;
pH->first = 0;
fts2HashFree(pH->ht);
pH->ht = 0;
pH->htsize = 0;
while( elem ){
fts2HashElem *next_elem = elem->next;
if( pH->copyKey && elem->pKey ){
fts2HashFree(elem->pKey);
}
fts2HashFree(elem);
elem = next_elem;
}
pH->count = 0;
}
/*
** Hash and comparison functions when the mode is FTS2_HASH_STRING
*/
static int strHash(const void *pKey, int nKey){
const char *z = (const char *)pKey;
int h = 0;
if( nKey<=0 ) nKey = (int) strlen(z);
while( nKey > 0 ){
h = (h<<3) ^ h ^ *z++;
nKey--;
}
return h & 0x7fffffff;
}
static int strCompare(const void *pKey1, int n1, const void *pKey2, int n2){
if( n1!=n2 ) return 1;
return strncmp((const char*)pKey1,(const char*)pKey2,n1);
}
/*
** Hash and comparison functions when the mode is FTS2_HASH_BINARY
*/
static int binHash(const void *pKey, int nKey){
int h = 0;
const char *z = (const char *)pKey;
while( nKey-- > 0 ){
h = (h<<3) ^ h ^ *(z++);
}
return h & 0x7fffffff;
}
static int binCompare(const void *pKey1, int n1, const void *pKey2, int n2){
if( n1!=n2 ) return 1;
return memcmp(pKey1,pKey2,n1);
}
/*
** Return a pointer to the appropriate hash function given the key class.
**
** The C syntax in this function definition may be unfamilar to some
** programmers, so we provide the following additional explanation:
**
** The name of the function is "hashFunction". The function takes a
** single parameter "keyClass". The return value of hashFunction()
** is a pointer to another function. Specifically, the return value
** of hashFunction() is a pointer to a function that takes two parameters
** with types "const void*" and "int" and returns an "int".
*/
static int (*hashFunction(int keyClass))(const void*,int){
if( keyClass==FTS2_HASH_STRING ){
return &strHash;
}else{
assert( keyClass==FTS2_HASH_BINARY );
return &binHash;
}
}
/*
** Return a pointer to the appropriate hash function given the key class.
**
** For help in interpreted the obscure C code in the function definition,
** see the header comment on the previous function.
*/
static int (*compareFunction(int keyClass))(const void*,int,const void*,int){
if( keyClass==FTS2_HASH_STRING ){
return &strCompare;
}else{
assert( keyClass==FTS2_HASH_BINARY );
return &binCompare;
}
}
/* Link an element into the hash table
*/
static void insertElement(
fts2Hash *pH, /* The complete hash table */
struct _fts2ht *pEntry, /* The entry into which pNew is inserted */
fts2HashElem *pNew /* The element to be inserted */
){
fts2HashElem *pHead; /* First element already in pEntry */
pHead = pEntry->chain;
if( pHead ){
pNew->next = pHead;
pNew->prev = pHead->prev;
if( pHead->prev ){ pHead->prev->next = pNew; }
else { pH->first = pNew; }
pHead->prev = pNew;
}else{
pNew->next = pH->first;
if( pH->first ){ pH->first->prev = pNew; }
pNew->prev = 0;
pH->first = pNew;
}
pEntry->count++;
pEntry->chain = pNew;
}
/* Resize the hash table so that it cantains "new_size" buckets.
** "new_size" must be a power of 2. The hash table might fail
** to resize if sqliteMalloc() fails.
*/
static void rehash(fts2Hash *pH, int new_size){
struct _fts2ht *new_ht; /* The new hash table */
fts2HashElem *elem, *next_elem; /* For looping over existing elements */
int (*xHash)(const void*,int); /* The hash function */
assert( (new_size & (new_size-1))==0 );
new_ht = (struct _fts2ht *)fts2HashMalloc( new_size*sizeof(struct _fts2ht) );
if( new_ht==0 ) return;
fts2HashFree(pH->ht);
pH->ht = new_ht;
pH->htsize = new_size;
xHash = hashFunction(pH->keyClass);
for(elem=pH->first, pH->first=0; elem; elem = next_elem){
int h = (*xHash)(elem->pKey, elem->nKey) & (new_size-1);
next_elem = elem->next;
insertElement(pH, &new_ht[h], elem);
}
}
/* This function (for internal use only) locates an element in an
** hash table that matches the given key. The hash for this key has
** already been computed and is passed as the 4th parameter.
*/
static fts2HashElem *findElementGivenHash(
const fts2Hash *pH, /* The pH to be searched */
const void *pKey, /* The key we are searching for */
int nKey,
int h /* The hash for this key. */
){
fts2HashElem *elem; /* Used to loop thru the element list */
int count; /* Number of elements left to test */
int (*xCompare)(const void*,int,const void*,int); /* comparison function */
if( pH->ht ){
struct _fts2ht *pEntry = &pH->ht[h];
elem = pEntry->chain;
count = pEntry->count;
xCompare = compareFunction(pH->keyClass);
while( count-- && elem ){
if( (*xCompare)(elem->pKey,elem->nKey,pKey,nKey)==0 ){
return elem;
}
elem = elem->next;
}
}
return 0;
}
/* Remove a single entry from the hash table given a pointer to that
** element and a hash on the element's key.
*/
static void removeElementGivenHash(
fts2Hash *pH, /* The pH containing "elem" */
fts2HashElem* elem, /* The element to be removed from the pH */
int h /* Hash value for the element */
){
struct _fts2ht *pEntry;
if( elem->prev ){
elem->prev->next = elem->next;
}else{
pH->first = elem->next;
}
if( elem->next ){
elem->next->prev = elem->prev;
}
pEntry = &pH->ht[h];
if( pEntry->chain==elem ){
pEntry->chain = elem->next;
}
pEntry->count--;
if( pEntry->count<=0 ){
pEntry->chain = 0;
}
if( pH->copyKey && elem->pKey ){
fts2HashFree(elem->pKey);
}
fts2HashFree( elem );
pH->count--;
if( pH->count<=0 ){
assert( pH->first==0 );
assert( pH->count==0 );
fts2HashClear(pH);
}
}
/* Attempt to locate an element of the hash table pH with a key
** that matches pKey,nKey. Return the data for this element if it is
** found, or NULL if there is no match.
*/
void *sqlite3Fts2HashFind(const fts2Hash *pH, const void *pKey, int nKey){
int h; /* A hash on key */
fts2HashElem *elem; /* The element that matches key */
int (*xHash)(const void*,int); /* The hash function */
if( pH==0 || pH->ht==0 ) return 0;
xHash = hashFunction(pH->keyClass);
assert( xHash!=0 );
h = (*xHash)(pKey,nKey);
assert( (pH->htsize & (pH->htsize-1))==0 );
elem = findElementGivenHash(pH,pKey,nKey, h & (pH->htsize-1));
return elem ? elem->data : 0;
}
/* Insert an element into the hash table pH. The key is pKey,nKey
** and the data is "data".
**
** If no element exists with a matching key, then a new
** element is created. A copy of the key is made if the copyKey
** flag is set. NULL is returned.
**
** If another element already exists with the same key, then the
** new data replaces the old data and the old data is returned.
** The key is not copied in this instance. If a malloc fails, then
** the new data is returned and the hash table is unchanged.
**
** If the "data" parameter to this function is NULL, then the
** element corresponding to "key" is removed from the hash table.
*/
void *sqlite3Fts2HashInsert(
fts2Hash *pH, /* The hash table to insert into */
const void *pKey, /* The key */
int nKey, /* Number of bytes in the key */
void *data /* The data */
){
int hraw; /* Raw hash value of the key */
int h; /* the hash of the key modulo hash table size */
fts2HashElem *elem; /* Used to loop thru the element list */
fts2HashElem *new_elem; /* New element added to the pH */
int (*xHash)(const void*,int); /* The hash function */
assert( pH!=0 );
xHash = hashFunction(pH->keyClass);
assert( xHash!=0 );
hraw = (*xHash)(pKey, nKey);
assert( (pH->htsize & (pH->htsize-1))==0 );
h = hraw & (pH->htsize-1);
elem = findElementGivenHash(pH,pKey,nKey,h);
if( elem ){
void *old_data = elem->data;
if( data==0 ){
removeElementGivenHash(pH,elem,h);
}else{
elem->data = data;
}
return old_data;
}
if( data==0 ) return 0;
new_elem = (fts2HashElem*)fts2HashMalloc( sizeof(fts2HashElem) );
if( new_elem==0 ) return data;
if( pH->copyKey && pKey!=0 ){
new_elem->pKey = fts2HashMalloc( nKey );
if( new_elem->pKey==0 ){
fts2HashFree(new_elem);
return data;
}
memcpy((void*)new_elem->pKey, pKey, nKey);
}else{
new_elem->pKey = (void*)pKey;
}
new_elem->nKey = nKey;
pH->count++;
if( pH->htsize==0 ){
rehash(pH,8);
if( pH->htsize==0 ){
pH->count = 0;
fts2HashFree(new_elem);
return data;
}
}
if( pH->count > pH->htsize ){
rehash(pH,pH->htsize*2);
}
assert( pH->htsize>0 );
assert( (pH->htsize & (pH->htsize-1))==0 );
h = hraw & (pH->htsize-1);
insertElement(pH, &pH->ht[h], new_elem);
new_elem->data = data;
return 0;
}
#endif /* !defined(SQLITE_CORE) || defined(SQLITE_ENABLE_FTS2) */

View File

@ -1,110 +0,0 @@
/*
** 2001 September 22
**
** The author disclaims copyright to this source code. In place of
** a legal notice, here is a blessing:
**
** May you do good and not evil.
** May you find forgiveness for yourself and forgive others.
** May you share freely, never taking more than you give.
**
*************************************************************************
** This is the header file for the generic hash-table implementation
** used in SQLite. We've modified it slightly to serve as a standalone
** hash table implementation for the full-text indexing module.
**
*/
#ifndef _FTS2_HASH_H_
#define _FTS2_HASH_H_
/* Forward declarations of structures. */
typedef struct fts2Hash fts2Hash;
typedef struct fts2HashElem fts2HashElem;
/* A complete hash table is an instance of the following structure.
** The internals of this structure are intended to be opaque -- client
** code should not attempt to access or modify the fields of this structure
** directly. Change this structure only by using the routines below.
** However, many of the "procedures" and "functions" for modifying and
** accessing this structure are really macros, so we can't really make
** this structure opaque.
*/
struct fts2Hash {
char keyClass; /* HASH_INT, _POINTER, _STRING, _BINARY */
char copyKey; /* True if copy of key made on insert */
int count; /* Number of entries in this table */
fts2HashElem *first; /* The first element of the array */
int htsize; /* Number of buckets in the hash table */
struct _fts2ht { /* the hash table */
int count; /* Number of entries with this hash */
fts2HashElem *chain; /* Pointer to first entry with this hash */
} *ht;
};
/* Each element in the hash table is an instance of the following
** structure. All elements are stored on a single doubly-linked list.
**
** Again, this structure is intended to be opaque, but it can't really
** be opaque because it is used by macros.
*/
struct fts2HashElem {
fts2HashElem *next, *prev; /* Next and previous elements in the table */
void *data; /* Data associated with this element */
void *pKey; int nKey; /* Key associated with this element */
};
/*
** There are 2 different modes of operation for a hash table:
**
** FTS2_HASH_STRING pKey points to a string that is nKey bytes long
** (including the null-terminator, if any). Case
** is respected in comparisons.
**
** FTS2_HASH_BINARY pKey points to binary data nKey bytes long.
** memcmp() is used to compare keys.
**
** A copy of the key is made if the copyKey parameter to fts2HashInit is 1.
*/
#define FTS2_HASH_STRING 1
#define FTS2_HASH_BINARY 2
/*
** Access routines. To delete, insert a NULL pointer.
*/
void sqlite3Fts2HashInit(fts2Hash*, int keytype, int copyKey);
void *sqlite3Fts2HashInsert(fts2Hash*, const void *pKey, int nKey, void *pData);
void *sqlite3Fts2HashFind(const fts2Hash*, const void *pKey, int nKey);
void sqlite3Fts2HashClear(fts2Hash*);
/*
** Shorthand for the functions above
*/
#define fts2HashInit sqlite3Fts2HashInit
#define fts2HashInsert sqlite3Fts2HashInsert
#define fts2HashFind sqlite3Fts2HashFind
#define fts2HashClear sqlite3Fts2HashClear
/*
** Macros for looping over all elements of a hash table. The idiom is
** like this:
**
** fts2Hash h;
** fts2HashElem *p;
** ...
** for(p=fts2HashFirst(&h); p; p=fts2HashNext(p)){
** SomeStructure *pData = fts2HashData(p);
** // do something with pData
** }
*/
#define fts2HashFirst(H) ((H)->first)
#define fts2HashNext(E) ((E)->next)
#define fts2HashData(E) ((E)->data)
#define fts2HashKey(E) ((E)->pKey)
#define fts2HashKeysize(E) ((E)->nKey)
/*
** Number of entries in a hash table
*/
#define fts2HashCount(H) ((H)->count)
#endif /* _FTS2_HASH_H_ */

View File

@ -1,260 +0,0 @@
/*
** 2007 June 22
**
** The author disclaims copyright to this source code. In place of
** a legal notice, here is a blessing:
**
** May you do good and not evil.
** May you find forgiveness for yourself and forgive others.
** May you share freely, never taking more than you give.
**
*************************************************************************
** This file implements a tokenizer for fts2 based on the ICU library.
**
** $Id: fts2_icu.c,v 1.3 2008/12/18 05:30:26 danielk1977 Exp $
*/
#if !defined(SQLITE_CORE) || defined(SQLITE_ENABLE_FTS2)
#ifdef SQLITE_ENABLE_ICU
#include <assert.h>
#include <string.h>
#include "fts2_tokenizer.h"
#include <unicode/ubrk.h>
#include <unicode/ucol.h>
#include <unicode/ustring.h>
#include <unicode/utf16.h>
typedef struct IcuTokenizer IcuTokenizer;
typedef struct IcuCursor IcuCursor;
struct IcuTokenizer {
sqlite3_tokenizer base;
char *zLocale;
};
struct IcuCursor {
sqlite3_tokenizer_cursor base;
UBreakIterator *pIter; /* ICU break-iterator object */
int nChar; /* Number of UChar elements in pInput */
UChar *aChar; /* Copy of input using utf-16 encoding */
int *aOffset; /* Offsets of each character in utf-8 input */
int nBuffer;
char *zBuffer;
int iToken;
};
/*
** Create a new tokenizer instance.
*/
static int icuCreate(
int argc, /* Number of entries in argv[] */
const char * const *argv, /* Tokenizer creation arguments */
sqlite3_tokenizer **ppTokenizer /* OUT: Created tokenizer */
){
IcuTokenizer *p;
int n = 0;
if( argc>0 ){
n = strlen(argv[0])+1;
}
p = (IcuTokenizer *)sqlite3_malloc(sizeof(IcuTokenizer)+n);
if( !p ){
return SQLITE_NOMEM;
}
memset(p, 0, sizeof(IcuTokenizer));
if( n ){
p->zLocale = (char *)&p[1];
memcpy(p->zLocale, argv[0], n);
}
*ppTokenizer = (sqlite3_tokenizer *)p;
return SQLITE_OK;
}
/*
** Destroy a tokenizer
*/
static int icuDestroy(sqlite3_tokenizer *pTokenizer){
IcuTokenizer *p = (IcuTokenizer *)pTokenizer;
sqlite3_free(p);
return SQLITE_OK;
}
/*
** Prepare to begin tokenizing a particular string. The input
** string to be tokenized is pInput[0..nBytes-1]. A cursor
** used to incrementally tokenize this string is returned in
** *ppCursor.
*/
static int icuOpen(
sqlite3_tokenizer *pTokenizer, /* The tokenizer */
const char *zInput, /* Input string */
int nInput, /* Length of zInput in bytes */
sqlite3_tokenizer_cursor **ppCursor /* OUT: Tokenization cursor */
){
IcuTokenizer *p = (IcuTokenizer *)pTokenizer;
IcuCursor *pCsr;
const int32_t opt = U_FOLD_CASE_DEFAULT;
UErrorCode status = U_ZERO_ERROR;
int nChar;
UChar32 c;
int iInput = 0;
int iOut = 0;
*ppCursor = 0;
if( nInput<0 ){
nInput = strlen(zInput);
}
nChar = nInput+1;
pCsr = (IcuCursor *)sqlite3_malloc(
sizeof(IcuCursor) + /* IcuCursor */
((nChar+3)&~3) * sizeof(UChar) + /* IcuCursor.aChar[] */
(nChar+1) * sizeof(int) /* IcuCursor.aOffset[] */
);
if( !pCsr ){
return SQLITE_NOMEM;
}
memset(pCsr, 0, sizeof(IcuCursor));
pCsr->aChar = (UChar *)&pCsr[1];
pCsr->aOffset = (int *)&pCsr->aChar[(nChar+3)&~3];
pCsr->aOffset[iOut] = iInput;
U8_NEXT(zInput, iInput, nInput, c);
while( c>0 ){
int isError = 0;
c = u_foldCase(c, opt);
U16_APPEND(pCsr->aChar, iOut, nChar, c, isError);
if( isError ){
sqlite3_free(pCsr);
return SQLITE_ERROR;
}
pCsr->aOffset[iOut] = iInput;
if( iInput<nInput ){
U8_NEXT(zInput, iInput, nInput, c);
}else{
c = 0;
}
}
pCsr->pIter = ubrk_open(UBRK_WORD, p->zLocale, pCsr->aChar, iOut, &status);
if( !U_SUCCESS(status) ){
sqlite3_free(pCsr);
return SQLITE_ERROR;
}
pCsr->nChar = iOut;
ubrk_first(pCsr->pIter);
*ppCursor = (sqlite3_tokenizer_cursor *)pCsr;
return SQLITE_OK;
}
/*
** Close a tokenization cursor previously opened by a call to icuOpen().
*/
static int icuClose(sqlite3_tokenizer_cursor *pCursor){
IcuCursor *pCsr = (IcuCursor *)pCursor;
ubrk_close(pCsr->pIter);
sqlite3_free(pCsr->zBuffer);
sqlite3_free(pCsr);
return SQLITE_OK;
}
/*
** Extract the next token from a tokenization cursor.
*/
static int icuNext(
sqlite3_tokenizer_cursor *pCursor, /* Cursor returned by simpleOpen */
const char **ppToken, /* OUT: *ppToken is the token text */
int *pnBytes, /* OUT: Number of bytes in token */
int *piStartOffset, /* OUT: Starting offset of token */
int *piEndOffset, /* OUT: Ending offset of token */
int *piPosition /* OUT: Position integer of token */
){
IcuCursor *pCsr = (IcuCursor *)pCursor;
int iStart = 0;
int iEnd = 0;
int nByte = 0;
while( iStart==iEnd ){
UChar32 c;
iStart = ubrk_current(pCsr->pIter);
iEnd = ubrk_next(pCsr->pIter);
if( iEnd==UBRK_DONE ){
return SQLITE_DONE;
}
while( iStart<iEnd ){
int iWhite = iStart;
U8_NEXT(pCsr->aChar, iWhite, pCsr->nChar, c);
if( u_isspace(c) ){
iStart = iWhite;
}else{
break;
}
}
assert(iStart<=iEnd);
}
do {
UErrorCode status = U_ZERO_ERROR;
if( nByte ){
char *zNew = sqlite3_realloc(pCsr->zBuffer, nByte);
if( !zNew ){
return SQLITE_NOMEM;
}
pCsr->zBuffer = zNew;
pCsr->nBuffer = nByte;
}
u_strToUTF8(
pCsr->zBuffer, pCsr->nBuffer, &nByte, /* Output vars */
&pCsr->aChar[iStart], iEnd-iStart, /* Input vars */
&status /* Output success/failure */
);
} while( nByte>pCsr->nBuffer );
*ppToken = pCsr->zBuffer;
*pnBytes = nByte;
*piStartOffset = pCsr->aOffset[iStart];
*piEndOffset = pCsr->aOffset[iEnd];
*piPosition = pCsr->iToken++;
return SQLITE_OK;
}
/*
** The set of routines that implement the simple tokenizer
*/
static const sqlite3_tokenizer_module icuTokenizerModule = {
0, /* iVersion */
icuCreate, /* xCreate */
icuDestroy, /* xCreate */
icuOpen, /* xOpen */
icuClose, /* xClose */
icuNext, /* xNext */
};
/*
** Set *ppModule to point at the implementation of the ICU tokenizer.
*/
void sqlite3Fts2IcuTokenizerModule(
sqlite3_tokenizer_module const**ppModule
){
*ppModule = &icuTokenizerModule;
}
#endif /* defined(SQLITE_ENABLE_ICU) */
#endif /* !defined(SQLITE_CORE) || defined(SQLITE_ENABLE_FTS2) */

View File

@ -1,644 +0,0 @@
/*
** 2006 September 30
**
** The author disclaims copyright to this source code. In place of
** a legal notice, here is a blessing:
**
** May you do good and not evil.
** May you find forgiveness for yourself and forgive others.
** May you share freely, never taking more than you give.
**
*************************************************************************
** Implementation of the full-text-search tokenizer that implements
** a Porter stemmer.
*/
/*
** The code in this file is only compiled if:
**
** * The FTS2 module is being built as an extension
** (in which case SQLITE_CORE is not defined), or
**
** * The FTS2 module is being built into the core of
** SQLite (in which case SQLITE_ENABLE_FTS2 is defined).
*/
#if !defined(SQLITE_CORE) || defined(SQLITE_ENABLE_FTS2)
#include <assert.h>
#include <stdlib.h>
#include <stdio.h>
#include <string.h>
#include "sqlite3.h"
#include "sqlite3ext.h"
SQLITE_EXTENSION_INIT3
#include "fts2_tokenizer.h"
/*
** Class derived from sqlite3_tokenizer
*/
typedef struct porter_tokenizer {
sqlite3_tokenizer base; /* Base class */
} porter_tokenizer;
/*
** Class derived from sqlit3_tokenizer_cursor
*/
typedef struct porter_tokenizer_cursor {
sqlite3_tokenizer_cursor base;
const char *zInput; /* input we are tokenizing */
int nInput; /* size of the input */
int iOffset; /* current position in zInput */
int iToken; /* index of next token to be returned */
char *zToken; /* storage for current token */
int nAllocated; /* space allocated to zToken buffer */
} porter_tokenizer_cursor;
/* Forward declaration */
static const sqlite3_tokenizer_module porterTokenizerModule;
/*
** Create a new tokenizer instance.
*/
static int porterCreate(
int argc, const char * const *argv,
sqlite3_tokenizer **ppTokenizer
){
porter_tokenizer *t;
t = (porter_tokenizer *) sqlite3_malloc(sizeof(*t));
if( t==NULL ) return SQLITE_NOMEM;
memset(t, 0, sizeof(*t));
*ppTokenizer = &t->base;
return SQLITE_OK;
}
/*
** Destroy a tokenizer
*/
static int porterDestroy(sqlite3_tokenizer *pTokenizer){
sqlite3_free(pTokenizer);
return SQLITE_OK;
}
/*
** Prepare to begin tokenizing a particular string. The input
** string to be tokenized is zInput[0..nInput-1]. A cursor
** used to incrementally tokenize this string is returned in
** *ppCursor.
*/
static int porterOpen(
sqlite3_tokenizer *pTokenizer, /* The tokenizer */
const char *zInput, int nInput, /* String to be tokenized */
sqlite3_tokenizer_cursor **ppCursor /* OUT: Tokenization cursor */
){
porter_tokenizer_cursor *c;
c = (porter_tokenizer_cursor *) sqlite3_malloc(sizeof(*c));
if( c==NULL ) return SQLITE_NOMEM;
c->zInput = zInput;
if( zInput==0 ){
c->nInput = 0;
}else if( nInput<0 ){
c->nInput = (int)strlen(zInput);
}else{
c->nInput = nInput;
}
c->iOffset = 0; /* start tokenizing at the beginning */
c->iToken = 0;
c->zToken = NULL; /* no space allocated, yet. */
c->nAllocated = 0;
*ppCursor = &c->base;
return SQLITE_OK;
}
/*
** Close a tokenization cursor previously opened by a call to
** porterOpen() above.
*/
static int porterClose(sqlite3_tokenizer_cursor *pCursor){
porter_tokenizer_cursor *c = (porter_tokenizer_cursor *) pCursor;
sqlite3_free(c->zToken);
sqlite3_free(c);
return SQLITE_OK;
}
/*
** Vowel or consonant
*/
static const char cType[] = {
0, 1, 1, 1, 0, 1, 1, 1, 0, 1, 1, 1, 1, 1, 0, 1, 1, 1, 1, 1, 0,
1, 1, 1, 2, 1
};
/*
** isConsonant() and isVowel() determine if their first character in
** the string they point to is a consonant or a vowel, according
** to Porter ruls.
**
** A consonate is any letter other than 'a', 'e', 'i', 'o', or 'u'.
** 'Y' is a consonant unless it follows another consonant,
** in which case it is a vowel.
**
** In these routine, the letters are in reverse order. So the 'y' rule
** is that 'y' is a consonant unless it is followed by another
** consonent.
*/
static int isVowel(const char*);
static int isConsonant(const char *z){
int j;
char x = *z;
if( x==0 ) return 0;
assert( x>='a' && x<='z' );
j = cType[x-'a'];
if( j<2 ) return j;
return z[1]==0 || isVowel(z + 1);
}
static int isVowel(const char *z){
int j;
char x = *z;
if( x==0 ) return 0;
assert( x>='a' && x<='z' );
j = cType[x-'a'];
if( j<2 ) return 1-j;
return isConsonant(z + 1);
}
/*
** Let any sequence of one or more vowels be represented by V and let
** C be sequence of one or more consonants. Then every word can be
** represented as:
**
** [C] (VC){m} [V]
**
** In prose: A word is an optional consonant followed by zero or
** vowel-consonant pairs followed by an optional vowel. "m" is the
** number of vowel consonant pairs. This routine computes the value
** of m for the first i bytes of a word.
**
** Return true if the m-value for z is 1 or more. In other words,
** return true if z contains at least one vowel that is followed
** by a consonant.
**
** In this routine z[] is in reverse order. So we are really looking
** for an instance of of a consonant followed by a vowel.
*/
static int m_gt_0(const char *z){
while( isVowel(z) ){ z++; }
if( *z==0 ) return 0;
while( isConsonant(z) ){ z++; }
return *z!=0;
}
/* Like mgt0 above except we are looking for a value of m which is
** exactly 1
*/
static int m_eq_1(const char *z){
while( isVowel(z) ){ z++; }
if( *z==0 ) return 0;
while( isConsonant(z) ){ z++; }
if( *z==0 ) return 0;
while( isVowel(z) ){ z++; }
if( *z==0 ) return 1;
while( isConsonant(z) ){ z++; }
return *z==0;
}
/* Like mgt0 above except we are looking for a value of m>1 instead
** or m>0
*/
static int m_gt_1(const char *z){
while( isVowel(z) ){ z++; }
if( *z==0 ) return 0;
while( isConsonant(z) ){ z++; }
if( *z==0 ) return 0;
while( isVowel(z) ){ z++; }
if( *z==0 ) return 0;
while( isConsonant(z) ){ z++; }
return *z!=0;
}
/*
** Return TRUE if there is a vowel anywhere within z[0..n-1]
*/
static int hasVowel(const char *z){
while( isConsonant(z) ){ z++; }
return *z!=0;
}
/*
** Return TRUE if the word ends in a double consonant.
**
** The text is reversed here. So we are really looking at
** the first two characters of z[].
*/
static int doubleConsonant(const char *z){
return isConsonant(z) && z[0]==z[1] && isConsonant(z+1);
}
/*
** Return TRUE if the word ends with three letters which
** are consonant-vowel-consonent and where the final consonant
** is not 'w', 'x', or 'y'.
**
** The word is reversed here. So we are really checking the
** first three letters and the first one cannot be in [wxy].
*/
static int star_oh(const char *z){
return
z[0]!=0 && isConsonant(z) &&
z[0]!='w' && z[0]!='x' && z[0]!='y' &&
z[1]!=0 && isVowel(z+1) &&
z[2]!=0 && isConsonant(z+2);
}
/*
** If the word ends with zFrom and xCond() is true for the stem
** of the word that preceeds the zFrom ending, then change the
** ending to zTo.
**
** The input word *pz and zFrom are both in reverse order. zTo
** is in normal order.
**
** Return TRUE if zFrom matches. Return FALSE if zFrom does not
** match. Not that TRUE is returned even if xCond() fails and
** no substitution occurs.
*/
static int stem(
char **pz, /* The word being stemmed (Reversed) */
const char *zFrom, /* If the ending matches this... (Reversed) */
const char *zTo, /* ... change the ending to this (not reversed) */
int (*xCond)(const char*) /* Condition that must be true */
){
char *z = *pz;
while( *zFrom && *zFrom==*z ){ z++; zFrom++; }
if( *zFrom!=0 ) return 0;
if( xCond && !xCond(z) ) return 1;
while( *zTo ){
*(--z) = *(zTo++);
}
*pz = z;
return 1;
}
/*
** This is the fallback stemmer used when the porter stemmer is
** inappropriate. The input word is copied into the output with
** US-ASCII case folding. If the input word is too long (more
** than 20 bytes if it contains no digits or more than 6 bytes if
** it contains digits) then word is truncated to 20 or 6 bytes
** by taking 10 or 3 bytes from the beginning and end.
*/
static void copy_stemmer(const char *zIn, int nIn, char *zOut, int *pnOut){
int i, mx, j;
int hasDigit = 0;
for(i=0; i<nIn; i++){
int c = zIn[i];
if( c>='A' && c<='Z' ){
zOut[i] = c - 'A' + 'a';
}else{
if( c>='0' && c<='9' ) hasDigit = 1;
zOut[i] = c;
}
}
mx = hasDigit ? 3 : 10;
if( nIn>mx*2 ){
for(j=mx, i=nIn-mx; i<nIn; i++, j++){
zOut[j] = zOut[i];
}
i = j;
}
zOut[i] = 0;
*pnOut = i;
}
/*
** Stem the input word zIn[0..nIn-1]. Store the output in zOut.
** zOut is at least big enough to hold nIn bytes. Write the actual
** size of the output word (exclusive of the '\0' terminator) into *pnOut.
**
** Any upper-case characters in the US-ASCII character set ([A-Z])
** are converted to lower case. Upper-case UTF characters are
** unchanged.
**
** Words that are longer than about 20 bytes are stemmed by retaining
** a few bytes from the beginning and the end of the word. If the
** word contains digits, 3 bytes are taken from the beginning and
** 3 bytes from the end. For long words without digits, 10 bytes
** are taken from each end. US-ASCII case folding still applies.
**
** If the input word contains not digits but does characters not
** in [a-zA-Z] then no stemming is attempted and this routine just
** copies the input into the input into the output with US-ASCII
** case folding.
**
** Stemming never increases the length of the word. So there is
** no chance of overflowing the zOut buffer.
*/
static void porter_stemmer(const char *zIn, int nIn, char *zOut, int *pnOut){
int i, j, c;
char zReverse[28];
char *z, *z2;
if( nIn<3 || nIn>=sizeof(zReverse)-7 ){
/* The word is too big or too small for the porter stemmer.
** Fallback to the copy stemmer */
copy_stemmer(zIn, nIn, zOut, pnOut);
return;
}
for(i=0, j=sizeof(zReverse)-6; i<nIn; i++, j--){
c = zIn[i];
if( c>='A' && c<='Z' ){
zReverse[j] = c + 'a' - 'A';
}else if( c>='a' && c<='z' ){
zReverse[j] = c;
}else{
/* The use of a character not in [a-zA-Z] means that we fallback
** to the copy stemmer */
copy_stemmer(zIn, nIn, zOut, pnOut);
return;
}
}
memset(&zReverse[sizeof(zReverse)-5], 0, 5);
z = &zReverse[j+1];
/* Step 1a */
if( z[0]=='s' ){
if(
!stem(&z, "sess", "ss", 0) &&
!stem(&z, "sei", "i", 0) &&
!stem(&z, "ss", "ss", 0)
){
z++;
}
}
/* Step 1b */
z2 = z;
if( stem(&z, "dee", "ee", m_gt_0) ){
/* Do nothing. The work was all in the test */
}else if(
(stem(&z, "gni", "", hasVowel) || stem(&z, "de", "", hasVowel))
&& z!=z2
){
if( stem(&z, "ta", "ate", 0) ||
stem(&z, "lb", "ble", 0) ||
stem(&z, "zi", "ize", 0) ){
/* Do nothing. The work was all in the test */
}else if( doubleConsonant(z) && (*z!='l' && *z!='s' && *z!='z') ){
z++;
}else if( m_eq_1(z) && star_oh(z) ){
*(--z) = 'e';
}
}
/* Step 1c */
if( z[0]=='y' && hasVowel(z+1) ){
z[0] = 'i';
}
/* Step 2 */
switch( z[1] ){
case 'a':
stem(&z, "lanoita", "ate", m_gt_0) ||
stem(&z, "lanoit", "tion", m_gt_0);
break;
case 'c':
stem(&z, "icne", "ence", m_gt_0) ||
stem(&z, "icna", "ance", m_gt_0);
break;
case 'e':
stem(&z, "rezi", "ize", m_gt_0);
break;
case 'g':
stem(&z, "igol", "log", m_gt_0);
break;
case 'l':
stem(&z, "ilb", "ble", m_gt_0) ||
stem(&z, "illa", "al", m_gt_0) ||
stem(&z, "iltne", "ent", m_gt_0) ||
stem(&z, "ile", "e", m_gt_0) ||
stem(&z, "ilsuo", "ous", m_gt_0);
break;
case 'o':
stem(&z, "noitazi", "ize", m_gt_0) ||
stem(&z, "noita", "ate", m_gt_0) ||
stem(&z, "rota", "ate", m_gt_0);
break;
case 's':
stem(&z, "msila", "al", m_gt_0) ||
stem(&z, "ssenevi", "ive", m_gt_0) ||
stem(&z, "ssenluf", "ful", m_gt_0) ||
stem(&z, "ssensuo", "ous", m_gt_0);
break;
case 't':
stem(&z, "itila", "al", m_gt_0) ||
stem(&z, "itivi", "ive", m_gt_0) ||
stem(&z, "itilib", "ble", m_gt_0);
break;
}
/* Step 3 */
switch( z[0] ){
case 'e':
stem(&z, "etaci", "ic", m_gt_0) ||
stem(&z, "evita", "", m_gt_0) ||
stem(&z, "ezila", "al", m_gt_0);
break;
case 'i':
stem(&z, "itici", "ic", m_gt_0);
break;
case 'l':
stem(&z, "laci", "ic", m_gt_0) ||
stem(&z, "luf", "", m_gt_0);
break;
case 's':
stem(&z, "ssen", "", m_gt_0);
break;
}
/* Step 4 */
switch( z[1] ){
case 'a':
if( z[0]=='l' && m_gt_1(z+2) ){
z += 2;
}
break;
case 'c':
if( z[0]=='e' && z[2]=='n' && (z[3]=='a' || z[3]=='e') && m_gt_1(z+4) ){
z += 4;
}
break;
case 'e':
if( z[0]=='r' && m_gt_1(z+2) ){
z += 2;
}
break;
case 'i':
if( z[0]=='c' && m_gt_1(z+2) ){
z += 2;
}
break;
case 'l':
if( z[0]=='e' && z[2]=='b' && (z[3]=='a' || z[3]=='i') && m_gt_1(z+4) ){
z += 4;
}
break;
case 'n':
if( z[0]=='t' ){
if( z[2]=='a' ){
if( m_gt_1(z+3) ){
z += 3;
}
}else if( z[2]=='e' ){
stem(&z, "tneme", "", m_gt_1) ||
stem(&z, "tnem", "", m_gt_1) ||
stem(&z, "tne", "", m_gt_1);
}
}
break;
case 'o':
if( z[0]=='u' ){
if( m_gt_1(z+2) ){
z += 2;
}
}else if( z[3]=='s' || z[3]=='t' ){
stem(&z, "noi", "", m_gt_1);
}
break;
case 's':
if( z[0]=='m' && z[2]=='i' && m_gt_1(z+3) ){
z += 3;
}
break;
case 't':
stem(&z, "eta", "", m_gt_1) ||
stem(&z, "iti", "", m_gt_1);
break;
case 'u':
if( z[0]=='s' && z[2]=='o' && m_gt_1(z+3) ){
z += 3;
}
break;
case 'v':
case 'z':
if( z[0]=='e' && z[2]=='i' && m_gt_1(z+3) ){
z += 3;
}
break;
}
/* Step 5a */
if( z[0]=='e' ){
if( m_gt_1(z+1) ){
z++;
}else if( m_eq_1(z+1) && !star_oh(z+1) ){
z++;
}
}
/* Step 5b */
if( m_gt_1(z) && z[0]=='l' && z[1]=='l' ){
z++;
}
/* z[] is now the stemmed word in reverse order. Flip it back
** around into forward order and return.
*/
*pnOut = i = strlen(z);
zOut[i] = 0;
while( *z ){
zOut[--i] = *(z++);
}
}
/*
** Characters that can be part of a token. We assume any character
** whose value is greater than 0x80 (any UTF character) can be
** part of a token. In other words, delimiters all must have
** values of 0x7f or lower.
*/
static const char porterIdChar[] = {
/* x0 x1 x2 x3 x4 x5 x6 x7 x8 x9 xA xB xC xD xE xF */
1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 0, 0, 0, 0, 0, /* 3x */
0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, /* 4x */
1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 0, 0, 0, 1, /* 5x */
0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, /* 6x */
1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 0, 0, 0, 0, /* 7x */
};
#define isDelim(C) (((ch=C)&0x80)==0 && (ch<0x30 || !porterIdChar[ch-0x30]))
/*
** Extract the next token from a tokenization cursor. The cursor must
** have been opened by a prior call to porterOpen().
*/
static int porterNext(
sqlite3_tokenizer_cursor *pCursor, /* Cursor returned by porterOpen */
const char **pzToken, /* OUT: *pzToken is the token text */
int *pnBytes, /* OUT: Number of bytes in token */
int *piStartOffset, /* OUT: Starting offset of token */
int *piEndOffset, /* OUT: Ending offset of token */
int *piPosition /* OUT: Position integer of token */
){
porter_tokenizer_cursor *c = (porter_tokenizer_cursor *) pCursor;
const char *z = c->zInput;
while( c->iOffset<c->nInput ){
int iStartOffset, ch;
/* Scan past delimiter characters */
while( c->iOffset<c->nInput && isDelim(z[c->iOffset]) ){
c->iOffset++;
}
/* Count non-delimiter characters. */
iStartOffset = c->iOffset;
while( c->iOffset<c->nInput && !isDelim(z[c->iOffset]) ){
c->iOffset++;
}
if( c->iOffset>iStartOffset ){
int n = c->iOffset-iStartOffset;
if( n>c->nAllocated ){
c->nAllocated = n+20;
c->zToken = sqlite3_realloc(c->zToken, c->nAllocated);
if( c->zToken==NULL ) return SQLITE_NOMEM;
}
porter_stemmer(&z[iStartOffset], n, c->zToken, pnBytes);
*pzToken = c->zToken;
*piStartOffset = iStartOffset;
*piEndOffset = c->iOffset;
*piPosition = c->iToken++;
return SQLITE_OK;
}
}
return SQLITE_DONE;
}
/*
** The set of routines that implement the porter-stemmer tokenizer
*/
static const sqlite3_tokenizer_module porterTokenizerModule = {
0,
porterCreate,
porterDestroy,
porterOpen,
porterClose,
porterNext,
};
/*
** Allocate a new porter tokenizer. Return a pointer to the new
** tokenizer in *ppModule
*/
void sqlite3Fts2PorterTokenizerModule(
sqlite3_tokenizer_module const**ppModule
){
*ppModule = &porterTokenizerModule;
}
#endif /* !defined(SQLITE_CORE) || defined(SQLITE_ENABLE_FTS2) */

View File

@ -1,375 +0,0 @@
/*
** 2007 June 22
**
** The author disclaims copyright to this source code. In place of
** a legal notice, here is a blessing:
**
** May you do good and not evil.
** May you find forgiveness for yourself and forgive others.
** May you share freely, never taking more than you give.
**
******************************************************************************
**
** This is part of an SQLite module implementing full-text search.
** This particular file implements the generic tokenizer interface.
*/
/*
** The code in this file is only compiled if:
**
** * The FTS2 module is being built as an extension
** (in which case SQLITE_CORE is not defined), or
**
** * The FTS2 module is being built into the core of
** SQLite (in which case SQLITE_ENABLE_FTS2 is defined).
*/
#if !defined(SQLITE_CORE) || defined(SQLITE_ENABLE_FTS2)
#include "sqlite3.h"
#include "sqlite3ext.h"
SQLITE_EXTENSION_INIT3
#include "fts2_hash.h"
#include "fts2_tokenizer.h"
#include <assert.h>
/*
** Implementation of the SQL scalar function for accessing the underlying
** hash table. This function may be called as follows:
**
** SELECT <function-name>(<key-name>);
** SELECT <function-name>(<key-name>, <pointer>);
**
** where <function-name> is the name passed as the second argument
** to the sqlite3Fts2InitHashTable() function (e.g. 'fts2_tokenizer').
**
** If the <pointer> argument is specified, it must be a blob value
** containing a pointer to be stored as the hash data corresponding
** to the string <key-name>. If <pointer> is not specified, then
** the string <key-name> must already exist in the has table. Otherwise,
** an error is returned.
**
** Whether or not the <pointer> argument is specified, the value returned
** is a blob containing the pointer stored as the hash data corresponding
** to string <key-name> (after the hash-table is updated, if applicable).
*/
static void scalarFunc(
sqlite3_context *context,
int argc,
sqlite3_value **argv
){
fts2Hash *pHash;
void *pPtr = 0;
const unsigned char *zName;
int nName;
assert( argc==1 || argc==2 );
pHash = (fts2Hash *)sqlite3_user_data(context);
zName = sqlite3_value_text(argv[0]);
nName = sqlite3_value_bytes(argv[0])+1;
if( argc==2 ){
void *pOld;
int n = sqlite3_value_bytes(argv[1]);
if( n!=sizeof(pPtr) ){
sqlite3_result_error(context, "argument type mismatch", -1);
return;
}
pPtr = *(void **)sqlite3_value_blob(argv[1]);
pOld = sqlite3Fts2HashInsert(pHash, (void *)zName, nName, pPtr);
if( pOld==pPtr ){
sqlite3_result_error(context, "out of memory", -1);
return;
}
}else{
pPtr = sqlite3Fts2HashFind(pHash, zName, nName);
if( !pPtr ){
char *zErr = sqlite3_mprintf("unknown tokenizer: %s", zName);
sqlite3_result_error(context, zErr, -1);
sqlite3_free(zErr);
return;
}
}
sqlite3_result_blob(context, (void *)&pPtr, sizeof(pPtr), SQLITE_TRANSIENT);
}
#ifdef SQLITE_TEST
#if defined(INCLUDE_SQLITE_TCL_H)
# include "sqlite_tcl.h"
#else
# include "tcl.h"
#endif
#include <string.h>
/*
** Implementation of a special SQL scalar function for testing tokenizers
** designed to be used in concert with the Tcl testing framework. This
** function must be called with two arguments:
**
** SELECT <function-name>(<key-name>, <input-string>);
** SELECT <function-name>(<key-name>, <pointer>);
**
** where <function-name> is the name passed as the second argument
** to the sqlite3Fts2InitHashTable() function (e.g. 'fts2_tokenizer')
** concatenated with the string '_test' (e.g. 'fts2_tokenizer_test').
**
** The return value is a string that may be interpreted as a Tcl
** list. For each token in the <input-string>, three elements are
** added to the returned list. The first is the token position, the
** second is the token text (folded, stemmed, etc.) and the third is the
** substring of <input-string> associated with the token. For example,
** using the built-in "simple" tokenizer:
**
** SELECT fts_tokenizer_test('simple', 'I don't see how');
**
** will return the string:
**
** "{0 i I 1 dont don't 2 see see 3 how how}"
**
*/
static void testFunc(
sqlite3_context *context,
int argc,
sqlite3_value **argv
){
fts2Hash *pHash;
sqlite3_tokenizer_module *p;
sqlite3_tokenizer *pTokenizer = 0;
sqlite3_tokenizer_cursor *pCsr = 0;
const char *zErr = 0;
const char *zName;
int nName;
const char *zInput;
int nInput;
const char *zArg = 0;
const char *zToken;
int nToken;
int iStart;
int iEnd;
int iPos;
Tcl_Obj *pRet;
assert( argc==2 || argc==3 );
nName = sqlite3_value_bytes(argv[0]);
zName = (const char *)sqlite3_value_text(argv[0]);
nInput = sqlite3_value_bytes(argv[argc-1]);
zInput = (const char *)sqlite3_value_text(argv[argc-1]);
if( argc==3 ){
zArg = (const char *)sqlite3_value_text(argv[1]);
}
pHash = (fts2Hash *)sqlite3_user_data(context);
p = (sqlite3_tokenizer_module *)sqlite3Fts2HashFind(pHash, zName, nName+1);
if( !p ){
char *zErr = sqlite3_mprintf("unknown tokenizer: %s", zName);
sqlite3_result_error(context, zErr, -1);
sqlite3_free(zErr);
return;
}
pRet = Tcl_NewObj();
Tcl_IncrRefCount(pRet);
if( SQLITE_OK!=p->xCreate(zArg ? 1 : 0, &zArg, &pTokenizer) ){
zErr = "error in xCreate()";
goto finish;
}
pTokenizer->pModule = p;
if( SQLITE_OK!=p->xOpen(pTokenizer, zInput, nInput, &pCsr) ){
zErr = "error in xOpen()";
goto finish;
}
pCsr->pTokenizer = pTokenizer;
while( SQLITE_OK==p->xNext(pCsr, &zToken, &nToken, &iStart, &iEnd, &iPos) ){
Tcl_ListObjAppendElement(0, pRet, Tcl_NewIntObj(iPos));
Tcl_ListObjAppendElement(0, pRet, Tcl_NewStringObj(zToken, nToken));
zToken = &zInput[iStart];
nToken = iEnd-iStart;
Tcl_ListObjAppendElement(0, pRet, Tcl_NewStringObj(zToken, nToken));
}
if( SQLITE_OK!=p->xClose(pCsr) ){
zErr = "error in xClose()";
goto finish;
}
if( SQLITE_OK!=p->xDestroy(pTokenizer) ){
zErr = "error in xDestroy()";
goto finish;
}
finish:
if( zErr ){
sqlite3_result_error(context, zErr, -1);
}else{
sqlite3_result_text(context, Tcl_GetString(pRet), -1, SQLITE_TRANSIENT);
}
Tcl_DecrRefCount(pRet);
}
static
int registerTokenizer(
sqlite3 *db,
char *zName,
const sqlite3_tokenizer_module *p
){
int rc;
sqlite3_stmt *pStmt;
const char zSql[] = "SELECT fts2_tokenizer(?, ?)";
rc = sqlite3_prepare_v2(db, zSql, -1, &pStmt, 0);
if( rc!=SQLITE_OK ){
return rc;
}
sqlite3_bind_text(pStmt, 1, zName, -1, SQLITE_STATIC);
sqlite3_bind_blob(pStmt, 2, &p, sizeof(p), SQLITE_STATIC);
sqlite3_step(pStmt);
return sqlite3_finalize(pStmt);
}
static
int queryFts2Tokenizer(
sqlite3 *db,
char *zName,
const sqlite3_tokenizer_module **pp
){
int rc;
sqlite3_stmt *pStmt;
const char zSql[] = "SELECT fts2_tokenizer(?)";
*pp = 0;
rc = sqlite3_prepare_v2(db, zSql, -1, &pStmt, 0);
if( rc!=SQLITE_OK ){
return rc;
}
sqlite3_bind_text(pStmt, 1, zName, -1, SQLITE_STATIC);
if( SQLITE_ROW==sqlite3_step(pStmt) ){
if( sqlite3_column_type(pStmt, 0)==SQLITE_BLOB ){
memcpy(pp, sqlite3_column_blob(pStmt, 0), sizeof(*pp));
}
}
return sqlite3_finalize(pStmt);
}
void sqlite3Fts2SimpleTokenizerModule(sqlite3_tokenizer_module const**ppModule);
/*
** Implementation of the scalar function fts2_tokenizer_internal_test().
** This function is used for testing only, it is not included in the
** build unless SQLITE_TEST is defined.
**
** The purpose of this is to test that the fts2_tokenizer() function
** can be used as designed by the C-code in the queryFts2Tokenizer and
** registerTokenizer() functions above. These two functions are repeated
** in the README.tokenizer file as an example, so it is important to
** test them.
**
** To run the tests, evaluate the fts2_tokenizer_internal_test() scalar
** function with no arguments. An assert() will fail if a problem is
** detected. i.e.:
**
** SELECT fts2_tokenizer_internal_test();
**
*/
static void intTestFunc(
sqlite3_context *context,
int argc,
sqlite3_value **argv
){
int rc;
const sqlite3_tokenizer_module *p1;
const sqlite3_tokenizer_module *p2;
sqlite3 *db = (sqlite3 *)sqlite3_user_data(context);
/* Test the query function */
sqlite3Fts2SimpleTokenizerModule(&p1);
rc = queryFts2Tokenizer(db, "simple", &p2);
assert( rc==SQLITE_OK );
assert( p1==p2 );
rc = queryFts2Tokenizer(db, "nosuchtokenizer", &p2);
assert( rc==SQLITE_ERROR );
assert( p2==0 );
assert( 0==strcmp(sqlite3_errmsg(db), "unknown tokenizer: nosuchtokenizer") );
/* Test the storage function */
rc = registerTokenizer(db, "nosuchtokenizer", p1);
assert( rc==SQLITE_OK );
rc = queryFts2Tokenizer(db, "nosuchtokenizer", &p2);
assert( rc==SQLITE_OK );
assert( p2==p1 );
sqlite3_result_text(context, "ok", -1, SQLITE_STATIC);
}
#endif
/*
** Set up SQL objects in database db used to access the contents of
** the hash table pointed to by argument pHash. The hash table must
** been initialized to use string keys, and to take a private copy
** of the key when a value is inserted. i.e. by a call similar to:
**
** sqlite3Fts2HashInit(pHash, FTS2_HASH_STRING, 1);
**
** This function adds a scalar function (see header comment above
** scalarFunc() in this file for details) and, if ENABLE_TABLE is
** defined at compilation time, a temporary virtual table (see header
** comment above struct HashTableVtab) to the database schema. Both
** provide read/write access to the contents of *pHash.
**
** The third argument to this function, zName, is used as the name
** of both the scalar and, if created, the virtual table.
*/
int sqlite3Fts2InitHashTable(
sqlite3 *db,
fts2Hash *pHash,
const char *zName
){
int rc = SQLITE_OK;
void *p = (void *)pHash;
const int any = SQLITE_ANY;
char *zTest = 0;
char *zTest2 = 0;
#ifdef SQLITE_TEST
void *pdb = (void *)db;
zTest = sqlite3_mprintf("%s_test", zName);
zTest2 = sqlite3_mprintf("%s_internal_test", zName);
if( !zTest || !zTest2 ){
rc = SQLITE_NOMEM;
}
#endif
if( rc!=SQLITE_OK
|| (rc = sqlite3_create_function(db, zName, 1, any, p, scalarFunc, 0, 0))
|| (rc = sqlite3_create_function(db, zName, 2, any, p, scalarFunc, 0, 0))
#ifdef SQLITE_TEST
|| (rc = sqlite3_create_function(db, zTest, 2, any, p, testFunc, 0, 0))
|| (rc = sqlite3_create_function(db, zTest, 3, any, p, testFunc, 0, 0))
|| (rc = sqlite3_create_function(db, zTest2, 0, any, pdb, intTestFunc, 0, 0))
#endif
);
sqlite3_free(zTest);
sqlite3_free(zTest2);
return rc;
}
#endif /* !defined(SQLITE_CORE) || defined(SQLITE_ENABLE_FTS2) */

View File

@ -1,145 +0,0 @@
/*
** 2006 July 10
**
** The author disclaims copyright to this source code.
**
*************************************************************************
** Defines the interface to tokenizers used by fulltext-search. There
** are three basic components:
**
** sqlite3_tokenizer_module is a singleton defining the tokenizer
** interface functions. This is essentially the class structure for
** tokenizers.
**
** sqlite3_tokenizer is used to define a particular tokenizer, perhaps
** including customization information defined at creation time.
**
** sqlite3_tokenizer_cursor is generated by a tokenizer to generate
** tokens from a particular input.
*/
#ifndef _FTS2_TOKENIZER_H_
#define _FTS2_TOKENIZER_H_
/* TODO(shess) Only used for SQLITE_OK and SQLITE_DONE at this time.
** If tokenizers are to be allowed to call sqlite3_*() functions, then
** we will need a way to register the API consistently.
*/
#include "sqlite3.h"
/*
** Structures used by the tokenizer interface. When a new tokenizer
** implementation is registered, the caller provides a pointer to
** an sqlite3_tokenizer_module containing pointers to the callback
** functions that make up an implementation.
**
** When an fts2 table is created, it passes any arguments passed to
** the tokenizer clause of the CREATE VIRTUAL TABLE statement to the
** sqlite3_tokenizer_module.xCreate() function of the requested tokenizer
** implementation. The xCreate() function in turn returns an
** sqlite3_tokenizer structure representing the specific tokenizer to
** be used for the fts2 table (customized by the tokenizer clause arguments).
**
** To tokenize an input buffer, the sqlite3_tokenizer_module.xOpen()
** method is called. It returns an sqlite3_tokenizer_cursor object
** that may be used to tokenize a specific input buffer based on
** the tokenization rules supplied by a specific sqlite3_tokenizer
** object.
*/
typedef struct sqlite3_tokenizer_module sqlite3_tokenizer_module;
typedef struct sqlite3_tokenizer sqlite3_tokenizer;
typedef struct sqlite3_tokenizer_cursor sqlite3_tokenizer_cursor;
struct sqlite3_tokenizer_module {
/*
** Structure version. Should always be set to 0.
*/
int iVersion;
/*
** Create a new tokenizer. The values in the argv[] array are the
** arguments passed to the "tokenizer" clause of the CREATE VIRTUAL
** TABLE statement that created the fts2 table. For example, if
** the following SQL is executed:
**
** CREATE .. USING fts2( ... , tokenizer <tokenizer-name> arg1 arg2)
**
** then argc is set to 2, and the argv[] array contains pointers
** to the strings "arg1" and "arg2".
**
** This method should return either SQLITE_OK (0), or an SQLite error
** code. If SQLITE_OK is returned, then *ppTokenizer should be set
** to point at the newly created tokenizer structure. The generic
** sqlite3_tokenizer.pModule variable should not be initialized by
** this callback. The caller will do so.
*/
int (*xCreate)(
int argc, /* Size of argv array */
const char *const*argv, /* Tokenizer argument strings */
sqlite3_tokenizer **ppTokenizer /* OUT: Created tokenizer */
);
/*
** Destroy an existing tokenizer. The fts2 module calls this method
** exactly once for each successful call to xCreate().
*/
int (*xDestroy)(sqlite3_tokenizer *pTokenizer);
/*
** Create a tokenizer cursor to tokenize an input buffer. The caller
** is responsible for ensuring that the input buffer remains valid
** until the cursor is closed (using the xClose() method).
*/
int (*xOpen)(
sqlite3_tokenizer *pTokenizer, /* Tokenizer object */
const char *pInput, int nBytes, /* Input buffer */
sqlite3_tokenizer_cursor **ppCursor /* OUT: Created tokenizer cursor */
);
/*
** Destroy an existing tokenizer cursor. The fts2 module calls this
** method exactly once for each successful call to xOpen().
*/
int (*xClose)(sqlite3_tokenizer_cursor *pCursor);
/*
** Retrieve the next token from the tokenizer cursor pCursor. This
** method should either return SQLITE_OK and set the values of the
** "OUT" variables identified below, or SQLITE_DONE to indicate that
** the end of the buffer has been reached, or an SQLite error code.
**
** *ppToken should be set to point at a buffer containing the
** normalized version of the token (i.e. after any case-folding and/or
** stemming has been performed). *pnBytes should be set to the length
** of this buffer in bytes. The input text that generated the token is
** identified by the byte offsets returned in *piStartOffset and
** *piEndOffset.
**
** The buffer *ppToken is set to point at is managed by the tokenizer
** implementation. It is only required to be valid until the next call
** to xNext() or xClose().
*/
/* TODO(shess) current implementation requires pInput to be
** nul-terminated. This should either be fixed, or pInput/nBytes
** should be converted to zInput.
*/
int (*xNext)(
sqlite3_tokenizer_cursor *pCursor, /* Tokenizer cursor */
const char **ppToken, int *pnBytes, /* OUT: Normalized text for token */
int *piStartOffset, /* OUT: Byte offset of token in input buffer */
int *piEndOffset, /* OUT: Byte offset of end of token in input buffer */
int *piPosition /* OUT: Number of tokens returned before this one */
);
};
struct sqlite3_tokenizer {
const sqlite3_tokenizer_module *pModule; /* The module for this tokenizer */
/* Tokenizer implementations will typically add additional fields */
};
struct sqlite3_tokenizer_cursor {
sqlite3_tokenizer *pTokenizer; /* Tokenizer for this cursor. */
/* Tokenizer implementations will typically add additional fields */
};
#endif /* _FTS2_TOKENIZER_H_ */

View File

@ -1,233 +0,0 @@
/*
** 2006 Oct 10
**
** The author disclaims copyright to this source code. In place of
** a legal notice, here is a blessing:
**
** May you do good and not evil.
** May you find forgiveness for yourself and forgive others.
** May you share freely, never taking more than you give.
**
******************************************************************************
**
** Implementation of the "simple" full-text-search tokenizer.
*/
/*
** The code in this file is only compiled if:
**
** * The FTS2 module is being built as an extension
** (in which case SQLITE_CORE is not defined), or
**
** * The FTS2 module is being built into the core of
** SQLite (in which case SQLITE_ENABLE_FTS2 is defined).
*/
#if !defined(SQLITE_CORE) || defined(SQLITE_ENABLE_FTS2)
#include <assert.h>
#include <stdlib.h>
#include <stdio.h>
#include <string.h>
#include "sqlite3.h"
#include "sqlite3ext.h"
SQLITE_EXTENSION_INIT3
#include "fts2_tokenizer.h"
typedef struct simple_tokenizer {
sqlite3_tokenizer base;
char delim[128]; /* flag ASCII delimiters */
} simple_tokenizer;
typedef struct simple_tokenizer_cursor {
sqlite3_tokenizer_cursor base;
const char *pInput; /* input we are tokenizing */
int nBytes; /* size of the input */
int iOffset; /* current position in pInput */
int iToken; /* index of next token to be returned */
char *pToken; /* storage for current token */
int nTokenAllocated; /* space allocated to zToken buffer */
} simple_tokenizer_cursor;
/* Forward declaration */
static const sqlite3_tokenizer_module simpleTokenizerModule;
static int simpleDelim(simple_tokenizer *t, unsigned char c){
return c<0x80 && t->delim[c];
}
/*
** Create a new tokenizer instance.
*/
static int simpleCreate(
int argc, const char * const *argv,
sqlite3_tokenizer **ppTokenizer
){
simple_tokenizer *t;
t = (simple_tokenizer *) sqlite3_malloc(sizeof(*t));
if( t==NULL ) return SQLITE_NOMEM;
memset(t, 0, sizeof(*t));
/* TODO(shess) Delimiters need to remain the same from run to run,
** else we need to reindex. One solution would be a meta-table to
** track such information in the database, then we'd only want this
** information on the initial create.
*/
if( argc>1 ){
int i, n = strlen(argv[1]);
for(i=0; i<n; i++){
unsigned char ch = argv[1][i];
/* We explicitly don't support UTF-8 delimiters for now. */
if( ch>=0x80 ){
sqlite3_free(t);
return SQLITE_ERROR;
}
t->delim[ch] = 1;
}
} else {
/* Mark non-alphanumeric ASCII characters as delimiters */
int i;
for(i=1; i<0x80; i++){
t->delim[i] = !((i>='0' && i<='9') || (i>='A' && i<='Z') ||
(i>='a' && i<='z'));
}
}
*ppTokenizer = &t->base;
return SQLITE_OK;
}
/*
** Destroy a tokenizer
*/
static int simpleDestroy(sqlite3_tokenizer *pTokenizer){
sqlite3_free(pTokenizer);
return SQLITE_OK;
}
/*
** Prepare to begin tokenizing a particular string. The input
** string to be tokenized is pInput[0..nBytes-1]. A cursor
** used to incrementally tokenize this string is returned in
** *ppCursor.
*/
static int simpleOpen(
sqlite3_tokenizer *pTokenizer, /* The tokenizer */
const char *pInput, int nBytes, /* String to be tokenized */
sqlite3_tokenizer_cursor **ppCursor /* OUT: Tokenization cursor */
){
simple_tokenizer_cursor *c;
c = (simple_tokenizer_cursor *) sqlite3_malloc(sizeof(*c));
if( c==NULL ) return SQLITE_NOMEM;
c->pInput = pInput;
if( pInput==0 ){
c->nBytes = 0;
}else if( nBytes<0 ){
c->nBytes = (int)strlen(pInput);
}else{
c->nBytes = nBytes;
}
c->iOffset = 0; /* start tokenizing at the beginning */
c->iToken = 0;
c->pToken = NULL; /* no space allocated, yet. */
c->nTokenAllocated = 0;
*ppCursor = &c->base;
return SQLITE_OK;
}
/*
** Close a tokenization cursor previously opened by a call to
** simpleOpen() above.
*/
static int simpleClose(sqlite3_tokenizer_cursor *pCursor){
simple_tokenizer_cursor *c = (simple_tokenizer_cursor *) pCursor;
sqlite3_free(c->pToken);
sqlite3_free(c);
return SQLITE_OK;
}
/*
** Extract the next token from a tokenization cursor. The cursor must
** have been opened by a prior call to simpleOpen().
*/
static int simpleNext(
sqlite3_tokenizer_cursor *pCursor, /* Cursor returned by simpleOpen */
const char **ppToken, /* OUT: *ppToken is the token text */
int *pnBytes, /* OUT: Number of bytes in token */
int *piStartOffset, /* OUT: Starting offset of token */
int *piEndOffset, /* OUT: Ending offset of token */
int *piPosition /* OUT: Position integer of token */
){
simple_tokenizer_cursor *c = (simple_tokenizer_cursor *) pCursor;
simple_tokenizer *t = (simple_tokenizer *) pCursor->pTokenizer;
unsigned char *p = (unsigned char *)c->pInput;
while( c->iOffset<c->nBytes ){
int iStartOffset;
/* Scan past delimiter characters */
while( c->iOffset<c->nBytes && simpleDelim(t, p[c->iOffset]) ){
c->iOffset++;
}
/* Count non-delimiter characters. */
iStartOffset = c->iOffset;
while( c->iOffset<c->nBytes && !simpleDelim(t, p[c->iOffset]) ){
c->iOffset++;
}
if( c->iOffset>iStartOffset ){
int i, n = c->iOffset-iStartOffset;
if( n>c->nTokenAllocated ){
c->nTokenAllocated = n+20;
c->pToken = sqlite3_realloc(c->pToken, c->nTokenAllocated);
if( c->pToken==NULL ) return SQLITE_NOMEM;
}
for(i=0; i<n; i++){
/* TODO(shess) This needs expansion to handle UTF-8
** case-insensitivity.
*/
unsigned char ch = p[iStartOffset+i];
c->pToken[i] = (ch>='A' && ch<='Z') ? (ch - 'A' + 'a') : ch;
}
*ppToken = c->pToken;
*pnBytes = n;
*piStartOffset = iStartOffset;
*piEndOffset = c->iOffset;
*piPosition = c->iToken++;
return SQLITE_OK;
}
}
return SQLITE_DONE;
}
/*
** The set of routines that implement the simple tokenizer
*/
static const sqlite3_tokenizer_module simpleTokenizerModule = {
0,
simpleCreate,
simpleDestroy,
simpleOpen,
simpleClose,
simpleNext,
};
/*
** Allocate a new simple tokenizer. Return a pointer to the new
** tokenizer in *ppModule
*/
void sqlite3Fts2SimpleTokenizerModule(
sqlite3_tokenizer_module const**ppModule
){
*ppModule = &simpleTokenizerModule;
}
#endif /* !defined(SQLITE_CORE) || defined(SQLITE_ENABLE_FTS2) */

View File

@ -1,116 +0,0 @@
#!/usr/bin/tclsh
#
# This script builds a single C code file holding all of FTS2 code.
# The name of the output file is fts2amal.c. To build this file,
# first do:
#
# make target_source
#
# The make target above moves all of the source code files into
# a subdirectory named "tsrc". (This script expects to find the files
# there and will not work if they are not found.)
#
# After the "tsrc" directory has been created and populated, run
# this script:
#
# tclsh mkfts2amal.tcl
#
# The amalgamated FTS2 code will be written into fts2amal.c
#
# Open the output file and write a header comment at the beginning
# of the file.
#
set out [open fts2amal.c w]
set today [clock format [clock seconds] -format "%Y-%m-%d %H:%M:%S UTC" -gmt 1]
puts $out [subst \
{/******************************************************************************
** This file is an amalgamation of separate C source files from the SQLite
** Full Text Search extension 2 (fts2). By combining all the individual C
** code files into this single large file, the entire code can be compiled
** as a one translation unit. This allows many compilers to do optimizations
** that would not be possible if the files were compiled separately. It also
** makes the code easier to import into other projects.
**
** This amalgamation was generated on $today.
*/}]
# These are the header files used by FTS2. The first time any of these
# files are seen in a #include statement in the C code, include the complete
# text of the file in-line. The file only needs to be included once.
#
foreach hdr {
fts2.h
fts2_hash.h
fts2_tokenizer.h
sqlite3.h
sqlite3ext.h
} {
set available_hdr($hdr) 1
}
# 78 stars used for comment formatting.
set s78 \
{*****************************************************************************}
# Insert a comment into the code
#
proc section_comment {text} {
global out s78
set n [string length $text]
set nstar [expr {60 - $n}]
set stars [string range $s78 0 $nstar]
puts $out "/************** $text $stars/"
}
# Read the source file named $filename and write it into the
# sqlite3.c output file. If any #include statements are seen,
# process them approprately.
#
proc copy_file {filename} {
global seen_hdr available_hdr out
set tail [file tail $filename]
section_comment "Begin file $tail"
set in [open $filename r]
while {![eof $in]} {
set line [gets $in]
if {[regexp {^#\s*include\s+["<]([^">]+)[">]} $line all hdr]} {
if {[info exists available_hdr($hdr)]} {
if {$available_hdr($hdr)} {
section_comment "Include $hdr in the middle of $tail"
copy_file tsrc/$hdr
section_comment "Continuing where we left off in $tail"
}
} elseif {![info exists seen_hdr($hdr)]} {
set seen_hdr($hdr) 1
puts $out $line
}
} elseif {[regexp {^#ifdef __cplusplus} $line]} {
puts $out "#if 0"
} elseif {[regexp {^#line} $line]} {
# Skip #line directives.
} else {
puts $out $line
}
}
close $in
section_comment "End of $tail"
}
# Process the source files. Process files containing commonly
# used subroutines first in order to help the compiler find
# inlining opportunities.
#
foreach file {
fts2.c
fts2_hash.c
fts2_porter.c
fts2_tokenizer.c
fts2_tokenizer1.c
fts2_icu.c
} {
copy_file tsrc/$file
}
close $out

48
main.mk
View File

@ -192,24 +192,6 @@ SRC = \
# Source code for extensions
#
SRC += \
$(TOP)/ext/fts1/fts1.c \
$(TOP)/ext/fts1/fts1.h \
$(TOP)/ext/fts1/fts1_hash.c \
$(TOP)/ext/fts1/fts1_hash.h \
$(TOP)/ext/fts1/fts1_porter.c \
$(TOP)/ext/fts1/fts1_tokenizer.h \
$(TOP)/ext/fts1/fts1_tokenizer1.c
SRC += \
$(TOP)/ext/fts2/fts2.c \
$(TOP)/ext/fts2/fts2.h \
$(TOP)/ext/fts2/fts2_hash.c \
$(TOP)/ext/fts2/fts2_hash.h \
$(TOP)/ext/fts2/fts2_icu.c \
$(TOP)/ext/fts2/fts2_porter.c \
$(TOP)/ext/fts2/fts2_tokenizer.h \
$(TOP)/ext/fts2/fts2_tokenizer.c \
$(TOP)/ext/fts2/fts2_tokenizer1.c
SRC += \
$(TOP)/ext/fts3/fts3.c \
$(TOP)/ext/fts3/fts3.h \
@ -397,7 +379,6 @@ TESTSRC += \
$(TOP)/ext/recover/test_recover.c
#TESTSRC += $(TOP)/ext/fts2/fts2_tokenizer.c
#TESTSRC += $(TOP)/ext/fts3/fts3_tokenizer.c
TESTSRC2 = \
@ -482,14 +463,6 @@ HDR = \
# Header files used by extensions
#
EXTHDR += \
$(TOP)/ext/fts1/fts1.h \
$(TOP)/ext/fts1/fts1_hash.h \
$(TOP)/ext/fts1/fts1_tokenizer.h
EXTHDR += \
$(TOP)/ext/fts2/fts2.h \
$(TOP)/ext/fts2/fts2_hash.h \
$(TOP)/ext/fts2/fts2_tokenizer.h
EXTHDR += \
$(TOP)/ext/fts3/fts3.h \
$(TOP)/ext/fts3/fts3Int.h \
@ -690,9 +663,6 @@ sqlite3.c-debug: target_source $(TOP)/tool/mksqlite3c.tcl
sqlite3-all.c: sqlite3.c $(TOP)/tool/split-sqlite3c.tcl
tclsh $(TOP)/tool/split-sqlite3c.tcl
fts2amal.c: target_source $(TOP)/ext/fts2/mkfts2amal.tcl
tclsh $(TOP)/ext/fts2/mkfts2amal.tcl
# Rules to build the LEMON compiler generator
#
lemon: $(TOP)/tool/lemon.c $(TOP)/tool/lempar.c
@ -787,24 +757,6 @@ shell.c: $(SHELL_SRC) $(TOP)/tool/mkshellc.tcl
icu.o: $(TOP)/ext/icu/icu.c $(HDR) $(EXTHDR)
$(TCCX) -DSQLITE_CORE -c $(TOP)/ext/icu/icu.c
fts2.o: $(TOP)/ext/fts2/fts2.c $(HDR) $(EXTHDR)
$(TCCX) -DSQLITE_CORE -c $(TOP)/ext/fts2/fts2.c
fts2_hash.o: $(TOP)/ext/fts2/fts2_hash.c $(HDR) $(EXTHDR)
$(TCCX) -DSQLITE_CORE -c $(TOP)/ext/fts2/fts2_hash.c
fts2_icu.o: $(TOP)/ext/fts2/fts2_icu.c $(HDR) $(EXTHDR)
$(TCCX) -DSQLITE_CORE -c $(TOP)/ext/fts2/fts2_icu.c
fts2_porter.o: $(TOP)/ext/fts2/fts2_porter.c $(HDR) $(EXTHDR)
$(TCCX) -DSQLITE_CORE -c $(TOP)/ext/fts2/fts2_porter.c
fts2_tokenizer.o: $(TOP)/ext/fts2/fts2_tokenizer.c $(HDR) $(EXTHDR)
$(TCCX) -DSQLITE_CORE -c $(TOP)/ext/fts2/fts2_tokenizer.c
fts2_tokenizer1.o: $(TOP)/ext/fts2/fts2_tokenizer1.c $(HDR) $(EXTHDR)
$(TCCX) -DSQLITE_CORE -c $(TOP)/ext/fts2/fts2_tokenizer1.c
fts3.o: $(TOP)/ext/fts3/fts3.c $(HDR) $(EXTHDR)
$(TCCX) -DSQLITE_CORE -c $(TOP)/ext/fts3/fts3.c

View File

@ -1,11 +1,11 @@
C Add\sOOM\scheck,\sper\stip\sat\s[forum:/forumpost/933479b2d5|forum\spost\s933479b2d5]
D 2023-01-14T19:27:40.985
C Omit\sthe\slong-disused\sFTS1\sand\sFTS2\simplements\sfrom\sthe\sactive\ssource\stree.\nThe\scode\swill\spersist\sforever\sin\sthe\ssource\srepository,\sbut\sthere\sis\sno\spoint\nin\scarrying\sit\saround\sin\sthe\slatest\starballs\swhere\sit\sis\snever\sused.
D 2023-01-14T19:53:42.098
F .fossil-settings/empty-dirs dbb81e8fc0401ac46a1491ab34a7f2c7c0452f2f06b54ebb845d024ca8283ef1
F .fossil-settings/ignore-glob 35175cdfcf539b2318cb04a9901442804be81cd677d8b889fcc9149c21f239ea
F LICENSE.md df5091916dbb40e6e9686186587125e1b2ff51f022cc334e886c19a0e9982724
F Makefile.in 341f02570d220695100004c447773ecb6c082e24178fc45dcbc0a212abaa0655
F Makefile.in 4bba1e74ea2ffc95b9f299d72aebec06dc55c2d93948c93a2f356cc9682912ea
F Makefile.linux-gcc f609543700659711fbd230eced1f01353117621dccae7b9fb70daa64236c5241
F Makefile.msc 5a13f784bd91c246673648f056e7f3193c45d53dff874eecde547ec854c292b8
F Makefile.msc 6de67bb5a8e849f9a7a11085366cc8daf99845fa21368e89b3cd195abd62cd2a
F README.md 8b8df9ca852aeac4864eb1e400002633ee6db84065bd01b78c33817f97d31f5e
F VERSION 413ec94920a487ae32c9a2a8819544d690662d6f7c7ce025c0d0b8a1e74fa9db
F aclocal.m4 a5c22d164aff7ed549d53a90fa56d56955281f50
@ -54,32 +54,6 @@ F ext/expert/expert1.test 95b00567ce0775126a1b788af2d055255014714ecfddc97913864d
F ext/expert/sqlite3expert.c a912efbad597eafdb0ce934ebc11039f3190b2d479685d89184e107f65d856e1
F ext/expert/sqlite3expert.h ca81efc2679a92373a13a3e76a6138d0310e32be53d6c3bfaedabd158ea8969b
F ext/expert/test_expert.c d56c194b769bdc90cf829a14c9ecbc1edca9c850b837a4d0b13be14095c32a72
F ext/fts1/README.txt 20ac73b006a70bcfd80069bdaf59214b6cf1db5e
F ext/fts1/ft_hash.c 3927bd880e65329bdc6f506555b228b28924921b
F ext/fts1/ft_hash.h 06df7bba40dadd19597aa400a875dbc2fed705ea
F ext/fts1/fts1.c a39f7d21c2994d27c959ef9c3505c81542c81432
F ext/fts1/fts1.h 6060b8f62c1d925ea8356cb1a6598073eb9159a6
F ext/fts1/fts1_hash.c 3196cee866edbebb1c0521e21672e6d599965114
F ext/fts1/fts1_hash.h e7f0d761353996a8175eda351104acfde23afcb0
F ext/fts1/fts1_porter.c b1c7304b8988ba3f764a147cdd32043b4913ea7b
F ext/fts1/fts1_tokenizer.h fdea722c38a9f82ed921642981234f666e47919c
F ext/fts1/fts1_tokenizer1.c fd00d1fe4dc30dfc5c64cba695ce34f4af20d2fa
F ext/fts1/fulltext.c 37698e1909584f6d8ea67d1485e3ad39dbf42d19
F ext/fts1/fulltext.h 08525a47852d1d62a0be81d3fc3fe2d23b094efd
F ext/fts1/simple_tokenizer.c bbfa4e3b2a26ef17d4edc6d98cd4a3f5396d998a
F ext/fts1/tokenizer.h 0c53421b832366d20d720d21ea3e1f6e66a36ef9
F ext/fts2/README.tokenizers 21e3684ea5a095b55d70f6878b4ce6af5932dfb7
F ext/fts2/README.txt 8c18f41574404623b76917b9da66fcb0ab38328d
F ext/fts2/fts2.c 72c816a9ae448049fbbe8f18a85698765fc7956c
F ext/fts2/fts2.h da5f76c65163301d1068a971fd32f4119e3c95fa
F ext/fts2/fts2_hash.c 011a1d32de45bb1b519a1fd0048e857d6a843558
F ext/fts2/fts2_hash.h 1824b99dfd8d0225facbdb26a2c87289b2e7dcf8
F ext/fts2/fts2_icu.c 51c5cd3c04954badd329fa738c95fcdb717b5188
F ext/fts2/fts2_porter.c 2cd4a507bf3c3085fe66f59b0f2a325f65aaacf5
F ext/fts2/fts2_tokenizer.c b529493d55e55497213c37e1f31680a77746be26
F ext/fts2/fts2_tokenizer.h 27a1a99ca2d615cf7e142839b8d79e8751b4529e
F ext/fts2/fts2_tokenizer1.c 07e223eecb483d448313b5f1553a4f299a7fb7a1
F ext/fts2/mkfts2amal.tcl 974d5d438cb3f7c4a652639262f82418c1e4cff0
F ext/fts3/README.content b9078d0843a094d86af0d48dffbff13c906702b4c3558012e67b9c7cc3bf59ee
F ext/fts3/README.syntax a19711dc5458c20734b8e485e75fb1981ec2427a
F ext/fts3/README.tokenizers b92bdeb8b46503f0dd301d364efc5ef59ef9fa8e2758b8e742f39fa93a2e422d
@ -565,7 +539,7 @@ F ext/wasm/wasmfs.make cf9a68162d92ca2bcb0b9528b244cb36d5cc2d84ccc9c2d398461927d
F install-sh 9d4de14ab9fb0facae2f48780b874848cbf2f895 x
F ltmain.sh 3ff0879076df340d2e23ae905484d8c15d5fdea8
F magic.txt 5ade0bc977aa135e79e3faaea894d5671b26107cc91e70783aa7dc83f22f3ba0
F main.mk ed4950f3e1d03687e6c94c0f9aa26373ddab628030480d1e298097cfe0b9a5cf
F main.mk e8aca588ea9b98108e303972fbe6fa08d0481afbad21e81c288ef44a9a695dab
F mkso.sh fd21c06b063bb16a5d25deea1752c2da6ac3ed83
F mptest/config01.test 3c6adcbc50b991866855f1977ff172eb6d901271
F mptest/config02.test 4415dfe36c48785f751e16e32c20b077c28ae504
@ -2069,8 +2043,8 @@ F vsixtest/vsixtest.tcl 6a9a6ab600c25a91a7acc6293828957a386a8a93
F vsixtest/vsixtest.vcxproj.data 2ed517e100c66dc455b492e1a33350c1b20fbcdc
F vsixtest/vsixtest.vcxproj.filters 37e51ffedcdb064aad6ff33b6148725226cd608e
F vsixtest/vsixtest_TemporaryKey.pfx e5b1b036facdb453873e7084e1cae9102ccc67a0
P eac135fd6a5dd220a0f7c1b61f987bbd801faabdb76846c4417fef0dd03fcb87
R 99e97b5c1ed332ad2978f8ea2c2fcf8e
U larrybr
Z ebace40712668be3208071b1f9005e7b
P eda84dcffee6016fb3f8588d96c7ffb6275edd626b11f6fe12e81be90226c7d8
R 7852d0ec8fdd16208bf35878fba2c912
U drh
Z 906fb928e09759a07caadcd98e3668bb
# Remove this line to create a well-formed Fossil manifest.

View File

@ -1 +1 @@
eda84dcffee6016fb3f8588d96c7ffb6275edd626b11f6fe12e81be90226c7d8
2bb50d5aedef0fd216d94058f477a58d88aa3a68bbadc94fa67998b7c391a8ff