2b85d5f46e
us to break any UTF-8 code points, unless they were already broken in the input. (CVS 3376) FossilOrigin-Name: 6c77c2d5e15e9d3efed3e274bc93cd5a4868f574
175 lines
4.7 KiB
C
175 lines
4.7 KiB
C
/*
|
|
** The author disclaims copyright to this source code.
|
|
**
|
|
*************************************************************************
|
|
** Implementation of the "simple" full-text-search tokenizer.
|
|
*/
|
|
|
|
#include <assert.h>
|
|
#if !defined(__APPLE__)
|
|
#include <malloc.h>
|
|
#else
|
|
#include <stdlib.h>
|
|
#endif
|
|
#include <stdio.h>
|
|
#include <string.h>
|
|
#include <ctype.h>
|
|
|
|
#include "tokenizer.h"
|
|
|
|
/* Duplicate a string; the caller must free() the returned string.
|
|
* (We don't use strdup() since it's not part of the standard C library and
|
|
* may not be available everywhere.) */
|
|
/* TODO(shess) Copied from fulltext.c, consider util.c for such
|
|
** things. */
|
|
static char *string_dup(const char *s){
|
|
char *str = malloc(strlen(s) + 1);
|
|
strcpy(str, s);
|
|
return str;
|
|
}
|
|
|
|
typedef struct simple_tokenizer {
|
|
sqlite3_tokenizer base;
|
|
const char *zDelim; /* token delimiters */
|
|
} simple_tokenizer;
|
|
|
|
typedef struct simple_tokenizer_cursor {
|
|
sqlite3_tokenizer_cursor base;
|
|
const char *pInput; /* input we are tokenizing */
|
|
int nBytes; /* size of the input */
|
|
const char *pCurrent; /* current position in pInput */
|
|
int iToken; /* index of next token to be returned */
|
|
char *zToken; /* storage for current token */
|
|
int nTokenBytes; /* actual size of current token */
|
|
int nTokenAllocated; /* space allocated to zToken buffer */
|
|
} simple_tokenizer_cursor;
|
|
|
|
static sqlite3_tokenizer_module simpleTokenizerModule;/* forward declaration */
|
|
|
|
static int simpleCreate(
|
|
int argc, const char **argv,
|
|
sqlite3_tokenizer **ppTokenizer
|
|
){
|
|
simple_tokenizer *t;
|
|
|
|
t = (simple_tokenizer *) malloc(sizeof(simple_tokenizer));
|
|
/* TODO(shess) Delimiters need to remain the same from run to run,
|
|
** else we need to reindex. One solution would be a meta-table to
|
|
** track such information in the database, then we'd only want this
|
|
** information on the initial create.
|
|
*/
|
|
if( argc>1 ){
|
|
t->zDelim = string_dup(argv[1]);
|
|
} else {
|
|
/* Build a string excluding alphanumeric ASCII characters */
|
|
char zDelim[0x80]; /* nul-terminated, so nul not a member */
|
|
int i, j;
|
|
for(i=1, j=0; i<0x80; i++){
|
|
if( !isalnum(i) ){
|
|
zDelim[j++] = i;
|
|
}
|
|
}
|
|
zDelim[j++] = '\0';
|
|
assert( j<=sizeof(zDelim) );
|
|
t->zDelim = string_dup(zDelim);
|
|
}
|
|
|
|
*ppTokenizer = &t->base;
|
|
return SQLITE_OK;
|
|
}
|
|
|
|
static int simpleDestroy(sqlite3_tokenizer *pTokenizer){
|
|
simple_tokenizer *t = (simple_tokenizer *) pTokenizer;
|
|
|
|
free((void *) t->zDelim);
|
|
free(t);
|
|
|
|
return SQLITE_OK;
|
|
}
|
|
|
|
static int simpleOpen(
|
|
sqlite3_tokenizer *pTokenizer,
|
|
const char *pInput, int nBytes,
|
|
sqlite3_tokenizer_cursor **ppCursor
|
|
){
|
|
simple_tokenizer_cursor *c;
|
|
|
|
c = (simple_tokenizer_cursor *) malloc(sizeof(simple_tokenizer_cursor));
|
|
c->pInput = pInput;
|
|
c->nBytes = nBytes<0 ? (int) strlen(pInput) : nBytes;
|
|
c->pCurrent = c->pInput; /* start tokenizing at the beginning */
|
|
c->iToken = 0;
|
|
c->zToken = NULL; /* no space allocated, yet. */
|
|
c->nTokenBytes = 0;
|
|
c->nTokenAllocated = 0;
|
|
|
|
*ppCursor = &c->base;
|
|
return SQLITE_OK;
|
|
}
|
|
|
|
static int simpleClose(sqlite3_tokenizer_cursor *pCursor){
|
|
simple_tokenizer_cursor *c = (simple_tokenizer_cursor *) pCursor;
|
|
|
|
if( NULL!=c->zToken ){
|
|
free(c->zToken);
|
|
}
|
|
free(c);
|
|
|
|
return SQLITE_OK;
|
|
}
|
|
|
|
static int simpleNext(
|
|
sqlite3_tokenizer_cursor *pCursor,
|
|
const char **ppToken, int *pnBytes,
|
|
int *piStartOffset, int *piEndOffset, int *piPosition
|
|
){
|
|
simple_tokenizer_cursor *c = (simple_tokenizer_cursor *) pCursor;
|
|
simple_tokenizer *t = (simple_tokenizer *) pCursor->pTokenizer;
|
|
int ii;
|
|
|
|
while( c->pCurrent-c->pInput<c->nBytes ){
|
|
int n = (int) strcspn(c->pCurrent, t->zDelim);
|
|
if( n>0 ){
|
|
if( n+1>c->nTokenAllocated ){
|
|
c->zToken = realloc(c->zToken, n+1);
|
|
}
|
|
for(ii=0; ii<n; ii++){
|
|
/* TODO(shess) This needs expansion to handle UTF-8
|
|
** case-insensitivity.
|
|
*/
|
|
char ch = c->pCurrent[ii];
|
|
c->zToken[ii] = (unsigned char)ch<0x80 ? tolower(ch) : ch;
|
|
}
|
|
c->zToken[n] = '\0';
|
|
*ppToken = c->zToken;
|
|
*pnBytes = n;
|
|
*piStartOffset = (int) (c->pCurrent-c->pInput);
|
|
*piEndOffset = *piStartOffset+n;
|
|
*piPosition = c->iToken++;
|
|
c->pCurrent += n + 1;
|
|
|
|
return SQLITE_OK;
|
|
}
|
|
c->pCurrent += n + 1;
|
|
/* TODO(shess) could strspn() to skip delimiters en masse. Needs
|
|
** to happen in two places, though, which is annoying.
|
|
*/
|
|
}
|
|
return SQLITE_DONE;
|
|
}
|
|
|
|
static sqlite3_tokenizer_module simpleTokenizerModule = {
|
|
0,
|
|
simpleCreate,
|
|
simpleDestroy,
|
|
simpleOpen,
|
|
simpleClose,
|
|
simpleNext,
|
|
};
|
|
|
|
void get_simple_tokenizer_module(
|
|
sqlite3_tokenizer_module **ppModule
|
|
){
|
|
*ppModule = &simpleTokenizerModule;
|
|
}
|