Add some documentation for user-defined fts2 tokenizers. (CVS 4116)
FossilOrigin-Name: 5a9eee86587219a68655d548864d129edec969ae
This commit is contained in:
parent
27b1f95f08
commit
24e1afa222
124
ext/fts2/README.tokenizers
Normal file
124
ext/fts2/README.tokenizers
Normal file
@ -0,0 +1,124 @@
|
||||
|
||||
1. FTS2 Tokenizers
|
||||
|
||||
When creating a new full-text table, FTS2 allows the user to select
|
||||
the text tokenizer implementation to be used when indexing text
|
||||
by specifying a "tokenizer" clause as part of the CREATE VIRTUAL TABLE
|
||||
statement:
|
||||
|
||||
CREATE VIRTUAL TABLE <table-name> USING fts2(
|
||||
<columns ...> [, tokenizer <tokenizer-name> [<tokenizer-args>]]
|
||||
);
|
||||
|
||||
The built-in tokenizers (valid values to pass as <tokenizer name>) are
|
||||
"simple" and "porter".
|
||||
|
||||
<tokenizer-args> should consist of zero or more white-space separated
|
||||
arguments to pass to the selected tokenizer implementation. The
|
||||
interpretation of the arguments, if any, depends on the individual
|
||||
tokenizer.
|
||||
|
||||
2. Custom Tokenizers
|
||||
|
||||
FTS2 allows users to provide custom tokenizer implementations. The
|
||||
interface used to create a new tokenizer is defined and described in
|
||||
the fts2_tokenizer.h source file.
|
||||
|
||||
Registering a new FTS2 tokenizer is similar to registering a new
|
||||
virtual table module with SQLite. The user passes a pointer to a
|
||||
structure containing pointers to various callback functions that
|
||||
make up the implementation of the new tokenizer type. For tokenizers,
|
||||
the structure (defined in fts2_tokenizer.h) is called
|
||||
"sqlite3_tokenizer_module".
|
||||
|
||||
FTS2 does not expose a C-function that users call to register new
|
||||
tokenizer types with a database handle. Instead, the pointer must
|
||||
be encoded as an SQL blob value and passed to FTS2 through the SQL
|
||||
engine by evaluating a special scalar function, "fts2_tokenizer()".
|
||||
The fts2_tokenizer() function may be called with one or two arguments,
|
||||
as follows:
|
||||
|
||||
SELECT fts2_tokenizer(<tokenizer-name>);
|
||||
SELECT fts2_tokenizer(<tokenizer-name>, <sqlite3_tokenizer_module ptr>);
|
||||
|
||||
Where <tokenizer-name> is a string identifying the tokenizer and
|
||||
<sqlite3_tokenizer_module ptr> is a pointer to an sqlite3_tokenizer_module
|
||||
structure encoded as an SQL blob. If the second argument is present,
|
||||
it is registered as tokenizer <tokenizer-name> and a copy of it
|
||||
returned. If only one argument is passed, a pointer to the tokenizer
|
||||
implementation currently registered as <tokenizer-name> is returned,
|
||||
encoded as a blob. Or, if no such tokenizer exists, an SQL NULL value
|
||||
is returned.
|
||||
|
||||
SECURITY: If the fts2 extension is used in an environment where potentially
|
||||
malicious users may execute arbitrary SQL (i.e. gears), they should be
|
||||
prevented from invoking the fts2_tokenizer() function, possibly using the
|
||||
authorisation callback.
|
||||
|
||||
See "Sample code" below for an example of calling the fts2_tokenizer()
|
||||
function from C code.
|
||||
|
||||
3. ICU Library Tokenizers
|
||||
|
||||
If this extension is compiled with the SQLITE_ENABLE_ICU pre-processor
|
||||
symbol defined, then there exists a built-in tokenizer named "icu"
|
||||
implemented using the ICU library. The first argument passed to the
|
||||
xCreate() method (see fts2_tokenizer.h) of this tokenizer may be
|
||||
an ICU locale identifier. For example "tr_TR" for Turkish as used
|
||||
in Turkey, or "en_AU" for English as used in Australia. For example:
|
||||
|
||||
"CREATE VIRTUAL TABLE thai_text USING fts2(text, tokenizer icu th_TH)"
|
||||
|
||||
The ICU tokenizer implementation is very simple. It splits the input
|
||||
text according to the ICU rules for finding word boundaries and discards
|
||||
any tokens that consist entirely of white-space. This may be suitable
|
||||
for some applications in some locales, but not all. If more complex
|
||||
processing is required, for example to implement stemming or
|
||||
discard punctuation, this can be done by creating a tokenizer
|
||||
implementation that uses the ICU tokenizer as part of it's implementation.
|
||||
|
||||
When using the ICU tokenizer this way, it is safe to overwrite the
|
||||
contents of the strings returned by the xNext() method (see
|
||||
fts2_tokenizer.h).
|
||||
|
||||
4. Sample code.
|
||||
|
||||
The following two code samples illustrate the way C code should invoke
|
||||
the fts2_tokenizer() scalar function:
|
||||
|
||||
int registerTokenizer(sqlite3 *db, char *zName, sqlite3_tokenizer_module *p){
|
||||
sqlite3_stmt *pStmt;
|
||||
const char zSql[] = "SELECT fts2_tokenizer(?, ?)";
|
||||
|
||||
rc = sqlite3_prepare_v2(db, zSql, &pStmt);
|
||||
if( rc!=SQLITE_OK ){
|
||||
return rc;
|
||||
}
|
||||
|
||||
sqlite3_bind_text(pStmt, 1, zName, -1, SQLITE_STATIC);
|
||||
sqlite3_bind_blob(pStmt, 2, &p, sizeof(p), SQLITE_STATIC);
|
||||
sqlite3_step(pStmt);
|
||||
|
||||
return sqlite3_finalize(pStmt);
|
||||
}
|
||||
|
||||
int queryTokenizer(sqlite3 *db, char *zName, sqlite3_tokenizer_module **pp){
|
||||
sqlite3_stmt *pStmt;
|
||||
const char zSql[] = "SELECT fts2_tokenizer(?)";
|
||||
|
||||
*pp = 0;
|
||||
rc = sqlite3_prepare_v2(db, zSql, &pStmt);
|
||||
if( rc!=SQLITE_OK ){
|
||||
return rc;
|
||||
}
|
||||
|
||||
sqlite3_bind_text(pStmt, 1, zName, -1, SQLITE_STATIC);
|
||||
if( SQLITE_ROW==sqlite3_step(pStmt) ){
|
||||
if( sqlite3_column_type(pStmt, 0)==SQLITE_BLOB ){
|
||||
memcpy(pp, sqlite3_column_blob(pStmt, 0), sizeof(*pp));
|
||||
}
|
||||
}
|
||||
|
||||
return sqlite3_finalize(pStmt);
|
||||
}
|
||||
|
11
manifest
11
manifest
@ -1,5 +1,5 @@
|
||||
C Make\sthe\sauto_vacuum\smode\speristent\sin\sall\scases.\s(CVS\s4115)
|
||||
D 2007-06-25T08:16:58
|
||||
C Add\ssome\sdocumentation\sfor\suser-defined\sfts2\stokenizers.\s(CVS\s4116)
|
||||
D 2007-06-25T09:52:31
|
||||
F Makefile.in 7f7485a4cc039476a42e534b3f26ec90e2f9753e
|
||||
F Makefile.linux-gcc 2d8574d1ba75f129aba2019f0b959db380a90935
|
||||
F README 9c4e2d6706bdcc3efdd773ce752a8cdab4f90028
|
||||
@ -35,6 +35,7 @@ F ext/fts1/fulltext.c d935e600d87bc86b7d64f55c7520ea41d6034c5c
|
||||
F ext/fts1/fulltext.h 08525a47852d1d62a0be81d3fc3fe2d23b094efd
|
||||
F ext/fts1/simple_tokenizer.c 1844d72f7194c3fd3d7e4173053911bf0661b70d
|
||||
F ext/fts1/tokenizer.h 0c53421b832366d20d720d21ea3e1f6e66a36ef9
|
||||
F ext/fts2/README.tokenizers f358364121285c402d7b38fd44ba87b40903859b
|
||||
F ext/fts2/README.txt 8c18f41574404623b76917b9da66fcb0ab38328d
|
||||
F ext/fts2/fts2.c 841766f2f14d68e623404f9531d98afa0f7cbf05
|
||||
F ext/fts2/fts2.h 591916a822cfb6426518fdbf6069359119bc46eb
|
||||
@ -514,7 +515,7 @@ F www/tclsqlite.tcl 8be95ee6dba05eabcd27a9d91331c803f2ce2130
|
||||
F www/vdbe.tcl 87a31ace769f20d3627a64fa1fade7fed47b90d0
|
||||
F www/version3.tcl 890248cf7b70e60c383b0e84d77d5132b3ead42b
|
||||
F www/whentouse.tcl fc46eae081251c3c181bd79c5faef8195d7991a5
|
||||
P bc61dcbf64af56d4a1394c8ff46e91245dc16d15
|
||||
R ad646b50fb78ca315c96e2f2605f5241
|
||||
P 5b0408ddd0f1c825f402d0f5a3088a61b5ecd2c3
|
||||
R ba9f214b2c9893e0ab7d0a1d3ada75bc
|
||||
U danielk1977
|
||||
Z 7178761d0ea9f7ce5c978317ce1d9a00
|
||||
Z 128ebbe7f73a75dd3845535d3b5f71db
|
||||
|
@ -1 +1 @@
|
||||
5b0408ddd0f1c825f402d0f5a3088a61b5ecd2c3
|
||||
5a9eee86587219a68655d548864d129edec969ae
|
Loading…
x
Reference in New Issue
Block a user