/* ** 2014 May 31 ** ** The author disclaims copyright to this source code. In place of ** a legal notice, here is a blessing: ** ** May you do good and not evil. ** May you find forgiveness for yourself and forgive others. ** May you share freely, never taking more than you give. ** ****************************************************************************** ** ** Interfaces to extend FTS5. Using the interfaces defined in this file, ** FTS5 may be extended with: ** ** * custom tokenizers, and ** * custom auxiliary functions. */ #ifndef _FTS5_H #define _FTS5_H #include "sqlite3.h" #ifdef __cplusplus extern "C" { #endif /************************************************************************* ** CUSTOM AUXILIARY FUNCTIONS ** ** Virtual table implementations may overload SQL functions by implementing ** the sqlite3_module.xFindFunction() method. */ typedef struct Fts5ExtensionApi Fts5ExtensionApi; typedef struct Fts5Context Fts5Context; typedef struct Fts5PhraseIter Fts5PhraseIter; typedef void (*fts5_extension_function)( const Fts5ExtensionApi *pApi, /* API offered by current FTS version */ Fts5Context *pFts, /* First arg to pass to pApi functions */ sqlite3_context *pCtx, /* Context for returning result/error */ int nVal, /* Number of values in apVal[] array */ sqlite3_value **apVal /* Array of trailing arguments */ ); struct Fts5PhraseIter { const unsigned char *a; const unsigned char *b; }; /* ** EXTENSION API FUNCTIONS ** ** xUserData(pFts): ** Return a copy of the context pointer the extension function was ** registered with. ** ** xColumnTotalSize(pFts, iCol, pnToken): ** If parameter iCol is less than zero, set output variable *pnToken ** to the total number of tokens in the FTS5 table. Or, if iCol is ** non-negative but less than the number of columns in the table, return ** the total number of tokens in column iCol, considering all rows in ** the FTS5 table. ** ** If parameter iCol is greater than or equal to the number of columns ** in the table, SQLITE_RANGE is returned. Or, if an error occurs (e.g. ** an OOM condition or IO error), an appropriate SQLite error code is ** returned. ** ** xColumnCount(pFts): ** Return the number of columns in the table. ** ** xColumnSize(pFts, iCol, pnToken): ** If parameter iCol is less than zero, set output variable *pnToken ** to the total number of tokens in the current row. Or, if iCol is ** non-negative but less than the number of columns in the table, set ** *pnToken to the number of tokens in column iCol of the current row. ** ** If parameter iCol is greater than or equal to the number of columns ** in the table, SQLITE_RANGE is returned. Or, if an error occurs (e.g. ** an OOM condition or IO error), an appropriate SQLite error code is ** returned. ** ** This function may be quite inefficient if used with an FTS5 table ** created with the "columnsize=0" option. ** ** xColumnText: ** This function attempts to retrieve the text of column iCol of the ** current document. If successful, (*pz) is set to point to a buffer ** containing the text in utf-8 encoding, (*pn) is set to the size in bytes ** (not characters) of the buffer and SQLITE_OK is returned. Otherwise, ** if an error occurs, an SQLite error code is returned and the final values ** of (*pz) and (*pn) are undefined. ** ** xPhraseCount: ** Returns the number of phrases in the current query expression. ** ** xPhraseSize: ** Returns the number of tokens in phrase iPhrase of the query. Phrases ** are numbered starting from zero. ** ** xInstCount: ** Set *pnInst to the total number of occurrences of all phrases within ** the query within the current row. Return SQLITE_OK if successful, or ** an error code (i.e. SQLITE_NOMEM) if an error occurs. ** ** This API can be quite slow if used with an FTS5 table created with the ** "detail=none" or "detail=column" option. If the FTS5 table is created ** with either "detail=none" or "detail=column" and "content=" option ** (i.e. if it is a contentless table), then this API always returns 0. ** ** xInst: ** Query for the details of phrase match iIdx within the current row. ** Phrase matches are numbered starting from zero, so the iIdx argument ** should be greater than or equal to zero and smaller than the value ** output by xInstCount(). ** ** Usually, output parameter *piPhrase is set to the phrase number, *piCol ** to the column in which it occurs and *piOff the token offset of the ** first token of the phrase. Returns SQLITE_OK if successful, or an error ** code (i.e. SQLITE_NOMEM) if an error occurs. ** ** This API can be quite slow if used with an FTS5 table created with the ** "detail=none" or "detail=column" option. ** ** xRowid: ** Returns the rowid of the current row. ** ** xTokenize: ** Tokenize text using the tokenizer belonging to the FTS5 table. ** ** xQueryPhrase(pFts5, iPhrase, pUserData, xCallback): ** This API function is used to query the FTS table for phrase iPhrase ** of the current query. Specifically, a query equivalent to: ** ** ... FROM ftstable WHERE ftstable MATCH $p ORDER BY rowid ** ** with $p set to a phrase equivalent to the phrase iPhrase of the ** current query is executed. Any column filter that applies to ** phrase iPhrase of the current query is included in $p. For each ** row visited, the callback function passed as the fourth argument ** is invoked. The context and API objects passed to the callback ** function may be used to access the properties of each matched row. ** Invoking Api.xUserData() returns a copy of the pointer passed as ** the third argument to pUserData. ** ** If the callback function returns any value other than SQLITE_OK, the ** query is abandoned and the xQueryPhrase function returns immediately. ** If the returned value is SQLITE_DONE, xQueryPhrase returns SQLITE_OK. ** Otherwise, the error code is propagated upwards. ** ** If the query runs to completion without incident, SQLITE_OK is returned. ** Or, if some error occurs before the query completes or is aborted by ** the callback, an SQLite error code is returned. ** ** ** xSetAuxdata(pFts5, pAux, xDelete) ** ** Save the pointer passed as the second argument as the extension function's ** "auxiliary data". The pointer may then be retrieved by the current or any ** future invocation of the same fts5 extension function made as part of ** the same MATCH query using the xGetAuxdata() API. ** ** Each extension function is allocated a single auxiliary data slot for ** each FTS query (MATCH expression). If the extension function is invoked ** more than once for a single FTS query, then all invocations share a ** single auxiliary data context. ** ** If there is already an auxiliary data pointer when this function is ** invoked, then it is replaced by the new pointer. If an xDelete callback ** was specified along with the original pointer, it is invoked at this ** point. ** ** The xDelete callback, if one is specified, is also invoked on the ** auxiliary data pointer after the FTS5 query has finished. ** ** If an error (e.g. an OOM condition) occurs within this function, ** the auxiliary data is set to NULL and an error code returned. If the ** xDelete parameter was not NULL, it is invoked on the auxiliary data ** pointer before returning. ** ** ** xGetAuxdata(pFts5, bClear) ** ** Returns the current auxiliary data pointer for the fts5 extension ** function. See the xSetAuxdata() method for details. ** ** If the bClear argument is non-zero, then the auxiliary data is cleared ** (set to NULL) before this function returns. In this case the xDelete, ** if any, is not invoked. ** ** ** xRowCount(pFts5, pnRow) ** ** This function is used to retrieve the total number of rows in the table. ** In other words, the same value that would be returned by: ** ** SELECT count(*) FROM ftstable; ** ** xPhraseFirst() ** This function is used, along with type Fts5PhraseIter and the xPhraseNext ** method, to iterate through all instances of a single query phrase within ** the current row. This is the same information as is accessible via the ** xInstCount/xInst APIs. While the xInstCount/xInst APIs are more convenient ** to use, this API may be faster under some circumstances. To iterate ** through instances of phrase iPhrase, use the following code: ** ** Fts5PhraseIter iter; ** int iCol, iOff; ** for(pApi->xPhraseFirst(pFts, iPhrase, &iter, &iCol, &iOff); ** iCol>=0; ** pApi->xPhraseNext(pFts, &iter, &iCol, &iOff) ** ){ ** // An instance of phrase iPhrase at offset iOff of column iCol ** } ** ** The Fts5PhraseIter structure is defined above. Applications should not ** modify this structure directly - it should only be used as shown above ** with the xPhraseFirst() and xPhraseNext() API methods (and by ** xPhraseFirstColumn() and xPhraseNextColumn() as illustrated below). ** ** This API can be quite slow if used with an FTS5 table created with the ** "detail=none" or "detail=column" option. If the FTS5 table is created ** with either "detail=none" or "detail=column" and "content=" option ** (i.e. if it is a contentless table), then this API always iterates ** through an empty set (all calls to xPhraseFirst() set iCol to -1). ** ** xPhraseNext() ** See xPhraseFirst above. ** ** xPhraseFirstColumn() ** This function and xPhraseNextColumn() are similar to the xPhraseFirst() ** and xPhraseNext() APIs described above. The difference is that instead ** of iterating through all instances of a phrase in the current row, these ** APIs are used to iterate through the set of columns in the current row ** that contain one or more instances of a specified phrase. For example: ** ** Fts5PhraseIter iter; ** int iCol; ** for(pApi->xPhraseFirstColumn(pFts, iPhrase, &iter, &iCol); ** iCol>=0; ** pApi->xPhraseNextColumn(pFts, &iter, &iCol) ** ){ ** // Column iCol contains at least one instance of phrase iPhrase ** } ** ** This API can be quite slow if used with an FTS5 table created with the ** "detail=none" option. If the FTS5 table is created with either ** "detail=none" "content=" option (i.e. if it is a contentless table), ** then this API always iterates through an empty set (all calls to ** xPhraseFirstColumn() set iCol to -1). ** ** The information accessed using this API and its companion ** xPhraseFirstColumn() may also be obtained using xPhraseFirst/xPhraseNext ** (or xInst/xInstCount). The chief advantage of this API is that it is ** significantly more efficient than those alternatives when used with ** "detail=column" tables. ** ** xPhraseNextColumn() ** See xPhraseFirstColumn above. ** ** xQueryToken(pFts5, iPhrase, iToken, ppToken, pnToken) ** This is used to access token iToken of phrase iPhrase of the current ** query. Before returning, output parameter *ppToken is set to point ** to a buffer containing the requested token, and *pnToken to the ** size of this buffer in bytes. ** ** The output text is not a copy of the query text that specified the ** token. It is the output of the tokenizer module. For tokendata=1 ** tables, this includes any embedded 0x00 and trailing data. ** ** xInstToken(pFts5, iIdx, iToken, ppToken, pnToken) ** This is used to access token iToken of phrase hit iIdx within the ** current row. ** ** The output text is not a copy of the document text that was tokenized. ** It is the output of the tokenizer module. For tokendata=1 tables, this ** includes any embedded 0x00 and trailing data. ** ** This API can be quite slow if used with an FTS5 table created with the ** "detail=none" or "detail=column" option. */ struct Fts5ExtensionApi { int iVersion; /* Currently always set to 3 */ void *(*xUserData)(Fts5Context*); int (*xColumnCount)(Fts5Context*); int (*xRowCount)(Fts5Context*, sqlite3_int64 *pnRow); int (*xColumnTotalSize)(Fts5Context*, int iCol, sqlite3_int64 *pnToken); int (*xTokenize)(Fts5Context*, const char *pText, int nText, /* Text to tokenize */ void *pCtx, /* Context passed to xToken() */ int (*xToken)(void*, int, const char*, int, int, int) /* Callback */ ); int (*xPhraseCount)(Fts5Context*); int (*xPhraseSize)(Fts5Context*, int iPhrase); int (*xInstCount)(Fts5Context*, int *pnInst); int (*xInst)(Fts5Context*, int iIdx, int *piPhrase, int *piCol, int *piOff); sqlite3_int64 (*xRowid)(Fts5Context*); int (*xColumnText)(Fts5Context*, int iCol, const char **pz, int *pn); int (*xColumnSize)(Fts5Context*, int iCol, int *pnToken); int (*xQueryPhrase)(Fts5Context*, int iPhrase, void *pUserData, int(*)(const Fts5ExtensionApi*,Fts5Context*,void*) ); int (*xSetAuxdata)(Fts5Context*, void *pAux, void(*xDelete)(void*)); void *(*xGetAuxdata)(Fts5Context*, int bClear); int (*xPhraseFirst)(Fts5Context*, int iPhrase, Fts5PhraseIter*, int*, int*); void (*xPhraseNext)(Fts5Context*, Fts5PhraseIter*, int *piCol, int *piOff); int (*xPhraseFirstColumn)(Fts5Context*, int iPhrase, Fts5PhraseIter*, int*); void (*xPhraseNextColumn)(Fts5Context*, Fts5PhraseIter*, int *piCol); /* Below this point are iVersion>=3 only */ int (*xQueryToken)(Fts5Context*, int iPhrase, int iToken, const char **ppToken, int *pnToken ); int (*xInstToken)(Fts5Context*, int iIdx, int iToken, const char**, int*); }; /* ** CUSTOM AUXILIARY FUNCTIONS *************************************************************************/ /************************************************************************* ** CUSTOM TOKENIZERS ** ** Applications may also register custom tokenizer types. A tokenizer ** is registered by providing fts5 with a populated instance of the ** following structure. All structure methods must be defined, setting ** any member of the fts5_tokenizer struct to NULL leads to undefined ** behaviour. The structure methods are expected to function as follows: ** ** xCreate: ** This function is used to allocate and initialize a tokenizer instance. ** A tokenizer instance is required to actually tokenize text. ** ** The first argument passed to this function is a copy of the (void*) ** pointer provided by the application when the fts5_tokenizer object ** was registered with FTS5 (the third argument to xCreateTokenizer()). ** The second and third arguments are an array of nul-terminated strings ** containing the tokenizer arguments, if any, specified following the ** tokenizer name as part of the CREATE VIRTUAL TABLE statement used ** to create the FTS5 table. ** ** The final argument is an output variable. If successful, (*ppOut) ** should be set to point to the new tokenizer handle and SQLITE_OK ** returned. If an error occurs, some value other than SQLITE_OK should ** be returned. In this case, fts5 assumes that the final value of *ppOut ** is undefined. ** ** xDelete: ** This function is invoked to delete a tokenizer handle previously ** allocated using xCreate(). Fts5 guarantees that this function will ** be invoked exactly once for each successful call to xCreate(). ** ** xTokenize: ** This function is expected to tokenize the nText byte string indicated ** by argument pText. pText may or may not be nul-terminated. The first ** argument passed to this function is a pointer to an Fts5Tokenizer object ** returned by an earlier call to xCreate(). ** ** The second argument indicates the reason that FTS5 is requesting ** tokenization of the supplied text. This is always one of the following ** four values: ** ** ** ** For each token in the input string, the supplied callback xToken() must ** be invoked. The first argument to it should be a copy of the pointer ** passed as the second argument to xTokenize(). The third and fourth ** arguments are a pointer to a buffer containing the token text, and the ** size of the token in bytes. The 4th and 5th arguments are the byte offsets ** of the first byte of and first byte immediately following the text from ** which the token is derived within the input. ** ** The second argument passed to the xToken() callback ("tflags") should ** normally be set to 0. The exception is if the tokenizer supports ** synonyms. In this case see the discussion below for details. ** ** FTS5 assumes the xToken() callback is invoked for each token in the ** order that they occur within the input text. ** ** If an xToken() callback returns any value other than SQLITE_OK, then ** the tokenization should be abandoned and the xTokenize() method should ** immediately return a copy of the xToken() return value. Or, if the ** input buffer is exhausted, xTokenize() should return SQLITE_OK. Finally, ** if an error occurs with the xTokenize() implementation itself, it ** may abandon the tokenization and return any error code other than ** SQLITE_OK or SQLITE_DONE. ** ** SYNONYM SUPPORT ** ** Custom tokenizers may also support synonyms. Consider a case in which a ** user wishes to query for a phrase such as "first place". Using the ** built-in tokenizers, the FTS5 query 'first + place' will match instances ** of "first place" within the document set, but not alternative forms ** such as "1st place". In some applications, it would be better to match ** all instances of "first place" or "1st place" regardless of which form ** the user specified in the MATCH query text. ** ** There are several ways to approach this in FTS5: ** **
  1. By mapping all synonyms to a single token. In this case, using ** the above example, this means that the tokenizer returns the ** same token for inputs "first" and "1st". Say that token is in ** fact "first", so that when the user inserts the document "I won ** 1st place" entries are added to the index for tokens "i", "won", ** "first" and "place". If the user then queries for '1st + place', ** the tokenizer substitutes "first" for "1st" and the query works ** as expected. ** **
  2. By querying the index for all synonyms of each query term ** separately. In this case, when tokenizing query text, the ** tokenizer may provide multiple synonyms for a single term ** within the document. FTS5 then queries the index for each ** synonym individually. For example, faced with the query: ** ** ** ... MATCH 'first place' ** ** the tokenizer offers both "1st" and "first" as synonyms for the ** first token in the MATCH query and FTS5 effectively runs a query ** similar to: ** ** ** ... MATCH '(first OR 1st) place' ** ** except that, for the purposes of auxiliary functions, the query ** still appears to contain just two phrases - "(first OR 1st)" ** being treated as a single phrase. ** **
  3. By adding multiple synonyms for a single term to the FTS index. ** Using this method, when tokenizing document text, the tokenizer ** provides multiple synonyms for each token. So that when a ** document such as "I won first place" is tokenized, entries are ** added to the FTS index for "i", "won", "first", "1st" and ** "place". ** ** This way, even if the tokenizer does not provide synonyms ** when tokenizing query text (it should not - to do so would be ** inefficient), it doesn't matter if the user queries for ** 'first + place' or '1st + place', as there are entries in the ** FTS index corresponding to both forms of the first token. **
** ** Whether it is parsing document or query text, any call to xToken that ** specifies a tflags argument with the FTS5_TOKEN_COLOCATED bit ** is considered to supply a synonym for the previous token. For example, ** when parsing the document "I won first place", a tokenizer that supports ** synonyms would call xToken() 5 times, as follows: ** ** ** xToken(pCtx, 0, "i", 1, 0, 1); ** xToken(pCtx, 0, "won", 3, 2, 5); ** xToken(pCtx, 0, "first", 5, 6, 11); ** xToken(pCtx, FTS5_TOKEN_COLOCATED, "1st", 3, 6, 11); ** xToken(pCtx, 0, "place", 5, 12, 17); ** ** ** It is an error to specify the FTS5_TOKEN_COLOCATED flag the first time ** xToken() is called. Multiple synonyms may be specified for a single token ** by making multiple calls to xToken(FTS5_TOKEN_COLOCATED) in sequence. ** There is no limit to the number of synonyms that may be provided for a ** single token. ** ** In many cases, method (1) above is the best approach. It does not add ** extra data to the FTS index or require FTS5 to query for multiple terms, ** so it is efficient in terms of disk space and query speed. However, it ** does not support prefix queries very well. If, as suggested above, the ** token "first" is substituted for "1st" by the tokenizer, then the query: ** ** ** ... MATCH '1s*' ** ** will not match documents that contain the token "1st" (as the tokenizer ** will probably not map "1s" to any prefix of "first"). ** ** For full prefix support, method (3) may be preferred. In this case, ** because the index contains entries for both "first" and "1st", prefix ** queries such as 'fi*' or '1s*' will match correctly. However, because ** extra entries are added to the FTS index, this method uses more space ** within the database. ** ** Method (2) offers a midpoint between (1) and (3). Using this method, ** a query such as '1s*' will match documents that contain the literal ** token "1st", but not "first" (assuming the tokenizer is not able to ** provide synonyms for prefixes). However, a non-prefix query like '1st' ** will match against "1st" and "first". This method does not require ** extra disk space, as no extra entries are added to the FTS index. ** On the other hand, it may require more CPU cycles to run MATCH queries, ** as separate queries of the FTS index are required for each synonym. ** ** When using methods (2) or (3), it is important that the tokenizer only ** provide synonyms when tokenizing document text (method (3)) or query ** text (method (2)), not both. Doing so will not cause any errors, but is ** inefficient. */ typedef struct Fts5Tokenizer Fts5Tokenizer; typedef struct fts5_tokenizer fts5_tokenizer; struct fts5_tokenizer { int (*xCreate)(void*, const char **azArg, int nArg, Fts5Tokenizer **ppOut); void (*xDelete)(Fts5Tokenizer*); int (*xTokenize)(Fts5Tokenizer*, void *pCtx, int flags, /* Mask of FTS5_TOKENIZE_* flags */ const char *pText, int nText, int (*xToken)( void *pCtx, /* Copy of 2nd argument to xTokenize() */ int tflags, /* Mask of FTS5_TOKEN_* flags */ const char *pToken, /* Pointer to buffer containing token */ int nToken, /* Size of token in bytes */ int iStart, /* Byte offset of token within input text */ int iEnd /* Byte offset of end of token within input text */ ) ); }; /* Flags that may be passed as the third argument to xTokenize() */ #define FTS5_TOKENIZE_QUERY 0x0001 #define FTS5_TOKENIZE_PREFIX 0x0002 #define FTS5_TOKENIZE_DOCUMENT 0x0004 #define FTS5_TOKENIZE_AUX 0x0008 /* Flags that may be passed by the tokenizer implementation back to FTS5 ** as the third argument to the supplied xToken callback. */ #define FTS5_TOKEN_COLOCATED 0x0001 /* Same position as prev. token */ /* ** END OF CUSTOM TOKENIZERS *************************************************************************/ /************************************************************************* ** FTS5 EXTENSION REGISTRATION API */ typedef struct fts5_api fts5_api; struct fts5_api { int iVersion; /* Currently always set to 2 */ /* Create a new tokenizer */ int (*xCreateTokenizer)( fts5_api *pApi, const char *zName, void *pUserData, fts5_tokenizer *pTokenizer, void (*xDestroy)(void*) ); /* Find an existing tokenizer */ int (*xFindTokenizer)( fts5_api *pApi, const char *zName, void **ppUserData, fts5_tokenizer *pTokenizer ); /* Create a new auxiliary function */ int (*xCreateFunction)( fts5_api *pApi, const char *zName, void *pUserData, fts5_extension_function xFunction, void (*xDestroy)(void*) ); }; /* ** END OF REGISTRATION API *************************************************************************/ #ifdef __cplusplus } /* end of the 'extern "C"' block */ #endif #endif /* _FTS5_H */