Add the "matchlen" column to the spellfix1 virtual table.

FossilOrigin-Name: f24b9d87f6b0e8b4d26669d5c1191f9280ba14a3
This commit is contained in:
dan 2012-07-13 19:26:34 +00:00
parent 2deb165f4b
commit 8512752407
4 changed files with 296 additions and 28 deletions

View File

@ -1,5 +1,5 @@
C Update\stest_spellfix.c\swith\slatest\schanges.
D 2012-07-13T16:15:20.128
C Add\sthe\s"matchlen"\scolumn\sto\sthe\sspellfix1\svirtual\stable.
D 2012-07-13T19:26:34.617
F Makefile.arm-wince-mingw32ce-gcc d6df77f1f48d690bd73162294bbba7f59507c72f
F Makefile.in 8f6d858bf3df9978ba43df19985146a1173025e4
F Makefile.linux-gcc 91d710bdc4998cb015f39edf3cb314ec4f4d7e23
@ -221,7 +221,7 @@ F src/test_quota.h 8761e463b25e75ebc078bd67d70e39b9c817a0cb
F src/test_rtree.c aba603c949766c4193f1068b91c787f57274e0d9
F src/test_schema.c 8c06ef9ddb240c7a0fcd31bc221a6a2aade58bf0
F src/test_server.c 2f99eb2837dfa06a4aacf24af24c6affdf66a84f
F src/test_spellfix.c 1de8d8c086efa50bb6660ea5988e8630ef9144aa
F src/test_spellfix.c 1c900928dad9b71c0fdcbdda9e2f52234f283660
F src/test_stat.c d1569c7a4839f13e80187e2c26b2ab4da2d03935
F src/test_superlock.c 2b97936ca127d13962c3605dbc9a4ef269c424cd
F src/test_syscall.c a992d8c80ea91fbf21fb2dd570db40e77dd7e6ae
@ -716,6 +716,7 @@ F test/speed3.test d32043614c08c53eafdc80f33191d5bd9b920523
F test/speed4.test abc0ad3399dcf9703abed2fff8705e4f8e416715
F test/speed4p.explain 6b5f104ebeb34a038b2f714150f51d01143e59aa
F test/speed4p.test 0e51908951677de5a969b723e03a27a1c45db38b
F test/spellfix.test 936be6f7ba1c4d096adb280c68b32f4848af8d2e
F test/sqllimits1.test b1aae27cc98eceb845e7f7adf918561256e31298
F test/stat.test 08e8185b3fd5b010c90d7ad82b9dd4ea1cbf14b0
F test/stmt.test 25d64e3dbf9a3ce89558667d7f39d966fe2a71b9
@ -1004,7 +1005,10 @@ F tool/tostr.awk e75472c2f98dd76e06b8c9c1367f4ab07e122d06
F tool/vdbe-compress.tcl d70ea6d8a19e3571d7ab8c9b75cba86d1173ff0f
F tool/warnings-clang.sh f6aa929dc20ef1f856af04a730772f59283631d4
F tool/warnings.sh fbc018d67fd7395f440c28f33ef0f94420226381
P 7fac56ed9feda819e66070bd5e06db8cad77e8bd
R 8322d35d4441c91cd733d64b44bd2f2e
P cba2a65870481df213e006b07e74f0ca19d2d57c
R 79ffd07a9c878240f73dd72d96e3ef36
T *branch * spellfix-matchlen
T *sym-spellfix-matchlen *
T -sym-trunk *
U dan
Z 4817f8644451c5f84c464c55e7d56257
Z 642149d4a86bbb287ab54c68c79dc818

View File

@ -1 +1 @@
cba2a65870481df213e006b07e74f0ca19d2d57c
f24b9d87f6b0e8b4d26669d5c1191f9280ba14a3

View File

@ -101,6 +101,11 @@
** by default (unless overridden by ORDER BY) returns
** results in order of increasing score.
**
** matchlen For prefix queries, the number of characters in the prefix
** of the returned value (word) that matched the query term.
** For non-prefix queries, the number of characters in the
** returned value.
**
** top (HIDDEN) For any query, this value is the same on all
** rows. It is an integer which is the maximum number of
** rows that will be output. The actually number of rows
@ -605,8 +610,14 @@ static int substituteCost(char cPrev, char cFrom, char cTo){
** -1 One of the inputs is NULL
** -2 Non-ASCII characters on input
** -3 Unable to allocate memory
**
** If pnMatch is not NULL, then *pnMatch is set to the number of bytes
** of zB that matched the pattern in zA. If zA does not end with a '*',
** then this value is always the number of bytes in zB (i.e. strlen(zB)).
** If zA does end in a '*', then it is the number of bytes in the prefix
** of zB that was deemed to match zA.
*/
static int editdist1(const char *zA, const char *zB, int iLangId){
static int editdist1(const char *zA, const char *zB, int iLangId, int *pnMatch){
int nA, nB; /* Number of characters in zA[] and zB[] */
int xA, xB; /* Loop counters for zA[] and zB[] */
char cA, cB; /* Current character of zA and zB */
@ -619,12 +630,14 @@ static int editdist1(const char *zA, const char *zB, int iLangId){
char *cx; /* Corresponding character values */
int *toFree = 0; /* Malloced space */
int mStack[60+15]; /* Stack space to use if not too much is needed */
int nMatch = 0;
/* Early out if either input is NULL */
if( zA==0 || zB==0 ) return -1;
/* Skip any common prefix */
while( zA[0] && zA[0]==zB[0] ){ dc = zA[0]; zA++; zB++; }
while( zA[0] && zA[0]==zB[0] ){ dc = zA[0]; zA++; zB++; nMatch++; }
if( pnMatch ) *pnMatch = nMatch;
if( zA[0]==0 && zB[0]==0 ) return 0;
#if 0
@ -737,10 +750,14 @@ static int editdist1(const char *zA, const char *zB, int iLangId){
if( cA=='*' ){
res = m[1];
for(xB=1; xB<=nB; xB++){
if( m[xB]<res ) res = m[xB];
if( m[xB]<res ){
res = m[xB];
if( pnMatch ) *pnMatch = xB+nMatch;
}
}
}else{
res = m[nB];
if( pnMatch ) *pnMatch = -1;
}
sqlite3_free(toFree);
return res;
@ -764,7 +781,7 @@ static void editdistSqlFunc(
int res = editdist1(
(const char*)sqlite3_value_text(argv[0]),
(const char*)sqlite3_value_text(argv[1]),
langid);
langid, 0);
if( res<0 ){
if( res==(-3) ){
sqlite3_result_error_nomem(context);
@ -1142,12 +1159,20 @@ static void updateCost(
/* Compute the edit distance between two strings.
**
** If an error occurs, return a negative number which is the error code.
**
** If pnMatch is not NULL, then *pnMatch is set to the number of characters
** (not bytes) in z2 that matched the search pattern in *pFrom. If pFrom does
** not contain the pattern for a prefix-search, then this is always the number
** of characters in z2. If pFrom does contain a prefix search pattern, then
** it is the number of characters in the prefix of z2 that was deemed to
** match pFrom.
*/
static int editDist3Core(
EditDist3FromString *pFrom, /* The FROM string */
const char *z2, /* The TO string */
int n2, /* Length of the TO string */
const EditDist3Lang *pLang /* Edit weights for a particular language ID */
const EditDist3Lang *pLang, /* Edit weights for a particular language ID */
int *pnMatch /* OUT: Characters in matched prefix */
){
int k, n;
int i1, b1;
@ -1282,10 +1307,16 @@ static int editDist3Core(
/* Free memory allocations and return the result */
res = (int)m[szRow*(n2+1)-1];
if( f.isPrefix ){
for(i2=f.n; i2<n2; i2++){
*pnMatch = n2;
for(i2=1; i2<=n2; i2++){
int b = m[szRow*i2-1];
if( b<res ) res = b;
if( b<=res ){
res = b;
if( pnMatch ) *pnMatch = i2-1;
}
}
}else if( pnMatch ){
*pnMatch = n2;
}
editDist3Abort:
@ -1344,7 +1375,7 @@ static void editDist3SqlFunc(
sqlite3_result_error_nomem(context);
return;
}
dist = editDist3Core(pFrom, zB, nB, pLang);
dist = editDist3Core(pFrom, zB, nB, pLang, 0);
editDist3FromStringDelete(pFrom);
sqlite3_result_int(context, dist);
}
@ -1418,6 +1449,21 @@ static int utf8Read(const unsigned char *z, int n, int *pSize){
return c;
}
/*
** Return the number of characters in the utf-8 string in the nIn byte
** buffer pointed to by zIn.
*/
static int utf8Charlen(const char *zIn, int nIn){
int i;
int nChar = 0;
for(i=0; i<nIn; nChar++){
int sz;
utf8Read((const unsigned char *)&zIn[i], nIn-i, &sz);
i += sz;
}
return nChar;
}
/*
** Table of translations from unicode characters into ASCII.
*/
@ -1868,6 +1914,45 @@ static unsigned char *transliterate(const unsigned char *zIn, int nIn){
return zOut;
}
/*
** Return the number of characters in the shortest prefix of the input
** string that transliterates to an ASCII string nTrans bytes or longer.
** Or, if the transliteration of the input string is less than nTrans
** bytes in size, return the number of characters in the input string.
*/
static int translen_to_charlen(const char *zIn, int nIn, int nTrans){
int i, c, sz, nOut;
int nChar;
i = nOut = 0;
for(nChar=0; i<nIn && nOut<nTrans; nChar++){
c = utf8Read((const unsigned char *)&zIn[i], nIn-i, &sz);
i += sz;
nOut++;
if( c>=128 ){
int xTop, xBtm, x;
xTop = sizeof(translit)/sizeof(translit[0]) - 1;
xBtm = 0;
while( xTop>=xBtm ){
x = (xTop + xBtm)/2;
if( translit[x].cFrom==c ){
if( translit[x].cTo1 ) nOut++;
if( c==0x0429 || c== 0x0449 ) nOut += 2;
break;
}else if( translit[x].cFrom>c ){
xTop = x-1;
}else{
xBtm = x+1;
}
}
}
}
return nChar;
}
/*
** spellfix1_translit(X)
**
@ -2092,6 +2177,7 @@ struct spellfix1_vtab {
struct spellfix1_cursor {
sqlite3_vtab_cursor base; /* Base class - must be first */
spellfix1_vtab *pVTab; /* The table to which this cursor belongs */
char *zPattern; /* rhs of MATCH clause */
int nRow; /* Number of rows of content */
int nAlloc; /* Number of allocated rows */
int iRow; /* Current row of content */
@ -2105,6 +2191,7 @@ struct spellfix1_cursor {
int iRank; /* Rank for this row */
int iDistance; /* Distance from pattern for this row */
int iScore; /* Score for sorting */
int iMatchlen; /* Value of matchlen column (or -1) */
char zHash[SPELLFIX_MX_HASH]; /* the phonehash used for this match */
} *a;
};
@ -2200,7 +2287,7 @@ static char *spellfix1Dequote(const char *zIn){
** argv[0] -> module name ("spellfix1")
** argv[1] -> database name
** argv[2] -> table name
** argv[3].. -> optional arguments (currently ignored)
** argv[3].. -> optional arguments (i.e. "edit_cost_table" parameter)
*/
static int spellfix1Init(
int isCreate,
@ -2238,21 +2325,23 @@ static int spellfix1Init(
rc = SQLITE_NOMEM;
}else{
rc = sqlite3_declare_vtab(db,
"CREATE TABLE x(word,rank,distance,langid,"
"score, phonehash,top HIDDEN,scope HIDDEN,srchcnt HIDDEN,"
"soundslike HIDDEN,command HIDDEN)"
"CREATE TABLE x(word,rank,distance,langid, "
"score, matchlen, phonehash, "
"top HIDDEN, scope HIDDEN, srchcnt HIDDEN, "
"soundslike HIDDEN, command HIDDEN)"
);
#define SPELLFIX_COL_WORD 0
#define SPELLFIX_COL_RANK 1
#define SPELLFIX_COL_DISTANCE 2
#define SPELLFIX_COL_LANGID 3
#define SPELLFIX_COL_SCORE 4
#define SPELLFIX_COL_PHONEHASH 5
#define SPELLFIX_COL_TOP 6
#define SPELLFIX_COL_SCOPE 7
#define SPELLFIX_COL_SRCHCNT 8
#define SPELLFIX_COL_SOUNDSLIKE 9
#define SPELLFIX_COL_COMMAND 10
#define SPELLFIX_COL_MATCHLEN 5
#define SPELLFIX_COL_PHONEHASH 6
#define SPELLFIX_COL_TOP 7
#define SPELLFIX_COL_SCOPE 8
#define SPELLFIX_COL_SRCHCNT 9
#define SPELLFIX_COL_SOUNDSLIKE 10
#define SPELLFIX_COL_COMMAND 11
}
if( rc==SQLITE_OK && isCreate ){
sqlite3_uint64 r;
@ -2350,6 +2439,7 @@ static int spellfix1Close(sqlite3_vtab_cursor *cur){
spellfix1_cursor *pCur = (spellfix1_cursor *)cur;
spellfix1ResetCursor(pCur);
spellfix1ResizeCursor(pCur, 0);
sqlite3_free(pCur->zPattern);
sqlite3_free(pCur);
return SQLITE_OK;
}
@ -2583,15 +2673,16 @@ static void spellfix1RunQuery(MatchQuery *p, const char *zQuery, int nQuery){
}
}
while( sqlite3_step(pStmt)==SQLITE_ROW ){
int iMatchlen = -1;
iRank = sqlite3_column_int(pStmt, 2);
if( p->pMatchStr3 ){
int nWord = sqlite3_column_bytes(pStmt, 1);
zWord = (const char*)sqlite3_column_text(pStmt, 1);
iDist = editDist3Core(p->pMatchStr3, zWord, nWord, p->pLang);
iDist = editDist3Core(p->pMatchStr3, zWord, nWord, p->pLang, &iMatchlen);
}else{
zK1 = (const char*)sqlite3_column_text(pStmt, 3);
if( zK1==0 ) continue;
iDist = editdist1(p->zPattern, zK1, pCur->iLang);
iDist = editdist1(p->zPattern, zK1, pCur->iLang, 0);
}
pCur->nSearch++;
iScore = spellfix1Score(iDist,iRank);
@ -2615,6 +2706,7 @@ static void spellfix1RunQuery(MatchQuery *p, const char *zQuery, int nQuery){
pCur->a[idx].iRank = iRank;
pCur->a[idx].iDistance = iDist;
pCur->a[idx].iScore = iScore;
pCur->a[idx].iMatchlen = iMatchlen;
memcpy(pCur->a[idx].zHash, zHash1, iScope+1);
if( pCur->nRow<pCur->nAlloc ) pCur->nRow++;
if( pCur->nRow==pCur->nAlloc ){
@ -2696,6 +2788,8 @@ static int spellfix1FilterForMatch(
x.pLang = 0;
}
zPattern = (char*)transliterate(zMatchThis, sqlite3_value_bytes(argv[0]));
sqlite3_free(pCur->zPattern);
pCur->zPattern = zPattern;
if( zPattern==0 ) return SQLITE_NOMEM;
nPattern = strlen(zPattern);
if( zPattern[nPattern-1]=='*' ) nPattern--;
@ -2746,7 +2840,6 @@ static int spellfix1FilterForMatch(
pCur->iScope = iScope;
}
sqlite3_finalize(pStmt);
sqlite3_free(zPattern);
editDist3FromStringDelete(pMatchStr3);
return pCur->a ? x.rc : SQLITE_NOMEM;
}
@ -2830,6 +2923,30 @@ static int spellfix1Column(sqlite3_vtab_cursor *cur, sqlite3_context *ctx, int i
sqlite3_result_int(ctx, pCur->a[pCur->iRow].iScore);
break;
}
case SPELLFIX_COL_MATCHLEN: {
int iMatchlen = pCur->a[pCur->iRow].iMatchlen;
if( iMatchlen<0 ){
int nPattern = strlen(pCur->zPattern);
char *zWord = pCur->a[pCur->iRow].zWord;
int nWord = strlen(zWord);
if( nPattern>0 && pCur->zPattern[nPattern-1]=='*' ){
char *zTranslit;
int res;
zTranslit = (char *)transliterate((unsigned char *)zWord, nWord);
if( !zTranslit ) return SQLITE_NOMEM;
res = editdist1(pCur->zPattern, zTranslit, pCur->iLang, &iMatchlen);
sqlite3_free(zTranslit);
if( res<0 ) return SQLITE_NOMEM;
iMatchlen = translen_to_charlen(zWord, nWord, iMatchlen);
}else{
iMatchlen = utf8Charlen(zWord, nWord);
}
}
sqlite3_result_int(ctx, iMatchlen);
break;
}
case SPELLFIX_COL_PHONEHASH: {
sqlite3_result_text(ctx, pCur->a[pCur->iRow].zHash, -1, SQLITE_STATIC);
break;

147
test/spellfix.test Normal file
View File

@ -0,0 +1,147 @@
# 2012 July 12
#
# The author disclaims copyright to this source code. In place of
# a legal notice, here is a blessing:
#
# May you do good and not evil.
# May you find forgiveness for yourself and forgive others.
# May you share freely, never taking more than you give.
#
#***********************************************************************
#
set testdir [file dirname $argv0]
source $testdir/tester.tcl
set testprefix spellfix
register_spellfix_module db
set vocab {
rabbi rabbit rabbits rabble rabid rabies raccoon raccoons race raced racer
racers races racetrack racial racially racing rack racked racket racketeer
racketeering racketeers rackets racking racks radar radars radial radially
radian radiance radiant radiantly radiate radiated radiates radiating radiation
radiations radiator radiators radical radically radicals radices radii radio
radioactive radioastronomy radioed radiography radioing radiology radios radish
radishes radium radius radix radon raft rafter rafters rafts rag rage raged
rages ragged raggedly raggedness raging rags ragweed raid raided raider raiders
raiding raids rail railed railer railers railing railroad railroaded railroader
railroaders railroading railroads rails railway railways raiment rain rainbow
raincoat raincoats raindrop raindrops rained rainfall rainier rainiest raining
rains rainstorm rainy raise raised raiser raisers raises raisin raising rake
raked rakes raking rallied rallies rally rallying ram ramble rambler rambles
rambling ramblings ramification ramifications ramp rampage rampant rampart
ramps ramrod rams ran ranch ranched rancher ranchers ranches ranching rancid
random randomization randomize randomized randomizes randomly randomness randy
rang range ranged rangeland ranger rangers ranges ranging rangy rank ranked
ranker rankers rankest ranking rankings rankle rankly rankness ranks ransack
ransacked ransacking ransacks ransom ransomer ransoming ransoms rant ranted
ranter ranters ranting rants rap rapacious rape raped raper rapes rapid
rapidity rapidly rapids rapier raping rapport rapprochement raps rapt raptly
rapture raptures rapturous rare rarely rareness rarer rarest rarity rascal
rascally rascals rash rasher rashly rashness rasp raspberry rasped rasping
rasps raster rat rate rated rater raters rates rather ratification ratified
ratifies ratify ratifying rating ratings ratio ration rational rationale
rationales rationalities rationality rationalization rationalizations
rationalize rationalized rationalizes rationalizing rationally rationals
rationing rations ratios rats rattle rattled rattler rattlers rattles
rattlesnake rattlesnakes rattling raucous ravage ravaged ravager ravagers
ravages ravaging rave raved raven ravening ravenous ravenously ravens raves
ravine ravines raving ravings raw rawer rawest rawly rawness ray rays raze
razor razors re reabbreviate reabbreviated reabbreviates reabbreviating reach
reachability reachable reachably reached reacher reaches reaching reacquired
react reacted reacting reaction reactionaries reactionary reactions reactivate
reactivated reactivates reactivating reactivation reactive reactively
reactivity reactor reactors reacts read readability readable reader readers
readied readier readies readiest readily readiness reading readings readjusted
readout readouts reads ready readying real realest realign realigned realigning
realigns realism realist realistic realistically realists realities reality
}
do_test 1.1 {
execsql { CREATE VIRTUAL TABLE t1 USING spellfix1 }
foreach word $vocab {
execsql { INSERT INTO t1(word) VALUES($word) }
}
} {}
foreach {tn word res} {
1 raxpi* {rasping 5 rasped 5 raspberry 6 rasp 4 rasps 4}
2 ril* {rail 4 railway 4 railing 4 rails 4 railways 4}
3 rilis* {realist 6 realistic 6 realistically 6 realists 6 realism 6}
4 reail* {realities 3 reality 3 real 3 realest 3 realist 3}
5 ras* {rasp 3 rash 3 rasped 3 rasping 3 rasps 3}
6 realistss* {realists 8 realigns 8 realistic 9 realistically 9 realest 7}
7 realistss {realists 8 realist 7 realigns 8 realistic 9 realest 7}
8 rllation* {realities 9 reality 7 rallied 7 railed 4}
9 renstom* {rainstorm 8 ransomer 6 ransom 6 ransoming 6 ransoms 6}
} {
do_execsql_test 1.2.$tn {
SELECT word, matchlen FROM t1 WHERE word MATCH $word LIMIT 5
} $res
}
do_execsql_test 2.1 {
CREATE VIRTUAL TABLE t2 USING spellfix1;
INSERT INTO t2 (word, soundslike) VALUES('school', 'skuul');
INSERT INTO t2 (word, soundslike) VALUES('psalm', 'sarm');
SELECT word, matchlen FROM t2 WHERE word MATCH 'sar*' LIMIT 5;
} {psalm 4}
do_execsql_test 2.2 {
SELECT word, matchlen FROM t2 WHERE word MATCH 'skol*' LIMIT 5;
} {school 6}
set vocab {
kangaroo kanji kappa karate keel keeled keeling keels keen keener keenest
keenly keenness keep keeper keepers keeping keeps ken kennel kennels kept
kerchief kerchiefs kern kernel kernels kerosene ketchup kettle
kettles key keyboard keyboards keyed keyhole keying keynote keypad keypads keys
keystroke keystrokes keyword keywords kick kicked kicker kickers kicking
kickoff kicks kid kidded kiddie kidding kidnap kidnapper kidnappers kidnapping
kidnappings kidnaps kidney kidneys kids kill killed killer killers killing
killingly killings killjoy kills kilobit kilobits kiloblock kilobyte kilobytes
kilogram kilograms kilohertz kilohm kilojoule kilometer kilometers kiloton
kilovolt kilowatt kiloword kimono kin kind kinder kindergarten kindest
kindhearted kindle kindled kindles kindling kindly kindness kindred kinds
kinetic king kingdom kingdoms kingly kingpin kings kink kinky kinship kinsman
kiosk kiss kissed kisser kissers kisses kissing kit kitchen kitchenette
kitchens kite kited kites kiting kits kitten kittenish kittens kitty klaxon
kludge kludges klystron knack knapsack knapsacks knave knaves knead kneads knee
kneecap kneed kneeing kneel kneeled kneeling kneels knees knell knells knelt
knew knife knifed knifes knifing knight knighted knighthood knighting knightly
knights knit knits knives knob knobs knock knockdown knocked knocker knockers
knocking knockout knocks knoll knolls knot knots knotted knotting know knowable
knower knowhow knowing knowingly knowledge knowledgeable known knows knuckle
knuckled knuckles koala kosher kudo
}
do_execsql_test 3.1 {
CREATE TABLE costs(iLang, cFrom, cTo, iCost);
INSERT INTO costs VALUES(0, 'a', 'e', 1);
INSERT INTO costs VALUES(0, 'e', 'i', 1);
INSERT INTO costs VALUES(0, 'i', 'o', 1);
INSERT INTO costs VALUES(0, 'o', 'u', 1);
INSERT INTO costs VALUES(0, 'u', 'a', 1);
CREATE VIRTUAL TABLE t3 USING spellfix1(edit_cost_table=costs);
}
do_test 3.2 {
foreach w $vocab {
execsql { INSERT INTO t3(word) VALUES($w) }
}
} {}
breakpoint
foreach {tn word res} {
1 kos* {kosher 3 kiosk 4 kudo 2 kappa 1 keypad 1}
2 kellj* {killjoy 5 killed 4 killingly 4 kill 4 killer 4}
3 kellj {kill 4 kills 5 killjoy 7 keel 4 killed 6}
} {
do_execsql_test 1.2.$tn {
SELECT word, matchlen FROM t3 WHERE word MATCH $word LIMIT 5
} $res
}
finish_test