2014-12-29 14:24:46 +03:00
|
|
|
# 2014 Dec 20
|
|
|
|
#
|
|
|
|
# The author disclaims copyright to this source code. In place of
|
|
|
|
# a legal notice, here is a blessing:
|
|
|
|
#
|
|
|
|
# May you do good and not evil.
|
|
|
|
# May you find forgiveness for yourself and forgive others.
|
|
|
|
# May you share freely, never taking more than you give.
|
|
|
|
#
|
|
|
|
#***********************************************************************
|
|
|
|
#
|
2015-05-19 14:32:01 +03:00
|
|
|
# Tests focusing on the built-in fts5 tokenizers.
|
2014-12-29 14:24:46 +03:00
|
|
|
#
|
|
|
|
|
2015-01-06 22:08:26 +03:00
|
|
|
source [file join [file dirname [info script]] fts5_common.tcl]
|
2014-12-29 14:24:46 +03:00
|
|
|
set testprefix fts5tokenizer
|
|
|
|
|
|
|
|
|
|
|
|
do_execsql_test 1.0 {
|
|
|
|
CREATE VIRTUAL TABLE ft1 USING fts5(x, tokenize=porter);
|
|
|
|
DROP TABLE ft1;
|
|
|
|
}
|
|
|
|
do_execsql_test 1.1 {
|
|
|
|
CREATE VIRTUAL TABLE ft1 USING fts5(x, tokenize='porter');
|
|
|
|
DROP TABLE ft1;
|
|
|
|
}
|
|
|
|
do_execsql_test 1.2 {
|
|
|
|
CREATE VIRTUAL TABLE ft1 USING fts5(x, tokenize = porter);
|
|
|
|
DROP TABLE ft1;
|
|
|
|
}
|
|
|
|
do_execsql_test 1.3 {
|
|
|
|
CREATE VIRTUAL TABLE ft1 USING fts5(x, tokenize = 'porter');
|
|
|
|
DROP TABLE ft1;
|
|
|
|
}
|
|
|
|
do_execsql_test 1.4 {
|
2015-01-12 20:58:04 +03:00
|
|
|
CREATE VIRTUAL TABLE ft1 USING fts5(x, tokenize = 'porter ascii');
|
2014-12-29 14:24:46 +03:00
|
|
|
DROP TABLE ft1;
|
|
|
|
}
|
|
|
|
|
2015-04-29 23:54:08 +03:00
|
|
|
do_catchsql_test 1.5 {
|
|
|
|
CREATE VIRTUAL TABLE ft1 USING fts5(x, tokenize = 'nosuch');
|
|
|
|
} {1 {no such tokenizer: nosuch}}
|
|
|
|
|
|
|
|
do_catchsql_test 1.6 {
|
|
|
|
CREATE VIRTUAL TABLE ft1 USING fts5(x, tokenize = 'porter nosuch');
|
|
|
|
} {1 {error in tokenizer constructor}}
|
|
|
|
|
2014-12-29 14:24:46 +03:00
|
|
|
do_execsql_test 2.0 {
|
|
|
|
CREATE VIRTUAL TABLE ft1 USING fts5(x, tokenize=porter);
|
|
|
|
INSERT INTO ft1 VALUES('embedded databases');
|
|
|
|
}
|
|
|
|
do_execsql_test 2.1 { SELECT rowid FROM ft1 WHERE ft1 MATCH 'embedding' } 1
|
|
|
|
do_execsql_test 2.2 { SELECT rowid FROM ft1 WHERE ft1 MATCH 'database' } 1
|
|
|
|
do_execsql_test 2.3 {
|
|
|
|
SELECT rowid FROM ft1 WHERE ft1 MATCH 'database embedding'
|
|
|
|
} 1
|
|
|
|
|
|
|
|
proc tcl_create {args} {
|
|
|
|
set ::targs $args
|
|
|
|
error "failed"
|
|
|
|
}
|
|
|
|
sqlite3_fts5_create_tokenizer db tcl tcl_create
|
|
|
|
|
|
|
|
foreach {tn directive expected} {
|
|
|
|
1 {tokenize='tcl a b c'} {a b c}
|
|
|
|
2 {tokenize='tcl ''d'' ''e'' ''f'''} {d e f}
|
|
|
|
3 {tokenize="tcl 'g' 'h' 'i'"} {g h i}
|
|
|
|
4 {tokenize = tcl} {}
|
|
|
|
} {
|
|
|
|
do_catchsql_test 3.$tn.1 "
|
|
|
|
CREATE VIRTUAL TABLE ft2 USING fts5(x, $directive)
|
|
|
|
" {1 {error in tokenizer constructor}}
|
|
|
|
do_test 3.$tn.2 { set ::targs } $expected
|
|
|
|
}
|
|
|
|
|
|
|
|
do_catchsql_test 4.1 {
|
|
|
|
CREATE VIRTUAL TABLE ft2 USING fts5(x, tokenize = tcl abc);
|
|
|
|
} {1 {parse error in "tokenize = tcl abc"}}
|
|
|
|
do_catchsql_test 4.2 {
|
|
|
|
CREATE VIRTUAL TABLE ft2 USING fts5(x y)
|
2015-04-24 22:41:43 +03:00
|
|
|
} {1 {unrecognized column option: y}}
|
2014-12-29 14:24:46 +03:00
|
|
|
|
2015-01-06 22:08:26 +03:00
|
|
|
#-------------------------------------------------------------------------
|
|
|
|
# Test the "separators" and "tokenchars" options a bit.
|
|
|
|
#
|
2015-01-12 20:58:04 +03:00
|
|
|
foreach {tn tokenizer} {1 ascii 2 unicode61} {
|
2015-01-06 22:08:26 +03:00
|
|
|
reset_db
|
|
|
|
set T "$tokenizer tokenchars ',.:' separators 'xyz'"
|
|
|
|
execsql "CREATE VIRTUAL TABLE t1 USING fts5(x, tokenize = \"$T\")"
|
|
|
|
do_execsql_test 5.$tn.1 {
|
|
|
|
INSERT INTO t1 VALUES('abcxdefyghizjkl.mno,pqr:stu/vwx+yz');
|
|
|
|
}
|
|
|
|
foreach {tn2 token res} {
|
|
|
|
1 abc 1 2 def 1 3 ghi 1 4 jkl {}
|
|
|
|
5 mno {} 6 pqr {} 7 stu {} 8 jkl.mno,pqr:stu 1
|
|
|
|
9 vw 1
|
|
|
|
} {
|
|
|
|
do_execsql_test 5.$tn.2.$tn2 "
|
|
|
|
SELECT rowid FROM t1 WHERE t1 MATCH '\"$token\"'
|
|
|
|
" $res
|
|
|
|
}
|
|
|
|
}
|
|
|
|
|
2015-05-19 14:32:01 +03:00
|
|
|
#-------------------------------------------------------------------------
|
|
|
|
# Miscellaneous tests for the ascii tokenizer.
|
|
|
|
#
|
|
|
|
# 5.1.*: Test that the ascii tokenizer ignores non-ASCII characters in the
|
|
|
|
# 'separators' option. But unicode61 does not.
|
|
|
|
#
|
|
|
|
# 5.2.*: An option without an argument is an error.
|
|
|
|
#
|
|
|
|
|
|
|
|
do_test 5.1.1 {
|
|
|
|
execsql "
|
|
|
|
CREATE VIRTUAL TABLE a1 USING fts5(x, tokenize=`ascii separators '\u1234'`);
|
|
|
|
INSERT INTO a1 VALUES('abc\u1234def');
|
|
|
|
"
|
|
|
|
execsql { SELECT rowid FROM a1 WHERE a1 MATCH 'def' }
|
|
|
|
} {}
|
|
|
|
|
|
|
|
do_test 5.1.2 {
|
|
|
|
execsql "
|
|
|
|
CREATE VIRTUAL TABLE a2 USING fts5(
|
|
|
|
x, tokenize=`unicode61 separators '\u1234'`);
|
|
|
|
INSERT INTO a2 VALUES('abc\u1234def');
|
|
|
|
"
|
|
|
|
execsql { SELECT rowid FROM a2 WHERE a2 MATCH 'def' }
|
|
|
|
} {1}
|
|
|
|
|
|
|
|
do_catchsql_test 5.2 {
|
|
|
|
CREATE VIRTUAL TABLE a3 USING fts5(x, y, tokenize = 'ascii tokenchars');
|
|
|
|
} {1 {error in tokenizer constructor}}
|
|
|
|
do_catchsql_test 5.3 {
|
|
|
|
CREATE VIRTUAL TABLE a3 USING fts5(x, y, tokenize = 'ascii opt arg');
|
|
|
|
} {1 {error in tokenizer constructor}}
|
|
|
|
|
2015-05-19 22:37:09 +03:00
|
|
|
#-------------------------------------------------------------------------
|
|
|
|
# Test that the ASCII and unicode61 tokenizers both handle SQLITE_DONE
|
|
|
|
# correctly.
|
|
|
|
#
|
|
|
|
|
|
|
|
proc test_token_cb {varname token iStart iEnd} {
|
|
|
|
upvar $varname var
|
|
|
|
lappend var $token
|
|
|
|
if {[llength $var]==3} { return "SQLITE_DONE" }
|
|
|
|
return "SQLITE_OK"
|
|
|
|
}
|
|
|
|
|
|
|
|
proc tokenize {cmd} {
|
|
|
|
set res [list]
|
|
|
|
$cmd xTokenize [$cmd xColumnText 0] [list test_token_cb res]
|
|
|
|
set res
|
|
|
|
}
|
|
|
|
sqlite3_fts5_create_function db tokenize tokenize
|
|
|
|
|
|
|
|
do_execsql_test 6.0 {
|
|
|
|
CREATE VIRTUAL TABLE x1 USING fts5(a, tokenize=ascii);
|
|
|
|
INSERT INTO x1 VALUES('q w e r t y');
|
|
|
|
INSERT INTO x1 VALUES('y t r e w q');
|
|
|
|
SELECT tokenize(x1) FROM x1 WHERE x1 MATCH 'e AND r';
|
|
|
|
} {
|
|
|
|
{q w e} {y t r}
|
|
|
|
}
|
|
|
|
|
|
|
|
do_execsql_test 6.1 {
|
|
|
|
CREATE VIRTUAL TABLE x2 USING fts5(a, tokenize=unicode61);
|
|
|
|
INSERT INTO x2 VALUES('q w e r t y');
|
|
|
|
INSERT INTO x2 VALUES('y t r e w q');
|
|
|
|
SELECT tokenize(x2) FROM x2 WHERE x2 MATCH 'e AND r';
|
|
|
|
} {
|
|
|
|
{q w e} {y t r}
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
|
|
#-------------------------------------------------------------------------
|
|
|
|
# Miscellaneous tests for the unicode tokenizer.
|
|
|
|
#
|
|
|
|
do_catchsql_test 6.1 {
|
|
|
|
CREATE VIRTUAL TABLE a3 USING fts5(x, y, tokenize = 'unicode61 tokenchars');
|
|
|
|
} {1 {error in tokenizer constructor}}
|
|
|
|
do_catchsql_test 6.2 {
|
|
|
|
CREATE VIRTUAL TABLE a3 USING fts5(x, y, tokenize = 'unicode61 a b');
|
|
|
|
} {1 {error in tokenizer constructor}}
|
|
|
|
do_catchsql_test 6.3 {
|
|
|
|
CREATE VIRTUAL TABLE a3 USING fts5(
|
|
|
|
x, y, tokenize = 'unicode61 remove_diacritics 2'
|
|
|
|
);
|
|
|
|
} {1 {error in tokenizer constructor}}
|
|
|
|
do_catchsql_test 6.4 {
|
|
|
|
CREATE VIRTUAL TABLE a3 USING fts5(
|
|
|
|
x, y, tokenize = 'unicode61 remove_diacritics 10'
|
|
|
|
);
|
|
|
|
} {1 {error in tokenizer constructor}}
|
|
|
|
|
|
|
|
#-------------------------------------------------------------------------
|
|
|
|
# Porter tokenizer with very large tokens.
|
|
|
|
#
|
|
|
|
set a [string repeat a 100]
|
|
|
|
set b [string repeat b 500]
|
|
|
|
set c [string repeat c 1000]
|
|
|
|
do_execsql_test 7.0 {
|
|
|
|
CREATE VIRTUAL TABLE e5 USING fts5(x, tokenize=porter);
|
|
|
|
INSERT INTO e5 VALUES($a || ' ' || $b);
|
|
|
|
INSERT INTO e5 VALUES($b || ' ' || $c);
|
|
|
|
INSERT INTO e5 VALUES($c || ' ' || $a);
|
|
|
|
}
|
|
|
|
|
|
|
|
do_execsql_test 7.1 {SELECT rowid FROM e5 WHERE e5 MATCH $a} { 1 3 }
|
|
|
|
do_execsql_test 7.2 {SELECT rowid FROM e5 WHERE e5 MATCH $b} { 1 2 }
|
|
|
|
do_execsql_test 7.3 {SELECT rowid FROM e5 WHERE e5 MATCH $c} { 2 3 }
|
|
|
|
|
|
|
|
|
2014-12-29 14:24:46 +03:00
|
|
|
finish_test
|
|
|
|
|