sqlite/ext/fts5/test/fts5tok1.test
dan d1d43efa4f Prevent fts5 tokenizer unicode61 from considering '\0' to be a token characters, even if other characters of class "Cc" are.
FossilOrigin-Name: b7b7bde9b7a03665e3691c6d51118965f216d2dfb1617f138b9f9e60e418ed2f
2020-10-26 13:24:36 +00:00

151 lines
3.5 KiB
Plaintext

# 2016 Jan 15
#
# The author disclaims copyright to this source code. In place of
# a legal notice, here is a blessing:
#
# May you do good and not evil.
# May you find forgiveness for yourself and forgive others.
# May you share freely, never taking more than you give.
#
#*************************************************************************
#
source [file join [file dirname [info script]] fts5_common.tcl]
ifcapable !fts5 { finish_test ; return }
set ::testprefix fts5tok1
sqlite3_fts5_register_fts5tokenize db
#-------------------------------------------------------------------------
# Simple test cases. Using the default (ascii) tokenizer.
#
do_execsql_test 1.0 {
CREATE VIRTUAL TABLE t1 USING fts5tokenize(ascii);
CREATE VIRTUAL TABLE t2 USING fts5tokenize();
CREATE VIRTUAL TABLE t3 USING fts5tokenize(
ascii, 'separators', 'xyz', tokenchars, ''''
);
}
foreach {tn tbl} {1 t1 2 t2 3 t3} {
do_execsql_test 1.$tn.1 "SELECT input, * FROM $tbl ('one two three')" {
{one two three} one 0 3 0
{one two three} two 4 7 1
{one two three} three 8 13 2
}
do_execsql_test 1.$tn.2 "
SELECT token FROM $tbl WHERE input = 'OnE tWo tHrEe'
" {
one two three
}
}
do_execsql_test 1.4 {
SELECT token FROM t3 WHERE input = '1x2x3x'
} {1 2 3}
do_execsql_test 1.5 {
SELECT token FROM t1 WHERE input = '1x2x3x'
} {1x2x3x}
do_execsql_test 1.6 {
SELECT token FROM t3 WHERE input = '1''2x3x'
} {1'2 3}
do_execsql_test 1.7 {
SELECT token FROM t3 WHERE input = ''
} {}
do_execsql_test 1.8 {
SELECT token FROM t3 WHERE input = NULL
} {}
do_execsql_test 1.9 {
SELECT input, * FROM t3 WHERE input = 123
} {123 123 0 3 0}
do_execsql_test 1.10 {
SELECT input, * FROM t1 WHERE input = 'a b c' AND token = 'b';
} {
{a b c} b 2 3 1
}
do_execsql_test 1.11 {
SELECT input, * FROM t1 WHERE token = 'b' AND input = 'a b c';
} {
{a b c} b 2 3 1
}
do_execsql_test 1.12 {
SELECT input, * FROM t1 WHERE input < 'b' AND input = 'a b c';
} {
{a b c} a 0 1 0
{a b c} b 2 3 1
{a b c} c 4 5 2
}
do_execsql_test 1.13.1 {
CREATE TABLE c1(x);
INSERT INTO c1(x) VALUES('a b c');
INSERT INTO c1(x) VALUES('d e f');
}
do_execsql_test 1.13.2 {
SELECT c1.*, input, t1.* FROM c1, t1 WHERE input = x AND c1.rowid=t1.rowid;
} {
{a b c} {a b c} a 0 1 0
{d e f} {d e f} e 2 3 1
}
#-------------------------------------------------------------------------
# Error cases.
#
do_catchsql_test 2.0 {
CREATE VIRTUAL TABLE tX USING fts5tokenize(nosuchtokenizer);
} {1 {vtable constructor failed: tX}}
do_catchsql_test 2.1 {
CREATE VIRTUAL TABLE t4 USING fts5tokenize;
SELECT * FROM t4;
} {1 {SQL logic error}}
#-------------------------------------------------------------------------
# Embedded 0x00 characters.
#
reset_db
do_execsql_test 3.1.0 {
CREATE VIRTUAL TABLE t1 USING fts5(z);
CREATE VIRTUAL TABLE tt USING fts5vocab(t1, 'instance');
INSERT INTO t1 VALUES('abc' || char(0) || 'def');
SELECT * FROM tt;
} { abc 1 z 0 def 1 z 1 }
do_execsql_test 3.1.1 {
SELECT hex(z) FROM t1;
} {61626300646566}
do_execsql_test 3.1.2 {
INSERT INTO t1(t1) VALUES('integrity-check');
} {}
do_execsql_test 3.2.0 {
CREATE VIRTUAL TABLE t2 USING fts5(z,
tokenize="unicode61 categories 'L* N* Co Cc'"
);
CREATE VIRTUAL TABLE tu USING fts5vocab(t2, 'instance');
INSERT INTO t2 VALUES('abc' || char(0) || 'def');
SELECT * FROM tu;
} { abc 1 z 0 def 1 z 1 }
do_execsql_test 3.2.1 {
SELECT hex(z) FROM t1;
} {61626300646566}
do_execsql_test 3.2.2 {
INSERT INTO t1(t1) VALUES('integrity-check');
} {}
finish_test