1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174
|
# 2007 June 21
#
# The author disclaims copyright to this source code. In place of
# a legal notice, here is a blessing:
#
# May you do good and not evil.
# May you find forgiveness for yourself and forgive others.
# May you share freely, never taking more than you give.
#
#*************************************************************************
# This file implements regression tests for SQLite library. The focus
# of this script is testing the pluggable tokeniser feature of the
# FTS2 module.
#
# $Id: fts2token.test,v 1.3 2007/06/25 12:05:40 danielk1977 Exp $
#
set testdir [file dirname $argv0]
source $testdir/tester.tcl
# If SQLITE_ENABLE_FTS2 is defined, omit this file.
ifcapable !fts2 {
finish_test
return
}
proc escape_string {str} {
set out ""
foreach char [split $str ""] {
scan $char %c i
if {$i<=127} {
append out $char
} else {
append out [format {\x%.4x} $i]
}
}
set out
}
#--------------------------------------------------------------------------
# Test cases fts2token-1.* are the warm-body test for the SQL scalar
# function fts2_tokenizer(). The procedure is as follows:
#
# 1: Verify that there is no such fts2 tokenizer as 'blah'.
#
# 2: Query for the built-in tokenizer 'simple'. Insert a copy of the
# retrieved value as tokenizer 'blah'.
#
# 3: Test that the value returned for tokenizer 'blah' is now the
# same as that retrieved for 'simple'.
#
# 4: Test that it is now possible to create an fts2 table using
# tokenizer 'blah' (it was not possible in step 1).
#
# 5: Test that the table created to use tokenizer 'blah' is usable.
#
do_test fts2token-1.1 {
catchsql {
CREATE VIRTUAL TABLE t1 USING fts2(content, tokenize blah);
}
} {1 {unknown tokenizer: blah}}
do_test fts2token-1.2 {
execsql {
SELECT fts2_tokenizer('blah', fts2_tokenizer('simple')) IS NULL;
}
} {0}
do_test fts2token-1.3 {
execsql {
SELECT fts2_tokenizer('blah') == fts2_tokenizer('simple');
}
} {1}
do_test fts2token-1.4 {
catchsql {
CREATE VIRTUAL TABLE t1 USING fts2(content, tokenize blah);
}
} {0 {}}
do_test fts2token-1.5 {
execsql {
INSERT INTO t1(content) VALUES('There was movement at the station');
INSERT INTO t1(content) VALUES('For the word has passed around');
INSERT INTO t1(content) VALUES('That the colt from ol regret had got away');
SELECT content FROM t1 WHERE content MATCH 'movement'
}
} {{There was movement at the station}}
#--------------------------------------------------------------------------
# Test cases fts2token-2.* test error cases in the scalar function based
# API for getting and setting tokenizers.
#
do_test fts2token-2.1 {
catchsql {
SELECT fts2_tokenizer('nosuchtokenizer');
}
} {1 {unknown tokenizer: nosuchtokenizer}}
#--------------------------------------------------------------------------
# Test cases fts2token-3.* test the three built-in tokenizers with a
# simple input string via the built-in test function. This is as much
# to test the test function as the tokenizer implementations.
#
do_test fts2token-3.1 {
execsql {
SELECT fts2_tokenizer_test('simple', 'I don''t see how');
}
} {{0 i I 1 don don 2 t t 3 see see 4 how how}}
do_test fts2token-3.2 {
execsql {
SELECT fts2_tokenizer_test('porter', 'I don''t see how');
}
} {{0 i I 1 don don 2 t t 3 see see 4 how how}}
ifcapable icu {
do_test fts2token-3.3 {
execsql {
SELECT fts2_tokenizer_test('icu', 'I don''t see how');
}
} {{0 i I 1 don't don't 2 see see 3 how how}}
}
#--------------------------------------------------------------------------
# Test cases fts2token-4.* test the ICU tokenizer. In practice, this
# tokenizer only has two modes - "thai" and "everybody else". Some other
# Asian languages (Lao, Khmer etc.) require the same special treatment as
# Thai, but ICU doesn't support them yet.
#
ifcapable icu {
proc do_icu_test {name locale input output} {
set ::out [db eval { SELECT fts2_tokenizer_test('icu', $locale, $input) }]
do_test $name {
lindex $::out 0
} $output
}
do_icu_test fts2token-4.1 en_US {} {}
do_icu_test fts2token-4.2 en_US {Test cases fts2} [list \
0 test Test 1 cases cases 2 fts2 fts2
]
# The following test shows that ICU is smart enough to recognise
# Thai chararacters, even when the locale is set to English/United
# States.
#
set input "\u0e2d\u0e30\u0e44\u0e23\u0e19\u0e30\u0e04\u0e23\u0e31\u0e1a"
set output "0 \u0e2d\u0e30\u0e44\u0e23 \u0e2d\u0e30\u0e44\u0e23 "
append output "1 \u0e19\u0e30 \u0e19\u0e30 "
append output "2 \u0e04\u0e23\u0e31\u0e1a \u0e04\u0e23\u0e31\u0e1a"
do_icu_test fts2token-4.3 th_TH $input $output
do_icu_test fts2token-4.4 en_US $input $output
# ICU handles an unknown locale by falling back to the default.
# So this is not an error.
do_icu_test fts2token-4.5 MiddleOfTheOcean $input $output
set longtoken "AReallyReallyLongTokenOneThatWillSurelyRequire"
append longtoken "AReallocInTheIcuTokenizerCode"
set input "short tokens then "
append input $longtoken
set output "0 short short "
append output "1 tokens tokens "
append output "2 then then "
append output "3 [string tolower $longtoken] $longtoken"
do_icu_test fts2token-4.6 MiddleOfTheOcean $input $output
do_icu_test fts2token-4.7 th_TH $input $output
do_icu_test fts2token-4.8 en_US $input $output
}
do_test fts2token-internal {
execsql { SELECT fts2_tokenizer_internal_test() }
} {ok}
finish_test
|