1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229
|
from lunr.query_lexer import QueryLexer
def _lex(string):
lexer = QueryLexer(string)
lexer.run()
return lexer
class TestQueryLexer:
def test_single_term_produces_one_lexeme(self):
lexer = _lex("foo")
assert len(lexer.lexemes) == 1
lexeme = lexer.lexemes[0]
assert lexeme["type"] == QueryLexer.TERM
assert lexeme["string"] == "foo"
assert lexeme["start"] == 0
assert lexeme["end"] == 3
def test_term_escape_character(self):
lexer = _lex("foo\\:bar")
assert len(lexer.lexemes) == 1
lexeme = lexer.lexemes[0]
assert lexeme["type"] == QueryLexer.TERM
assert lexeme["string"] == "foo:bar"
assert lexeme["start"] == 0
assert lexeme["end"] == 8
def test_multiple_terms(self):
lexer = _lex("foo bar")
assert len(lexer.lexemes) == 2
foo_lexeme, bar_lexeme = lexer.lexemes
assert foo_lexeme["type"] == bar_lexeme["type"] == QueryLexer.TERM
assert foo_lexeme["string"] == "foo"
assert bar_lexeme["string"] == "bar"
assert foo_lexeme["start"] == 0
assert bar_lexeme["start"] == 4
assert foo_lexeme["end"] == 3
assert bar_lexeme["end"] == 7
def test_separator_length_greater_than_one(self):
lexer = _lex("foo bar")
assert len(lexer.lexemes) == 2
foo_lexeme, bar_lexeme = lexer.lexemes
assert foo_lexeme["type"] == bar_lexeme["type"] == QueryLexer.TERM
assert foo_lexeme["string"] == "foo"
assert bar_lexeme["string"] == "bar"
assert foo_lexeme["start"] == 0
assert bar_lexeme["start"] == 7
assert foo_lexeme["end"] == 3
assert bar_lexeme["end"] == 10
def test_hyphen_is_considered_a_separator(self):
lexer = _lex("foo-bar")
assert len(lexer.lexemes) == 2
def test_term_with_field(self):
lexer = _lex("title:foo")
assert len(lexer.lexemes) == 2
field_lexeme, term_lexeme = lexer.lexemes
assert field_lexeme["type"] == QueryLexer.FIELD
assert term_lexeme["type"] == QueryLexer.TERM
assert field_lexeme["string"] == "title"
assert term_lexeme["string"] == "foo"
assert field_lexeme["start"] == 0
assert term_lexeme["start"] == 6
assert field_lexeme["end"] == 5
assert term_lexeme["end"] == 9
def test_term_with_field_with_escape_character(self):
lexer = _lex("ti\\:tle:foo")
assert len(lexer.lexemes) == 2
field_lexeme, term_lexeme = lexer.lexemes
assert field_lexeme["type"] == QueryLexer.FIELD
assert term_lexeme["type"] == QueryLexer.TERM
assert field_lexeme["string"] == "ti:tle"
assert term_lexeme["string"] == "foo"
assert field_lexeme["start"] == 0
assert term_lexeme["start"] == 8
assert field_lexeme["end"] == 7
assert term_lexeme["end"] == 11
def test_term_with_edit_distance(self):
lexer = _lex("foo~2")
assert len(lexer.lexemes) == 2
term_lexeme, edit_distance_lexeme = lexer.lexemes
assert term_lexeme["type"] == QueryLexer.TERM
assert edit_distance_lexeme["type"] == QueryLexer.EDIT_DISTANCE
assert term_lexeme["string"] == "foo"
assert edit_distance_lexeme["string"] == "2"
assert term_lexeme["start"] == 0
assert edit_distance_lexeme["start"] == 4
assert term_lexeme["end"] == 3
assert edit_distance_lexeme["end"] == 5
def test_term_with_boost(self):
lexer = _lex("foo^10")
assert len(lexer.lexemes) == 2
term_lexeme, boost_lexeme = lexer.lexemes
assert term_lexeme["type"] == QueryLexer.TERM
assert boost_lexeme["type"] == QueryLexer.BOOST
assert term_lexeme["string"] == "foo"
assert boost_lexeme["string"] == "10"
assert term_lexeme["start"] == 0
assert boost_lexeme["start"] == 4
assert term_lexeme["end"] == 3
assert boost_lexeme["end"] == 6
def test_term_with_field_boost_and_edit_distance(self):
lexer = _lex("title:foo^10~5")
assert len(lexer.lexemes) == 4
field_lexeme, term_lexeme, boost_lexeme, edit_distance_lexeme = lexer.lexemes
assert field_lexeme["type"] == QueryLexer.FIELD
assert term_lexeme["type"] == QueryLexer.TERM
assert boost_lexeme["type"] == QueryLexer.BOOST
assert edit_distance_lexeme["type"] == QueryLexer.EDIT_DISTANCE
assert field_lexeme["string"] == "title"
assert term_lexeme["string"] == "foo"
assert boost_lexeme["string"] == "10"
assert edit_distance_lexeme["string"] == "5"
assert field_lexeme["start"] == 0
assert term_lexeme["start"] == 6
assert boost_lexeme["start"] == 10
assert edit_distance_lexeme["start"] == 13
assert field_lexeme["end"] == 5
assert term_lexeme["end"] == 9
assert boost_lexeme["end"] == 12
assert edit_distance_lexeme["end"] == 14
def test_single_term_with_hyphen_produces_two_lexemes(self):
"""Embedded hyphens should not be confused with presence operators."""
lexer = _lex("foo-bar")
assert len(lexer.lexemes) == 2
foo_lexeme, bar_lexeme = lexer.lexemes
assert foo_lexeme["type"] == QueryLexer.TERM
assert foo_lexeme["string"] == "foo"
assert foo_lexeme["start"] == 0
assert foo_lexeme["end"] == 3
assert bar_lexeme["type"] == QueryLexer.TERM
assert bar_lexeme["string"] == "bar"
assert bar_lexeme["start"] == 4
assert bar_lexeme["end"] == 7
def test_single_term_with_presence_produces_two_lexemes(self):
lexer = _lex("+foo")
assert len(lexer.lexemes) == 2
presence_lexeme, term_lexeme = lexer.lexemes
assert presence_lexeme["type"] == QueryLexer.PRESENCE
assert presence_lexeme["string"] == "+"
assert presence_lexeme["start"] == 0
assert presence_lexeme["end"] == 1
assert term_lexeme["type"] == QueryLexer.TERM
assert term_lexeme["string"] == "foo"
assert term_lexeme["start"] == 1
assert term_lexeme["end"] == 4
def test_multiple_terms_with_presence_produces_four_lexemes(self):
lexer = _lex("+foo +bar")
assert len(lexer.lexemes) == 4
(
foo_presence_lexeme,
foo_term_lexeme,
bar_presence_lexeme,
bar_term_lexeme,
) = lexer.lexemes
assert foo_term_lexeme["type"] == QueryLexer.TERM
assert foo_term_lexeme["string"] == "foo"
assert foo_term_lexeme["start"] == 1
assert foo_term_lexeme["end"] == 4
assert foo_presence_lexeme["type"] == QueryLexer.PRESENCE
assert foo_presence_lexeme["string"] == "+"
assert foo_presence_lexeme["start"] == 0
assert foo_presence_lexeme["end"] == 1
assert bar_term_lexeme["type"] == QueryLexer.TERM
assert bar_term_lexeme["string"] == "bar"
assert bar_term_lexeme["start"] == 6
assert bar_term_lexeme["end"] == 9
assert bar_presence_lexeme["type"] == QueryLexer.PRESENCE
assert bar_presence_lexeme["string"] == "+"
assert bar_presence_lexeme["start"] == 5
assert bar_presence_lexeme["end"] == 6
def test_multiple_terms_with_presence_and_fuzz(self):
lexer = _lex("+foo~1 +bar")
assert len(lexer.lexemes) == 5
(
foo_presence_lexeme,
foo_term_lexeme,
foo_fuzz_lexeme,
bar_presence_lexeme,
bar_term_lexeme,
) = lexer.lexemes
assert foo_presence_lexeme["type"] == QueryLexer.PRESENCE
assert foo_presence_lexeme["string"] == "+"
assert foo_presence_lexeme["start"] == 0
assert foo_presence_lexeme["end"] == 1
assert foo_term_lexeme["type"] == QueryLexer.TERM
assert foo_term_lexeme["string"] == "foo"
assert foo_term_lexeme["start"] == 1
assert foo_term_lexeme["end"] == 4
assert foo_fuzz_lexeme["type"] == QueryLexer.EDIT_DISTANCE
assert foo_fuzz_lexeme["string"] == "1"
assert foo_fuzz_lexeme["start"] == 5
assert foo_fuzz_lexeme["end"] == 6
assert bar_presence_lexeme["type"] == QueryLexer.PRESENCE
assert bar_presence_lexeme["string"] == "+"
assert bar_presence_lexeme["start"] == 7
assert bar_presence_lexeme["end"] == 8
assert bar_term_lexeme["type"] == QueryLexer.TERM
assert bar_term_lexeme["string"] == "bar"
assert bar_term_lexeme["start"] == 8
assert bar_term_lexeme["end"] == 11
|