File: test_query_lexer.py

package info (click to toggle)
python-lunr 0.8.0-1
  • links: PTS, VCS
  • area: main
  • in suites: forky, sid, trixie
  • size: 3,644 kB
  • sloc: python: 3,811; javascript: 114; makefile: 60
file content (229 lines) | stat: -rw-r--r-- 8,724 bytes parent folder | download | duplicates (2)
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
from lunr.query_lexer import QueryLexer


def _lex(string):
    lexer = QueryLexer(string)
    lexer.run()
    return lexer


class TestQueryLexer:
    def test_single_term_produces_one_lexeme(self):
        lexer = _lex("foo")
        assert len(lexer.lexemes) == 1
        lexeme = lexer.lexemes[0]
        assert lexeme["type"] == QueryLexer.TERM
        assert lexeme["string"] == "foo"
        assert lexeme["start"] == 0
        assert lexeme["end"] == 3

    def test_term_escape_character(self):
        lexer = _lex("foo\\:bar")
        assert len(lexer.lexemes) == 1
        lexeme = lexer.lexemes[0]
        assert lexeme["type"] == QueryLexer.TERM
        assert lexeme["string"] == "foo:bar"
        assert lexeme["start"] == 0
        assert lexeme["end"] == 8

    def test_multiple_terms(self):
        lexer = _lex("foo bar")
        assert len(lexer.lexemes) == 2
        foo_lexeme, bar_lexeme = lexer.lexemes
        assert foo_lexeme["type"] == bar_lexeme["type"] == QueryLexer.TERM
        assert foo_lexeme["string"] == "foo"
        assert bar_lexeme["string"] == "bar"
        assert foo_lexeme["start"] == 0
        assert bar_lexeme["start"] == 4
        assert foo_lexeme["end"] == 3
        assert bar_lexeme["end"] == 7

    def test_separator_length_greater_than_one(self):
        lexer = _lex("foo    bar")
        assert len(lexer.lexemes) == 2
        foo_lexeme, bar_lexeme = lexer.lexemes
        assert foo_lexeme["type"] == bar_lexeme["type"] == QueryLexer.TERM
        assert foo_lexeme["string"] == "foo"
        assert bar_lexeme["string"] == "bar"
        assert foo_lexeme["start"] == 0
        assert bar_lexeme["start"] == 7
        assert foo_lexeme["end"] == 3
        assert bar_lexeme["end"] == 10

    def test_hyphen_is_considered_a_separator(self):
        lexer = _lex("foo-bar")
        assert len(lexer.lexemes) == 2

    def test_term_with_field(self):
        lexer = _lex("title:foo")
        assert len(lexer.lexemes) == 2
        field_lexeme, term_lexeme = lexer.lexemes
        assert field_lexeme["type"] == QueryLexer.FIELD
        assert term_lexeme["type"] == QueryLexer.TERM
        assert field_lexeme["string"] == "title"
        assert term_lexeme["string"] == "foo"
        assert field_lexeme["start"] == 0
        assert term_lexeme["start"] == 6
        assert field_lexeme["end"] == 5
        assert term_lexeme["end"] == 9

    def test_term_with_field_with_escape_character(self):
        lexer = _lex("ti\\:tle:foo")
        assert len(lexer.lexemes) == 2
        field_lexeme, term_lexeme = lexer.lexemes
        assert field_lexeme["type"] == QueryLexer.FIELD
        assert term_lexeme["type"] == QueryLexer.TERM
        assert field_lexeme["string"] == "ti:tle"
        assert term_lexeme["string"] == "foo"
        assert field_lexeme["start"] == 0
        assert term_lexeme["start"] == 8
        assert field_lexeme["end"] == 7
        assert term_lexeme["end"] == 11

    def test_term_with_edit_distance(self):
        lexer = _lex("foo~2")
        assert len(lexer.lexemes) == 2
        term_lexeme, edit_distance_lexeme = lexer.lexemes
        assert term_lexeme["type"] == QueryLexer.TERM
        assert edit_distance_lexeme["type"] == QueryLexer.EDIT_DISTANCE
        assert term_lexeme["string"] == "foo"
        assert edit_distance_lexeme["string"] == "2"
        assert term_lexeme["start"] == 0
        assert edit_distance_lexeme["start"] == 4
        assert term_lexeme["end"] == 3
        assert edit_distance_lexeme["end"] == 5

    def test_term_with_boost(self):
        lexer = _lex("foo^10")
        assert len(lexer.lexemes) == 2
        term_lexeme, boost_lexeme = lexer.lexemes
        assert term_lexeme["type"] == QueryLexer.TERM
        assert boost_lexeme["type"] == QueryLexer.BOOST
        assert term_lexeme["string"] == "foo"
        assert boost_lexeme["string"] == "10"
        assert term_lexeme["start"] == 0
        assert boost_lexeme["start"] == 4
        assert term_lexeme["end"] == 3
        assert boost_lexeme["end"] == 6

    def test_term_with_field_boost_and_edit_distance(self):
        lexer = _lex("title:foo^10~5")
        assert len(lexer.lexemes) == 4
        field_lexeme, term_lexeme, boost_lexeme, edit_distance_lexeme = lexer.lexemes
        assert field_lexeme["type"] == QueryLexer.FIELD
        assert term_lexeme["type"] == QueryLexer.TERM
        assert boost_lexeme["type"] == QueryLexer.BOOST
        assert edit_distance_lexeme["type"] == QueryLexer.EDIT_DISTANCE

        assert field_lexeme["string"] == "title"
        assert term_lexeme["string"] == "foo"
        assert boost_lexeme["string"] == "10"
        assert edit_distance_lexeme["string"] == "5"

        assert field_lexeme["start"] == 0
        assert term_lexeme["start"] == 6
        assert boost_lexeme["start"] == 10
        assert edit_distance_lexeme["start"] == 13

        assert field_lexeme["end"] == 5
        assert term_lexeme["end"] == 9
        assert boost_lexeme["end"] == 12
        assert edit_distance_lexeme["end"] == 14

    def test_single_term_with_hyphen_produces_two_lexemes(self):
        """Embedded hyphens should not be confused with presence operators."""
        lexer = _lex("foo-bar")
        assert len(lexer.lexemes) == 2
        foo_lexeme, bar_lexeme = lexer.lexemes

        assert foo_lexeme["type"] == QueryLexer.TERM
        assert foo_lexeme["string"] == "foo"
        assert foo_lexeme["start"] == 0
        assert foo_lexeme["end"] == 3

        assert bar_lexeme["type"] == QueryLexer.TERM
        assert bar_lexeme["string"] == "bar"
        assert bar_lexeme["start"] == 4
        assert bar_lexeme["end"] == 7

    def test_single_term_with_presence_produces_two_lexemes(self):
        lexer = _lex("+foo")
        assert len(lexer.lexemes) == 2
        presence_lexeme, term_lexeme = lexer.lexemes

        assert presence_lexeme["type"] == QueryLexer.PRESENCE
        assert presence_lexeme["string"] == "+"
        assert presence_lexeme["start"] == 0
        assert presence_lexeme["end"] == 1

        assert term_lexeme["type"] == QueryLexer.TERM
        assert term_lexeme["string"] == "foo"
        assert term_lexeme["start"] == 1
        assert term_lexeme["end"] == 4

    def test_multiple_terms_with_presence_produces_four_lexemes(self):
        lexer = _lex("+foo +bar")
        assert len(lexer.lexemes) == 4
        (
            foo_presence_lexeme,
            foo_term_lexeme,
            bar_presence_lexeme,
            bar_term_lexeme,
        ) = lexer.lexemes

        assert foo_term_lexeme["type"] == QueryLexer.TERM
        assert foo_term_lexeme["string"] == "foo"
        assert foo_term_lexeme["start"] == 1
        assert foo_term_lexeme["end"] == 4

        assert foo_presence_lexeme["type"] == QueryLexer.PRESENCE
        assert foo_presence_lexeme["string"] == "+"
        assert foo_presence_lexeme["start"] == 0
        assert foo_presence_lexeme["end"] == 1

        assert bar_term_lexeme["type"] == QueryLexer.TERM
        assert bar_term_lexeme["string"] == "bar"
        assert bar_term_lexeme["start"] == 6
        assert bar_term_lexeme["end"] == 9

        assert bar_presence_lexeme["type"] == QueryLexer.PRESENCE
        assert bar_presence_lexeme["string"] == "+"
        assert bar_presence_lexeme["start"] == 5
        assert bar_presence_lexeme["end"] == 6

    def test_multiple_terms_with_presence_and_fuzz(self):
        lexer = _lex("+foo~1 +bar")
        assert len(lexer.lexemes) == 5

        (
            foo_presence_lexeme,
            foo_term_lexeme,
            foo_fuzz_lexeme,
            bar_presence_lexeme,
            bar_term_lexeme,
        ) = lexer.lexemes

        assert foo_presence_lexeme["type"] == QueryLexer.PRESENCE
        assert foo_presence_lexeme["string"] == "+"
        assert foo_presence_lexeme["start"] == 0
        assert foo_presence_lexeme["end"] == 1

        assert foo_term_lexeme["type"] == QueryLexer.TERM
        assert foo_term_lexeme["string"] == "foo"
        assert foo_term_lexeme["start"] == 1
        assert foo_term_lexeme["end"] == 4

        assert foo_fuzz_lexeme["type"] == QueryLexer.EDIT_DISTANCE
        assert foo_fuzz_lexeme["string"] == "1"
        assert foo_fuzz_lexeme["start"] == 5
        assert foo_fuzz_lexeme["end"] == 6

        assert bar_presence_lexeme["type"] == QueryLexer.PRESENCE
        assert bar_presence_lexeme["string"] == "+"
        assert bar_presence_lexeme["start"] == 7
        assert bar_presence_lexeme["end"] == 8

        assert bar_term_lexeme["type"] == QueryLexer.TERM
        assert bar_term_lexeme["string"] == "bar"
        assert bar_term_lexeme["start"] == 8
        assert bar_term_lexeme["end"] == 11