File: query_lexer.py

package info (click to toggle)
python-lunr 0.8.0-1
  • links: PTS, VCS
  • area: main
  • in suites: forky, sid, trixie
  • size: 3,644 kB
  • sloc: python: 3,811; javascript: 114; makefile: 60
file content (153 lines) | stat: -rw-r--r-- 3,827 bytes parent folder | download | duplicates (2)
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
from lunr.tokenizer import default_separator


class QueryLexer:
    # TODO: use iteration protocol?
    EOS = "EOS"
    FIELD = "FIELD"
    TERM = "TERM"
    EDIT_DISTANCE = "EDIT_DISTANCE"
    BOOST = "BOOST"
    PRESENCE = "PRESENCE"

    def __init__(self, string):
        self.lexemes = []
        self.string = string
        self.length = len(string)
        self.pos = 0
        self.start = 0
        self.escape_char_positions = []

    @property
    def width(self):
        return self.pos - self.start

    def ignore(self):
        if self.start == self.pos:
            self.pos += 1

        self.start = self.pos

    def backup(self):
        self.pos -= 1

    def accept_digit_run(self):
        char = self.next()
        while char != self.EOS and (47 < ord(char) < 58):
            char = self.next()

        if char != self.EOS:
            self.backup()

    def run(self):
        state = self.lex_text()
        while state:
            state = state()

    def slice_string(self):
        subslices = []
        slice_start = self.start

        for escape_char_position in self.escape_char_positions:
            subslices.append(self.string[slice_start:escape_char_position])
            slice_start = escape_char_position + 1

        subslices.append(self.string[slice_start : self.pos])
        self.escape_char_positions = []

        return "".join(subslices)

    def next(self):
        if self.pos >= self.length:
            return self.EOS

        char = self.string[self.pos]
        self.pos += 1
        return char

    def emit(self, type_):
        self.lexemes.append(
            {
                "type": type_,
                "string": self.slice_string(),
                "start": self.start,
                "end": self.pos,
            }
        )
        self.start = self.pos

    def escape_character(self):
        self.escape_char_positions.append(self.pos - 1)
        self.pos += 1

    def lex_field(self):
        self.backup()
        self.emit(self.FIELD)
        self.ignore()
        return self.lex_text

    def lex_term(self):
        if self.width > 1:
            self.backup()
            self.emit(self.TERM)

        self.ignore()

        return self.lex_text

    def lex_edit_distance(self):
        self.ignore()
        self.accept_digit_run()
        self.emit(self.EDIT_DISTANCE)
        return self.lex_text

    def lex_boost(self):
        self.ignore()
        self.accept_digit_run()
        self.emit(self.BOOST)
        return self.lex_text

    def lex_EOS(self):
        if self.width > 0:
            self.emit(self.TERM)

    def lex_text(self):
        while True:
            char = self.next()
            if char == self.EOS:
                return self.lex_EOS

            if ord(char) == 92:  # Escape character is '\'
                self.escape_character()
                continue

            if char == ":":
                return self.lex_field

            if char == "~":
                self.backup()
                if self.width > 0:
                    self.emit(self.TERM)

                return self.lex_edit_distance

            if char == "^":
                self.backup()
                if self.width > 0:
                    self.emit(self.TERM)

                return self.lex_boost

            # '+' indicates term presence is required, check for length to
            # ensure only a leading '+' is considered
            if char == "+" and self.width == 1:
                self.emit(self.PRESENCE)
                return self.lex_text

            # '-' indicates term presence is prohibited
            if char == "-" and self.width == 1:
                self.emit(self.PRESENCE)
                return self.lex_text

            if default_separator(char):
                return self.lex_term