File: token_set.py

package info (click to toggle)
python-lunr 0.8.0-1
  • links: PTS, VCS
  • area: main
  • in suites: forky, sid, trixie
  • size: 3,644 kB
  • sloc: python: 3,811; javascript: 114; makefile: 60
file content (269 lines) | stat: -rw-r--r-- 9,444 bytes parent folder | download | duplicates (2)
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
class TokenSet:
    """
    A token set is used to store the unique list of all tokens
    within an index. Token sets are also used to represent an
    incoming query to the index, this query token set and index
    token set are then intersected to find which tokens to look
    up in the inverted index.

    A token set can hold multiple tokens, as in the case of the
    index token set, or it can hold a single token as in the
    case of a simple query token set.

    Additionally token sets are used to perform wildcard matching.
    Leading, contained and trailing wildcards are supported, and
    from this edit distance matching can also be provided.

    Token sets are implemented as a minimal finite state automata,
    where both common prefixes and suffixes are shared between tokens.
    This helps to reduce the space used for storing the token set.

    TODO: consider https://github.com/glyph/automat
    """

    _next_id = 1

    def __init__(self):
        self.final = False
        self.edges = {}
        self.id = self._next_id
        self.__class__._next_id += 1

    def __str__(self):
        try:
            return self._string
        except AttributeError:
            pass

        string = "1" if self.final else "0"
        for label in sorted(list(self.edges.keys())):
            node = self.edges[label]
            try:
                node_id = str(node.id)
            except AttributeError:
                # TODO: JS seems to rely on undefined for the id attribute?
                node_id = ""

            string = string + label + node_id

        return string

    def __repr__(self):
        return '<TokenSet "{}">'.format(str(self))

    @classmethod
    def from_string(self, string):
        """Creates a TokenSet from a string.

        The string may contain one or more wildcard characters (*) that will
        allow wildcard matching when intersecting with another TokenSet
        """
        node = TokenSet()
        root = node

        # Iterates throough all characters in the passed string appending
        # a node for each character.
        # When a wildcard character is found then a self referencing edge
        # is introduced to continually match any number of characters
        for i, char in enumerate(string):
            final = i == len(string) - 1
            if char == "*":
                node.edges[char] = node
                node.final = final
            else:
                next_ = TokenSet()
                next_.final = final
                node.edges[char] = next_
                node = next_

        return root

    @classmethod
    def from_fuzzy_string(cls, string, edit_distance):
        """Creates a token set representing a single string with a specified
        edit distance.

        Insertions, deletions, substitutions and transpositions are each
        treated as an edit distance of 1.

        Increasing the allowed edit distance will have a dramatic impact
        on the performance of both creating and intersecting these TokenSets.
        It is advised to keep the edit distance less than 3.
        """
        root = TokenSet()

        stack = [{"node": root, "edits_remaining": edit_distance, "string": string}]

        while stack:
            frame = stack.pop()
            # no edit
            if len(frame["string"]) > 0:
                char = frame["string"][0]
                no_edit_node = None
                if char in frame["node"].edges:
                    no_edit_node = frame["node"].edges[char]
                else:
                    no_edit_node = TokenSet()
                    frame["node"].edges[char] = no_edit_node

                if len(frame["string"]) == 1:
                    no_edit_node.final = True

                stack.append(
                    {
                        "node": no_edit_node,
                        "edits_remaining": frame["edits_remaining"],
                        "string": frame["string"][1:],
                    }
                )

            if frame["edits_remaining"] == 0:
                continue

            # insertion, can only do insertion if there are edits remaining
            if "*" in frame["node"].edges:
                insertion_node = frame["node"].edges["*"]
            else:
                insertion_node = TokenSet()
                frame["node"].edges["*"] = insertion_node

            if len(frame["string"]) == 0:
                insertion_node.final = True

            stack.append(
                {
                    "node": insertion_node,
                    "edits_remaining": frame["edits_remaining"] - 1,
                    "string": frame["string"],
                }
            )

            # deletion, can only do a deletion if we have enough edits
            # remaining and if there are characters left to delete in the string
            if len(frame["string"]) > 1:
                stack.append(
                    {
                        "node": frame["node"],
                        "edits_remaining": frame["edits_remaining"] - 1,
                        "string": frame["string"][1:],
                    }
                )

            # deletion, just removing the last character of the string
            if len(frame["string"]) == 1:
                frame["node"].final = True

            # substitution, can only do a substitution if we have enough edits
            # remaining and there are characters left to substitute
            if len(frame["string"]) >= 1:
                if "*" in frame["node"].edges:
                    substitution_node = frame["node"].edges["*"]
                else:
                    substitution_node = TokenSet()
                    frame["node"].edges["*"] = substitution_node

                if len(frame["string"]) == 1:
                    substitution_node.final = True

                stack.append(
                    {
                        "node": substitution_node,
                        "edits_remaining": frame["edits_remaining"] - 1,
                        "string": frame["string"][1:],
                    }
                )

            # transposition, can only do a transposition if there are edits
            # remaining and there are enough characters to transpose
            if frame["edits_remaining"] and len(frame["string"]) > 1:
                char_a = frame["string"][0]
                char_b = frame["string"][1]
                transpose_node = None

                if char_b in frame["node"].edges:
                    transpose_node = frame["node"].edges[char_b]
                else:
                    transpose_node = TokenSet()
                    frame["node"].edges[char_b] = transpose_node

                if len(frame["string"]) == 1:
                    transpose_node.final = True

                stack.append(
                    {
                        "node": transpose_node,
                        "edits_remaining": frame["edits_remaining"] - 1,
                        "string": char_a + frame["string"][2:],
                    }
                )

        return root

    @classmethod
    def from_list(cls, list_of_words):
        from lunr.token_set_builder import TokenSetBuilder

        builder = TokenSetBuilder()
        for word in list_of_words:
            builder.insert(word)

        builder.finish()
        return builder.root

    @classmethod
    def from_clause(cls, clause):
        if clause.edit_distance:
            return cls.from_fuzzy_string(clause.term, clause.edit_distance)
        else:
            return cls.from_string(clause.term)

    def to_list(self):
        words = []
        stack = [{"prefix": "", "node": self}]

        while stack:
            frame = stack.pop()
            if frame["node"].final:
                words.append(frame["prefix"])

            for edge in frame["node"].edges.keys():
                stack.append(
                    {
                        "prefix": frame["prefix"] + str(edge),
                        "node": frame["node"].edges[edge],
                    }
                )

        return words

    def intersect(self, other):
        """Returns a new TokenSet that is the intersection of this TokenSet
        and the passed TokenSet.

        This intersection will take into account any wildcards contained within
        the TokenSet.
        """
        output = TokenSet()
        stack = [{"node": self, "q_node": other, "output": output}]

        while stack:
            frame = stack.pop()
            for q_edge in frame["q_node"].edges.keys():
                for n_edge in frame["node"].edges.keys():
                    if n_edge == q_edge or q_edge == "*":
                        node = frame["node"].edges[n_edge]
                        q_node = frame["q_node"].edges[q_edge]
                        final = node.final and q_node.final
                        next_ = None

                        if n_edge in frame["output"].edges:
                            next_ = frame["output"].edges[n_edge]
                            next_.final = next_.final or final
                        else:
                            next_ = TokenSet()
                            next_.final = final
                            frame["output"].edges[n_edge] = next_

                        stack.append({"node": node, "q_node": q_node, "output": next_})

        return output