File: token_set_builder.py

package info (click to toggle)
python-lunr 0.8.0-1
  • links: PTS, VCS
  • area: main
  • in suites: forky, sid, trixie
  • size: 3,644 kB
  • sloc: python: 3,811; javascript: 114; makefile: 60
file content (58 lines) | stat: -rw-r--r-- 1,660 bytes parent folder | download | duplicates (2)
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
from lunr.token_set import TokenSet
from lunr.exceptions import BaseLunrException


class TokenSetBuilder:
    def __init__(self):
        self.previous_word = ""
        self.root = TokenSet()
        self.unchecked_nodes = []
        self.minimized_nodes = {}

    def insert(self, word):
        if word < self.previous_word:
            raise BaseLunrException("Out of order word insertion")

        common_prefix = 0
        for i in range(min(len(word), len(self.previous_word))):
            if word[i] != self.previous_word[i]:
                break

            common_prefix += 1

        self.minimize(common_prefix)

        node = (
            self.root if not self.unchecked_nodes else self.unchecked_nodes[-1]["child"]
        )

        for i in range(common_prefix, len(word)):
            next_node = TokenSet()
            char = word[i]

            node.edges[char] = next_node

            self.unchecked_nodes.append(
                {"parent": node, "char": char, "child": next_node}
            )

            node = next_node

        node.final = True
        self.previous_word = word

    def finish(self):
        self.minimize(0)

    def minimize(self, down_to):
        for i in range(len(self.unchecked_nodes) - 1, down_to - 1, -1):
            node = self.unchecked_nodes[i]
            child_key = str(node["child"])

            if child_key in self.minimized_nodes:
                node["parent"].edges[node["char"]] = self.minimized_nodes[child_key]
            else:
                node["child"]._str = child_key
                self.minimized_nodes[child_key] = node["child"]

            self.unchecked_nodes.pop()