File: fuzzy_search.py

package info (click to toggle)
textual-autocomplete 4.0.5-1
  • links: PTS, VCS
  • area: main
  • in suites: forky, sid
  • size: 788 kB
  • sloc: python: 1,835; makefile: 4
file content (162 lines) | stat: -rw-r--r-- 5,123 bytes parent folder | download
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
"""
Fuzzy matcher.

This class is used by the [command palette](/guide/command_palette) to match search terms.

This is the matcher that powers Textual's command palette.

Thanks to Will McGugan for the implementation.
"""

from __future__ import annotations

from operator import itemgetter
from re import IGNORECASE, escape, finditer, search
from typing import Iterable, NamedTuple

from textual.cache import LRUCache


class _Search(NamedTuple):
    """Internal structure to keep track of a recursive search."""

    candidate_offset: int = 0
    query_offset: int = 0
    offsets: tuple[int, ...] = ()

    def branch(self, offset: int) -> tuple[_Search, _Search]:
        """Branch this search when an offset is found.

        Args:
            offset: Offset of a matching letter in the query.

        Returns:
            A pair of search objects.
        """
        _, query_offset, offsets = self
        return (
            _Search(offset + 1, query_offset + 1, offsets + (offset,)),
            _Search(offset + 1, query_offset, offsets),
        )

    @property
    def groups(self) -> int:
        """Number of groups in offsets."""
        groups = 1
        last_offset, *offsets = self.offsets
        for offset in offsets:
            if offset != last_offset + 1:
                groups += 1
            last_offset = offset
        return groups


class FuzzySearch:
    """Performs a fuzzy search.

    Unlike a regex solution, this will finds all possible matches.
    """

    cache: LRUCache[tuple[str, str, bool], tuple[float, tuple[int, ...]]] = LRUCache(
        1024 * 4
    )

    def __init__(self, case_sensitive: bool = False) -> None:
        """Initialize fuzzy search.

        Args:
            case_sensitive: Is the match case sensitive?
        """

        self.case_sensitive = case_sensitive

    def match(self, query: str, candidate: str) -> tuple[float, tuple[int, ...]]:
        """Match against a query.

        Args:
            query: The fuzzy query.
            candidate: A candidate to check,.

        Returns:
            A pair of (score, tuple of offsets). `(0, ())` for no result.
        """
        query_regex = ".*?".join(f"({escape(character)})" for character in query)
        if not search(
            query_regex, candidate, flags=0 if self.case_sensitive else IGNORECASE
        ):
            # Bail out early if there is no possibility of a match
            return (0.0, ())

        cache_key = (query, candidate, self.case_sensitive)
        if cache_key in self.cache:
            return self.cache[cache_key]
        result = max(
            self._match(query, candidate), key=itemgetter(0), default=(0.0, ())
        )
        self.cache[cache_key] = result
        return result

    def _match(
        self, query: str, candidate: str
    ) -> Iterable[tuple[float, tuple[int, ...]]]:
        """Generator to do the matching.

        Args:
            query: Query to match.
            candidate: Candidate to check against.

        Yields:
            Pairs of score and tuple of offsets.
        """
        if not self.case_sensitive:
            query = query.lower()
            candidate = candidate.lower()

        # We need this to give a bonus to first letters.
        first_letters = {match.start() for match in finditer(r"\w+", candidate)}

        def score(search: _Search) -> float:
            """Sore a search.

            Args:
                search: Search object.

            Returns:
                Score.

            """
            # This is a heuristic, and can be tweaked for better results
            # Boost first letter matches
            offset_count = len(search.offsets)
            score: float = offset_count + len(
                first_letters.intersection(search.offsets)
            )
            # Boost to favor less groups
            normalized_groups = (offset_count - (search.groups - 1)) / offset_count
            score *= 1 + (normalized_groups * normalized_groups)
            return score

        stack: list[_Search] = [_Search()]
        push = stack.append
        pop = stack.pop
        query_size = len(query)
        find = candidate.find
        # Limit the number of loops out of an abundance of caution.
        # This should be hard to reach without contrived data.
        remaining_loops = 10_000
        while stack and (remaining_loops := remaining_loops - 1):
            search = pop()
            offset = find(query[search.query_offset], search.candidate_offset)
            if offset != -1:
                if not set(candidate[search.candidate_offset :]).issuperset(
                    query[search.query_offset :]
                ):
                    # Early out if there is not change of a match
                    continue
                advance_branch, branch = search.branch(offset)
                if advance_branch.query_offset == query_size:
                    yield score(advance_branch), advance_branch.offsets
                    push(branch)
                else:
                    push(branch)
                    push(advance_branch)