File: fuzzy_search.py

package info (click to toggle)
textual-autocomplete 4.0.6-1
links: PTS, VCS
area: main
in suites: forky, sid
size: 796 kB
sloc: python: 1,907; makefile: 4
file content (222 lines) | stat: -rw-r--r-- 6,969 bytes
"""
Fuzzy matcher.

This class is used by the [command palette](/guide/command_palette) to match search terms.

This is the matcher that powers Textual's command palette.

Thanks to Will McGugan for the implementation.
"""

from __future__ import annotations

from functools import lru_cache
from operator import itemgetter
from re import finditer
from typing import Iterable, Sequence

import rich.repr

from textual.cache import LRUCache
from textual.content import Content
from textual.style import Style


class FuzzySearch:
    """Performs a fuzzy search.

    Unlike a regex solution, this will finds all possible matches.
    """

    def __init__(
        self, case_sensitive: bool = False, *, cache_size: int = 1024 * 4
    ) -> None:
        """Initialize fuzzy search.

        Args:
            case_sensitive: Is the match case sensitive?
            cache_size: Number of queries to cache.
        """

        self.case_sensitive = case_sensitive
        self.cache: LRUCache[tuple[str, str], tuple[float, Sequence[int]]] = LRUCache(
            cache_size
        )

    def match(self, query: str, candidate: str) -> tuple[float, Sequence[int]]:
        """Match against a query.

        Args:
            query: The fuzzy query.
            candidate: A candidate to check,.

        Returns:
            A pair of (score, tuple of offsets). `(0, ())` for no result.
        """

        cache_key = (query, candidate)
        if cache_key in self.cache:
            return self.cache[cache_key]
        default: tuple[float, Sequence[int]] = (0.0, [])
        result = max(self._match(query, candidate), key=itemgetter(0), default=default)
        self.cache[cache_key] = result
        return result

    @classmethod
    @lru_cache(maxsize=1024)
    def get_first_letters(cls, candidate: str) -> frozenset[int]:
        return frozenset({match.start() for match in finditer(r"\w+", candidate)})

    def score(self, candidate: str, positions: Sequence[int]) -> float:
        """Score a search.

        Args:
            search: Search object.

        Returns:
            Score.
        """
        first_letters = self.get_first_letters(candidate)
        # This is a heuristic, and can be tweaked for better results
        # Boost first letter matches
        offset_count = len(positions)
        score: float = offset_count + len(first_letters.intersection(positions))

        groups = 1
        last_offset, *offsets = positions
        for offset in offsets:
            if offset != last_offset + 1:
                groups += 1
            last_offset = offset

        # Boost to favor less groups
        normalized_groups = (offset_count - (groups - 1)) / offset_count
        score *= 1 + (normalized_groups * normalized_groups)
        return score

    def _match(
        self, query: str, candidate: str
    ) -> Iterable[tuple[float, Sequence[int]]]:
        letter_positions: list[list[int]] = []
        position = 0

        if not self.case_sensitive:
            candidate = candidate.lower()
            query = query.lower()
        score = self.score
        if query in candidate:
            # Quick exit when the query exists as a substring
            query_location = candidate.rfind(query)
            offsets = list(range(query_location, query_location + len(query)))
            yield (
                score(candidate, offsets) * (2.0 if candidate == query else 1.5),
                offsets,
            )
            return

        for offset, letter in enumerate(query):
            last_index = len(candidate) - offset
            positions: list[int] = []
            letter_positions.append(positions)
            index = position
            while (location := candidate.find(letter, index)) != -1:
                positions.append(location)
                index = location + 1
                if index >= last_index:
                    break
            if not positions:
                yield (0.0, ())
                return
            position = positions[0] + 1

        possible_offsets: list[list[int]] = []
        query_length = len(query)

        def get_offsets(offsets: list[int], positions_index: int) -> None:
            """Recursively match offsets.

            Args:
                offsets: A list of offsets.
                positions_index: Index of query letter.

            """
            for offset in letter_positions[positions_index]:
                if not offsets or offset > offsets[-1]:
                    new_offsets = [*offsets, offset]
                    if len(new_offsets) == query_length:
                        possible_offsets.append(new_offsets)
                    else:
                        get_offsets(new_offsets, positions_index + 1)

        get_offsets([], 0)

        for offsets in possible_offsets:
            yield score(candidate, offsets), offsets


@rich.repr.auto
class Matcher:
    """A fuzzy matcher."""

    def __init__(
        self,
        query: str,
        *,
        match_style: Style | None = None,
        case_sensitive: bool = False,
    ) -> None:
        """Initialise the fuzzy matching object.

        Args:
            query: A query as typed in by the user.
            match_style: The style to use to highlight matched portions of a string.
            case_sensitive: Should matching be case sensitive?
        """
        self._query = query
        self._match_style = Style(reverse=True) if match_style is None else match_style
        self._case_sensitive = case_sensitive
        self.fuzzy_search = FuzzySearch()

    @property
    def query(self) -> str:
        """The query string to look for."""
        return self._query

    @property
    def match_style(self) -> Style:
        """The style that will be used to highlight hits in the matched text."""
        return self._match_style

    @property
    def case_sensitive(self) -> bool:
        """Is this matcher case sensitive?"""
        return self._case_sensitive

    def match(self, candidate: str) -> float:
        """Match the candidate against the query.

        Args:
            candidate: Candidate string to match against the query.

        Returns:
            Strength of the match from 0 to 1.
        """
        return self.fuzzy_search.match(self.query, candidate)[0]

    def highlight(self, candidate: str) -> Content:
        """Highlight the candidate with the fuzzy match.

        Args:
            candidate: The candidate string to match against the query.

        Returns:
            A [`Text`][rich.text.Text] object with highlighted matches.
        """
        content = Content.from_markup(candidate)
        score, offsets = self.fuzzy_search.match(self.query, candidate)
        if not score:
            return content
        for offset in offsets:
            if not candidate[offset].isspace():
                content = content.stylize(self._match_style, offset, offset + 1)
        return content