File: stemmer.py

package info (click to toggle)
python-lunr 0.8.0-1
  • links: PTS, VCS
  • area: main
  • in suites: forky, sid, trixie
  • size: 3,644 kB
  • sloc: python: 3,811; javascript: 114; makefile: 60
file content (27 lines) | stat: -rw-r--r-- 881 bytes parent folder | download | duplicates (2)
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
def get_language_stemmer(language):
    """Retrieves the SnowballStemmer for a particular language.

    Args:
        language (str): ISO-639-1 code of the language.
    """
    from lunr.languages import SUPPORTED_LANGUAGES
    from nltk.stem.snowball import SnowballStemmer  # type: ignore

    return SnowballStemmer(SUPPORTED_LANGUAGES[language])


def nltk_stemmer(stemmer, token, i=None, tokens=None):
    """Wrapper around a NLTK SnowballStemmer, which includes stop words for
    each language.

    Args:
        stemmer (SnowballStemmer): Stemmer instance that performs the stemming.
        token (lunr.Token): The token to stem.
        i (int): The index of the token in a set.
        tokens (list): A list of tokens representing the set.
    """

    def wrapped_stem(token, metadata=None):
        return stemmer.stem(token)

    return token.update(wrapped_stem)