1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27
|
def get_language_stemmer(language):
"""Retrieves the SnowballStemmer for a particular language.
Args:
language (str): ISO-639-1 code of the language.
"""
from lunr.languages import SUPPORTED_LANGUAGES
from nltk.stem.snowball import SnowballStemmer # type: ignore
return SnowballStemmer(SUPPORTED_LANGUAGES[language])
def nltk_stemmer(stemmer, token, i=None, tokens=None):
"""Wrapper around a NLTK SnowballStemmer, which includes stop words for
each language.
Args:
stemmer (SnowballStemmer): Stemmer instance that performs the stemming.
token (lunr.Token): The token to stem.
i (int): The index of the token in a set.
tokens (list): A list of tokens representing the set.
"""
def wrapped_stem(token, metadata=None):
return stemmer.stem(token)
return token.update(wrapped_stem)
|