1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163
|
"""A simple English lemmatizer that finds and removes known suffixes."""
from enum import Flag, auto
from typing import TypeAlias
import wn
from wn._types import LemmatizeResult
from wn.constants import ADJ, ADJ_SAT, ADV, NOUN, PARTS_OF_SPEECH, VERB
POSExceptionMap: TypeAlias = dict[str, set[str]]
ExceptionMap: TypeAlias = dict[str, POSExceptionMap]
class _System(Flag):
"""Flags to track suffix rules in various implementations of Morphy."""
PWN = auto()
NLTK = auto()
WN = auto()
ALL = PWN | NLTK | WN
_PWN = _System.PWN
_NLTK = _System.NLTK
_WN = _System.WN
_ALL = _System.ALL
Rule: TypeAlias = tuple[str, str, _System]
DETACHMENT_RULES: dict[str, list[Rule]] = {
NOUN: [
("s", "", _ALL),
("ces", "x", _WN),
("ses", "s", _ALL),
("ves", "f", _NLTK | _WN),
("ives", "ife", _WN),
("xes", "x", _ALL),
("xes", "xis", _WN),
("zes", "z", _ALL),
("ches", "ch", _ALL),
("shes", "sh", _ALL),
("men", "man", _ALL),
("ies", "y", _ALL),
],
VERB: [
("s", "", _ALL),
("ies", "y", _ALL),
("es", "e", _ALL),
("es", "", _ALL),
("ed", "e", _ALL),
("ed", "", _ALL),
("ing", "e", _ALL),
("ing", "", _ALL),
],
ADJ: [
("er", "", _ALL),
("est", "", _ALL),
("er", "e", _ALL),
("est", "e", _ALL),
],
ADV: [],
}
DETACHMENT_RULES[ADJ_SAT] = DETACHMENT_RULES[ADJ]
class Morphy:
"""The Morphy lemmatizer class.
Objects of this class are callables that take a wordform and an
optional part of speech and return a dictionary mapping parts of
speech to lemmas. If objects of this class are not created with a
:class:`wn.Wordnet` object, the returned lemmas may be invalid.
Arguments:
wordnet: optional :class:`wn.Wordnet` instance
Example:
>>> import wn
>>> from wn.morphy import Morphy
>>> ewn = wn.Wordnet("ewn:2020")
>>> m = Morphy(ewn)
>>> m("axes", pos="n")
{'n': {'axe', 'ax', 'axis'}}
>>> m("geese", pos="n")
{'n': {'goose'}}
>>> m("gooses")
{'n': {'goose'}, 'v': {'goose'}}
>>> m("goosing")
{'v': {'goose'}}
"""
def __init__(self, wordnet: wn.Wordnet | None = None):
self._rules = {
pos: [rule for rule in rules if rule[2] & _System.WN]
for pos, rules in DETACHMENT_RULES.items()
}
exceptions: ExceptionMap = {pos: {} for pos in PARTS_OF_SPEECH}
all_lemmas: dict[str, set[str]] = {pos: set() for pos in PARTS_OF_SPEECH}
if wordnet:
for word in wordnet.words():
pos = word.pos
pos_exc = exceptions[pos]
lemma, *others = word.forms()
# store every lemma whether it has other forms or not
all_lemmas[pos].add(lemma)
# those with other forms map to the original lemmas
for other in others:
if other in pos_exc:
pos_exc[other].add(lemma)
else:
pos_exc[other] = {lemma}
self._initialized = True
else:
self._initialized = False
self._exceptions = exceptions
self._all_lemmas = all_lemmas
def __call__(self, form: str, pos: str | None = None) -> LemmatizeResult:
result = {}
if not self._initialized:
result[pos] = {form} # always include original when not initialized
if pos is None:
pos_list = list(DETACHMENT_RULES)
elif pos in DETACHMENT_RULES:
pos_list = [pos]
else:
pos_list = [] # not handled by morphy
no_pos_forms = result.get(None, set()) # avoid unnecessary duplicates
for _pos in pos_list:
candidates = self._morphstr(form, _pos) - no_pos_forms
if candidates:
result.setdefault(_pos, set()).update(candidates)
return result
def _morphstr(self, form: str, pos: str) -> set[str]:
candidates: set[str] = set()
initialized = self._initialized
if initialized:
all_lemmas = self._all_lemmas[pos]
if form in all_lemmas:
candidates.add(form)
candidates.update(self._exceptions[pos].get(form, set()))
else:
all_lemmas = set()
for suffix, repl, _ in self._rules[pos]:
# avoid applying rules that perform full suppletion
if form.endswith(suffix) and len(suffix) < len(form):
candidate = f"{form[: -len(suffix)]}{repl}"
if not initialized or candidate in all_lemmas:
candidates.add(candidate)
return candidates
morphy = Morphy()
|