1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71
|
import re
import logging
from functools import lru_cache
from typing import Callable, Dict, Match, Optional
from normality import WS
from fingerprints.cleanup import clean_name_ascii
log = logging.getLogger(__name__)
NormFunc = Callable[[str], Optional[str]]
ReplaceFunc = Callable[[str], str]
class Replacer(object):
def __init__(self, replacements: Dict[str, str], remove: bool = False) -> None:
self.replacements = replacements
self.remove = remove
forms = set(self.replacements.keys())
if remove:
forms.update(self.replacements.values())
forms_sorted = sorted(forms, key=lambda ct: -1 * len(ct))
forms_regex = "\\b(%s)\\b" % "|".join(forms_sorted)
self.matcher = re.compile(forms_regex, re.U)
def get_canonical(self, match: Match[str]) -> str:
if self.remove:
return WS
return self.replacements.get(match.group(1), match.group(1))
def __call__(self, text: str) -> str:
return self.matcher.sub(self.get_canonical, text)
def normalize_replacements(norm_func: NormFunc) -> Dict[str, str]:
from fingerprints.types.data import TYPES
replacements: Dict[str, str] = {}
for type in TYPES["types"]:
main_norm = norm_func(type["main"])
if main_norm is None:
log.warning("Main form is normalized to null: %r", type["main"])
continue
for form in type["forms"]:
form_norm = norm_func(form)
if form_norm is None:
log.warning("Form is normalized to null [%r]: %r", type["main"], form)
continue
if form_norm == main_norm:
continue
if form_norm in replacements and replacements[form_norm] != main_norm:
log.warning(
"Form has duplicate mains: %r (%r, %r)",
form,
replacements[form_norm],
main_norm,
)
continue
replacements[form_norm] = main_norm
return replacements
@lru_cache(maxsize=None)
def get_replacer(
clean: NormFunc = clean_name_ascii, remove: bool = False
) -> ReplaceFunc:
replacements = normalize_replacements(clean)
return Replacer(replacements, remove=remove)
if __name__ == "__main__":
get_replacer()
|