File: cleanup.py

package info (click to toggle)
python-fingerprints 1.3.1-1
  • links: PTS, VCS
  • area: main
  • in suites: forky, sid
  • size: 920 kB
  • sloc: python: 1,290; makefile: 17
file content (101 lines) | stat: -rw-r--r-- 2,622 bytes parent folder | download
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
import re
import logging
from typing import Optional
from functools import lru_cache
from normality import WS, squash_spaces, ascii_text, category_replace

log = logging.getLogger(__name__)

CHARACTERS_REMOVE_RE = re.compile(r"[\.\'’]")

PREFIXES_RAW_LIST = [
    "Mr",
    "Ms",
    "Mrs",
    "Mister",
    "Miss",
    "Madam",
    "Madame",
    "Monsieur",
    "Honorable",
    "Honourable",
    "Mme",
    "Mmme",
    "Herr",
    "Hr",
    "Frau",
    "Fr",
    "The",
    "Fräulein",
    "Senor",
    "Senorita",
    "Sheik",
    "Sheikh",
    "Shaikh",
    "Sr",
    "Sir",
    "Lady",
    "The",
    "de",
    "of",
]
PREFIXES_RAW = "|".join(PREFIXES_RAW_LIST)
NAME_PATTERN_ = r"^\W*((%s)\.?\s+)*(?P<term>.*?)([\'’]s)?\W*$"
NAME_PATTERN_ = NAME_PATTERN_ % PREFIXES_RAW
PREFIXES = re.compile(NAME_PATTERN_, re.I | re.U)
BRACKETED = re.compile(r"(\([^\(\)]*\)|\[[^\[\]]*\])")


def clean_entity_prefix(name: str) -> str:
    """Remove prefixes like Mr., Mrs., etc."""
    match = PREFIXES.match(name)
    if match is not None:
        name = match.group("term")
    return name


def clean_brackets(text: str) -> str:
    """Remove any text in brackets. This is meant to handle names of companies
    which include the jurisdiction, like: Turtle Management (Seychelles) Ltd."""
    return BRACKETED.sub(WS, text)


@lru_cache(maxsize=2000)
def clean_name_ascii(text: Optional[str]) -> Optional[str]:
    """
    This function performs a series of operations to clean and normalize the input text.
    It transliterates the text to ASCII, removes punctuation and symbols, converts the
    text to lowercase, replaces certain character categories, and collapses consecutive
    spaces.

    Args:
        text (Optional[str]): The input text to be cleaned.

    Returns:
        Optional[str]: The cleaned text, or None if the cleaned text is empty or too short.
    """
    # transliterate to ascii
    if text is None:
        return None
    text = ascii_text(text)
    # replace punctuation and symbols
    text = CHARACTERS_REMOVE_RE.sub("", text)
    text = text.lower()
    cleaned = category_replace(text)
    cleaned = squash_spaces(cleaned)
    if len(cleaned) < 2:
        return None
    return cleaned


@lru_cache(maxsize=2000)
def clean_name_light(text: str) -> Optional[str]:
    """Clean up a name for comparison, but don't convert to ASCII/Latin."""
    # replace punctuation and symbols
    text = CHARACTERS_REMOVE_RE.sub("", text)
    text = text.lower()
    cleaned = category_replace(text)
    cleaned = squash_spaces(cleaned)
    if len(cleaned) < 2:
        return None
    return cleaned