1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82
|
"""
Used to regenerate character tables in ftfy/chardata.py with explanatory comments.
"""
import unicodedata
from dataclasses import dataclass
from ftfy.chardata import UTF8_CLUES
@dataclass
class CharData:
name: str
codept: int
encodings: list[tuple[str, int]]
def sort_key(self) -> tuple[int, str, int]:
if self.name.startswith("LATIN "):
return (0, self.name, self.codept)
else:
return (1, "", self.codept)
SAFE_ENCODINGS = [
"latin-1",
"windows-1252",
"windows-1251",
"windows-1250",
"windows-1253",
"windows-1254",
"windows-1257",
]
def show_char_table(chars: str, byte_min: int = 0, byte_max: int = 0xFF) -> None:
char_data: list[CharData] = []
for char in chars:
name = unicodedata.name(char, "<unknown>")
codept = ord(char)
encodings: list[tuple[str, int]] = []
for encoding in SAFE_ENCODINGS:
try:
encoded: bytes = char.encode(encoding)
byte: int = encoded[0]
encodings.append((encoding, byte))
except UnicodeEncodeError:
pass
if encodings:
char_data.append(CharData(name=name, codept=codept, encodings=encodings))
else:
print(f"No relevant encoding for {codept=}, {name=}")
char_data.sort(key=CharData.sort_key)
for cd in char_data:
encoding_info: list[str] = []
for encoding, byte in cd.encodings:
if byte_min <= byte <= byte_max:
info_str = f"{encoding}:{byte:X}"
encoding_info.append(info_str)
if encoding_info:
encoding_explanation = encoding_info[0]
else:
encoding_explanation = "???"
print(f' "\\N{{{cd.name}}}" # {encoding_explanation}')
def run() -> None:
print("# utf8_first_of_2")
show_char_table(UTF8_CLUES["utf8_first_of_2"], 0xC2, 0xDF)
print("# utf8_first_of_3")
show_char_table(UTF8_CLUES["utf8_first_of_3"], 0xE0, 0xEF)
print("# utf8_first_of_4")
show_char_table(UTF8_CLUES["utf8_first_of_4"], 0xF0, 0xF3)
print("# utf8_continuation")
print(r' "\x80-\xbf"')
show_char_table(UTF8_CLUES["utf8_continuation"][3:], 0x80, 0xBF)
print("# utf8_continuation_strict")
print(r' "\x80-\xbf"')
show_char_table(UTF8_CLUES["utf8_continuation_strict"][3:], 0x80, 0xBF)
if __name__ == "__main__":
run()
|