File: ucd.py

package info (click to toggle)
libxkbcommon 1.12.3-1
  • links: PTS, VCS
  • area: main
  • in suites: forky, sid
  • size: 8,184 kB
  • sloc: ansic: 57,023; xml: 8,785; python: 7,449; yacc: 913; sh: 253; makefile: 23
file content (200 lines) | stat: -rwxr-xr-x 6,296 bytes parent folder | download
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
#!/usr/bin/env python3

# # Copyright © 2025 Pierre Le Marre <dev@wismill.eu>
# SPDX-License-Identifier: MIT

"""
Utils to parse the Unicode database files
"""

from collections.abc import Callable, Iterator
from dataclasses import dataclass
from pathlib import Path
import sys
from typing import ClassVar, Self


def parse_code_point(raw: str) -> int | None:
    return None if not raw else int(raw, 16)


@dataclass
class CodePointRange:
    start: int
    end: int

    def __iter__(self) -> Iterator[int]:
        yield from range(self.start, self.end + 1)

    @classmethod
    def parse(cls, raw: str) -> Self:
        start, *end = raw.strip().split("..")
        return cls(
            start=int(start, 16), end=int(start, 16) if not end else int(end[0], 16)
        )


@dataclass
class PropertyEntry:
    code_point: int
    property: str

    @classmethod
    def parse_file(
        cls, path: Path, filter: Callable[[str], bool] | None = None
    ) -> Iterator[Self]:
        with path.open("rt", encoding="utf-8") as fd:
            for line in fd:
                # Remove comment
                line, *_ = line.split("#")
                line = line.strip()
                # Skip empty lines
                if not line:
                    continue
                raw_range, property, *_ = line.split(";")
                range = CodePointRange.parse(raw_range)
                property = property.strip()
                if filter and not filter(property):
                    continue
                for code_point in range:
                    yield cls(code_point=code_point, property=property)


@dataclass
class UnicodeDataEntry:
    code_point: int
    general_category: str
    lower_case: int | None
    upper_case: int | None
    title_case: int | None

    @classmethod
    def parse_file(cls, path: Path) -> Iterator[Self]:
        with path.open("rt", encoding="utf-8") as fd:
            for line in fd:
                line = line.strip()
                if not line or line.startswith("#"):
                    continue
                (
                    cp,
                    _name,
                    general_category,
                    _cc,
                    _bc,
                    _d,
                    _decimal,
                    _digit,
                    _numeric,
                    _mirrored,
                    _,
                    _,
                    upper_case,
                    lower_case,
                    title_case,
                    *_,
                ) = line.split(";")
                code_point = int(cp, 16)
                yield cls(
                    code_point=code_point,
                    general_category=general_category,
                    lower_case=parse_code_point(lower_case),
                    upper_case=parse_code_point(upper_case),
                    title_case=parse_code_point(title_case),
                )


@dataclass
class DB:
    lower_case: set[int]
    upper_case: set[int]
    title_case: set[int]
    lower_case_mappings: dict[int, int]
    upper_case_mappings: dict[int, int]
    title_case_mappings: dict[int, int]

    case_properties: ClassVar[frozenset[str]] = frozenset(("Lowercase", "Uppercase"))

    @classmethod
    def filter_case_properties(cls, property: str) -> bool:
        return property in cls.case_properties

    @classmethod
    def parse_ucd(cls, path: Path) -> Self:
        lower_case: set[int] = set()
        upper_case: set[int] = set()
        title_case: set[int] = set()

        lower_case_mappings: dict[int, int] = {}
        upper_case_mappings: dict[int, int] = {}
        title_case_mappings: dict[int, int] = {}

        for entry in UnicodeDataEntry.parse_file(path / "UnicodeData.txt"):
            if entry.general_category == "Lt":
                title_case.add(entry.code_point)
            if entry.lower_case is not None:
                lower_case_mappings[entry.code_point] = entry.lower_case
            if entry.upper_case is not None:
                upper_case_mappings[entry.code_point] = entry.upper_case
            if entry.title_case is not None:
                title_case_mappings[entry.code_point] = entry.title_case

        for entry in PropertyEntry.parse_file(
            path / "DerivedCoreProperties.txt", filter=cls.filter_case_properties
        ):
            match entry.property:
                case "Lowercase":
                    lower_case.add(entry.code_point)
                case "Uppercase":
                    upper_case.add(entry.code_point)
                case _:
                    raise ValueError(entry)

        return cls(
            lower_case=lower_case,
            upper_case=upper_case,
            title_case=title_case,
            lower_case_mappings=lower_case_mappings,
            upper_case_mappings=upper_case_mappings,
            title_case_mappings=title_case_mappings,
        )

    def isULowercase(self, cp: int) -> bool:
        return cp in self.lower_case

    def isUUppercase(self, cp: int) -> bool:
        return cp in self.upper_case

    def istitle(self, cp: int) -> bool:
        return cp in self.title_case

    def tolower(self, cp_or_char: int | str) -> int | str:
        cp = cp_or_char if isinstance(cp_or_char, int) else ord(cp_or_char)
        mapping = self.lower_case_mappings.get(cp, cp)
        return mapping if isinstance(cp_or_char, int) else chr(mapping)

    def toupper(self, cp_or_char: int | str) -> int | str:
        cp = cp_or_char if isinstance(cp_or_char, int) else ord(cp_or_char)
        mapping = self.upper_case_mappings.get(cp, cp)
        return mapping if isinstance(cp_or_char, int) else chr(mapping)


if __name__ == "__main__":
    # Test
    import icu

    c = icu.Locale.createFromName("C")
    icu.Locale.setDefault(c)

    path = Path(sys.argv[1])
    db = DB.parse_ucd(path)

    for cp in range(0, 0x10FFFF + 1):
        assert db.isULowercase(cp) == icu.DB.isULowercase(cp), (
            cp,
            db.isULowercase(cp),
            icu.DB.isULowercase(cp),
        )
        assert db.isUUppercase(cp) == icu.DB.isUUppercase(cp), cp
        assert db.istitle(cp) == icu.DB.istitle(cp), cp
        assert db.tolower(cp) == icu.DB.tolower(cp)
        assert db.toupper(cp) == icu.DB.toupper(cp)