File: codepointset.py

package info (click to toggle)
python-precis-i18n 1.1.2-1
  • links: PTS, VCS
  • area: main
  • in suites: forky, sid
  • size: 1,916 kB
  • sloc: python: 1,824; sh: 28; makefile: 3
file content (182 lines) | stat: -rw-r--r-- 4,937 bytes parent folder | download | duplicates (2)
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
"""Implements the CodepointSet class."""

import io
import re
from bisect import bisect_left


class CodepointSet:
    """Concrete class for an immutable set of Unicode code points.

    Inclusive ranges [a, b] are stored as adjacent unicode characters in a
    string. The low end of a range has an even index. The high end is at an
    odd index. Singleton ranges are stored as [c, c].

    To test if a code point is in the set, search for the code point. If it
    matches a char in the string, the code point is in the set. If not, find
    the index of the next largest char. If this index is odd, the code point is
    in the set.

    This class is constructed from a multi-line string containing a sequence of
    codepoints and codepoint ranges.

      HHHH
      HHHH..HHHH

    H is a hexadecimal digit. Comment lines begin with '#'. Blank lines are
    ignored.

    Note: Sets with any non-BMP codepoints will use 32-bits for all codepoints.
    (PEP 393 Flexible String Representation)

    Args:
        table (str): Multi-line string of code point ranges.
    """

    def __init__(self, table):
        self._table = _stringify(_coalesce(_parse(table)))
        assert (len(self._table) % 2) == 0

    def __contains__(self, cp):
        """Check if code point `cp` is in the set.

        Args:
            cp (int): Code point.

        Returns:
            bool: True if `cp` is in the set.
        """
        if not 0 <= cp <= 0x10FFFF:
            return False
        char = chr(cp)
        idx = bisect_left(self._table, char)
        if idx >= len(self._table):
            return False
        return (idx % 2) == 1 or self._table[idx] == char

    def __len__(self):
        """Return size of set.

        Used for debugging only.

        Returns:
            int: Count of total code points.
        """
        count = 0
        for lo, hi in self.items():
            count += hi - lo + 1
        return count

    def __eq__(self, rhs):
        """Check if set is equal to other set.

        Args:
            rhs (CodepointSet): Other set.

        Returns:
            bool: True if sets are equal.
        """
        # pylint: disable=protected-access
        if self.__class__ != rhs.__class__:
            return False
        return self._table == rhs._table

    def __repr__(self):
        """Return string representation of set.

        Example:
            "CodepointSet('0000\n0010..00FF')"
        """
        elems = "\\n".join(_repr(elem) for elem in self.items())
        return "CodepointSet('%s')" % elems

    def items(self):
        """Generator yielding sequence of range tuples (lo, hi)."""
        for i in range(len(self._table) // 2):
            lo = ord(self._table[2 * i])
            hi = ord(self._table[2 * i + 1])
            yield (lo, hi)


def _parse(table):
    """Parse a multi-line string containing codepoint ranges.

    Args:
        table (str): Multi-line string of code point ranges.

    Returns:
        list: List of 2-tuples (lo, hi) representing code point ranges.

    Raises:
        ValueError: Error while parsing `table`.
    """
    codepoint = re.compile(r"^([0-9A-Fa-f]+)(?:\.\.([0-9A-Fa-f]+))?$")
    elems = []
    for line in io.StringIO(table):
        line = line.strip()
        m = codepoint.match(line)
        if not m:
            if line and line[0] != "#":
                raise ValueError("Unable to parse line: %s" % line)
            continue
        lo = int(m.group(1), 16)
        hi = int(m.group(2), 16) if m.group(2) else lo
        if lo > hi:
            raise ValueError("Invalid range (lo > hi): %s" % line)
        elems.append((lo, hi))
    return elems


def _coalesce(elems):
    """Sort and coalesce adjacent ranges (in-place).

    Args:
        elems (list): List of 2-tuples (lo, hi).

    Returns:
        list: List `elems` after sorting in-place and coalescing ranges.

    Raises:
        ValueError: Overlapping ranges.
    """
    elems.sort()
    i = 0
    while i < len(elems) - 1:
        (lo0, hi0), (lo1, hi1) = elems[i : i + 2]
        if not lo0 <= hi0 < lo1 <= hi1:
            raise ValueError("Range overlaps at index %d: %r" % (i, elems[i : i + 2]))
        if lo1 == hi0 + 1:
            elems[i : i + 2] = [(lo0, hi1)]
        else:
            i += 1
    return elems


def _stringify(elems):
    """Convert a sequence of ranges into a string.

    Args:
        elems (list): List of 2-tuples representing ranges.

    Returns:
        str: String with lo..hi ranges concatenated.
    """
    return "".join(chr(lo) + chr(hi) for (lo, hi) in elems)


def _repr(elem):
    """Return string representation for tuple (lo, hi).

    Examples:
        "HHHH"
        "HHHH..HHHH"

    Args:
        elem (tuple): 2-tuple (lo, hi)

    Returns:
        str: String representation of range tuple.
    """
    if elem[0] == elem[1]:
        return "%04X" % elem[0]
    return "%04X..%04X" % elem