File: wchar_helper.py

package info (click to toggle)
pypy3 7.0.0%2Bdfsg-3
  • links: PTS, VCS
  • area: main
  • in suites: buster
  • size: 111,848 kB
  • sloc: python: 1,291,746; ansic: 74,281; asm: 5,187; cpp: 3,017; sh: 2,533; makefile: 544; xml: 243; lisp: 45; csh: 21; awk: 4
file content (192 lines) | stat: -rw-r--r-- 6,398 bytes parent folder | download | duplicates (2)
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
from rpython.rlib.objectmodel import specialize
from rpython.rlib.rarithmetic import r_uint, r_ulonglong, intmask
from rpython.rtyper.annlowlevel import llunicode
from rpython.rtyper.lltypesystem import lltype, rffi
from rpython.rtyper.lltypesystem.rstr import copy_unicode_to_raw

SIZE_UNICODE = rffi.sizeof(lltype.UniChar)


if SIZE_UNICODE == 4:
    def ordinal_to_unicode(ordinal):    # 'ordinal' is a r_uint
        return unichr(intmask(ordinal))
else:
    def ordinal_to_unicode(ordinal):    # 'ordinal' is a r_uint
        if ordinal <= 0xffff:
            return unichr(intmask(ordinal))
        elif ordinal <= 0x10ffff:
            ordinal = intmask(ordinal - 0x10000)
            return (unichr(0xD800 | (ordinal >> 10)) +
                    unichr(0xDC00 | (ordinal & 0x3FF)))
        else:
            raise OutOfRange(ordinal)

def is_surrogate(u, index):
    return (unichr(0xD800) <= u[index + 0] <= unichr(0xDBFF) and
            unichr(0xDC00) <= u[index + 1] <= unichr(0xDFFF))

def as_surrogate(u, index):
    ordinal = (ord(u[index + 0]) - 0xD800) << 10
    ordinal |= (ord(u[index + 1]) - 0xDC00)
    return r_uint(ordinal + 0x10000)

def unicode_to_ordinal(u):
    if len(u) == 1:
        u = ord(u[0])
        return r_uint(u)
    elif SIZE_UNICODE == 2:
        if len(u) == 2 and is_surrogate(u, 0):
            return r_uint(as_surrogate(u, 0))
    raise ValueError


class OutOfRange(Exception):
    ordinal = 0

    def __init__(self, ordinal):
        ordinal = intmask(rffi.cast(rffi.INT, ordinal))
        self.ordinal = ordinal

def _unicode_from_wchar(ptr, length):
    return rffi.wcharpsize2unicode(rffi.cast(rffi.CWCHARP, ptr), length)


if SIZE_UNICODE == 2:
    def unicode_from_char32(ptr, length):
        # 'ptr' is a pointer to 'length' 32-bit integers
        ptr = rffi.cast(rffi.UINTP, ptr)
        alloc = length
        for i in range(length):
            if rffi.cast(lltype.Unsigned, ptr[i]) > 0xFFFF:
                alloc += 1

        u = [u'\x00'] * alloc
        j = 0
        for i in range(length):
            ordinal = rffi.cast(lltype.Unsigned, ptr[i])
            if ordinal > 0xFFFF:
                if ordinal > 0x10FFFF:
                    raise OutOfRange(ordinal)
                ordinal = intmask(ordinal - 0x10000)
                u[j] = unichr(0xD800 | (ordinal >> 10))
                j += 1
                u[j] = unichr(0xDC00 | (ordinal & 0x3FF))
                j += 1
            else:
                u[j] = unichr(intmask(ordinal))
                j += 1
        assert j == len(u)
        return u''.join(u)

    unicode_from_char16 = _unicode_from_wchar

else:
    unicode_from_char32 = _unicode_from_wchar

    def unicode_from_char16(ptr, length):
        # 'ptr' is a pointer to 'length' 16-bit integers
        ptr = rffi.cast(rffi.USHORTP, ptr)
        u = [u'\x00'] * length
        i = 0
        j = 0
        while j < length:
            ch = intmask(ptr[j])
            j += 1
            if 0xD800 <= ch <= 0xDBFF and j < length:
                ch2 = intmask(ptr[j])
                if 0xDC00 <= ch2 <= 0xDFFF:
                    ch = (((ch & 0x3FF)<<10) | (ch2 & 0x3FF)) + 0x10000
                    j += 1
            u[i] = unichr(ch)
            i += 1
        del u[i:]
        return u''.join(u)


@specialize.ll()
def _measure_length(ptr, maxlen):
    result = 0
    if maxlen < 0:
        while intmask(ptr[result]) != 0:
            result += 1
    else:
        while result < maxlen and intmask(ptr[result]) != 0:
            result += 1
    return result

def measure_length_16(ptr, maxlen=-1):
    return _measure_length(rffi.cast(rffi.USHORTP, ptr), maxlen)

def measure_length_32(ptr, maxlen=-1):
    return _measure_length(rffi.cast(rffi.UINTP, ptr), maxlen)


def unicode_size_as_char16(u):
    result = len(u)
    if SIZE_UNICODE == 4:
        for i in range(result):
            if ord(u[i]) > 0xFFFF:
                result += 1
    return result

def unicode_size_as_char32(u):
    result = len(u)
    if SIZE_UNICODE == 2 and result > 1:
        for i in range(result - 1):
            if is_surrogate(u, i):
                result -= 1
    return result


def _unicode_to_wchar(u, target_ptr, target_length, add_final_zero):
    # 'target_ptr' is a raw pointer to 'target_length' wchars;
    # we assume here that target_length == len(u).
    unichardata = rffi.cast(rffi.CWCHARP, target_ptr)
    copy_unicode_to_raw(llunicode(u), unichardata, 0, target_length)
    if add_final_zero:
        unichardata[target_length] = u'\x00'


if SIZE_UNICODE == 2:
    def unicode_to_char32(u, target_ptr, target_length, add_final_zero):
        # 'target_ptr' is a raw pointer to 'target_length' 32-bit integers;
        # we assume here that target_length == unicode_size_as_char32(u).
        ptr = rffi.cast(rffi.UINTP, target_ptr)
        src_index = 0
        last_surrogate_pos = len(u) - 2
        for i in range(target_length):
            if src_index <= last_surrogate_pos and is_surrogate(u, src_index):
                ordinal = as_surrogate(u, src_index)
                src_index += 2
            else:
                ordinal = r_uint(ord(u[src_index]))
                src_index += 1
            ptr[i] = rffi.cast(rffi.UINT, ordinal)
        if add_final_zero:
            ptr[target_length] = rffi.cast(rffi.UINT, 0)

    unicode_to_char16 = _unicode_to_wchar

else:
    unicode_to_char32 = _unicode_to_wchar

    def unicode_to_char16(u, target_ptr, target_length, add_final_zero):
        # 'target_ptr' is a raw pointer to 'target_length' 16-bit integers;
        # we assume here that target_length == unicode_size_as_char16(u).
        ptr = rffi.cast(rffi.USHORTP, target_ptr)
        for uc in u:
            ordinal = ord(uc)
            if ordinal > 0xFFFF:
                if ordinal > 0x10FFFF:
                    raise OutOfRange(ordinal)
                ordinal -= 0x10000
                ptr[0] = rffi.cast(rffi.USHORT, 0xD800 | (ordinal >> 10))
                ptr[1] = rffi.cast(rffi.USHORT, 0xDC00 | (ordinal & 0x3FF))
                ptr = rffi.ptradd(ptr, 2)
            else:
                ptr[0] = rffi.cast(rffi.USHORT, ordinal)
                ptr = rffi.ptradd(ptr, 1)
        assert ptr == (
            rffi.ptradd(rffi.cast(rffi.USHORTP, target_ptr), target_length))
        if add_final_zero:
            ptr[0] = rffi.cast(rffi.USHORT, 0)