File: py_ucs4_type.pyx

package info (click to toggle)
cython 3.0.11%2Bdfsg-2
  • links: PTS, VCS
  • area: main
  • in suites: sid, trixie
  • size: 19,092 kB
  • sloc: python: 83,539; ansic: 18,831; cpp: 1,402; xml: 1,031; javascript: 511; makefile: 403; sh: 204; sed: 11
file content (411 lines) | stat: -rw-r--r-- 10,741 bytes parent folder | download
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
# -*- coding: iso-8859-1 -*-
# mode: run
# tag: warnings


cimport cython

cdef Py_UCS4 char_ASCII = u'A'
cdef Py_UCS4 char_KLINGON = u'\uF8D2'

u_A = char_ASCII
u_KLINGON = char_KLINGON


def compare_ASCII():
    """
    >>> compare_ASCII()
    True
    False
    False
    """
    print(char_ASCII == u'A')
    print(char_ASCII == u'B')
    print(char_ASCII == u'\uF8D2')


def compare_klingon():
    """
    >>> compare_klingon()
    True
    False
    False
    """
    print(char_KLINGON == u'\uF8D2')
    print(char_KLINGON == u'A')
    print(char_KLINGON == u'B')


def single_uchar_compare():
    """
    >>> single_uchar_compare()
    """
    assert u'\u0100' < u'\u0101'
    assert u'\u0101' > u'\u0100'


from cpython.unicode cimport PyUnicode_FromOrdinal
import sys

u0 = u'\x00'
u1 = u'\x01'
umax = PyUnicode_FromOrdinal(sys.maxunicode)

def unicode_ordinal(Py_UCS4 i):
    """
    >>> ord(unicode_ordinal(0)) == 0
    True
    >>> ord(unicode_ordinal(1)) == 1
    True
    >>> ord(unicode_ordinal(sys.maxunicode)) == sys.maxunicode
    True

    >>> ord(unicode_ordinal(u0)) == 0
    True
    >>> ord(unicode_ordinal(u1)) == 1
    True
    >>> ord(unicode_ordinal(umax)) == sys.maxunicode
    True

    Value too small:
    >>> unicode_ordinal(-1) #doctest: +ELLIPSIS
    Traceback (most recent call last):
    ...
    OverflowError: ...

    Value too large:
    >>> unicode_ordinal(1114111+1) #doctest: +ELLIPSIS
    Traceback (most recent call last):
    ...
    OverflowError: ...

    Less than one character:
    >>> unicode_ordinal(u0[:0])
    Traceback (most recent call last):
    ...
    ValueError: only single character unicode strings can be converted to Py_UCS4, got length 0

    More than one character:
    >>> unicode_ordinal(u0+u1)
    Traceback (most recent call last):
    ...
    ValueError: only single character unicode strings can be converted to Py_UCS4, got length 2
    """
    return i


def ord_py_ucs4(Py_UCS4 x):
    """
    >>> ord_py_ucs4(u0)
    0
    >>> ord_py_ucs4(u_A)
    65
    >>> ord_py_ucs4(u_KLINGON)
    63698
    """
    return ord(x)


@cython.test_assert_path_exists('//PythonCapiCallNode')
@cython.test_fail_if_path_exists('//SimpleCallNode')
def unicode_type_methods(Py_UCS4 uchar):
    """
    >>> unicode_type_methods(ord('A'))
    [True, True, False, False, False, False, False, True, True]
    >>> unicode_type_methods(ord('a'))
    [True, True, False, False, True, False, False, False, False]
    >>> unicode_type_methods(ord('8'))
    [True, False, True, True, False, True, False, False, False]
    >>> unicode_type_methods(ord('\\t'))
    [False, False, False, False, False, False, True, False, False]
    """
    return [
        # character types
        uchar.isalnum(),
        uchar.isalpha(),
        uchar.isdecimal(),
        uchar.isdigit(),
        uchar.islower(),
        uchar.isnumeric(),
        uchar.isspace(),
        uchar.istitle(),
        uchar.isupper(),
        ]

#@cython.test_assert_path_exists('//PythonCapiCallNode')
#@cython.test_fail_if_path_exists('//SimpleCallNode')
def unicode_methods(Py_UCS4 uchar):
    """
    >>> unicode_methods(ord('A')) == ['a', 'A', 'A'] or unicode_methods(ord('A'))
    True
    >>> unicode_methods(ord('a')) == ['a', 'A', 'A'] or unicode_methods(ord('a'))
    True
    >>> unicode_methods(0x1E9E) == [u'\\xdf', u'\\u1e9e', u'\\u1e9e'] or unicode_methods(0x1E9E)
    True
    >>> unicode_methods(0x0130) in (
    ...     [u'i\\u0307', u'\\u0130', u'\\u0130'],  # Py3
    ...     [u'i', u'\\u0130', u'\\u0130'],  # Py2
    ... ) or unicode_methods(0x0130)
    True
    """
    # \u1E9E == 'LATIN CAPITAL LETTER SHARP S'
    # \u0130 == 'LATIN CAPITAL LETTER I WITH DOT ABOVE'
    return [
        # character conversion
        uchar.lower(),
        uchar.upper(),
        uchar.title(),
        ]


#@cython.test_assert_path_exists('//PythonCapiCallNode')
#@cython.test_fail_if_path_exists(
#    '//SimpleCallNode',
#    '//CoerceFromPyTypeNode',
#)
def unicode_method_return_type(Py_UCS4 uchar):
    """
    >>> unicode_method_return_type(ord('A'))
    [True, False]
    >>> unicode_method_return_type(ord('a'))
    [False, True]
    """
    cdef Py_UCS4 uc, ul
    uc, ul = uchar.upper(), uchar.lower()
    return [uc == uchar, ul == uchar]


@cython.test_assert_path_exists('//IntNode')
@cython.test_fail_if_path_exists('//SimpleCallNode',
                                 '//PythonCapiCallNode')
def len_uchar(Py_UCS4 uchar):
    """
    >>> len_uchar(ord('A'))
    1
    """
    return len(uchar)

def index_uchar(Py_UCS4 uchar, Py_ssize_t i):
    """
    >>> index_uchar(ord('A'), 0) == ('A', 'A', 'A')
    True
    >>> index_uchar(ord('A'), -1) == ('A', 'A', 'A')
    True
    >>> index_uchar(ord('A'), 1)
    Traceback (most recent call last):
    IndexError: string index out of range
    """
    return uchar[0], uchar[-1], uchar[i]

mixed_ustring = u'AbcDefGhIjKlmnoP'
lower_ustring = mixed_ustring.lower()
upper_ustring = mixed_ustring.lower()

@cython.test_assert_path_exists('//PythonCapiCallNode',
                                '//ForFromStatNode')
@cython.test_fail_if_path_exists('//SimpleCallNode',
                                 '//ForInStatNode')
def count_lower_case_characters(unicode ustring):
    """
    >>> count_lower_case_characters(mixed_ustring)
    10
    >>> count_lower_case_characters(lower_ustring)
    16
    """
    cdef Py_ssize_t count = 0
    for uchar in ustring:
         if uchar.islower():
             count += 1
    return count

@cython.test_assert_path_exists('//PythonCapiCallNode',
                                '//ForFromStatNode')
@cython.test_fail_if_path_exists('//SimpleCallNode',
                                 '//ForInStatNode')
def count_lower_case_characters_slice(unicode ustring):
    """
    >>> count_lower_case_characters_slice(mixed_ustring)
    10
    >>> count_lower_case_characters_slice(lower_ustring)
    14
    >>> sum([ 1 for uchar in lower_ustring[1:-1] if uchar.islower() ])
    14
    """
    cdef Py_ssize_t count = 0
    for uchar in ustring[1:-1]:
         if uchar.islower():
             count += 1
    return count

@cython.test_assert_path_exists('//PythonCapiCallNode',
                                '//ForFromStatNode')
@cython.test_fail_if_path_exists('//SimpleCallNode',
                                 '//ForInStatNode')
def count_lower_case_characters_slice_reversed(unicode ustring):
    """
    >>> count_lower_case_characters_slice_reversed(mixed_ustring)
    10
    >>> count_lower_case_characters_slice_reversed(lower_ustring)
    14
    >>> sum([ 1 for uchar in lower_ustring[-2:0:-1] if uchar.islower() ])
    14
    """
    cdef Py_ssize_t count = 0
    for uchar in ustring[-2:0:-1]:
         if uchar.islower():
             count += 1
    return count

def loop_object_over_latin1_unicode_literal():
    """
    >>> result = loop_object_over_latin1_unicode_literal()
    >>> print(result[:-1])
    abcdefg
    >>> ord(result[-1]) == 0xD7
    True
    """
    cdef object uchar
    chars = []
    for uchar in u'abcdefg\xD7':
        chars.append(uchar)
    return u''.join(chars)

def loop_object_over_unicode_literal():
    """
    >>> result = loop_object_over_unicode_literal()
    >>> print(result[:-1])
    abcdefg
    >>> ord(result[-1]) == 0xF8FD
    True
    """
    cdef object uchar
    chars = []
    for uchar in u'abcdefg\uF8FD':
        chars.append(uchar)
    return u''.join(chars)

@cython.test_assert_path_exists('//SwitchStatNode')
@cython.test_fail_if_path_exists('//ForInStatNode')
def iter_and_in():
    """
    >>> iter_and_in()
    a
    b
    e
    f
    h
    """
    for c in u'abcdefgh':
        if c in u'abCDefGh':
            print c


@cython.test_fail_if_path_exists('//ForInStatNode')
def iter_inferred():
    """
    >>> iter_inferred()
    a
    b
    c
    d
    e
    """
    uchars = list(u"abcde")
    uchars = u''.join(uchars)
    for c in uchars:
        print c


@cython.test_assert_path_exists('//SwitchStatNode',
                                '//ForFromStatNode')
@cython.test_fail_if_path_exists('//ForInStatNode')
def index_and_in():
    """
    >>> index_and_in()
    1
    3
    4
    7
    8
    """
    cdef int i
    for i in range(1,9):
        if u'abcdefgh'[-i] in u'abCDefGh':
            print i

# special test for narrow builds

high_uchar = u'\U00012345'
high_ustring0 = u'\U00012345\U00012346abc'
high_ustring1 = u'\U00012346\U00012345abc'
high_ustring_end = u'\U00012346abc\U00012344\U00012345'
high_ustring_no = u'\U00012346\U00012346abc'

def uchar_in(Py_UCS4 uchar, unicode ustring):
    """
    >>> uchar_in(high_uchar, high_ustring0)
    True
    >>> uchar_in(high_uchar, high_ustring1)
    True
    >>> uchar_in(high_uchar, high_ustring_end)
    True
    >>> uchar_in(high_uchar, high_ustring_no)
    False
    """
    assert uchar == 0x12345, ('%X' % uchar)
    return uchar in ustring


def uchar_lookup_in_dict(obj, Py_UCS4 uchar):
    """
    >>> d = {u_KLINGON: 1234, u0: 0, u1: 1, u_A: 2}
    >>> uchar_lookup_in_dict(d, u_KLINGON)
    (1234, 1234)
    >>> uchar_lookup_in_dict(d, u_A)
    (2, 2)
    >>> uchar_lookup_in_dict(d, u0)
    (0, 0)
    >>> uchar_lookup_in_dict(d, u1)
    (1, 1)
    """
    cdef dict d = obj
    dval = d[uchar]
    objval = obj[uchar]
    return dval, objval


def uchar_cast_to_int(Py_UCS4 uchar):
    """
    >>> ints = uchar_cast_to_int(u'3'); ints == (51, 3, 3, 3, 3) or ints
    True
    >>> ints = uchar_cast_to_int(u'0'); ints == (48, 0, 0, 0, 0) or ints
    True
    >>> uchar_cast_to_int(u'A')  # doctest: +ELLIPSIS
    Traceback (most recent call last):
    ValueError: invalid literal for int() with base 10: ...A...
    """
    cdef object ustr_object = uchar
    cdef str ustr_str = str(uchar)
    cdef unicode ustr_unicode = uchar
    return <int>uchar, <int>int(ustr_object[0]), <int>int(ustr_str[0]), <int>int(ustr_unicode[0]), <int>int(uchar)


def uchar_cast_to_float(Py_UCS4 uchar):
    """
    >>> floats = uchar_cast_to_float(u'3'); floats == (51, 3, 3, 3, 3) or floats
    True
    >>> floats = uchar_cast_to_float(u'0'); floats == (48, 0, 0, 0, 0) or floats
    True
    >>> uchar_cast_to_float(u'A')  # doctest: +ELLIPSIS
    Traceback (most recent call last):
    ValueError: could not convert string to float: ...A...
    """
    cdef object ustr_object = uchar
    cdef str ustr_str = str(uchar)
    cdef unicode ustr_unicode = uchar
    return <double>uchar, <double>float(ustr_object[0]), <double>float(ustr_str[0]), <double>float(ustr_unicode[0]), <double>float(uchar)


_WARNINGS = """
373:16: Item lookup of unicode character codes now always converts to a Unicode string. Use an explicit C integer cast to get back the previous integer lookup behaviour.
"""