File: unicode.pxd

package info (click to toggle)
cython 0.15.1-2
  • links: PTS, VCS
  • area: main
  • in suites: wheezy
  • size: 16,480 kB
  • sloc: python: 45,354; xml: 1,031; cpp: 635; makefile: 229; lisp: 48; ansic: 28
file content (390 lines) | stat: -rw-r--r-- 18,146 bytes parent folder | download
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
cdef extern from *:
    # Return true if the object o is a Unicode object or an instance
    # of a Unicode subtype. Changed in version 2.2: Allowed subtypes
    # to be accepted.
    bint PyUnicode_Check(object o)

    # Return true if the object o is a Unicode object, but not an
    # instance of a subtype. New in version 2.2.
    bint PyUnicode_CheckExact(object o)

    # Return the size of the object. o has to be a PyUnicodeObject
    # (not checked).
    Py_ssize_t PyUnicode_GET_SIZE(object o)

    # Return the size of the object's internal buffer in bytes. o has
    # to be a PyUnicodeObject (not checked).
    Py_ssize_t PyUnicode_GET_DATA_SIZE(object o)

    # Return a pointer to the internal Py_UNICODE buffer of the
    # object. o has to be a PyUnicodeObject (not checked).
    Py_UNICODE* PyUnicode_AS_UNICODE(object o)

    # Return a pointer to the internal buffer of the object. o has to
    # be a PyUnicodeObject (not checked).
    char* PyUnicode_AS_DATA(object o)

    # Return 1 or 0 depending on whether ch is a whitespace character.
    bint Py_UNICODE_ISSPACE(Py_UNICODE ch)

    # Return 1 or 0 depending on whether ch is a lowercase character.
    bint Py_UNICODE_ISLOWER(Py_UNICODE ch)

    # Return 1 or 0 depending on whether ch is an uppercase character.
    bint Py_UNICODE_ISUPPER(Py_UNICODE ch)

    # Return 1 or 0 depending on whether ch is a titlecase character.
    bint Py_UNICODE_ISTITLE(Py_UNICODE ch)

    # Return 1 or 0 depending on whether ch is a linebreak character.
    bint Py_UNICODE_ISLINEBREAK(Py_UNICODE ch)

    # Return 1 or 0 depending on whether ch is a decimal character.
    bint Py_UNICODE_ISDECIMAL(Py_UNICODE ch)

    # Return 1 or 0 depending on whether ch is a digit character.
    bint Py_UNICODE_ISDIGIT(Py_UNICODE ch)

    # Return 1 or 0 depending on whether ch is a numeric character.
    bint Py_UNICODE_ISNUMERIC(Py_UNICODE ch)

    # Return 1 or 0 depending on whether ch is an alphabetic character.
    bint Py_UNICODE_ISALPHA(Py_UNICODE ch)

    # Return 1 or 0 depending on whether ch is an alphanumeric character.
    bint Py_UNICODE_ISALNUM(Py_UNICODE ch)

    # Return the character ch converted to lower case.
    Py_UNICODE Py_UNICODE_TOLOWER(Py_UNICODE ch)

    # Return the character ch converted to upper case.
    Py_UNICODE Py_UNICODE_TOUPPER(Py_UNICODE ch)

    # Return the character ch converted to title case.
    Py_UNICODE Py_UNICODE_TOTITLE(Py_UNICODE ch)

    # Return the character ch converted to a decimal positive
    # integer. Return -1 if this is not possible. This macro does not
    # raise exceptions.
    int Py_UNICODE_TODECIMAL(Py_UNICODE ch)

    # Return the character ch converted to a single digit
    # integer. Return -1 if this is not possible. This macro does not
    # raise exceptions.
    int Py_UNICODE_TODIGIT(Py_UNICODE ch)

    # Return the character ch converted to a double. Return -1.0 if
    # this is not possible. This macro does not raise exceptions.
    double Py_UNICODE_TONUMERIC(Py_UNICODE ch)

    # To create Unicode objects and access their basic sequence
    # properties, use these APIs:

    # Create a Unicode Object from the Py_UNICODE buffer u of the
    # given size. u may be NULL which causes the contents to be
    # undefined. It is the user's responsibility to fill in the needed
    # data. The buffer is copied into the new object. If the buffer is
    # not NULL, the return value might be a shared object. Therefore,
    # modification of the resulting Unicode object is only allowed
    # when u is NULL.
    object PyUnicode_FromUnicode(Py_UNICODE *u, Py_ssize_t size)

    # Create a Unicode Object from the given Unicode code point ordinal.
    #
    # The ordinal must be in range(0x10000) on narrow Python builds
    # (UCS2), and range(0x110000) on wide builds (UCS4). A ValueError
    # is raised in case it is not.
    object PyUnicode_FromOrdinal(int ordinal)

    # Return a read-only pointer to the Unicode object's internal
    # Py_UNICODE buffer, NULL if unicode is not a Unicode object.
    Py_UNICODE* PyUnicode_AsUnicode(object o) except NULL

    # Return the length of the Unicode object.
    Py_ssize_t PyUnicode_GetSize(object o) except -1

    # Coerce an encoded object obj to an Unicode object and return a
    # reference with incremented refcount.
    # String and other char buffer compatible objects are decoded
    # according to the given encoding and using the error handling
    # defined by errors. Both can be NULL to have the interface use
    # the default values (see the next section for details).
    # All other objects, including Unicode objects, cause a TypeError
    # to be set.
    object PyUnicode_FromEncodedObject(object o, char *encoding, char *errors)

    # Shortcut for PyUnicode_FromEncodedObject(obj, NULL, "strict")
    # which is used throughout the interpreter whenever coercion to
    # Unicode is needed.
    object PyUnicode_FromObject(object obj)

    # If the platform supports wchar_t and provides a header file
    # wchar.h, Python can interface directly to this type using the
    # following functions. Support is optimized if Python's own
    # Py_UNICODE type is identical to the system's wchar_t.

    #ctypedef int wchar_t

    # Create a Unicode object from the wchar_t buffer w of the given
    # size. Return NULL on failure.
    #PyObject* PyUnicode_FromWideChar(wchar_t *w, Py_ssize_t size)

    #Py_ssize_t PyUnicode_AsWideChar(object o, wchar_t *w, Py_ssize_t size)

# Codecs

    # Create a Unicode object by decoding size bytes of the encoded
    # string s. encoding and errors have the same meaning as the
    # parameters of the same name in the unicode() builtin
    # function. The codec to be used is looked up using the Python
    # codec registry. Return NULL if an exception was raised by the
    # codec.
    object PyUnicode_Decode(char *s, Py_ssize_t size, char *encoding, char *errors)

    # Encode the Py_UNICODE buffer of the given size and return a
    # Python string object. encoding and errors have the same meaning
    # as the parameters of the same name in the Unicode encode()
    # method. The codec to be used is looked up using the Python codec
    # registry. Return NULL if an exception was raised by the codec.
    object PyUnicode_Encode(Py_UNICODE *s, Py_ssize_t size,
                            char *encoding, char *errors)

    # Encode a Unicode object and return the result as Python string
    # object. encoding and errors have the same meaning as the
    # parameters of the same name in the Unicode encode() method. The
    # codec to be used is looked up using the Python codec
    # registry. Return NULL if an exception was raised by the codec.
    object PyUnicode_AsEncodedString(object unicode, char *encoding, char *errors)

# These are the UTF-8 codec APIs:

    # Create a Unicode object by decoding size bytes of the UTF-8
    # encoded string s. Return NULL if an exception was raised by the
    # codec.
    object PyUnicode_DecodeUTF8(char *s, Py_ssize_t size, char *errors)

    # If consumed is NULL, behave like PyUnicode_DecodeUTF8(). If
    # consumed is not NULL, trailing incomplete UTF-8 byte sequences
    # will not be treated as an error. Those bytes will not be decoded
    # and the number of bytes that have been decoded will be stored in
    # consumed. New in version 2.4.
    object PyUnicode_DecodeUTF8Stateful(char *s, Py_ssize_t size, char *errors, Py_ssize_t *consumed)

    # Encode the Py_UNICODE buffer of the given size using UTF-8 and
    # return a Python string object. Return NULL if an exception was
    # raised by the codec.
    object PyUnicode_EncodeUTF8(Py_UNICODE *s, Py_ssize_t size, char *errors)

    # Encode a Unicode objects using UTF-8 and return the result as Python string object. Error handling is ``strict''. Return NULL if an exception was raised by the codec.
    object PyUnicode_AsUTF8String(object unicode)

# These are the UTF-16 codec APIs:

    # Decode length bytes from a UTF-16 encoded buffer string and
    # return the corresponding Unicode object. errors (if non-NULL)
    # defines the error handling. It defaults to ``strict''.
    #
    # If byteorder is non-NULL, the decoder starts decoding using the
    # given byte order:
    #
    #   *byteorder == -1: little endian
    #   *byteorder == 0:  native order
    #   *byteorder == 1:  big endian
    #
    # and then switches if the first two bytes of the input data are a
    # byte order mark (BOM) and the specified byte order is native
    # order. This BOM is not copied into the resulting Unicode
    # string. After completion, *byteorder is set to the current byte
    # order at the.
    #
    # If byteorder is NULL, the codec starts in native order mode.
    object PyUnicode_DecodeUTF16(char *s, Py_ssize_t size, char *errors, int *byteorder)

    # If consumed is NULL, behave like PyUnicode_DecodeUTF16(). If
    # consumed is not NULL, PyUnicode_DecodeUTF16Stateful() will not
    # treat trailing incomplete UTF-16 byte sequences (such as an odd
    # number of bytes or a split surrogate pair) as an error. Those
    # bytes will not be decoded and the number of bytes that have been
    # decoded will be stored in consumed. New in version 2.4.
    object PyUnicode_DecodeUTF16Stateful(char *s, Py_ssize_t size, char *errors, int *byteorder, Py_ssize_t *consumed)

    # Return a Python string object holding the UTF-16 encoded value
    # of the Unicode data in s. If byteorder is not 0, output is
    # written according to the following byte order:
    #
    #   byteorder == -1: little endian
    #   byteorder == 0:  native byte order (writes a BOM mark)
    #   byteorder == 1:  big endian
    #
    # If byteorder is 0, the output string will always start with the
    # Unicode BOM mark (U+FEFF). In the other two modes, no BOM mark
    # is prepended.
    #
    # If Py_UNICODE_WIDE is defined, a single Py_UNICODE value may get
    # represented as a surrogate pair. If it is not defined, each
    # Py_UNICODE values is interpreted as an UCS-2 character.
    object PyUnicode_EncodeUTF16(Py_UNICODE *s, Py_ssize_t size, char *errors, int byteorder)

    # Return a Python string using the UTF-16 encoding in native byte
    # order. The string always starts with a BOM mark. Error handling
    # is ``strict''. Return NULL if an exception was raised by the
    # codec.
    object PyUnicode_AsUTF16String(object unicode)

# These are the ``Unicode Escape'' codec APIs:

    # Create a Unicode object by decoding size bytes of the
    # Unicode-Escape encoded string s. Return NULL if an exception was
    # raised by the codec.
    object PyUnicode_DecodeUnicodeEscape(char *s, Py_ssize_t size, char *errors)

    # Encode the Py_UNICODE buffer of the given size using
    # Unicode-Escape and return a Python string object. Return NULL if
    # an exception was raised by the codec.
    object PyUnicode_EncodeUnicodeEscape(Py_UNICODE *s, Py_ssize_t size)

    # Encode a Unicode objects using Unicode-Escape and return the
    # result as Python string object. Error handling is
    # ``strict''. Return NULL if an exception was raised by the codec.
    object PyUnicode_AsUnicodeEscapeString(object unicode)

# These are the ``Raw Unicode Escape'' codec APIs:

    # Create a Unicode object by decoding size bytes of the
    # Raw-Unicode-Escape encoded string s. Return NULL if an exception
    # was raised by the codec.
    object PyUnicode_DecodeRawUnicodeEscape(char *s, Py_ssize_t size, char *errors)

    # Encode the Py_UNICODE buffer of the given size using
    # Raw-Unicode-Escape and return a Python string object. Return
    # NULL if an exception was raised by the codec.
    object PyUnicode_EncodeRawUnicodeEscape(Py_UNICODE *s, Py_ssize_t size, char *errors)

    # Encode a Unicode objects using Raw-Unicode-Escape and return the
    # result as Python string object. Error handling is
    # ``strict''. Return NULL if an exception was raised by the codec.
    object PyUnicode_AsRawUnicodeEscapeString(object unicode)

# These are the Latin-1 codec APIs: Latin-1 corresponds to the first 256 Unicode ordinals and only these are accepted by the codecs during encoding.

    # Create a Unicode object by decoding size bytes of the Latin-1
    # encoded string s. Return NULL if an exception was raised by the
    # codec.
    object PyUnicode_DecodeLatin1(char *s, Py_ssize_t size, char *errors)

    # Encode the Py_UNICODE buffer of the given size using Latin-1 and
    # return a Python string object. Return NULL if an exception was
    # raised by the codec.
    object PyUnicode_EncodeLatin1(Py_UNICODE *s, Py_ssize_t size, char *errors)

    # Encode a Unicode objects using Latin-1 and return the result as
    # Python string object. Error handling is ``strict''. Return NULL
    # if an exception was raised by the codec.
    object PyUnicode_AsLatin1String(object unicode)

# These are the ASCII codec APIs. Only 7-bit ASCII data is
# accepted. All other codes generate errors.

    # Create a Unicode object by decoding size bytes of the ASCII
    # encoded string s. Return NULL if an exception was raised by the
    # codec.
    object PyUnicode_DecodeASCII(char *s, Py_ssize_t size, char *errors)

    # Encode the Py_UNICODE buffer of the given size using ASCII and
    # return a Python string object. Return NULL if an exception was
    # raised by the codec.
    object PyUnicode_EncodeASCII(Py_UNICODE *s, Py_ssize_t size, char *errors)

    # Encode a Unicode objects using ASCII and return the result as
    # Python string object. Error handling is ``strict''. Return NULL
    # if an exception was raised by the codec.
    object PyUnicode_AsASCIIString(object o)

# These are the mapping codec APIs:
#
# This codec is special in that it can be used to implement many
# different codecs (and this is in fact what was done to obtain most
# of the standard codecs included in the encodings package). The codec
# uses mapping to encode and decode characters.
#
# Decoding mappings must map single string characters to single
# Unicode characters, integers (which are then interpreted as Unicode
# ordinals) or None (meaning "undefined mapping" and causing an
# error).
#
# Encoding mappings must map single Unicode characters to single
# string characters, integers (which are then interpreted as Latin-1
# ordinals) or None (meaning "undefined mapping" and causing an
# error).
#
# The mapping objects provided must only support the __getitem__
# mapping interface.
#
# If a character lookup fails with a LookupError, the character is
# copied as-is meaning that its ordinal value will be interpreted as
# Unicode or Latin-1 ordinal resp. Because of this, mappings only need
# to contain those mappings which map characters to different code
# points.

    # Create a Unicode object by decoding size bytes of the encoded
    # string s using the given mapping object. Return NULL if an
    # exception was raised by the codec. If mapping is NULL latin-1
    # decoding will be done. Else it can be a dictionary mapping byte
    # or a unicode string, which is treated as a lookup table. Byte
    # values greater that the length of the string and U+FFFE
    # "characters" are treated as "undefined mapping". Changed in
    # version 2.4: Allowed unicode string as mapping argument.
    object PyUnicode_DecodeCharmap(char *s, Py_ssize_t size, object mapping, char *errors)

    # Encode the Py_UNICODE buffer of the given size using the given
    # mapping object and return a Python string object. Return NULL if
    # an exception was raised by the codec.
    object PyUnicode_EncodeCharmap(Py_UNICODE *s, Py_ssize_t size, object mapping, char *errors)

    # Encode a Unicode objects using the given mapping object and
    # return the result as Python string object. Error handling is
    # ``strict''. Return NULL if an exception was raised by the codec.
    object PyUnicode_AsCharmapString(object o, object mapping)

# The following codec API is special in that maps Unicode to Unicode.

    # Translate a Py_UNICODE buffer of the given length by applying a
    # character mapping table to it and return the resulting Unicode
    # object. Return NULL when an exception was raised by the codec.
    #
    # The mapping table must map Unicode ordinal integers to Unicode
    # ordinal integers or None (causing deletion of the character).
    #
    # Mapping tables need only provide the __getitem__() interface;
    # dictionaries and sequences work well. Unmapped character
    # ordinals (ones which cause a LookupError) are left untouched and
    # are copied as-is.
    object PyUnicode_TranslateCharmap(Py_UNICODE *s, Py_ssize_t size,
                                      object table, char *errors)

# These are the MBCS codec APIs. They are currently only available on
# Windows and use the Win32 MBCS converters to implement the
# conversions. Note that MBCS (or DBCS) is a class of encodings, not
# just one. The target encoding is defined by the user settings on the
# machine running the codec.

    # Create a Unicode object by decoding size bytes of the MBCS
    # encoded string s. Return NULL if an exception was raised by the
    # codec.
    object PyUnicode_DecodeMBCS(char *s, Py_ssize_t size, char *errors)

    # If consumed is NULL, behave like PyUnicode_DecodeMBCS(). If
    # consumed is not NULL, PyUnicode_DecodeMBCSStateful() will not
    # decode trailing lead byte and the number of bytes that have been
    # decoded will be stored in consumed. New in version 2.5.
    object PyUnicode_DecodeMBCSStateful(char *s, int size, char *errors, int *consumed)

    # Encode the Py_UNICODE buffer of the given size using MBCS and
    # return a Python string object. Return NULL if an exception was
    # raised by the codec.
    object PyUnicode_EncodeMBCS(Py_UNICODE *s, Py_ssize_t size, char *errors)

    # Encode a Unicode objects using MBCS and return the result as
    # Python string object. Error handling is ``strict''. Return NULL
    # if an exception was raised by the codec.
    object PyUnicode_AsMBCSString(object o)