1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218
|
/* --- Internal Unicode Format -------------------------------------------- */
/* Py_UNICODE was the native Unicode storage format (code unit) used by
Python and represents a single Unicode element in the Unicode type.
With PEP 393, Py_UNICODE is deprecated and replaced with a
typedef to wchar_t. */
#define PY_UNICODE_TYPE wchar_t
typedef wchar_t Py_UNICODE;
/* Py_UCS4 and Py_UCS2 are typedefs for the respective
unicode representations. */
/* PyPy: the cpyext parser does not understand uint32_t, uint16_t, uint8_t */
typedef unsigned int Py_UCS4;
typedef unsigned short Py_UCS2;
typedef unsigned char Py_UCS1;
/* --- Unicode Type ------------------------------------------------------- */
typedef struct {
/*
SSTATE_NOT_INTERNED (0)
SSTATE_INTERNED_MORTAL (1)
SSTATE_INTERNED_IMMORTAL (2)
If interned != SSTATE_NOT_INTERNED, the two references from the
dictionary to this object are *not* counted in ob_refcnt.
*/
/* no bit fields on PyPy
unsigned int interned:2; */
unsigned char interned;
/* Character size:
- PyUnicode_WCHAR_KIND (0):
* character type = wchar_t (16 or 32 bits, depending on the
platform)
- PyUnicode_1BYTE_KIND (1):
* character type = Py_UCS1 (8 bits, unsigned)
* all characters are in the range U+0000-U+00FF (latin1)
* if ascii is set, all characters are in the range U+0000-U+007F
(ASCII), otherwise at least one character is in the range
U+0080-U+00FF
- PyUnicode_2BYTE_KIND (2):
* character type = Py_UCS2 (16 bits, unsigned)
* all characters are in the range U+0000-U+FFFF (BMP)
* at least one character is in the range U+0100-U+FFFF
- PyUnicode_4BYTE_KIND (4):
* character type = Py_UCS4 (32 bits, unsigned)
* all characters are in the range U+0000-U+10FFFF
* at least one character is in the range U+10000-U+10FFFF
*/
/* no bit fields on PyPy
unsigned int kind:3; */
unsigned char kind;
/* Compact is with respect to the allocation scheme. Compact unicode
objects only require one memory block while non-compact objects use
one block for the PyUnicodeObject struct and another for its data
buffer. */
/* no bit fields on PyPy
unsigned int compact:1; */
unsigned char compact;
/* The string only contains characters in the range U+0000-U+007F (ASCII)
and the kind is PyUnicode_1BYTE_KIND. If ascii is set and compact is
set, use the PyASCIIObject structure. */
/* no bit fields on PyPy
unsigned int ascii:1; */
unsigned char ascii;
/* The ready flag indicates whether the object layout is initialized
completely. This means that this is either a compact object, or
the data pointer is filled out. The bit is redundant, and helps
to minimize the test in PyUnicode_IS_READY(). */
/* no bit fields on PyPy
unsigned int ready:1; */
unsigned char ready;
/* Padding to ensure that PyUnicode_DATA() is always aligned to
4 bytes (see issue #19537 on m68k). */
/* Not on PyPy, since not using bit fields */
/*unsigned int padding:24; */
} _PyASCIIObject_state_t;
/* ASCII-only strings created through PyUnicode_New use the PyASCIIObject
structure. state.ascii and state.compact are set, and the data
immediately follow the structure. utf8_length and wstr_length can be found
in the length field; the utf8 pointer is equal to the data pointer. */
typedef struct {
/* There are 4 forms of Unicode strings:
- compact ascii:
* structure = PyASCIIObject
* test: PyUnicode_IS_COMPACT_ASCII(op)
* kind = PyUnicode_1BYTE_KIND
* compact = 1
* ascii = 1
* ready = 1
* (length is the length of the utf8 and wstr strings)
* (data starts just after the structure)
* (since ASCII is decoded from UTF-8, the utf8 string are the data)
- compact:
* structure = PyCompactUnicodeObject
* test: PyUnicode_IS_COMPACT(op) && !PyUnicode_IS_ASCII(op)
* kind = PyUnicode_1BYTE_KIND, PyUnicode_2BYTE_KIND or
PyUnicode_4BYTE_KIND
* compact = 1
* ready = 1
* ascii = 0
* utf8 is not shared with data
* utf8_length = 0 if utf8 is NULL
* wstr is shared with data and wstr_length=length
if kind=PyUnicode_2BYTE_KIND and sizeof(wchar_t)=2
or if kind=PyUnicode_4BYTE_KIND and sizeof(wchar_t)=4
* wstr_length = 0 if wstr is NULL
* (data starts just after the structure)
- legacy string, not ready:
* structure = PyUnicodeObject
* test: kind == PyUnicode_WCHAR_KIND
* length = 0 (use wstr_length)
* hash = -1
* kind = PyUnicode_WCHAR_KIND
* compact = 0
* ascii = 0
* ready = 0
* interned = SSTATE_NOT_INTERNED
* wstr is not NULL
* data.any is NULL
* utf8 is NULL
* utf8_length = 0
- legacy string, ready:
* structure = PyUnicodeObject structure
* test: !PyUnicode_IS_COMPACT(op) && kind != PyUnicode_WCHAR_KIND
* kind = PyUnicode_1BYTE_KIND, PyUnicode_2BYTE_KIND or
PyUnicode_4BYTE_KIND
* compact = 0
* ready = 1
* data.any is not NULL
* utf8 is shared and utf8_length = length with data.any if ascii = 1
* utf8_length = 0 if utf8 is NULL
* wstr is shared with data.any and wstr_length = length
if kind=PyUnicode_2BYTE_KIND and sizeof(wchar_t)=2
or if kind=PyUnicode_4BYTE_KIND and sizeof(wchar_4)=4
* wstr_length = 0 if wstr is NULL
Compact strings use only one memory block (structure + characters),
whereas legacy strings use one block for the structure and one block
for characters.
Legacy strings are created by PyUnicode_FromUnicode() and
PyUnicode_FromStringAndSize(NULL, size) functions. They become ready
when PyUnicode_READY() is called.
See also _PyUnicode_CheckConsistency().
*/
PyObject_HEAD
Py_ssize_t length; /* Number of code points in the string */
Py_hash_t hash; /* Hash value; -1 if not set */
_PyASCIIObject_state_t state;
wchar_t *wstr; /* wchar_t representation (null-terminated) */
} PyASCIIObject;
/* Non-ASCII strings allocated through PyUnicode_New use the
PyCompactUnicodeObject structure. state.compact is set, and the data
immediately follow the structure. */
typedef struct {
PyASCIIObject _base;
Py_ssize_t utf8_length; /* Number of bytes in utf8, excluding the
* terminating \0. */
char *utf8; /* UTF-8 representation (null-terminated) */
Py_ssize_t wstr_length; /* Number of code points in wstr, possible
* surrogates count as two code points. */
} PyCompactUnicodeObject;
/* Strings allocated through PyUnicode_FromUnicode(NULL, len) use the
PyUnicodeObject structure. The actual string data is initially in the wstr
block, and copied into the data block using _PyUnicode_Ready. */
typedef struct {
PyCompactUnicodeObject _base;
/*
union {
void *any;
Py_UCS1 *latin1;
Py_UCS2 *ucs2;
Py_UCS4 *ucs4;
} */ void * data; /* Canonical, smallest-form Unicode buffer */
} PyUnicodeObject;
/* --- Flexible String Representation Helper Macros (PEP 393) -------------- */
/* Values for PyASCIIObject.state: */
/* Interning state. */
#define SSTATE_NOT_INTERNED 0
#define SSTATE_INTERNED_MORTAL 1
#define SSTATE_INTERNED_IMMORTAL 2
/* --- Constants ---------------------------------------------------------- */
/* This Unicode character will be used as replacement character during
decoding if the errors argument is set to "replace". Note: the
Unicode character U+FFFD is the official REPLACEMENT CHARACTER in
Unicode 3.0. */
#define Py_UNICODE_REPLACEMENT_CHARACTER ((Py_UCS4) 0xFFFD)
|