File: locale.py

package info (click to toggle)
pypy3 7.0.0%2Bdfsg-3
  • links: PTS, VCS
  • area: main
  • in suites: buster
  • size: 111,848 kB
  • sloc: python: 1,291,746; ansic: 74,281; asm: 5,187; cpp: 3,017; sh: 2,533; makefile: 544; xml: 243; lisp: 45; csh: 21; awk: 4
file content (156 lines) | stat: -rw-r--r-- 5,500 bytes parent folder | download
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
"""
Provides internal 'locale' codecs (via POSIX wcstombs/mbrtowc) for use
by PyUnicode_Decode/EncodeFSDefault during interpreter bootstrap
"""
import os
import py
import sys
from rpython.rlib.objectmodel import we_are_translated
from rpython.rlib.rstring import UnicodeBuilder, assert_str0
from rpython.rlib.runicode import (code_to_unichr,
    default_unicode_error_decode, default_unicode_error_encode)
from rpython.rtyper.lltypesystem import lltype, rffi
from rpython.translator import cdir
from rpython.translator.tool.cbuild import ExternalCompilationInfo

cwd = py.path.local(__file__).dirpath()
eci = ExternalCompilationInfo(
    includes=[cwd.join('locale_codec.h')],
    include_dirs=[str(cwd), cdir],
    separate_module_files=[cwd.join('locale_codec.c')])

def llexternal(*args, **kwargs):
    kwargs.setdefault('compilation_info', eci)
    kwargs.setdefault('sandboxsafe', True)
    kwargs.setdefault('_nowrapper', True)
    return rffi.llexternal(*args, **kwargs)

# An actual wchar_t*, rffi.CWCHARP is an array of UniChar (possibly on a
# narrow build)
RAW_WCHARP = lltype.Ptr(lltype.Array(rffi.WCHAR_T, hints={'nolength': True}))
pypy_char2wchar = llexternal('pypy_char2wchar', [rffi.CCHARP, rffi.SIZE_TP],
                             RAW_WCHARP)
pypy_char2wchar_free = llexternal('pypy_char2wchar_free', [RAW_WCHARP],
                                  lltype.Void)
pypy_wchar2char = llexternal('pypy_wchar2char', [RAW_WCHARP, rffi.SIZE_TP],
                             rffi.CCHARP)
pypy_wchar2char_free = llexternal('pypy_wchar2char_free', [rffi.CCHARP],
                                  lltype.Void)


def unicode_encode_locale_surrogateescape(u, errorhandler=None):
    """Encode unicode via the locale codecs (POSIX wcstombs) with the
    surrogateescape handler.

    The optional errorhandler is only called in the case of fatal
    errors.
    """
    if errorhandler is None:
        errorhandler = default_unicode_error_encode

    with lltype.scoped_alloc(rffi.SIZE_TP.TO, 1) as errorposp:
        with scoped_unicode2rawwcharp(u) as ubuf:
            sbuf = pypy_wchar2char(ubuf, errorposp)
        try:
            if not sbuf:
                errorpos = rffi.cast(lltype.Signed, errorposp[0])
                if errorpos == -1:
                    raise MemoryError
                errmsg = _errmsg("pypy_wchar2char")
                errorhandler('strict', 'filesystemencoding', errmsg, u,
                             errorpos, errorpos + 1)
            return rffi.charp2str(sbuf)
        finally:
            pypy_wchar2char_free(sbuf)


def str_decode_locale_surrogateescape(s, errorhandler=None):
    """Decode strs via the locale codecs (POSIX mrbtowc) with the
    surrogateescape handler.

    The optional errorhandler is only called in the case of fatal
    errors.
    """
    if errorhandler is None:
        errorhandler = default_unicode_error_decode

    with lltype.scoped_alloc(rffi.SIZE_TP.TO, 1) as sizep:
        with rffi.scoped_str2charp(s) as sbuf:
            ubuf = pypy_char2wchar(sbuf, sizep)
        try:
            if not ubuf:
                errmsg = _errmsg("pypy_char2wchar")
                errorhandler('strict', 'filesystemencoding', errmsg, s, 0, 1)
            size = rffi.cast(lltype.Signed, sizep[0])
            return rawwcharp2unicoden(ubuf, size)
        finally:
            pypy_char2wchar_free(ubuf)


def _errmsg(what):
    # I *think* that the functions in locale_codec.c don't set errno
    return "%s failed" % what


class scoped_unicode2rawwcharp:
    def __init__(self, value):
        if value is not None:
            self.buf = unicode2rawwcharp(value)
        else:
            self.buf = lltype.nullptr(RAW_WCHARP.TO)
    def __enter__(self):
        return self.buf
    def __exit__(self, *args):
        if self.buf:
            lltype.free(self.buf, flavor='raw')

def unicode2rawwcharp(u):
    """unicode -> raw wchar_t*"""
    if _should_merge_surrogates():
        size = _unicode2rawwcharp_loop(u, lltype.nullptr(RAW_WCHARP.TO))
    else:
        size = len(u)
    array = lltype.malloc(RAW_WCHARP.TO, size + 1, flavor='raw')
    array[size] = rffi.cast(rffi.WCHAR_T, u'\x00')
    _unicode2rawwcharp_loop(u, array)
    return array
unicode2rawwcharp._annenforceargs_ = [unicode]

def _unicode2rawwcharp_loop(u, array):
    ulen = len(u)
    count = i = 0
    while i < ulen:
        oc = ord(u[i])
        if (_should_merge_surrogates() and
            0xD800 <= oc <= 0xDBFF and i + 1 < ulen and
            0xDC00 <= ord(u[i + 1]) <= 0xDFFF):
            if array:
                merged = (((oc & 0x03FF) << 10) |
                          (ord(u[i + 1]) & 0x03FF)) + 0x10000
                array[count] = rffi.cast(rffi.WCHAR_T, merged)
            i += 2
        else:
            if array:
                array[count] = rffi.cast(rffi.WCHAR_T, oc)
            i += 1
        count += 1
    return count
_unicode2rawwcharp_loop._annenforceargs_ = [unicode, None]


def rawwcharp2unicoden(wcp, maxlen):
    b = UnicodeBuilder(maxlen)
    i = 0
    while i < maxlen and rffi.cast(lltype.Signed, wcp[i]) != 0:
        b.append(code_to_unichr(wcp[i]))
        i += 1
    return assert_str0(b.build())
rawwcharp2unicoden._annenforceargs_ = [None, int]


def _should_merge_surrogates():
    if we_are_translated():
        unichar_size = rffi.sizeof(lltype.UniChar)
    else:
        unichar_size = 2 if sys.maxunicode == 0xFFFF else 4
    return unichar_size == 2 and rffi.sizeof(rffi.WCHAR_T) == 4