File: mutf8.py

package info (click to toggle)
python-mutf8 1.0.6-4
  • links: PTS, VCS
  • area: main
  • in suites: forky, sid
  • size: 132 kB
  • sloc: python: 342; ansic: 207; makefile: 3
file content (147 lines) | stat: -rw-r--r-- 4,402 bytes parent folder | download | duplicates (2)
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
def decode_modified_utf8(s: bytes) -> str:
    """
    Decodes a bytestring containing modified UTF-8 as defined in section
    4.4.7 of the JVM specification.

    :param s: bytestring to be converted.
    :returns: A unicode representation of the original string.
    """
    s_out = []
    s_len = len(s)
    s_ix = 0

    while s_ix < s_len:
        b1 = s[s_ix]
        s_ix += 1

        if b1 == 0:
            raise UnicodeDecodeError(
                'mutf-8',
                s,
                s_ix - 1,
                s_ix,
                'Embedded NULL byte in input.'
            )
        if b1 < 0x80:
            # ASCII/one-byte codepoint.
            s_out.append(chr(b1))
        elif (b1 & 0xE0) == 0xC0:
            # Two-byte codepoint.
            if s_ix >= s_len:
                raise UnicodeDecodeError(
                        'mutf-8',
                        s,
                        s_ix - 1,
                        s_ix,
                        '2-byte codepoint started, but input too short to'
                        ' finish.'
                    )

            s_out.append(
                chr(
                    (b1 & 0x1F) << 0x06 |
                    (s[s_ix] & 0x3F)
                )
            )
            s_ix += 1
        elif (b1 & 0xF0) == 0xE0:
            # Three-byte codepoint.
            if s_ix + 1 >= s_len:
                raise UnicodeDecodeError(
                        'mutf-8',
                        s,
                        s_ix - 1,
                        s_ix,
                        '3-byte or 6-byte codepoint started, but input too'
                        ' short to finish.'
                    )

            b2 = s[s_ix]
            b3 = s[s_ix + 1]

            if b1 == 0xED and (b2 & 0xF0) == 0xA0:
                # Possible six-byte codepoint.
                if s_ix + 4 >= s_len:
                    raise UnicodeDecodeError(
                            'mutf-8',
                            s,
                            s_ix - 1,
                            s_ix,
                            '3-byte or 6-byte codepoint started, but input too'
                            ' short to finish.'
                        )

                b4 = s[s_ix + 2]
                b5 = s[s_ix + 3]
                b6 = s[s_ix + 4]

                if b4 == 0xED and (b5 & 0xF0) == 0xB0:
                    # Definite six-byte codepoint.
                    s_out.append(
                        chr(
                            0x10000 |
                            (b2 & 0x0F) << 0x10 |
                            (b3 & 0x3F) << 0x0A |
                            (b5 & 0x0F) << 0x06 |
                            (b6 & 0x3F)
                        )
                    )
                    s_ix += 5
                    continue

            s_out.append(
                chr(
                    (b1 & 0x0F) << 0x0C |
                    (b2 & 0x3F) << 0x06 |
                    (b3 & 0x3F)
                )
            )
            s_ix += 2
        else:
            raise RuntimeError

    return u''.join(s_out)


def encode_modified_utf8(u: str) -> bytes:
    """
    Encodes a unicode string as modified UTF-8 as defined in section 4.4.7
    of the JVM specification.

    :param u: unicode string to be converted.
    :returns: A decoded bytearray.
    """
    final_string = bytearray()

    for c in (ord(char) for char in u):
        if c == 0x00:
            # NULL byte encoding shortcircuit.
            final_string.extend([0xC0, 0x80])
        elif c <= 0x7F:
            # ASCII
            final_string.append(c)
        elif c <= 0x7FF:
            # Two-byte codepoint.
            final_string.extend([
                (0xC0 | (0x1F & (c >> 0x06))),
                (0x80 | (0x3F & c))
            ])
        elif c <= 0xFFFF:
            # Three-byte codepoint.
            final_string.extend([
                (0xE0 | (0x0F & (c >> 0x0C))),
                (0x80 | (0x3F & (c >> 0x06))),
                (0x80 | (0x3F & c))
            ])
        else:
            # Six-byte codepoint.
            final_string.extend([
                0xED,
                0xA0 | ((c >> 0x10) & 0x0F),
                0x80 | ((c >> 0x0A) & 0x3f),
                0xED,
                0xb0 | ((c >> 0x06) & 0x0f),
                0x80 | (c & 0x3f)
            ])

    return bytes(final_string)