1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147
|
def decode_modified_utf8(s: bytes) -> str:
"""
Decodes a bytestring containing modified UTF-8 as defined in section
4.4.7 of the JVM specification.
:param s: bytestring to be converted.
:returns: A unicode representation of the original string.
"""
s_out = []
s_len = len(s)
s_ix = 0
while s_ix < s_len:
b1 = s[s_ix]
s_ix += 1
if b1 == 0:
raise UnicodeDecodeError(
'mutf-8',
s,
s_ix - 1,
s_ix,
'Embedded NULL byte in input.'
)
if b1 < 0x80:
# ASCII/one-byte codepoint.
s_out.append(chr(b1))
elif (b1 & 0xE0) == 0xC0:
# Two-byte codepoint.
if s_ix >= s_len:
raise UnicodeDecodeError(
'mutf-8',
s,
s_ix - 1,
s_ix,
'2-byte codepoint started, but input too short to'
' finish.'
)
s_out.append(
chr(
(b1 & 0x1F) << 0x06 |
(s[s_ix] & 0x3F)
)
)
s_ix += 1
elif (b1 & 0xF0) == 0xE0:
# Three-byte codepoint.
if s_ix + 1 >= s_len:
raise UnicodeDecodeError(
'mutf-8',
s,
s_ix - 1,
s_ix,
'3-byte or 6-byte codepoint started, but input too'
' short to finish.'
)
b2 = s[s_ix]
b3 = s[s_ix + 1]
if b1 == 0xED and (b2 & 0xF0) == 0xA0:
# Possible six-byte codepoint.
if s_ix + 4 >= s_len:
raise UnicodeDecodeError(
'mutf-8',
s,
s_ix - 1,
s_ix,
'3-byte or 6-byte codepoint started, but input too'
' short to finish.'
)
b4 = s[s_ix + 2]
b5 = s[s_ix + 3]
b6 = s[s_ix + 4]
if b4 == 0xED and (b5 & 0xF0) == 0xB0:
# Definite six-byte codepoint.
s_out.append(
chr(
0x10000 |
(b2 & 0x0F) << 0x10 |
(b3 & 0x3F) << 0x0A |
(b5 & 0x0F) << 0x06 |
(b6 & 0x3F)
)
)
s_ix += 5
continue
s_out.append(
chr(
(b1 & 0x0F) << 0x0C |
(b2 & 0x3F) << 0x06 |
(b3 & 0x3F)
)
)
s_ix += 2
else:
raise RuntimeError
return u''.join(s_out)
def encode_modified_utf8(u: str) -> bytes:
"""
Encodes a unicode string as modified UTF-8 as defined in section 4.4.7
of the JVM specification.
:param u: unicode string to be converted.
:returns: A decoded bytearray.
"""
final_string = bytearray()
for c in (ord(char) for char in u):
if c == 0x00:
# NULL byte encoding shortcircuit.
final_string.extend([0xC0, 0x80])
elif c <= 0x7F:
# ASCII
final_string.append(c)
elif c <= 0x7FF:
# Two-byte codepoint.
final_string.extend([
(0xC0 | (0x1F & (c >> 0x06))),
(0x80 | (0x3F & c))
])
elif c <= 0xFFFF:
# Three-byte codepoint.
final_string.extend([
(0xE0 | (0x0F & (c >> 0x0C))),
(0x80 | (0x3F & (c >> 0x06))),
(0x80 | (0x3F & c))
])
else:
# Six-byte codepoint.
final_string.extend([
0xED,
0xA0 | ((c >> 0x10) & 0x0F),
0x80 | ((c >> 0x0A) & 0x3f),
0xED,
0xb0 | ((c >> 0x06) & 0x0f),
0x80 | (c & 0x3f)
])
return bytes(final_string)
|