File: mutf8.py

package info (click to toggle)
python-mutf8 1.0.6-4
links: PTS, VCS
area: main
in suites: forky, sid
size: 132 kB
sloc: python: 342; ansic: 207; makefile: 3
file content (147 lines) | stat: -rw-r--r-- 4,402 bytes
parent folder | download | duplicates (2)
def decode_modified_utf8(s: bytes) -> str:
    """
    Decodes a bytestring containing modified UTF-8 as defined in section
    4.4.7 of the JVM specification.

    :param s: bytestring to be converted.
    :returns: A unicode representation of the original string.
    """
    s_out = []
    s_len = len(s)
    s_ix = 0

    while s_ix < s_len:
        b1 = s[s_ix]
        s_ix += 1

        if b1 == 0:
            raise UnicodeDecodeError(
                'mutf-8',
                s,
                s_ix - 1,
                s_ix,
                'Embedded NULL byte in input.'
            )
        if b1 < 0x80:
            # ASCII/one-byte codepoint.
            s_out.append(chr(b1))
        elif (b1 & 0xE0) == 0xC0:
            # Two-byte codepoint.
            if s_ix >= s_len:
                raise UnicodeDecodeError(
                        'mutf-8',
                        s,
                        s_ix - 1,
                        s_ix,
                        '2-byte codepoint started, but input too short to'
                        ' finish.'
                    )

            s_out.append(
                chr(
                    (b1 & 0x1F) << 0x06 |
                    (s[s_ix] & 0x3F)
                )
            )
            s_ix += 1
        elif (b1 & 0xF0) == 0xE0:
            # Three-byte codepoint.
            if s_ix + 1 >= s_len:
                raise UnicodeDecodeError(
                        'mutf-8',
                        s,
                        s_ix - 1,
                        s_ix,
                        '3-byte or 6-byte codepoint started, but input too'
                        ' short to finish.'
                    )

            b2 = s[s_ix]
            b3 = s[s_ix + 1]

            if b1 == 0xED and (b2 & 0xF0) == 0xA0:
                # Possible six-byte codepoint.
                if s_ix + 4 >= s_len:
                    raise UnicodeDecodeError(
                            'mutf-8',
                            s,
                            s_ix - 1,
                            s_ix,
                            '3-byte or 6-byte codepoint started, but input too'
                            ' short to finish.'
                        )

                b4 = s[s_ix + 2]
                b5 = s[s_ix + 3]
                b6 = s[s_ix + 4]

                if b4 == 0xED and (b5 & 0xF0) == 0xB0:
                    # Definite six-byte codepoint.
                    s_out.append(
                        chr(
                            0x10000 |
                            (b2 & 0x0F) << 0x10 |
                            (b3 & 0x3F) << 0x0A |
                            (b5 & 0x0F) << 0x06 |
                            (b6 & 0x3F)
                        )
                    )
                    s_ix += 5
                    continue

            s_out.append(
                chr(
                    (b1 & 0x0F) << 0x0C |
                    (b2 & 0x3F) << 0x06 |
                    (b3 & 0x3F)
                )
            )
            s_ix += 2
        else:
            raise RuntimeError

    return u''.join(s_out)


def encode_modified_utf8(u: str) -> bytes:
    """
    Encodes a unicode string as modified UTF-8 as defined in section 4.4.7
    of the JVM specification.

    :param u: unicode string to be converted.
    :returns: A decoded bytearray.
    """
    final_string = bytearray()

    for c in (ord(char) for char in u):
        if c == 0x00:
            # NULL byte encoding shortcircuit.
            final_string.extend([0xC0, 0x80])
        elif c <= 0x7F:
            # ASCII
            final_string.append(c)
        elif c <= 0x7FF:
            # Two-byte codepoint.
            final_string.extend([
                (0xC0 | (0x1F & (c >> 0x06))),
                (0x80 | (0x3F & c))
            ])
        elif c <= 0xFFFF:
            # Three-byte codepoint.
            final_string.extend([
                (0xE0 | (0x0F & (c >> 0x0C))),
                (0x80 | (0x3F & (c >> 0x06))),
                (0x80 | (0x3F & c))
            ])
        else:
            # Six-byte codepoint.
            final_string.extend([
                0xED,
                0xA0 | ((c >> 0x10) & 0x0F),
                0x80 | ((c >> 0x0A) & 0x3f),
                0xED,
                0xb0 | ((c >> 0x06) & 0x0f),
                0x80 | (c & 0x3f)
            ])

    return bytes(final_string)