File: utf-8.py

package info (click to toggle)
python-bitarray 3.6.1-1
  • links: PTS, VCS
  • area: main
  • in suites: forky
  • size: 1,288 kB
  • sloc: python: 11,456; ansic: 7,657; makefile: 73; sh: 6
file content (39 lines) | stat: -rw-r--r-- 1,116 bytes parent folder | download
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
from bitarray import bitarray
from bitarray.util import ba2int, pprint


# See: https://en.wikipedia.org/wiki/UTF-8

def code_point(u):
    print('character:', u)
    b = u.encode('utf-8')
    print('hexadecimal:', ' '.join('%02x' % i for i in b))
    a = bitarray(b, endian='big')
    pprint(a)

    # calculate binary code point from binary UTF-8 representation
    if a[0:1] == bitarray('0'):
        c = a[1:8]
        assert len(a) == 8
    elif a[0:3] == bitarray('110'):
        c = a[3:8] + a[10:16]
        assert a[8:10] == bitarray('10')
        assert len(a) == 16
    elif a[0:4] == bitarray('1110'):
        c = a[4:8] + a[10:16] + a[18:24]
        assert a[8:10] == a[16:18] == bitarray('10')
        assert len(a) == 24
    elif a[0:5] == bitarray('11110'):
        c = a[5:8] + a[10:16] + a[18:24] + a[26:32]
        assert a[8:10] == a[16:18] == a[24:26] == bitarray('10')
        assert len(a) == 32
    else:
        raise
    code_point = ba2int(c)

    print('code point:', hex(code_point))
    print()


for u in u'\u0024 \u00a2 \u20ac \ud55c \U00010348 \U0010ffff'.split():
    code_point(u)