File: utf8.py

package info (click to toggle)
felix 1.1.1-2
  • links: PTS
  • area: main
  • in suites: etch, etch-m68k
  • size: 4,992 kB
  • ctags: 1,178
  • sloc: python: 7,260; makefile: 408; sh: 58
file content (65 lines) | stat: -rw-r--r-- 1,853 bytes parent folder | download
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
#line 63 "interscript/src/utf8.ipk"
def utf8(i):
  if i < 0x80:
    return chr(i)
  if i < 0x800:
    return chr(0xC0 | (i>>6) & 0x1F)+\
      chr(0x80 | i & 0x3F)
  if i < 0x10000L:
    return chr(0xE0 | (i>>12) & 0xF)+\
      chr(0x80 | (i>>6) & 0x3F)+\
      chr(0x80 | i & 0x3F)
  if i < 0x200000L:
    return chr(0xF0 | (i>>18) & 0x7)+\
      chr(0x80 | (i>>12) & 0x3F)+\
      chr(0x80 | (i>>6) & 0x3F)+\
      chr(0x80 | i & 0x3F)
  if i < 0x4000000L:
    return chr(0xF8 | (i>>24) & 0x3)+\
      chr(0x80 | (i>>18) & 0x3F)+\
      chr(0x80 | (i>>12) & 0x3F)+\
      chr(0x80 | (i>>6) & 0x3F)+\
      chr(0x80 | i & 0x3F)
  return chr(0xFC | (i>>30) & 0x1)+\
    chr(0x80 | (i>>24) & 0x3F)+\
    chr(0x80 | (i>>18) & 0x3F)+\
    chr(0x80 | (i>>12) & 0x3F)+\
    chr(0x80 | (i>>6) & 0x3F)+\
    chr(0x80 | i & 0x3F)

def seq_to_utf8(a):
  s = ''
  for ch in a: s = s + utf8(ch)
  return s

def parse_utf8(s,i):
  lead = ord(s[i])
  if lead & 0x80 == 0:
    return lead & 0x7F,i+1 # ASCII
  if lead & 0xE0 == 0xC0:
    return ((lead & 0x1F) << 6)|\
      (ord(s[i+1]) & 0x3F),i+2
  if lead & 0xF0 == 0xE0:
    return ((lead & 0x1F)<<12)|\
      ((ord(s[i+1]) & 0x3F) <<6)|\
      (ord(s[i+2]) & 0x3F),i+3
  if lead & 0xF8 == 0xF0:
    return ((lead & 0x1F)<<18)|\
      ((ord(s[i+1]) & 0x3F) <<12)|\
      ((ord(s[i+2]) & 0x3F) <<6)|\
      (ord(s[i+3]) & 0x3F),i+4
  if lead & 0xFC == 0xF8:
    return ((lead & 0x1F)<<24)|\
      ((ord(s[i+1]) & 0x3F) <<18)|\
      ((ord(s[i+2]) & 0x3F) <<12)|\
      ((ord(s[i+3]) & 0x3F) <<6)|\
      (ord(s[i+4]) & 0x3F),i+5
  if lead & 0xFE == 0xFC:
    return ((lead & 0x1F)<<30)|\
      ((ord(s[i+1]) & 0x3F) <<24)|\
      ((ord(s[i+2]) & 0x3F) <<18)|\
      ((ord(s[i+3]) & 0x3F) <<12)|\
      ((ord(s[i+4]) & 0x3F) <<6)|\
      (ord(s[i+5]) & 0x3F),i+6
  return lead, i+1 # error, just use bad character