File: utf8.ml

package info (click to toggle)
ulex 0.5-3
  • links: PTS
  • area: main
  • in suites: sarge
  • size: 156 kB
  • ctags: 260
  • sloc: ml: 1,070; makefile: 73; sh: 50
file content (184 lines) | stat: -rw-r--r-- 6,276 bytes parent folder | download
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
exception MalFormed

let width = Array.make 256 (-1)
let () =
  for i = 0 to 127 do width.(i) <- 1 done;
  for i = 192 to 223 do width.(i) <- 2 done;
  for i = 224 to 239 do width.(i) <- 3 done;
  for i = 240 to 248 do width.(i) <- 4 done

let next s i =
  match s.[i] with
    | '\000'..'\127' as c ->
        Char.code c
    | '\192'..'\223' as c ->
	let n1 = Char.code c in
	let n2 = Char.code s.[i+1] in
        if (n2 lsr 6 != 0b10) then raise MalFormed;
        ((n1 land 0x1f) lsl 6) lor (n2 land 0x3f)
    | '\224' ->
	let n2 = Char.code s.[i+1] in
	let n3 = Char.code s.[i+2] in
	if (n2 lsr 5 != 0b101) || (n3 lsr 6 != 0b10) then raise MalFormed;
        ((n2 land 0x3f) lsl 6) lor (n3 land 0x3f)
    | ('\225'..'\236' | '\238'..'\239') as c ->
	let n1 = Char.code c in
	let n2 = Char.code s.[i+1] in
	let n3 = Char.code s.[i+2] in
        if (n2 lsr 6 != 0b10) || (n3 lsr 6 != 0b10) then raise MalFormed;
        ((n1 land 0x0f) lsl 12) lor ((n2 land 0x3f) lsl 6) lor (n3 land 0x3f)
    | '\237' ->
	let n2 = Char.code s.[i+1] in
	let n3 = Char.code s.[i+2] in
	if (n2 lsr 5 != 0b100) || (n3 lsr 6 != 0b10) then raise MalFormed;
        0xd000 lor ((n2 land 0x3f) lsl 6) lor (n3 land 0x3f)
    | '\240' ->
	let n2 = Char.code s.[i+1] in
	let n3 = Char.code s.[i+2] in
	let n4 = Char.code s.[i+3] in
	if (n2 lsr 4 != 0b1001) || (n3 lsr 6 != 0b10) || (n4 lsr 6 != 0b10)
	then raise MalFormed;
        ((n2 land 0x3f) lsl 12) lor ((n3 land 0x3f) lsl 6) lor (n4 land 0x3f)
    | '\241'..'\243' as c ->
	let n1 = Char.code c in
	let n2 = Char.code s.[i+1] in
	let n3 = Char.code s.[i+2] in
	let n4 = Char.code s.[i+3] in
        if (n2 lsr 6 != 0b10) || (n3 lsr 6 != 0b10) || (n4 lsr 6 != 0b10)
	then raise MalFormed;
        ((n1 land 0x07) lsl 18) lor ((n2 land 0x3f) lsl 12) lor
        ((n3 land 0x3f) lsl 6) lor (n4 land 0x3f)
    | '\244' ->
	let n2 = Char.code s.[i+1] in
	let n3 = Char.code s.[i+2] in
	let n4 = Char.code s.[i+3] in
        if (n2 lsr 4 != 0b1000) || (n3 lsr 6 != 0b10) || (n4 lsr 6 != 0b10)
	then raise MalFormed;
        0x100000 lor ((n2 land 0x3f) lsl 12) lor ((n3 land 0x3f) lsl 6) lor 
	(n4 land 0x3f)
    | _ -> raise MalFormed


(* With this implementation, a truncated code point will result
   in Stream.Failure, not in MalFormed. *)

let from_stream s =
  match Stream.next s with
    | '\000'..'\127' as c ->
        Char.code c
    | '\192'..'\223' as c ->
	let n1 = Char.code c in
	let n2 = Char.code (Stream.next s) in
        if (n2 lsr 6 != 0b10) then raise MalFormed;
        ((n1 land 0x1f) lsl 6) lor (n2 land 0x3f)
    | '\224' ->
	let n2 = Char.code (Stream.next s) in
	let n3 = Char.code (Stream.next s) in
	if (n2 lsr 5 != 0b101) || (n3 lsr 6 != 0b10) then raise MalFormed;
        ((n2 land 0x3f) lsl 6) lor (n3 land 0x3f)
    | ('\225'..'\236' | '\238'..'\239') as c ->
	let n1 = Char.code c in
	let n2 = Char.code (Stream.next s) in
	let n3 = Char.code (Stream.next s) in
        if (n2 lsr 6 != 0b10) || (n3 lsr 6 != 0b10) then raise MalFormed;
        ((n1 land 0x0f) lsl 12) lor ((n2 land 0x3f) lsl 6) lor (n3 land 0x3f)
    | '\237' ->
	let n2 = Char.code (Stream.next s) in
	let n3 = Char.code (Stream.next s) in
	if (n2 lsr 5 != 0b100) || (n3 lsr 6 != 0b10) then raise MalFormed;
        0xd000 lor ((n2 land 0x3f) lsl 6) lor (n3 land 0x3f)
    | '\240' ->
	let n2 = Char.code (Stream.next s) in
	let n3 = Char.code (Stream.next s) in
	let n2 = Char.code (Stream.next s) in
	let n4 = Char.code (Stream.next s) in
	if (n2 lsr 4 != 0b1001) || (n3 lsr 6 != 0b10) || (n4 lsr 6 != 0b10)
	then raise MalFormed;
        ((n2 land 0x3f) lsl 12) lor ((n3 land 0x3f) lsl 6) lor (n4 land 0x3f)
    | '\241'..'\243' as c ->
	let n1 = Char.code c in
	let n3 = Char.code (Stream.next s) in
	let n2 = Char.code (Stream.next s) in
	let n4 = Char.code (Stream.next s) in
        if (n2 lsr 6 != 0b10) || (n3 lsr 6 != 0b10) || (n4 lsr 6 != 0b10)
	then raise MalFormed;
        ((n1 land 0x07) lsl 18) lor ((n2 land 0x3f) lsl 12) lor
        ((n3 land 0x3f) lsl 6) lor (n4 land 0x3f)
    | '\244' ->
	let n3 = Char.code (Stream.next s) in
	let n2 = Char.code (Stream.next s) in
	let n4 = Char.code (Stream.next s) in
        if (n2 lsr 4 != 0b1000) || (n3 lsr 6 != 0b10) || (n4 lsr 6 != 0b10)
	then raise MalFormed;
        0x100000 lor ((n2 land 0x3f) lsl 12) lor ((n3 land 0x3f) lsl 6) lor 
	(n4 land 0x3f)
    | _ -> raise MalFormed



let compute_len s pos bytes =
  let rec aux n i =
    if i >= pos + bytes then if i = pos + bytes then n else raise MalFormed
    else 
      let w = width.(Char.code s.[i]) in
      if w > 0 then aux (succ n) (i + w) 
      else raise MalFormed
  in
  aux 0 pos

let rec blit_to_int s spos a apos n =
  if n > 0 then begin
    a.(apos) <- next s spos;
    blit_to_int s (spos + width.(Char.code s.[spos])) a (succ apos) (pred n)
  end

let to_int_array s pos bytes =
  let n = compute_len s pos bytes in
  let a = Array.create n 0 in
  blit_to_int s pos a 0 n;
  a

(**************************)

let width_code_point p =
  if p <= 0x7f then 1
  else if p <= 0x7ff then 2
  else if p <= 0xffff then 3
  else if p <= 0x10ffff then 4
  else raise MalFormed

let store b p =
  if p <= 0x7f then
    Buffer.add_char b (Char.chr p)
  else if p <= 0x7ff then (
    Buffer.add_char b (Char.chr (0xc0 lor (p lsr 6)));
    Buffer.add_char b (Char.chr (0x80 lor (p land 0x3f)))
  )
  else if p <= 0xffff then (
    if (p >= 0xd800 & p < 0xe000) then raise MalFormed;
    Buffer.add_char b (Char.chr (0xe0 lor (p lsr 12)));
    Buffer.add_char b (Char.chr (0x80 lor ((p lsr 6) land 0x3f)));
    Buffer.add_char b (Char.chr (0x80 lor (p land 0x3f)))
  )
  else if p <= 0x10ffff then (
    Buffer.add_char b (Char.chr (0xf0 lor (p lsr 18)));
    Buffer.add_char b (Char.chr (0x80 lor ((p lsr 12) land 0x3f)));
    Buffer.add_char b (Char.chr (0x80 lor ((p lsr 6)  land 0x3f)));
    Buffer.add_char b (Char.chr (0x80 lor (p land 0x3f)))
  )
  else raise MalFormed


let from_int_array a apos len =
  let b = Buffer.create (len * 4) in
  let rec aux apos len =
    if len > 0 then (store b a.(apos); aux (succ apos) (pred len))
    else Buffer.contents b in
  aux apos len

let stream_from_char_stream s =
  Stream.from 
    (fun _ ->
       try Some (from_stream s)
       with Stream.Failure -> None)