File: test_uutf.ml

package info (click to toggle)
uutf 1.0.4-2
  • links: PTS, VCS
  • area: main
  • in suites: forky, sid
  • size: 220 kB
  • sloc: ml: 1,586; makefile: 29; sh: 18
file content (376 lines) | stat: -rw-r--r-- 15,212 bytes parent folder | download | duplicates (2)
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
(*---------------------------------------------------------------------------
   Copyright (c) 2012 The uutf programmers. All rights reserved.
   SPDX-License-Identifier: ISC
  ---------------------------------------------------------------------------*)

let u_nl = Uchar.of_int 0x000A
let log f = Format.printf (f ^^ "@?")
let fail fmt =
  let fail _ = failwith (Format.flush_str_formatter ()) in
  Format.kfprintf fail Format.str_formatter fmt

let fail_decode e f =
  fail "expected %a, decoded %a" Uutf.pp_decode e Uutf.pp_decode f

let uchar_succ u = if Uchar.equal u Uchar.max then u else Uchar.succ u
let iter_uchars f =
  for u = 0x0000 to 0xD7FF do f (Uchar.unsafe_of_int u) done;
  for u = 0xE000 to 0x10FFFF do f (Uchar.unsafe_of_int u) done

let codec_test () =
  let codec_uchars encoding s bsize =
    log "Codec every unicode scalar value in %s with buffer size %d.\n%!"
      (Uutf.encoding_to_string encoding) bsize;
    let encode_uchars encoding s bsize =
      let spos = ref 0 in
      let e = Uutf.encoder encoding `Manual in
      let rec encode e v = match Uutf.encode e v with `Ok -> ()
      | `Partial ->
          let brem = Bytes.length s - !spos in
          let drem = Uutf.Manual.dst_rem e in
          let bsize = min bsize brem in
          Uutf.Manual.dst e s !spos bsize;
          spos := !spos + bsize - drem;
          encode e `Await
      in
      let encode_u u = encode e (`Uchar u) in
      iter_uchars encode_u; encode e `End;
      !spos - Uutf.Manual.dst_rem e                       (* encoded length. *)
    in
    let decode_uchars encoding s slen bsize =
      let spos = ref 0 in
      let bsize = min bsize slen in
      let d = Uutf.decoder ~encoding `Manual in
      let rec decode d = match Uutf.decode d with
      | `Malformed _ | `Uchar _ | `End as v -> v
      | `Await ->
          let rem = slen - !spos in
          let bsize = min bsize rem in
          Uutf.Manual.src d s !spos bsize;
          spos := !spos + bsize;
          decode d
      in
      let decode_u u = match decode d with
      | `Uchar u' when u = u' -> ()
      | v -> fail_decode (`Uchar u) v
      in
      iter_uchars decode_u;
      match decode d with
      | `End -> () | v -> fail_decode `End v
    in
    let slen = encode_uchars encoding s bsize in
    decode_uchars encoding s slen bsize
  in
  let full = 4 * 0x10FFFF in        (* will hold everything in any encoding. *)
  let s = Bytes.create full in
  let test encoding =
    (* Test with various sizes to increase condition coverage. *)
    for i = 1 to 11 do codec_uchars encoding s i done;
    codec_uchars encoding s full;
  in
  test `UTF_8; test `UTF_16BE; test `UTF_16LE

let buffer_string_codec_test () =
  let codec_uchars encoding encode decode b =
    log "Buffer/String codec every unicode scalar value in %s.\n%!"
      (Uutf.encoding_to_string encoding);
    Buffer.clear b;
    iter_uchars (encode b);
    let s = Buffer.contents b in
    let check uchar _ = function
    | `Uchar u when Uchar.equal u uchar -> uchar_succ uchar
    | v -> fail_decode (`Uchar uchar) v
    in
    ignore (decode ?pos:None ?len:None check (Uchar.of_int 0x0000) s)
  in
  let b = Buffer.create (4 * 0x10FFFF) in
  codec_uchars `UTF_8 Uutf.Buffer.add_utf_8 Uutf.String.fold_utf_8 b;
  codec_uchars `UTF_16BE Uutf.Buffer.add_utf_16be Uutf.String.fold_utf_16be b;
  codec_uchars `UTF_16LE Uutf.Buffer.add_utf_16le Uutf.String.fold_utf_16le b

let pos_test () =
  let test encoding s =
    log "Test position tracking in %s.\n%!" (Uutf.encoding_to_string encoding);
    let pos d (l, c, k) =
      match Uutf.decoder_line d, Uutf.decoder_col d, Uutf.decoder_count d with
      | (l', c', k') when l = l' && c = c' && k = k' -> ignore (Uutf.decode d)
      | (l', c', k') ->
          fail "Expected position (%d,%d,%d) found (%d,%d,%d)." l c k l' c' k'
    in
    let e = Uutf.decoder ~encoding (`String s) in
    pos e (1, 0, 0); pos e (1, 1, 1); pos e (1, 2, 2); pos e (2, 0, 3);
    pos e (2, 1, 4); pos e (3, 0, 5); pos e (3, 0, 6); pos e (3, 1, 7);
    pos e (3, 2, 8); pos e (4, 0, 9); pos e (4, 0, 10); pos e (5, 0, 11);
    pos e (6, 0, 12); pos e (6, 0, 12); pos e (6, 0, 12);
    let e = Uutf.decoder ~nln:(`ASCII u_nl) ~encoding (`String s) in
    pos e (1, 0, 0); pos e (1, 1, 1); pos e (1, 2, 2); pos e (2, 0, 3);
    pos e (2, 1, 4); pos e (3, 0, 5); pos e (3, 1, 6); pos e (3, 2, 7);
    pos e (4, 0, 8); pos e (5, 0, 9); pos e (6, 0, 10); pos e (6, 0, 10);
    pos e (6, 0, 10);
    let e = Uutf.decoder ~nln:(`NLF u_nl) ~encoding (`String s) in
    pos e (1, 0, 0); pos e (1, 1, 1); pos e (1, 2, 2); pos e (2, 0, 3);
    pos e (2, 1, 4); pos e (3, 0, 5); pos e (3, 1, 6); pos e (3, 2, 7);
    pos e (4, 0, 8); pos e (5, 0, 9); pos e (6, 0, 10); pos e (6, 0, 10);
    pos e (6, 0, 10);
    let e = Uutf.decoder ~nln:(`Readline u_nl) ~encoding (`String s) in
    pos e (1, 0, 0); pos e (1, 1, 1); pos e (1, 2, 2); pos e (2, 0, 3);
    pos e (2, 1, 4); pos e (3, 0, 5); pos e (3, 1, 6); pos e (3, 2, 7);
    pos e (4, 0, 8); pos e (5, 0, 9); pos e (6, 0, 10); pos e (6, 0, 10);
    pos e (6, 0, 10);
  in
  test `UTF_8 "LL\nL\r\nLL\r\n\n\x0C";
  test `UTF_16BE
    "\x00\x4C\x00\x4C\x00\x0A\x00\x4C\x00\x0D\x00\x0A\x00\x4C\x00\x4C\
     \x00\x0D\x00\x0A\x00\x0A\x00\x0C";
  test `UTF_16LE
    "\x4C\x00\x4C\x00\x0A\x00\x4C\x00\x0D\x00\x0A\x00\x4C\x00\x4C\x00\
     \x0D\x00\x0A\x00\x0A\x00\x0C\x00";
  ()

let guess_test () =
  log "Test encoding guessing.\n%!";
  let test (s, enc, removed_bom, seq) =
    let d = Uutf.decoder (`String s) in
    let rec test_seq seq d = match seq, Uutf.decode d with
    | `Uchar u :: vs, `Uchar u' when Uchar.equal u u' -> test_seq vs d
    | `Malformed bs :: vs, `Malformed bs' when bs = bs' -> test_seq vs d
    | [], `End -> ()
    | v :: _, v' -> fail_decode v v'
    | _ , _ -> assert false
    in
    test_seq seq d;
    let guess = Uutf.decoder_encoding d in
    if guess <> enc then fail "expected encoding: %s guessed: %s"
      (Uutf.encoding_to_string enc) (Uutf.encoding_to_string guess);
    let rem_bom = Uutf.decoder_removed_bom d in
    if rem_bom <> removed_bom then
      fail "expected removed bom: %b found: %b" removed_bom rem_bom
  in
  let uchar u = `Uchar (Uchar.unsafe_of_int u) in
  (* UTF-8 guess *)
  test ("", `UTF_8, false, []);
  test ("\xEF", `UTF_8, false, [`Malformed "\xEF";]);
  test ("\xEF\xBB", `UTF_8, false, [`Malformed "\xEF\xBB";]);
  test ("\xEF\xBB\x00", `UTF_8, false, [`Malformed "\xEF\xBB\x00";]);
  test ("\xEF\xBB\xBF\xEF\xBB\xBF", `UTF_8, true, [`Uchar Uutf.u_bom;]);
  test ("\n\r\n", `UTF_8, false, [`Uchar u_nl; uchar 0x0D; `Uchar u_nl;]);
  test ("\n\x80\xEF\xBB\xBF\n", `UTF_8, false,
        [`Uchar u_nl; `Malformed "\x80"; `Uchar Uutf.u_bom; `Uchar u_nl]);
  test ("\n\n\xEF\xBB\x00\n", `UTF_8, false,
        [`Uchar u_nl; `Uchar u_nl; `Malformed "\xEF\xBB\x00"; `Uchar u_nl;]);
  test ("\n\xC8\x99", `UTF_8, false, [`Uchar u_nl; uchar 0x0219;]);
  test ("\xC8\x99\n", `UTF_8, false, [uchar 0x0219; `Uchar u_nl;]);
  test ("\xC8\x99\n\n", `UTF_8, false,
        [uchar 0x0219; `Uchar u_nl; `Uchar u_nl]);
  test ("\xC8\x99\xC8\x99", `UTF_8, false, [uchar 0x0219; uchar 0x0219]);
  test ("\xC8\x99\xF0\x9F\x90\xAB", `UTF_8, false,
        [uchar 0x0219; uchar 0x1F42B]);
  test ("\xF0\x9F\x90\xAB\n", `UTF_8, false, [uchar 0x1F42B; `Uchar u_nl ]);
  (* UTF-16BE guess *)
  test ("\xFE\xFF\xDB\xFF\xDF\xFF\x00\x0A", `UTF_16BE, true,
        [uchar 0x10FFFF; `Uchar u_nl;]);
  test ("\xFE\xFF\xDB\xFF\x00\x0A\x00\x0A", `UTF_16BE, true,
       [`Malformed "\xDB\xFF\x00\x0A"; `Uchar u_nl;]);
  test ("\xFE\xFF\xDB\xFF\xDF", `UTF_16BE, true,
        [`Malformed "\xDB\xFF\xDF";]);
  test ("\x80\x81\xDB\xFF\xDF\xFF\xFE\xFF\xDF\xFF\xDB\xFF", `UTF_16BE, false,
        [uchar 0x8081; uchar 0x10FFFF; `Uchar Uutf.u_bom;
          `Malformed "\xDF\xFF"; `Malformed "\xDB\xFF"]);
  test ("\x80\x81\xDF\xFF\xDB\xFF\xFE", `UTF_16BE, false,
        [uchar 0x8081; `Malformed "\xDF\xFF"; `Malformed "\xDB\xFF\xFE";]);
  test ("\x00\x0A", `UTF_16BE, false, [`Uchar u_nl]);
  test ("\x00\x0A\xDB", `UTF_16BE, false, [`Uchar u_nl; `Malformed "\xDB"]);
  test ("\x00\x0A\xDB\xFF", `UTF_16BE, false,
        [`Uchar u_nl; `Malformed "\xDB\xFF"]);
  test ("\x00\x0A\xDB\xFF\xDF", `UTF_16BE, false,
        [`Uchar u_nl; `Malformed "\xDB\xFF\xDF"]);
  test ("\x00\x0A\xDB\xFF\xDF\xFF", `UTF_16BE, false,
        [`Uchar u_nl; uchar 0x10FFFF]);
  test ("\x00\x0A\x00\x0A", `UTF_16BE, false,
        [`Uchar u_nl; `Uchar u_nl]);
  (* UTF-16LE guess *)
  test ("\xFF\xFE\xFF\xDB\xFF\xDF\x0A\x00", `UTF_16LE, true,
        [uchar 0x10FFFF; `Uchar u_nl;]);
  test ("\xFF\xFE\xFF\xDB\x0A\x00\x0A\x00", `UTF_16LE, true,
       [`Malformed "\xFF\xDB\x0A\x00"; `Uchar u_nl;]);
  test ("\xFF\xFE\xFF\xDB\xDF", `UTF_16LE, true,
        [`Malformed "\xFF\xDB\xDF";]);
  test ("\x0A\x00", `UTF_16LE, false, [`Uchar u_nl]);
  test ("\x0A\x00\xDB", `UTF_16LE, false, [`Uchar u_nl; `Malformed "\xDB"]);
  test ("\x0A\x00\xFF\xDB", `UTF_16LE, false,
        [`Uchar u_nl; `Malformed "\xFF\xDB"]);
  test ("\x0A\x00\xFF\xDB\xDF", `UTF_16LE, false,
        [`Uchar u_nl; `Malformed "\xFF\xDB\xDF"]);
  test ("\x0A\x00\xFF\xDB\xFF\xDF", `UTF_16LE, false,
        [`Uchar u_nl; uchar 0x10FFFF]);
  test ("\x0A\x00\x0A\x00", `UTF_16LE, false,
        [`Uchar u_nl; `Uchar u_nl]);
  ()

let test_sub () =
  log "Test Uutf.String.fold_utf_8 substring";
  let trip fold ~pos ~len s =
    let b = Buffer.create 100 in
    let add _ _ = function
    | `Uchar u -> Uutf.Buffer.add_utf_8 b u
    | `Malformed _ -> assert false
    in
    fold ?pos:(Some pos) ?len:(Some len) add () s;
    assert (String.sub s pos len = Buffer.contents b);
  in
  trip Uutf.String.fold_utf_8 ~pos:4 ~len:4 "hop hap mop";
  trip Uutf.String.fold_utf_8 ~pos:0 ~len:1 "hop hap mop";
  trip Uutf.String.fold_utf_8 ~pos:2 ~len:1 "hop";
  ()

module Int = struct type t = int let compare : int -> int -> int = compare end
module Umap = Map.Make (Uchar)
module Bmap = Map.Make (Bytes)

(* Constructs from the specification, the map from uchars to their valid
   UTF-8 byte sequence and the map reverse map from valid UTF-8 byte sequences
   to their uchar.  *)
let utf8_maps () =
  log "Building UTF-8 codec maps from specification.\n";
  let spec = [        (* UTF-8 byte sequences cf. table 3.7 p. 94 Unicode 6. *)
    (0x0000,0x007F),     [|(0x00,0x7F)|];
    (0x0080,0x07FF),     [|(0xC2,0xDF); (0x80,0xBF)|];
    (0x0800,0x0FFF),     [|(0xE0,0xE0); (0xA0,0xBF); (0x80,0xBF)|];
    (0x1000,0xCFFF),     [|(0xE1,0xEC); (0x80,0xBF); (0x80,0xBF)|];
    (0xD000,0xD7FF),     [|(0xED,0xED); (0x80,0x9F); (0x80,0xBF)|];
    (0xE000,0xFFFF),     [|(0xEE,0xEF); (0x80,0xBF); (0x80,0xBF)|];
    (0x10000,0x3FFFF),   [|(0xF0,0xF0); (0x90,0xBF); (0x80,0xBF); (0x80,0xBF)|];
    (0x40000,0xFFFFF),   [|(0xF1,0xF3); (0x80,0xBF); (0x80,0xBF); (0x80,0xBF)|];
    (0x100000,0x10FFFF), [|(0xF4,0xF4); (0x80,0x8F); (0x80,0xBF); (0x80,0xBF)|]]
  in
  let add_range (umap, bmap) ((umin, umax), bytes) =
    let len = Array.length bytes in
    let bmin i = if i < len then fst bytes.(i) else max_int in
    let bmax i = if i < len then snd bytes.(i) else min_int in
    let umap = ref umap in
    let bmap = ref bmap in
    let uchar = ref umin in
    let buf = Bytes.create len in
    let add len' =
      if len <> len' then () else
      begin
        let bytes = Bytes.copy buf in
        let u = Uchar.of_int !uchar in
        umap := Umap.add u bytes !umap;
        bmap := Bmap.add bytes u !bmap;
        incr uchar;
      end
    in
    for b0 = bmin 0 to bmax 0 do
      Bytes.unsafe_set buf 0 (Char.chr b0);
      for b1 = bmin 1 to bmax 1 do
        Bytes.unsafe_set buf 1 (Char.chr b1);
        for b2 = bmin 2 to bmax 2 do
          Bytes.unsafe_set buf 2 (Char.chr b2);
          for b3 = bmin 3 to bmax 3 do
            Bytes.unsafe_set buf 3 (Char.chr b3);
            add 4;
          done;
          add 3;
        done;
        add 2;
      done;
      add 1;
    done;
    assert (!uchar - 1 = umax);
    (!umap, !bmap)
  in
  List.fold_left add_range (Umap.empty, Bmap.empty)  spec

let utf8_encode_test umap =
  log "Testing UTF-8 encoding of every unicode scalar value against spec.\n";
  let buf = Buffer.create 4 in
  let test u =
    let u = Uchar.unsafe_of_int u in
    let bytes = try Umap.find u umap with Not_found -> assert false in
    let bytes = Bytes.unsafe_to_string bytes in
    Buffer.clear buf; Uutf.Buffer.add_utf_8 buf u;
    if bytes = Buffer.contents buf then () else
    fail "UTF-8 encoding error (U+%04X)" (Uchar.to_int u)
  in
  for i = 0x0000 to 0xD7FF do test i done;
  for i = 0xE000 to 0x10FFFF do test i done

let utf8_decode_test bmap =
  log "Testing the UTF-8 decoding of all <= 4 bytes sequences (be patient).\n";
  let spec seq = try `Uchar (Bmap.find seq bmap) with
  | Not_found -> `Malformed (Bytes.unsafe_to_string seq)
  in
  let test seq =
    let sseq = Bytes.unsafe_to_string seq in
    let dec = List.rev (Uutf.String.fold_utf_8 (fun a _ c -> c :: a) [] sseq) in
    match spec seq, dec with
    | `Uchar u, [ `Uchar u' ] when u = u' -> `Decoded
    | `Malformed _, (`Malformed _) :: _ -> `Malformed
    | v, v' :: _ -> fail_decode v v'
    | _ -> fail "This should not have happened on specification '%S'." sseq
  in
  let s1 = Bytes.create 1
  and s2 = Bytes.create 2
  and s3 = Bytes.create 3
  and s4 = Bytes.create 4
  in
  for b0 = 0x00 to 0xFF do
    Bytes.unsafe_set s1 0 (Char.unsafe_chr b0);
    if test s1 = `Decoded then ()
    else begin
      Bytes.unsafe_set s2 0 (Char.unsafe_chr b0);
      for b1 = 0x00 to 0xFF do
        Bytes.unsafe_set s2 1 (Char.unsafe_chr b1);
	if test s2 = `Decoded then ()
        else begin
          Bytes.unsafe_set s3 0 (Char.unsafe_chr b0);
          Bytes.unsafe_set s3 1 (Char.unsafe_chr b1);
	  for b2 = 0x00 to 0xFF do
            Bytes.unsafe_set s3 2 (Char.unsafe_chr b2);
	    if test s3 = `Decoded then ()
            else begin
              Bytes.unsafe_set s4 0 (Char.unsafe_chr b0);
              Bytes.unsafe_set s4 1 (Char.unsafe_chr b1);
              Bytes.unsafe_set s4 2 (Char.unsafe_chr b2);
	      for b3 = 0x00 to 0xFF do
                Bytes.unsafe_set s4 3 (Char.unsafe_chr b3);
		ignore (test s4)
	      done;
	    end
	  done;
	end
      done;
    end
  done

let utf8_test () =                             (* Proof by exhaustiveness... *)
  let umap, bmap = utf8_maps () in
  utf8_encode_test umap;
(*  utf8_decode_test bmap; *)                        (* too long, commented. *)
  ()

let is_uchar_test () =
  log "Testing Uchar.is_valid.\n";
  let test cp expected =
    let is = Uchar.is_valid cp in
    if is <> expected then
    fail "Uutf.is_uchar %04X = %b, expected %b" cp is expected
  in
  for cp = 0x0000 to 0xD7FF do test cp true done;
  for cp = 0xD800 to 0xDFFF do test cp false done;
  for cp = 0xE000 to 0x10FFFF do test cp true done;
  for cp = 0x110000 to 0x120000 do test cp false done

let test () =
  Printexc.record_backtrace true;
  codec_test ();
  buffer_string_codec_test ();
  pos_test ();
  guess_test ();
  test_sub ();
  utf8_test ();
  is_uchar_test ();
  log "All tests succeeded.\n"

let () = if not (!Sys.interactive) then test ()