File: test.ml

package info (click to toggle)
js-of-ocaml 5.9.1-1
  • links: PTS, VCS
  • area: main
  • in suites: forky, sid, trixie
  • size: 32,020 kB
  • sloc: ml: 91,250; javascript: 57,289; ansic: 315; makefile: 271; lisp: 23; sh: 6; perl: 4
file content (275 lines) | stat: -rw-r--r-- 10,751 bytes parent folder | download
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
(* TEST
*)

(* UTF codec tests *)

let fold_uchars f acc =
  let rec loop f acc u =
    let acc = f acc u in
    if Uchar.equal u Uchar.max then acc else loop f acc (Uchar.succ u)
  in
  loop f acc Uchar.min

(* This tests that we encode and decode each character according
   to its specification. *)

let utf_8_spec =
  (* UTF-8 byte sequences, cf. table 3.7 Unicode 14. *)
  [(0x0000,0x007F),     [|(0x00,0x7F)|];
   (0x0080,0x07FF),     [|(0xC2,0xDF); (0x80,0xBF)|];
   (0x0800,0x0FFF),     [|(0xE0,0xE0); (0xA0,0xBF); (0x80,0xBF)|];
   (0x1000,0xCFFF),     [|(0xE1,0xEC); (0x80,0xBF); (0x80,0xBF)|];
   (0xD000,0xD7FF),     [|(0xED,0xED); (0x80,0x9F); (0x80,0xBF)|];
   (0xE000,0xFFFF),     [|(0xEE,0xEF); (0x80,0xBF); (0x80,0xBF)|];
   (0x10000,0x3FFFF),   [|(0xF0,0xF0); (0x90,0xBF); (0x80,0xBF); (0x80,0xBF)|];
   (0x40000,0xFFFFF),   [|(0xF1,0xF3); (0x80,0xBF); (0x80,0xBF); (0x80,0xBF)|];
   (0x100000,0x10FFFF), [|(0xF4,0xF4); (0x80,0x8F); (0x80,0xBF); (0x80,0xBF)|]]

let utf_16be_spec =
  (* UTF-16BE byte sequences, derived from table 3.5 Unicode 14. *)
  [(0x0000,0xD7FF),    [|(0x00,0xD7); (0x00,0xFF)|];
   (0xE000,0xFFFF),    [|(0xE0,0xFF); (0x00,0xFF)|];
   (0x10000,0x10FFFF), [|(0xD8,0xDB); (0x00,0xFF); (0xDC,0xDF); (0x00,0xFF)|]]

let uchar_map_of_spec spec =
  (* array mapping Uchar.t as ints to byte sequences according to [spec]. *)
  let map = Array.make ((Uchar.to_int Uchar.max) + 1) Bytes.empty in
  let add_range ((umin, umax), bytes) =
    let len = Array.length bytes in
    let bmin i = if i < len then fst bytes.(i) else max_int in
    let bmax i = if i < len then snd bytes.(i) else min_int in
    let uchar = ref umin in
    let buf = Bytes.create len in
    let add len' = match len = len' with
    | false -> ()
    | true -> map.(!uchar) <- Bytes.copy buf; incr uchar
    in
    for b0 = bmin 0 to bmax 0 do Bytes.set_uint8 buf 0 b0;
      for b1 = bmin 1 to bmax 1 do Bytes.set_uint8 buf 1 b1;
        for b2 = bmin 2 to bmax 2 do Bytes.set_uint8 buf 2 b2;
          for b3 = bmin 3 to bmax 3 do Bytes.set_uint8 buf 3 b3; add 4
          done; add 3;
        done; add 2;
      done; add 1;
    done; assert (!uchar - 1 = umax)
  in
  List.iter add_range spec;
  map

let uchar_map_get u map = map.(Uchar.to_int u)
let utf_8 = uchar_map_of_spec utf_8_spec
let utf_16be = uchar_map_of_spec utf_16be_spec
let utf_16le =
  let swap u b =
    let len = Bytes.length b in
    if len = 0 then () else
    for i = 0 to Bytes.length b / 2 - 1 do
      let j = i * 2 in
      Bytes.set_uint16_le b j (Bytes.get_uint16_be b j);
    done;
  in
  let map = Array.map Bytes.copy utf_16be in
  Array.iteri swap map; map

let test_utf utf utf_len get_utf set_utf utf_is_valid =
  (* Test codec and validation of each Uchar.t against the spec. *)
  let f () u =
    let utf_len = utf_len u in
    let buf = Bytes.create utf_len in
    assert (set_utf buf 0 u = utf_len);
    assert (Bytes.equal buf (uchar_map_get u utf));
    assert (Bytes.equal buf (uchar_map_get u utf));
    let dec = get_utf buf 0 in
    assert (Uchar.utf_decode_is_valid dec);
    assert (Uchar.utf_decode_length dec = utf_len);
    assert (Uchar.equal (Uchar.utf_decode_uchar dec) u);
    assert (utf_is_valid buf);
    ()
  in
  fold_uchars f ()

let () =
  test_utf utf_8 Uchar.utf_8_byte_length
    Bytes.get_utf_8_uchar Bytes.set_utf_8_uchar Bytes.is_valid_utf_8

let () =
  test_utf utf_16be Uchar.utf_16_byte_length
    Bytes.get_utf_16be_uchar Bytes.set_utf_16be_uchar Bytes.is_valid_utf_16be

let () =
  test_utf utf_16le Uchar.utf_16_byte_length
    Bytes.get_utf_16le_uchar Bytes.set_utf_16le_uchar Bytes.is_valid_utf_16le

let () =
  (* Test out of bounds *)
  let raises f = assert (try f (); false with Invalid_argument _ -> true) in
  (raises @@ fun () -> Bytes.get_utf_8_uchar Bytes.empty 0);
  (raises @@ fun () -> Bytes.set_utf_8_uchar Bytes.empty 0 Uchar.min);
  (raises @@ fun () -> Bytes.get_utf_16le_uchar Bytes.empty 0);
  (raises @@ fun () -> Bytes.set_utf_16le_uchar Bytes.empty 0 Uchar.min);
  (raises @@ fun () -> Bytes.get_utf_16be_uchar Bytes.empty 0);
  (raises @@ fun () -> Bytes.set_utf_16be_uchar Bytes.empty 0 Uchar.min);
  ()

let () =
  (* Test lack of space encodes *)
  let b = Bytes.make 1 '\xab' in
  assert (Bytes.set_utf_8_uchar b 0 Uchar.max = 0 && Bytes.get b 0 = '\xab');
  assert (Bytes.set_utf_16be_uchar b 0 Uchar.max = 0 && Bytes.get b 0 = '\xab');
  assert (Bytes.set_utf_16le_uchar b 0 Uchar.max = 0 && Bytes.get b 0 = '\xab');
  ()

let () =
  (* Test bug found during review *)
  let b = Bytes.create 2 in
  let () = Bytes.set_uint8 b 0 0xC3 in
  let () = Bytes.set_uint8 b 1 0x00 in
  assert (not (Bytes.is_valid_utf_8 b))

let () =
  (* Test used bytes and replacement according to WHATWG recommendation.
     This is just a recommendation.
     These examples are from TUS p. 126-127 Unicode 14  *)
  let b = Bytes.of_string "\xC0\xAF\xE0\x80\xBF\xF0\x81\x82\x41" in
  let ok i = i = Bytes.length b - 1 in
  for i = 0 to Bytes.length b - 1 do
    let dec = Bytes.get_utf_8_uchar b i in
    if not (ok i) then begin
      assert (Uchar.utf_decode_is_valid dec = false);
      assert (Uchar.utf_decode_length dec = 1);
      assert (Uchar.equal (Uchar.utf_decode_uchar dec) Uchar.rep)
    end else begin
      assert (Uchar.utf_decode_is_valid dec = true);
      assert (Uchar.utf_decode_length dec = 1);
      assert (Uchar.equal (Uchar.utf_decode_uchar dec) (Uchar.of_int 0x0041))
    end
  done;
  let b = Bytes.of_string "\xED\xA0\x80\xED\xBF\xBF\xED\xAF\x41" in
  let ok i = i = Bytes.length b - 1 in
  for i = 0 to Bytes.length b - 1 do
    let dec = Bytes.get_utf_8_uchar b i in
    if not (ok i) then begin
      assert (Uchar.utf_decode_is_valid dec = false);
      assert (Uchar.utf_decode_length dec = 1);
      assert (Uchar.equal (Uchar.utf_decode_uchar dec) Uchar.rep)
    end else begin
      assert (Uchar.utf_decode_is_valid dec = true);
      assert (Uchar.utf_decode_length dec = 1);
      assert (Uchar.equal (Uchar.utf_decode_uchar dec) (Uchar.of_int 0x0041))
    end
  done;
  let b = Bytes.of_string "\xF4\x91\x92\x93\xFF\x41\x80\xBF\x42" in
  let ok i = i = 5 || i = 8 in
  for i = 0 to Bytes.length b - 1 do
    let dec = Bytes.get_utf_8_uchar b i in
    if not (ok i) then begin
      assert (Uchar.utf_decode_is_valid dec = false);
      assert (Uchar.utf_decode_length dec = 1);
      assert (Uchar.equal (Uchar.utf_decode_uchar dec) Uchar.rep)
    end else begin
      assert (Uchar.utf_decode_is_valid dec = true);
      assert (Uchar.utf_decode_length dec = 1);
      assert (Uchar.equal (Uchar.utf_decode_uchar dec)
                (Uchar.of_char (Bytes.get b i)))
    end
  done;
  let b = Bytes.of_string "\xE1\x80\xE2\xF0\x91\x92\xF1\xBF\x41" in
  let d0 = Bytes.get_utf_8_uchar b 0 in
  assert (Uchar.utf_decode_is_valid d0 = false);
  assert (Uchar.utf_decode_length d0 = 2);
  assert (Uchar.equal (Uchar.utf_decode_uchar d0) Uchar.rep);
  let d2 = Bytes.get_utf_8_uchar b 2 in
  assert (Uchar.utf_decode_is_valid d2 = false);
  assert (Uchar.utf_decode_length d2 = 1);
  assert (Uchar.equal (Uchar.utf_decode_uchar d2) Uchar.rep);
  let d3 = Bytes.get_utf_8_uchar b 3 in
  assert (Uchar.utf_decode_is_valid d3 = false);
  assert (Uchar.utf_decode_length d3 = 3);
  assert (Uchar.equal (Uchar.utf_decode_uchar d3) Uchar.rep);
  let d6 = Bytes.get_utf_8_uchar b 6 in
  assert (Uchar.utf_decode_is_valid d6 = false);
  assert (Uchar.utf_decode_length d6 = 2);
  assert (Uchar.equal (Uchar.utf_decode_uchar d6) Uchar.rep);
  let d8 = Bytes.get_utf_8_uchar b 8 in
  assert (Uchar.utf_decode_length d8 = 1);
  assert (Uchar.equal (Uchar.utf_decode_uchar d8) (Uchar.of_int 0x0041));
  ()

let () = Printf.printf "All UTF tests passed!\n"

(* This is a very long test added here for reference just in case. It
   is not run.

   It assumes the good encoding and decodes have been checked by test_utf
   above. It exhaustively tests all 1-4 bytes invalid sequences for decodes.
   This ensures we do not decode invalid sequence to uchars. *)

let test_invalid_decodes () =
  let module Sset = Set.Make (String) in
  let utf_8_encs, utf_16be_encs, utf_16le_encs =
    Printf.printf "Building encoding sequence sets\n%!";
    let add (set8, set16be, set16le) u =
      let s = Bytes.unsafe_to_string in
      let e8 = Bytes.create (Uchar.utf_8_byte_length u) in
      let e16be = Bytes.create (Uchar.utf_16_byte_length u) in
      let e16le = Bytes.create (Uchar.utf_16_byte_length u) in
      ignore (Bytes.set_utf_8_uchar e8 0 u);
      ignore (Bytes.set_utf_16be_uchar e16be 0 u);
      ignore (Bytes.set_utf_16le_uchar e16le 0 u);
      Sset.add (s e8) set8,
      Sset.add (s e16be) set16be,
      Sset.add (s e16le) set16le
    in
    fold_uchars add (Sset.empty, Sset.empty, Sset.empty)
  in
  let test_seqs utf utf_encs get_utf_char is_valid_utf =
    let test seq =
      let dec = get_utf_char seq 0 in
      let valid = Uchar.utf_decode_is_valid dec in
      let is_valid = is_valid_utf seq in
      let is_enc = Sset.mem (Bytes.unsafe_to_string seq) utf_encs in
      if not ((valid && is_enc) || (not valid && not is_enc)) ||
         not ((is_valid && is_enc) || (not is_valid && not is_enc))
      then begin
        for i = 0 to Bytes.length seq - 1 do
          Printf.printf "%02X " (Bytes.get_uint8 seq i);
        done;
        Printf.printf "valid: %b is_encoding: %b decode: U+%04X\n is_valid:%b"
          valid is_enc (Uchar.to_int (Uchar.utf_decode_uchar dec)) is_valid;
        assert false
      end;
      valid
    in
    let[@inline] set buf i b = Bytes.unsafe_set buf i (Char.unsafe_chr b) in
    let s1 = Bytes.create 1 and s2 = Bytes.create 2
    and s3 = Bytes.create 3 and s4 = Bytes.create 4 in
    Printf.printf "Testing %s invalid decodes...\n%!" utf;
    for b0 = 0x00 to 0xFF do
      set s1 0 b0;
      if test s1 then ((* this prefix decoded, stop here *)) else begin
        set s2 0 b0;
        for b1 = 0x00 to 0xFF do
          set s2 1 b1;
          if test s2 then ((* this prefix decoded, stop here *)) else begin
            set s3 0 b0;
            set s3 1 b1;
            for b2 = 0x00 to 0xFF do
              set s3 2 b2;
              if test s3 then ((* this prefix decoded, stop here *)) else begin
                set s4 0 b0;
                set s4 1 b1;
                set s4 2 b2;
                for b3 = 0x00 to 0xFF do set s4 3 b3; ignore (test s4) done;
              end
            done;
          end
        done;
      end
    done
  in
  test_seqs "UTF-8" utf_8_encs Bytes.get_utf_8_uchar Bytes.is_valid_utf_8;
  test_seqs "UTF-16BE"
    utf_16be_encs Bytes.get_utf_16be_uchar Bytes.is_valid_utf_16be;
  test_seqs "UTF-16LE" utf_16le_encs Bytes.get_utf_16le_uchar
    Bytes.is_valid_utf_16le;
  ()