File: test.ml

package info (click to toggle)
js-of-ocaml 5.9.1-1
links: PTS, VCS
area: main
in suites: forky, sid, trixie
size: 32,020 kB
sloc: ml: 91,250; javascript: 57,289; ansic: 315; makefile: 271; lisp: 23; sh: 6; perl: 4
file content (275 lines) | stat: -rw-r--r-- 10,751 bytes
(* TEST
*)

(* UTF codec tests *)

let fold_uchars f acc =
  let rec loop f acc u =
    let acc = f acc u in
    if Uchar.equal u Uchar.max then acc else loop f acc (Uchar.succ u)
  in
  loop f acc Uchar.min

(* This tests that we encode and decode each character according
   to its specification. *)

let utf_8_spec =
  (* UTF-8 byte sequences, cf. table 3.7 Unicode 14. *)
  [(0x0000,0x007F),     [|(0x00,0x7F)|];
   (0x0080,0x07FF),     [|(0xC2,0xDF); (0x80,0xBF)|];
   (0x0800,0x0FFF),     [|(0xE0,0xE0); (0xA0,0xBF); (0x80,0xBF)|];
   (0x1000,0xCFFF),     [|(0xE1,0xEC); (0x80,0xBF); (0x80,0xBF)|];
   (0xD000,0xD7FF),     [|(0xED,0xED); (0x80,0x9F); (0x80,0xBF)|];
   (0xE000,0xFFFF),     [|(0xEE,0xEF); (0x80,0xBF); (0x80,0xBF)|];
   (0x10000,0x3FFFF),   [|(0xF0,0xF0); (0x90,0xBF); (0x80,0xBF); (0x80,0xBF)|];
   (0x40000,0xFFFFF),   [|(0xF1,0xF3); (0x80,0xBF); (0x80,0xBF); (0x80,0xBF)|];
   (0x100000,0x10FFFF), [|(0xF4,0xF4); (0x80,0x8F); (0x80,0xBF); (0x80,0xBF)|]]

let utf_16be_spec =
  (* UTF-16BE byte sequences, derived from table 3.5 Unicode 14. *)
  [(0x0000,0xD7FF),    [|(0x00,0xD7); (0x00,0xFF)|];
   (0xE000,0xFFFF),    [|(0xE0,0xFF); (0x00,0xFF)|];
   (0x10000,0x10FFFF), [|(0xD8,0xDB); (0x00,0xFF); (0xDC,0xDF); (0x00,0xFF)|]]

let uchar_map_of_spec spec =
  (* array mapping Uchar.t as ints to byte sequences according to [spec]. *)
  let map = Array.make ((Uchar.to_int Uchar.max) + 1) Bytes.empty in
  let add_range ((umin, umax), bytes) =
    let len = Array.length bytes in
    let bmin i = if i < len then fst bytes.(i) else max_int in
    let bmax i = if i < len then snd bytes.(i) else min_int in
    let uchar = ref umin in
    let buf = Bytes.create len in
    let add len' = match len = len' with
    | false -> ()
    | true -> map.(!uchar) <- Bytes.copy buf; incr uchar
    in
    for b0 = bmin 0 to bmax 0 do Bytes.set_uint8 buf 0 b0;
      for b1 = bmin 1 to bmax 1 do Bytes.set_uint8 buf 1 b1;
        for b2 = bmin 2 to bmax 2 do Bytes.set_uint8 buf 2 b2;
          for b3 = bmin 3 to bmax 3 do Bytes.set_uint8 buf 3 b3; add 4
          done; add 3;
        done; add 2;
      done; add 1;
    done; assert (!uchar - 1 = umax)
  in
  List.iter add_range spec;
  map

let uchar_map_get u map = map.(Uchar.to_int u)
let utf_8 = uchar_map_of_spec utf_8_spec
let utf_16be = uchar_map_of_spec utf_16be_spec
let utf_16le =
  let swap u b =
    let len = Bytes.length b in
    if len = 0 then () else
    for i = 0 to Bytes.length b / 2 - 1 do
      let j = i * 2 in
      Bytes.set_uint16_le b j (Bytes.get_uint16_be b j);
    done;
  in
  let map = Array.map Bytes.copy utf_16be in
  Array.iteri swap map; map

let test_utf utf utf_len get_utf set_utf utf_is_valid =
  (* Test codec and validation of each Uchar.t against the spec. *)
  let f () u =
    let utf_len = utf_len u in
    let buf = Bytes.create utf_len in
    assert (set_utf buf 0 u = utf_len);
    assert (Bytes.equal buf (uchar_map_get u utf));
    assert (Bytes.equal buf (uchar_map_get u utf));
    let dec = get_utf buf 0 in
    assert (Uchar.utf_decode_is_valid dec);
    assert (Uchar.utf_decode_length dec = utf_len);
    assert (Uchar.equal (Uchar.utf_decode_uchar dec) u);
    assert (utf_is_valid buf);
    ()
  in
  fold_uchars f ()

let () =
  test_utf utf_8 Uchar.utf_8_byte_length
    Bytes.get_utf_8_uchar Bytes.set_utf_8_uchar Bytes.is_valid_utf_8

let () =
  test_utf utf_16be Uchar.utf_16_byte_length
    Bytes.get_utf_16be_uchar Bytes.set_utf_16be_uchar Bytes.is_valid_utf_16be

let () =
  test_utf utf_16le Uchar.utf_16_byte_length
    Bytes.get_utf_16le_uchar Bytes.set_utf_16le_uchar Bytes.is_valid_utf_16le

let () =
  (* Test out of bounds *)
  let raises f = assert (try f (); false with Invalid_argument _ -> true) in
  (raises @@ fun () -> Bytes.get_utf_8_uchar Bytes.empty 0);
  (raises @@ fun () -> Bytes.set_utf_8_uchar Bytes.empty 0 Uchar.min);
  (raises @@ fun () -> Bytes.get_utf_16le_uchar Bytes.empty 0);
  (raises @@ fun () -> Bytes.set_utf_16le_uchar Bytes.empty 0 Uchar.min);
  (raises @@ fun () -> Bytes.get_utf_16be_uchar Bytes.empty 0);
  (raises @@ fun () -> Bytes.set_utf_16be_uchar Bytes.empty 0 Uchar.min);
  ()

let () =
  (* Test lack of space encodes *)
  let b = Bytes.make 1 '\xab' in
  assert (Bytes.set_utf_8_uchar b 0 Uchar.max = 0 && Bytes.get b 0 = '\xab');
  assert (Bytes.set_utf_16be_uchar b 0 Uchar.max = 0 && Bytes.get b 0 = '\xab');
  assert (Bytes.set_utf_16le_uchar b 0 Uchar.max = 0 && Bytes.get b 0 = '\xab');
  ()

let () =
  (* Test bug found during review *)
  let b = Bytes.create 2 in
  let () = Bytes.set_uint8 b 0 0xC3 in
  let () = Bytes.set_uint8 b 1 0x00 in
  assert (not (Bytes.is_valid_utf_8 b))

let () =
  (* Test used bytes and replacement according to WHATWG recommendation.
     This is just a recommendation.
     These examples are from TUS p. 126-127 Unicode 14  *)
  let b = Bytes.of_string "\xC0\xAF\xE0\x80\xBF\xF0\x81\x82\x41" in
  let ok i = i = Bytes.length b - 1 in
  for i = 0 to Bytes.length b - 1 do
    let dec = Bytes.get_utf_8_uchar b i in
    if not (ok i) then begin
      assert (Uchar.utf_decode_is_valid dec = false);
      assert (Uchar.utf_decode_length dec = 1);
      assert (Uchar.equal (Uchar.utf_decode_uchar dec) Uchar.rep)
    end else begin
      assert (Uchar.utf_decode_is_valid dec = true);
      assert (Uchar.utf_decode_length dec = 1);
      assert (Uchar.equal (Uchar.utf_decode_uchar dec) (Uchar.of_int 0x0041))
    end
  done;
  let b = Bytes.of_string "\xED\xA0\x80\xED\xBF\xBF\xED\xAF\x41" in
  let ok i = i = Bytes.length b - 1 in
  for i = 0 to Bytes.length b - 1 do
    let dec = Bytes.get_utf_8_uchar b i in
    if not (ok i) then begin
      assert (Uchar.utf_decode_is_valid dec = false);
      assert (Uchar.utf_decode_length dec = 1);
      assert (Uchar.equal (Uchar.utf_decode_uchar dec) Uchar.rep)
    end else begin
      assert (Uchar.utf_decode_is_valid dec = true);
      assert (Uchar.utf_decode_length dec = 1);
      assert (Uchar.equal (Uchar.utf_decode_uchar dec) (Uchar.of_int 0x0041))
    end
  done;
  let b = Bytes.of_string "\xF4\x91\x92\x93\xFF\x41\x80\xBF\x42" in
  let ok i = i = 5 || i = 8 in
  for i = 0 to Bytes.length b - 1 do
    let dec = Bytes.get_utf_8_uchar b i in
    if not (ok i) then begin
      assert (Uchar.utf_decode_is_valid dec = false);
      assert (Uchar.utf_decode_length dec = 1);
      assert (Uchar.equal (Uchar.utf_decode_uchar dec) Uchar.rep)
    end else begin
      assert (Uchar.utf_decode_is_valid dec = true);
      assert (Uchar.utf_decode_length dec = 1);
      assert (Uchar.equal (Uchar.utf_decode_uchar dec)
                (Uchar.of_char (Bytes.get b i)))
    end
  done;
  let b = Bytes.of_string "\xE1\x80\xE2\xF0\x91\x92\xF1\xBF\x41" in
  let d0 = Bytes.get_utf_8_uchar b 0 in
  assert (Uchar.utf_decode_is_valid d0 = false);
  assert (Uchar.utf_decode_length d0 = 2);
  assert (Uchar.equal (Uchar.utf_decode_uchar d0) Uchar.rep);
  let d2 = Bytes.get_utf_8_uchar b 2 in
  assert (Uchar.utf_decode_is_valid d2 = false);
  assert (Uchar.utf_decode_length d2 = 1);
  assert (Uchar.equal (Uchar.utf_decode_uchar d2) Uchar.rep);
  let d3 = Bytes.get_utf_8_uchar b 3 in
  assert (Uchar.utf_decode_is_valid d3 = false);
  assert (Uchar.utf_decode_length d3 = 3);
  assert (Uchar.equal (Uchar.utf_decode_uchar d3) Uchar.rep);
  let d6 = Bytes.get_utf_8_uchar b 6 in
  assert (Uchar.utf_decode_is_valid d6 = false);
  assert (Uchar.utf_decode_length d6 = 2);
  assert (Uchar.equal (Uchar.utf_decode_uchar d6) Uchar.rep);
  let d8 = Bytes.get_utf_8_uchar b 8 in
  assert (Uchar.utf_decode_length d8 = 1);
  assert (Uchar.equal (Uchar.utf_decode_uchar d8) (Uchar.of_int 0x0041));
  ()

let () = Printf.printf "All UTF tests passed!\n"

(* This is a very long test added here for reference just in case. It
   is not run.

   It assumes the good encoding and decodes have been checked by test_utf
   above. It exhaustively tests all 1-4 bytes invalid sequences for decodes.
   This ensures we do not decode invalid sequence to uchars. *)

let test_invalid_decodes () =
  let module Sset = Set.Make (String) in
  let utf_8_encs, utf_16be_encs, utf_16le_encs =
    Printf.printf "Building encoding sequence sets\n%!";
    let add (set8, set16be, set16le) u =
      let s = Bytes.unsafe_to_string in
      let e8 = Bytes.create (Uchar.utf_8_byte_length u) in
      let e16be = Bytes.create (Uchar.utf_16_byte_length u) in
      let e16le = Bytes.create (Uchar.utf_16_byte_length u) in
      ignore (Bytes.set_utf_8_uchar e8 0 u);
      ignore (Bytes.set_utf_16be_uchar e16be 0 u);
      ignore (Bytes.set_utf_16le_uchar e16le 0 u);
      Sset.add (s e8) set8,
      Sset.add (s e16be) set16be,
      Sset.add (s e16le) set16le
    in
    fold_uchars add (Sset.empty, Sset.empty, Sset.empty)
  in
  let test_seqs utf utf_encs get_utf_char is_valid_utf =
    let test seq =
      let dec = get_utf_char seq 0 in
      let valid = Uchar.utf_decode_is_valid dec in
      let is_valid = is_valid_utf seq in
      let is_enc = Sset.mem (Bytes.unsafe_to_string seq) utf_encs in
      if not ((valid && is_enc) || (not valid && not is_enc)) ||
         not ((is_valid && is_enc) || (not is_valid && not is_enc))
      then begin
        for i = 0 to Bytes.length seq - 1 do
          Printf.printf "%02X " (Bytes.get_uint8 seq i);
        done;
        Printf.printf "valid: %b is_encoding: %b decode: U+%04X\n is_valid:%b"
          valid is_enc (Uchar.to_int (Uchar.utf_decode_uchar dec)) is_valid;
        assert false
      end;
      valid
    in
    let[@inline] set buf i b = Bytes.unsafe_set buf i (Char.unsafe_chr b) in
    let s1 = Bytes.create 1 and s2 = Bytes.create 2
    and s3 = Bytes.create 3 and s4 = Bytes.create 4 in
    Printf.printf "Testing %s invalid decodes...\n%!" utf;
    for b0 = 0x00 to 0xFF do
      set s1 0 b0;
      if test s1 then ((* this prefix decoded, stop here *)) else begin
        set s2 0 b0;
        for b1 = 0x00 to 0xFF do
          set s2 1 b1;
          if test s2 then ((* this prefix decoded, stop here *)) else begin
            set s3 0 b0;
            set s3 1 b1;
            for b2 = 0x00 to 0xFF do
              set s3 2 b2;
              if test s3 then ((* this prefix decoded, stop here *)) else begin
                set s4 0 b0;
                set s4 1 b1;
                set s4 2 b2;
                for b3 = 0x00 to 0xFF do set s4 3 b3; ignore (test s4) done;
              end
            done;
          end
        done;
      end
    done
  in
  test_seqs "UTF-8" utf_8_encs Bytes.get_utf_8_uchar Bytes.is_valid_utf_8;
  test_seqs "UTF-16BE"
    utf_16be_encs Bytes.get_utf_16be_uchar Bytes.is_valid_utf_16be;
  test_seqs "UTF-16LE" utf_16le_encs Bytes.get_utf_16le_uchar
    Bytes.is_valid_utf_16le;
  ()