1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275
|
(* TEST
*)
(* UTF codec tests *)
let fold_uchars f acc =
let rec loop f acc u =
let acc = f acc u in
if Uchar.equal u Uchar.max then acc else loop f acc (Uchar.succ u)
in
loop f acc Uchar.min
(* This tests that we encode and decode each character according
to its specification. *)
let utf_8_spec =
(* UTF-8 byte sequences, cf. table 3.7 Unicode 14. *)
[(0x0000,0x007F), [|(0x00,0x7F)|];
(0x0080,0x07FF), [|(0xC2,0xDF); (0x80,0xBF)|];
(0x0800,0x0FFF), [|(0xE0,0xE0); (0xA0,0xBF); (0x80,0xBF)|];
(0x1000,0xCFFF), [|(0xE1,0xEC); (0x80,0xBF); (0x80,0xBF)|];
(0xD000,0xD7FF), [|(0xED,0xED); (0x80,0x9F); (0x80,0xBF)|];
(0xE000,0xFFFF), [|(0xEE,0xEF); (0x80,0xBF); (0x80,0xBF)|];
(0x10000,0x3FFFF), [|(0xF0,0xF0); (0x90,0xBF); (0x80,0xBF); (0x80,0xBF)|];
(0x40000,0xFFFFF), [|(0xF1,0xF3); (0x80,0xBF); (0x80,0xBF); (0x80,0xBF)|];
(0x100000,0x10FFFF), [|(0xF4,0xF4); (0x80,0x8F); (0x80,0xBF); (0x80,0xBF)|]]
let utf_16be_spec =
(* UTF-16BE byte sequences, derived from table 3.5 Unicode 14. *)
[(0x0000,0xD7FF), [|(0x00,0xD7); (0x00,0xFF)|];
(0xE000,0xFFFF), [|(0xE0,0xFF); (0x00,0xFF)|];
(0x10000,0x10FFFF), [|(0xD8,0xDB); (0x00,0xFF); (0xDC,0xDF); (0x00,0xFF)|]]
let uchar_map_of_spec spec =
(* array mapping Uchar.t as ints to byte sequences according to [spec]. *)
let map = Array.make ((Uchar.to_int Uchar.max) + 1) Bytes.empty in
let add_range ((umin, umax), bytes) =
let len = Array.length bytes in
let bmin i = if i < len then fst bytes.(i) else max_int in
let bmax i = if i < len then snd bytes.(i) else min_int in
let uchar = ref umin in
let buf = Bytes.create len in
let add len' = match len = len' with
| false -> ()
| true -> map.(!uchar) <- Bytes.copy buf; incr uchar
in
for b0 = bmin 0 to bmax 0 do Bytes.set_uint8 buf 0 b0;
for b1 = bmin 1 to bmax 1 do Bytes.set_uint8 buf 1 b1;
for b2 = bmin 2 to bmax 2 do Bytes.set_uint8 buf 2 b2;
for b3 = bmin 3 to bmax 3 do Bytes.set_uint8 buf 3 b3; add 4
done; add 3;
done; add 2;
done; add 1;
done; assert (!uchar - 1 = umax)
in
List.iter add_range spec;
map
let uchar_map_get u map = map.(Uchar.to_int u)
let utf_8 = uchar_map_of_spec utf_8_spec
let utf_16be = uchar_map_of_spec utf_16be_spec
let utf_16le =
let swap u b =
let len = Bytes.length b in
if len = 0 then () else
for i = 0 to Bytes.length b / 2 - 1 do
let j = i * 2 in
Bytes.set_uint16_le b j (Bytes.get_uint16_be b j);
done;
in
let map = Array.map Bytes.copy utf_16be in
Array.iteri swap map; map
let test_utf utf utf_len get_utf set_utf utf_is_valid =
(* Test codec and validation of each Uchar.t against the spec. *)
let f () u =
let utf_len = utf_len u in
let buf = Bytes.create utf_len in
assert (set_utf buf 0 u = utf_len);
assert (Bytes.equal buf (uchar_map_get u utf));
assert (Bytes.equal buf (uchar_map_get u utf));
let dec = get_utf buf 0 in
assert (Uchar.utf_decode_is_valid dec);
assert (Uchar.utf_decode_length dec = utf_len);
assert (Uchar.equal (Uchar.utf_decode_uchar dec) u);
assert (utf_is_valid buf);
()
in
fold_uchars f ()
let () =
test_utf utf_8 Uchar.utf_8_byte_length
Bytes.get_utf_8_uchar Bytes.set_utf_8_uchar Bytes.is_valid_utf_8
let () =
test_utf utf_16be Uchar.utf_16_byte_length
Bytes.get_utf_16be_uchar Bytes.set_utf_16be_uchar Bytes.is_valid_utf_16be
let () =
test_utf utf_16le Uchar.utf_16_byte_length
Bytes.get_utf_16le_uchar Bytes.set_utf_16le_uchar Bytes.is_valid_utf_16le
let () =
(* Test out of bounds *)
let raises f = assert (try f (); false with Invalid_argument _ -> true) in
(raises @@ fun () -> Bytes.get_utf_8_uchar Bytes.empty 0);
(raises @@ fun () -> Bytes.set_utf_8_uchar Bytes.empty 0 Uchar.min);
(raises @@ fun () -> Bytes.get_utf_16le_uchar Bytes.empty 0);
(raises @@ fun () -> Bytes.set_utf_16le_uchar Bytes.empty 0 Uchar.min);
(raises @@ fun () -> Bytes.get_utf_16be_uchar Bytes.empty 0);
(raises @@ fun () -> Bytes.set_utf_16be_uchar Bytes.empty 0 Uchar.min);
()
let () =
(* Test lack of space encodes *)
let b = Bytes.make 1 '\xab' in
assert (Bytes.set_utf_8_uchar b 0 Uchar.max = 0 && Bytes.get b 0 = '\xab');
assert (Bytes.set_utf_16be_uchar b 0 Uchar.max = 0 && Bytes.get b 0 = '\xab');
assert (Bytes.set_utf_16le_uchar b 0 Uchar.max = 0 && Bytes.get b 0 = '\xab');
()
let () =
(* Test bug found during review *)
let b = Bytes.create 2 in
let () = Bytes.set_uint8 b 0 0xC3 in
let () = Bytes.set_uint8 b 1 0x00 in
assert (not (Bytes.is_valid_utf_8 b))
let () =
(* Test used bytes and replacement according to WHATWG recommendation.
This is just a recommendation.
These examples are from TUS p. 126-127 Unicode 14 *)
let b = Bytes.of_string "\xC0\xAF\xE0\x80\xBF\xF0\x81\x82\x41" in
let ok i = i = Bytes.length b - 1 in
for i = 0 to Bytes.length b - 1 do
let dec = Bytes.get_utf_8_uchar b i in
if not (ok i) then begin
assert (Uchar.utf_decode_is_valid dec = false);
assert (Uchar.utf_decode_length dec = 1);
assert (Uchar.equal (Uchar.utf_decode_uchar dec) Uchar.rep)
end else begin
assert (Uchar.utf_decode_is_valid dec = true);
assert (Uchar.utf_decode_length dec = 1);
assert (Uchar.equal (Uchar.utf_decode_uchar dec) (Uchar.of_int 0x0041))
end
done;
let b = Bytes.of_string "\xED\xA0\x80\xED\xBF\xBF\xED\xAF\x41" in
let ok i = i = Bytes.length b - 1 in
for i = 0 to Bytes.length b - 1 do
let dec = Bytes.get_utf_8_uchar b i in
if not (ok i) then begin
assert (Uchar.utf_decode_is_valid dec = false);
assert (Uchar.utf_decode_length dec = 1);
assert (Uchar.equal (Uchar.utf_decode_uchar dec) Uchar.rep)
end else begin
assert (Uchar.utf_decode_is_valid dec = true);
assert (Uchar.utf_decode_length dec = 1);
assert (Uchar.equal (Uchar.utf_decode_uchar dec) (Uchar.of_int 0x0041))
end
done;
let b = Bytes.of_string "\xF4\x91\x92\x93\xFF\x41\x80\xBF\x42" in
let ok i = i = 5 || i = 8 in
for i = 0 to Bytes.length b - 1 do
let dec = Bytes.get_utf_8_uchar b i in
if not (ok i) then begin
assert (Uchar.utf_decode_is_valid dec = false);
assert (Uchar.utf_decode_length dec = 1);
assert (Uchar.equal (Uchar.utf_decode_uchar dec) Uchar.rep)
end else begin
assert (Uchar.utf_decode_is_valid dec = true);
assert (Uchar.utf_decode_length dec = 1);
assert (Uchar.equal (Uchar.utf_decode_uchar dec)
(Uchar.of_char (Bytes.get b i)))
end
done;
let b = Bytes.of_string "\xE1\x80\xE2\xF0\x91\x92\xF1\xBF\x41" in
let d0 = Bytes.get_utf_8_uchar b 0 in
assert (Uchar.utf_decode_is_valid d0 = false);
assert (Uchar.utf_decode_length d0 = 2);
assert (Uchar.equal (Uchar.utf_decode_uchar d0) Uchar.rep);
let d2 = Bytes.get_utf_8_uchar b 2 in
assert (Uchar.utf_decode_is_valid d2 = false);
assert (Uchar.utf_decode_length d2 = 1);
assert (Uchar.equal (Uchar.utf_decode_uchar d2) Uchar.rep);
let d3 = Bytes.get_utf_8_uchar b 3 in
assert (Uchar.utf_decode_is_valid d3 = false);
assert (Uchar.utf_decode_length d3 = 3);
assert (Uchar.equal (Uchar.utf_decode_uchar d3) Uchar.rep);
let d6 = Bytes.get_utf_8_uchar b 6 in
assert (Uchar.utf_decode_is_valid d6 = false);
assert (Uchar.utf_decode_length d6 = 2);
assert (Uchar.equal (Uchar.utf_decode_uchar d6) Uchar.rep);
let d8 = Bytes.get_utf_8_uchar b 8 in
assert (Uchar.utf_decode_length d8 = 1);
assert (Uchar.equal (Uchar.utf_decode_uchar d8) (Uchar.of_int 0x0041));
()
let () = Printf.printf "All UTF tests passed!\n"
(* This is a very long test added here for reference just in case. It
is not run.
It assumes the good encoding and decodes have been checked by test_utf
above. It exhaustively tests all 1-4 bytes invalid sequences for decodes.
This ensures we do not decode invalid sequence to uchars. *)
let test_invalid_decodes () =
let module Sset = Set.Make (String) in
let utf_8_encs, utf_16be_encs, utf_16le_encs =
Printf.printf "Building encoding sequence sets\n%!";
let add (set8, set16be, set16le) u =
let s = Bytes.unsafe_to_string in
let e8 = Bytes.create (Uchar.utf_8_byte_length u) in
let e16be = Bytes.create (Uchar.utf_16_byte_length u) in
let e16le = Bytes.create (Uchar.utf_16_byte_length u) in
ignore (Bytes.set_utf_8_uchar e8 0 u);
ignore (Bytes.set_utf_16be_uchar e16be 0 u);
ignore (Bytes.set_utf_16le_uchar e16le 0 u);
Sset.add (s e8) set8,
Sset.add (s e16be) set16be,
Sset.add (s e16le) set16le
in
fold_uchars add (Sset.empty, Sset.empty, Sset.empty)
in
let test_seqs utf utf_encs get_utf_char is_valid_utf =
let test seq =
let dec = get_utf_char seq 0 in
let valid = Uchar.utf_decode_is_valid dec in
let is_valid = is_valid_utf seq in
let is_enc = Sset.mem (Bytes.unsafe_to_string seq) utf_encs in
if not ((valid && is_enc) || (not valid && not is_enc)) ||
not ((is_valid && is_enc) || (not is_valid && not is_enc))
then begin
for i = 0 to Bytes.length seq - 1 do
Printf.printf "%02X " (Bytes.get_uint8 seq i);
done;
Printf.printf "valid: %b is_encoding: %b decode: U+%04X\n is_valid:%b"
valid is_enc (Uchar.to_int (Uchar.utf_decode_uchar dec)) is_valid;
assert false
end;
valid
in
let[@inline] set buf i b = Bytes.unsafe_set buf i (Char.unsafe_chr b) in
let s1 = Bytes.create 1 and s2 = Bytes.create 2
and s3 = Bytes.create 3 and s4 = Bytes.create 4 in
Printf.printf "Testing %s invalid decodes...\n%!" utf;
for b0 = 0x00 to 0xFF do
set s1 0 b0;
if test s1 then ((* this prefix decoded, stop here *)) else begin
set s2 0 b0;
for b1 = 0x00 to 0xFF do
set s2 1 b1;
if test s2 then ((* this prefix decoded, stop here *)) else begin
set s3 0 b0;
set s3 1 b1;
for b2 = 0x00 to 0xFF do
set s3 2 b2;
if test s3 then ((* this prefix decoded, stop here *)) else begin
set s4 0 b0;
set s4 1 b1;
set s4 2 b2;
for b3 = 0x00 to 0xFF do set s4 3 b3; ignore (test s4) done;
end
done;
end
done;
end
done
in
test_seqs "UTF-8" utf_8_encs Bytes.get_utf_8_uchar Bytes.is_valid_utf_8;
test_seqs "UTF-16BE"
utf_16be_encs Bytes.get_utf_16be_uchar Bytes.is_valid_utf_16be;
test_seqs "UTF-16LE" utf_16le_encs Bytes.get_utf_16le_uchar
Bytes.is_valid_utf_16le;
()
|