1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169
|
let make_iso enc =
let s = ref "" in
for i = 0 to 255 do
let u = try Netconversion.makechar (enc :> Netconversion.encoding) i
with Not_found -> "" in
s := !s ^ u
done;
!s
;;
let make_ucs2 start stop =
let s = String.create ((stop - start) * 2) in
for i = 0 to stop-start-1 do
let k = 2 * i in
let c = i + start in
s.[k] <- Char.chr(c lsr 8);
s.[k+1] <- Char.chr(c land 0xff);
done;
s
;;
let make_ucs4 start stop =
let s = String.create ((stop - start) * 4) in
for i = 0 to stop-start-1 do
let k = 4 * i in
let c = i + start in
s.[k] <- Char.chr(c lsr 24);
s.[k+1] <- Char.chr((c lsr 16) land 0xff);
s.[k+2] <- Char.chr((c lsr 8) land 0xff);
s.[k+3] <- Char.chr(c land 0xff);
done;
s
;;
let name_of_encoding enc =
match enc with
`Enc_iso88591 -> "ISO_8859-1"
| `Enc_iso88592 -> "ISO_8859-2"
| `Enc_iso88593 -> "ISO_8859-3"
| `Enc_iso88594 -> "ISO_8859-4"
| `Enc_iso88595 -> "ISO_8859-5"
| `Enc_iso88596 -> "ISO_8859-6"
| `Enc_iso88597 -> "ISO_8859-7"
| `Enc_iso88598 -> "ISO_8859-8"
| `Enc_iso88599 -> "ISO_8859-9"
| `Enc_iso885910 -> "ISO_8859-10"
| `Enc_iso885913 -> "ISO_8859-13"
| `Enc_iso885914 -> "ISO_8859-14"
| `Enc_iso885915 -> "ISO_8859-15"
| `Enc_utf8 -> "UTF-8"
| `Enc_ucs4 -> "UCS-4"
| `Enc_ucs2 -> "UCS-2"
| `Enc_utf16 -> "UTF-16"
(* Note: GNU-iconv assumes big endian byte order *)
;;
let iconv_recode_string in_enc out_enc in_s =
let in_enc_name = name_of_encoding in_enc in
let out_enc_name = name_of_encoding out_enc in
let out_s = ref "" in
let out_ch,in_ch = Unix.open_process ("iconv -f " ^ in_enc_name ^ " -t " ^
out_enc_name) in
(* Write in_s to in_ch in a new thread: *)
ignore
(Thread.create
(fun () ->
output_string in_ch in_s;
close_out in_ch;
)
()
);
(* Read the result in the current thread: *)
let buf = String.create 1024 in
let n = ref 1 in
while !n <> 0 do
let n' = input out_ch buf 0 1024 in
out_s := !out_s ^ String.sub buf 0 n';
n := n'
done;
ignore(Unix.close_process (out_ch,in_ch));
!out_s
;;
let test_iso_and_utf8 enc =
let name = name_of_encoding enc in
print_string ("Recode: " ^ name ^ " and UTF-8... "); flush stdout;
let s = make_iso enc in
let s1' = Netconversion.recode_string (enc :> Netconversion.encoding)
`Enc_utf8 s in
let s2' = iconv_recode_string enc `Enc_utf8 s in
assert(s1' = s2');
let s1 = Netconversion.recode_string `Enc_utf8
(enc :> Netconversion.encoding) s1' in
let s2 = iconv_recode_string `Enc_utf8 enc s1' in
assert(s1 = s2 && s1 = s);
print_endline "OK"; flush stdout
;;
let test_utf16_and_utf8_0000_d7ff () =
print_string "Recode: UTF-16-BE and UTF-8, #0000-#D7FF... ";
flush stdout;
let s = make_ucs2 0 0xd800 in
let s1' = Netconversion.recode_string `Enc_utf16_be `Enc_utf8 s in
let s2' = iconv_recode_string `Enc_utf16 `Enc_utf8 s in
assert(s1' = s2');
let s1 = Netconversion.recode_string `Enc_utf8 `Enc_utf16_be s1' in
let s2 = iconv_recode_string `Enc_utf8 `Enc_utf16 s1' in
assert(s1 = s2 && s1 = s);
print_endline "OK"; flush stdout
;;
let test_utf16_and_utf8_e000_fffd () =
print_string "Recode: UTF-16-BE and UTF-8, #E000-#FFFD... ";
flush stdout;
let s = make_ucs2 0xe000 0xfffe in
let s1' = Netconversion.recode_string `Enc_utf16_be `Enc_utf8 s in
let s2' = iconv_recode_string `Enc_utf16 `Enc_utf8 s in
assert(s1' = s2');
let s1 = Netconversion.recode_string `Enc_utf8 `Enc_utf16_be s1' in
let s2 = iconv_recode_string `Enc_utf8 `Enc_utf16 s1' in
assert(s1 = s2 && s1 = s);
print_endline "OK"; flush stdout
;;
let test_utf16_and_utf8_10000_10FFFF () =
print_string "Recode: UTF-16-BE and UTF-8, #10000-#10FFFF... ";
flush stdout;
for i = 1 to 16 do
let s0 = make_ucs4 (i * 0x10000) (i * 0x10000 + 0x10000) in
let s = iconv_recode_string `Enc_ucs4 `Enc_utf16 s0 in
let s1' = Netconversion.recode_string `Enc_utf16_be `Enc_utf8 s in
let s2' = iconv_recode_string `Enc_utf16 `Enc_utf8 s in
assert(s1' = s2');
let s1 = Netconversion.recode_string `Enc_utf8 `Enc_utf16_be s1' in
let s2 = iconv_recode_string `Enc_utf8 `Enc_utf16 s1' in
assert(s1 = s2 && s1 = s);
print_string "+"; flush stdout;
done;
print_endline "OK"; flush stdout
;;
print_endline "Warning: You need the command 'iconv' to run this test!";
flush stdout;
test_iso_and_utf8 `Enc_iso88591;
test_iso_and_utf8 `Enc_iso88592;
test_iso_and_utf8 `Enc_iso88593;
test_iso_and_utf8 `Enc_iso88594;
test_iso_and_utf8 `Enc_iso88595;
test_iso_and_utf8 `Enc_iso88596;
test_iso_and_utf8 `Enc_iso88597;
(* test_iso_and_utf8 `Enc_iso88598; *)
test_iso_and_utf8 `Enc_iso88599;
test_iso_and_utf8 `Enc_iso885910;
(* test_iso_and_utf8 `Enc_iso885913; *)
(* test_iso_and_utf8 `Enc_iso885914; *)
(* test_iso_and_utf8 `Enc_iso885915; *)
test_utf16_and_utf8_0000_d7ff();
test_utf16_and_utf8_e000_fffd();
(* This test does not work because iconv does not support the surrogate
* representation of UTF-16:
* test_utf16_and_utf8_10000_10FFFF();
*)
()
;;
|