File: test_recode.ml

package info (click to toggle)
netstring 0.10.1-3
  • links: PTS
  • area: main
  • in suites: woody
  • size: 1,000 kB
  • ctags: 895
  • sloc: ml: 8,389; xml: 416; makefile: 188; sh: 103
file content (169 lines) | stat: -rw-r--r-- 5,318 bytes parent folder | download
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169


let make_iso enc =
  let s = ref "" in
  for i = 0 to 255 do
    let u = try Netconversion.makechar (enc :> Netconversion.encoding) i 
            with Not_found -> "" in
    s := !s ^ u
  done;
  !s
;;

let make_ucs2 start stop =
  let s = String.create ((stop - start) * 2) in
  for i = 0 to stop-start-1 do
    let k = 2 * i in
    let c = i + start in
    s.[k]   <- Char.chr(c lsr 8);
    s.[k+1] <- Char.chr(c land 0xff);
  done;
  s
;;

let make_ucs4 start stop =
  let s = String.create ((stop - start) * 4) in
  for i = 0 to stop-start-1 do
    let k = 4 * i in
    let c = i + start in
    s.[k]   <- Char.chr(c lsr 24);
    s.[k+1] <- Char.chr((c lsr 16) land 0xff);
    s.[k+2] <- Char.chr((c lsr 8) land 0xff);
    s.[k+3] <- Char.chr(c land 0xff);
  done;
  s
;;

let name_of_encoding enc =
  match enc with
      `Enc_iso88591 -> "ISO_8859-1"
    | `Enc_iso88592 -> "ISO_8859-2"
    | `Enc_iso88593 -> "ISO_8859-3"
    | `Enc_iso88594 -> "ISO_8859-4"
    | `Enc_iso88595 -> "ISO_8859-5"
    | `Enc_iso88596 -> "ISO_8859-6"
    | `Enc_iso88597 -> "ISO_8859-7"
    | `Enc_iso88598 -> "ISO_8859-8"
    | `Enc_iso88599 -> "ISO_8859-9"
    | `Enc_iso885910 -> "ISO_8859-10"
    | `Enc_iso885913 -> "ISO_8859-13"
    | `Enc_iso885914 -> "ISO_8859-14"
    | `Enc_iso885915 -> "ISO_8859-15"
    | `Enc_utf8     -> "UTF-8"
    | `Enc_ucs4     -> "UCS-4"
    | `Enc_ucs2     -> "UCS-2"
    | `Enc_utf16    -> "UTF-16"

  (* Note: GNU-iconv assumes big endian byte order *)
;;

let iconv_recode_string in_enc out_enc in_s =
  let in_enc_name  = name_of_encoding in_enc in
  let out_enc_name = name_of_encoding out_enc in
  let out_s = ref "" in

  let out_ch,in_ch = Unix.open_process ("iconv -f " ^ in_enc_name ^ " -t " ^ 
					out_enc_name) in
  (* Write in_s to in_ch in a new thread: *)
  ignore
    (Thread.create
       (fun () ->
	  output_string in_ch in_s;
	  close_out in_ch;
       )
       ()
    );
  (* Read the result in the current thread: *)
  let buf = String.create 1024 in
  let n = ref 1 in
  while !n <> 0 do
    let n' = input out_ch buf 0 1024 in
    out_s := !out_s ^ String.sub buf 0 n';
    n := n'
  done;
  ignore(Unix.close_process (out_ch,in_ch));
  !out_s
;;

let test_iso_and_utf8 enc  =
  let name = name_of_encoding enc in
  print_string ("Recode: " ^ name ^ " and UTF-8... "); flush stdout;
  let s = make_iso enc in
  let s1' = Netconversion.recode_string (enc :> Netconversion.encoding) 
                                        `Enc_utf8 s in
  let s2' = iconv_recode_string         enc `Enc_utf8 s in
  assert(s1' = s2');
  let s1  = Netconversion.recode_string `Enc_utf8 
	                                (enc :> Netconversion.encoding) s1' in
  let s2  = iconv_recode_string         `Enc_utf8 enc s1' in
  assert(s1 = s2 && s1 = s);
  print_endline "OK"; flush stdout
;;

let test_utf16_and_utf8_0000_d7ff () =
  print_string "Recode: UTF-16-BE and UTF-8, #0000-#D7FF... "; 
  flush stdout;
  let s = make_ucs2 0 0xd800 in
  let s1' = Netconversion.recode_string `Enc_utf16_be `Enc_utf8 s in
  let s2' = iconv_recode_string        `Enc_utf16    `Enc_utf8 s in
  assert(s1' = s2');
  let s1  = Netconversion.recode_string `Enc_utf8 `Enc_utf16_be s1' in
  let s2  = iconv_recode_string        `Enc_utf8 `Enc_utf16 s1' in
  assert(s1 = s2 && s1 = s);
  print_endline "OK"; flush stdout
;;

let test_utf16_and_utf8_e000_fffd () =
  print_string "Recode: UTF-16-BE and UTF-8, #E000-#FFFD... "; 
  flush stdout;
  let s = make_ucs2 0xe000 0xfffe in
  let s1' = Netconversion.recode_string `Enc_utf16_be `Enc_utf8 s in
  let s2' = iconv_recode_string        `Enc_utf16    `Enc_utf8 s in
  assert(s1' = s2');
  let s1  = Netconversion.recode_string `Enc_utf8 `Enc_utf16_be s1' in
  let s2  = iconv_recode_string        `Enc_utf8 `Enc_utf16 s1' in
  assert(s1 = s2 && s1 = s);
  print_endline "OK"; flush stdout
;;

let test_utf16_and_utf8_10000_10FFFF () =
  print_string "Recode: UTF-16-BE and UTF-8, #10000-#10FFFF... "; 
  flush stdout;
  for i = 1 to 16 do
    let s0  = make_ucs4 (i * 0x10000) (i * 0x10000 + 0x10000) in
    let s   = iconv_recode_string        `Enc_ucs4     `Enc_utf16 s0 in
    let s1' = Netconversion.recode_string `Enc_utf16_be `Enc_utf8 s in
    let s2' = iconv_recode_string        `Enc_utf16    `Enc_utf8 s in
    assert(s1' = s2');
    let s1  = Netconversion.recode_string `Enc_utf8 `Enc_utf16_be s1' in
    let s2  = iconv_recode_string        `Enc_utf8 `Enc_utf16 s1' in
    assert(s1 = s2 && s1 = s);
    print_string "+"; flush stdout;
  done;
  print_endline "OK"; flush stdout
;;


print_endline "Warning: You need the command 'iconv' to run this test!";
flush stdout;
test_iso_and_utf8 `Enc_iso88591;
test_iso_and_utf8 `Enc_iso88592;
test_iso_and_utf8 `Enc_iso88593;
test_iso_and_utf8 `Enc_iso88594;
test_iso_and_utf8 `Enc_iso88595;
test_iso_and_utf8 `Enc_iso88596;
test_iso_and_utf8 `Enc_iso88597;
(* test_iso_and_utf8 `Enc_iso88598; *)
test_iso_and_utf8 `Enc_iso88599;
test_iso_and_utf8 `Enc_iso885910;
(* test_iso_and_utf8 `Enc_iso885913; *)
(* test_iso_and_utf8 `Enc_iso885914; *)
(* test_iso_and_utf8 `Enc_iso885915; *)
test_utf16_and_utf8_0000_d7ff();
test_utf16_and_utf8_e000_fffd();
(* This test does not work because iconv does not support the surrogate
 * representation of UTF-16:
 * test_utf16_and_utf8_10000_10FFFF();
 *)
()
;;