File: netconversion.mli

package info (click to toggle)
netstring 0.10.1-3
  • links: PTS
  • area: main
  • in suites: woody
  • size: 1,000 kB
  • ctags: 895
  • sloc: ml: 8,389; xml: 416; makefile: 188; sh: 103
file content (238 lines) | stat: -rw-r--r-- 9,057 bytes parent folder | download
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
(* $Id: netconversion.mli,v 1.1 2000/08/13 00:02:57 gerd Exp $
 * ----------------------------------------------------------------------
 *)

exception Malformed_code

(* Encodings:
 * - With the exception of UTF-8 and UTF-16, only single-byte character sets
 *   are supported.
 * - I took the mappings from www.unicode.org, and the standard names of
 *   the character sets from IANA. Obviously, many character sets are missing
 *   that can be supported; especially ISO646 character sets, many EBCDIC 
 *   code pages. 
 * - Because of the copyright statement from Unicode, I cannot put the
 *   source tables that describe the mappings into the distribution. They
 *   are publicly available from www.unicode.org.
 * - Because of this, it is difficult for you to extend the list of character 
 *   sets; you need the source tables I am not allowed to distribute.
 *   These tables have a very simple format: Every line describes a pair
 *   of code points; the left code (<= 0xff) is the code in the character
 *   set, the right code (<= 0xffff) is the Unicode equivalent.
 *   For an example, see
 *   http://www.unicode.org/Public/MAPPINGS/ISO8859/8859-2.TXT
 *   You can send me such files, and I will integrate them into the 
 *   distribution (if possible).
 * - I really do not know very much about the character sets used in
 *   East Asia. If you need them, please write the necessary conversion
 *   functions and send them to me.
 *
 * KNOWN PROBLEMS:
 * - The following charsets do not have a bijective mapping to Unicode:
 *   adobe_standard_encoding, adobe_symbol_encoding, 
 *   adobe_zapf_dingbats_encoding, cp1002 (0xFEBE). The current implementation
 *   simply removes one of the conflicting code point pairs - this might
 *   not what you want.
 *)

type encoding =
  [  `Enc_utf8       (* UTF-8 *)
  |  `Enc_java       (* The variant of UTF-8 used by Java *)
  |  `Enc_utf16      (* UTF-16 with unspecified endianess (restricted usage) *)
  |  `Enc_utf16_le   (* UTF-16 little endian *)
  |  `Enc_utf16_be   (* UTF-16 big endian *)
  |  `Enc_usascii    (* US-ASCII (only 7 bit) *)
  |  `Enc_iso88591   (* ISO-8859-1 *)
  |  `Enc_iso88592   (* ISO-8859-2 *)
  |  `Enc_iso88593   (* ISO-8859-3 *)
  |  `Enc_iso88594   (* ISO-8859-4 *)
  |  `Enc_iso88595   (* ISO-8859-5 *)
  |  `Enc_iso88596   (* ISO-8859-6 *)
  |  `Enc_iso88597   (* ISO-8859-7 *)
  |  `Enc_iso88598   (* ISO-8859-8 *)
  |  `Enc_iso88599   (* ISO-8859-9 *)
  |  `Enc_iso885910  (* ISO-8859-10 *)
  |  `Enc_iso885913  (* ISO-8859-13 *)
  |  `Enc_iso885914  (* ISO-8859-14 *)
  |  `Enc_iso885915  (* ISO-8859-15 *)
  |  `Enc_koi8r      (* KOI8-R *)
  |  `Enc_jis0201    (* JIS-0201 *)
    (* Microsoft: *)
  |  `Enc_windows1250  (* WINDOWS-1250 *)
  |  `Enc_windows1251  (* WINDOWS-1251 *)
  |  `Enc_windows1252  (* WINDOWS-1252 *)
  |  `Enc_windows1253  (* WINDOWS-1253 *)
  |  `Enc_windows1254  (* WINDOWS-1254 *)
  |  `Enc_windows1255  (* WINDOWS-1255 *)
  |  `Enc_windows1256  (* WINDOWS-1256 *)
  |  `Enc_windows1257  (* WINDOWS-1257 *)
  |  `Enc_windows1258  (* WINDOWS-1258 *)
    (* IBM, ASCII-based: *)
  |  `Enc_cp437
  |  `Enc_cp737
  |  `Enc_cp775
  |  `Enc_cp850
  |  `Enc_cp852
  |  `Enc_cp855
  |  `Enc_cp856
  |  `Enc_cp857
  |  `Enc_cp860
  |  `Enc_cp861
  |  `Enc_cp862
  |  `Enc_cp863
  |  `Enc_cp864
  |  `Enc_cp865
  |  `Enc_cp866
  |  `Enc_cp869
  |  `Enc_cp874
  |  `Enc_cp1006
   (* IBM, EBCDIC-based: *)
  |  `Enc_cp037
  |  `Enc_cp424
  |  `Enc_cp500
  |  `Enc_cp875
  |  `Enc_cp1026
   (* Adobe: *)
  |  `Enc_adobe_standard_encoding
  |  `Enc_adobe_symbol_encoding
  |  `Enc_adobe_zapf_dingbats_encoding
   (* Apple: *)
  |  `Enc_macroman

  ]


val encoding_of_string : string -> encoding;;
    (* Returns the encoding of the name of the encoding. Fails if the 
     * encoding is unknown.
     * E.g. encoding_of_string "iso-8859-1" = `Enc_iso88591
     *)

val string_of_encoding : encoding -> string;;
    (* Returns the name of the encoding. *)


val makechar : encoding -> int -> string
  (* makechar enc i:
   * Creates the string representing the code point i in encoding enc.
   * Raises Not_found if the character is legal but cannot be represented 
   * in enc.
   * 
   * Possible encodings: everything but `Enc_utf16.
   *)

val recode : in_enc:encoding -> 
             in_buf:string -> 
	     in_pos:int ->
	     in_len:int -> 
	     out_enc:encoding -> 
	     out_buf:string -> 
	     out_pos:int ->
	     out_len:int ->
	     max_chars:int ->
             subst:(int -> string) -> (int * int * encoding)
  (* 
   * let (in_n, out_n, in_enc') = 
   *     recode in_enc in_buf in_len out_enc out_buf out_pos out_len max_chars 
   *            subst:
   * Converts the character sequence contained in the at most in_len bytes
   * of in_buf starting at position in_pos, and writes the result 
   * into at most out_len bytes of out_buf starting at out_pos.
   * At most max_chars are written into out_buf.
   * The characters in in_buf are assumed to be encoded as in_enc, and the 
   * characters in out_buf will be encoded as out_enc.
   * If there is a code point which cannot be represented in out_enc,
   * the function subst is called with the code point as argument, and the
   * resulting string (which must already be encoded as out_enc) is
   * inserted instead. 
   * Note: It is possible that subst is called several times for the same
   * character.
   * Return value: out_n is the actual number of bytes written into out_buf.
   * in_n is the actual number of bytes that have been converted from
   * in_buf; in_n may be smaller than in_len because of incomplete
   * multi-byte characters, or because the output buffer has less space
   * for characters than the input buffer, or because of a change
   * of the encoding variant.
   * If there is at least one complete character in in_buf, and at least
   * space for one complete character in out_buf, and max_chars >= 1, it is 
   * guaranteed that in_n > 0 or out_n > 0.
   * in_enc' is normally identical to in_enc. However, there are cases
   * in which the encoding can be refined when looking at the byte
   * sequence; for example whether a little endian or big endian variant
   * of the encoding is used. in_enc' is the variant of in_enc that was
   * used for the last character that has been converted.
   *
   * NOTES:
   *
   * Supported range of code points: 0 to 0xd7ff, 0xe000 to 0xfffd,
   * 0x10000 to 0x10ffff.
   *
   * Enc_utf8: Malformed UTF-8 byte sequences are always rejected. This
   * is also true for the sequence 0xc0 0x80 which is used by some software
   * (Java) as paraphrase for the code point 0.
   *
   * Enc_utf16: When reading from a string encoded as Enc_utf16, a byte
   * order mark is expected at the beginning. The detected variant 
   * (Enc_utf16_le or Enc_utf16_be) is returned. The byte order mark is
   * not included into the output string. - It is not possible to
   * write as Enc_utf16.
   *
   * Enc_utf16_le, Enc_utf16_be: When reading from such a string, the
   * code point 0xfeff is returned as it is; it is a "zero-width 
   * non-breaking space". The code point 0xfffe is rejected.
   *
   * Surrogate pairs: These are recognized (or written) only for a
   * UTF-16 encoding; and rejected for any other encoding.
   *
   * Rejected byte sequences cause the exception Bad_character_stream.
   *)

val recode_string : in_enc:encoding -> 
                    out_enc:encoding ->
		    ?subst:(int -> string) ->
		    string ->
                    string 
  (* Recodes a complete string from in_enc to out_enc, and returns it.
   * The function subst is invoked for code points of in_enc that cannot
   * be represented in out_enc, and the result of the function invocation
   * is substituted.
   * If subst is missing, Not_found is raised in this case.
   *)

(* ======================================================================
 * History:
 * 
 * $Log: netconversion.mli,v $
 * Revision 1.1  2000/08/13 00:02:57  gerd
 * 	Initial revision.
 *
 *
 * ======================================================================
 * OLD LOGS FROM THE PXP PACKAGE (FILE NAME pxp_encoding.mli):
 *
 * Revision 1.4  2000/07/04 22:05:58  gerd
 * 	Enhanced version of 'recode'. Labeled arguments.
 * New function 'recode_string'.
 *
 * Revision 1.3  2000/05/29 23:48:38  gerd
 * 	Changed module names:
 * 		Markup_aux          into Pxp_aux
 * 		Markup_codewriter   into Pxp_codewriter
 * 		Markup_document     into Pxp_document
 * 		Markup_dtd          into Pxp_dtd
 * 		Markup_entity       into Pxp_entity
 * 		Markup_lexer_types  into Pxp_lexer_types
 * 		Markup_reader       into Pxp_reader
 * 		Markup_types        into Pxp_types
 * 		Markup_yacc         into Pxp_yacc
 * See directory "compatibility" for (almost) compatible wrappers emulating
 * Markup_document, Markup_dtd, Markup_reader, Markup_types, and Markup_yacc.
 *
 * Revision 1.2  2000/05/29 21:14:57  gerd
 * 	Changed the type 'encoding' into a polymorphic variant.
 *
 * Revision 1.1  2000/05/20 20:30:50  gerd
 * 	Initial revision.
 *
 * 
 *)