1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238
|
(* $Id: netconversion.mli,v 1.1 2000/08/13 00:02:57 gerd Exp $
* ----------------------------------------------------------------------
*)
exception Malformed_code
(* Encodings:
* - With the exception of UTF-8 and UTF-16, only single-byte character sets
* are supported.
* - I took the mappings from www.unicode.org, and the standard names of
* the character sets from IANA. Obviously, many character sets are missing
* that can be supported; especially ISO646 character sets, many EBCDIC
* code pages.
* - Because of the copyright statement from Unicode, I cannot put the
* source tables that describe the mappings into the distribution. They
* are publicly available from www.unicode.org.
* - Because of this, it is difficult for you to extend the list of character
* sets; you need the source tables I am not allowed to distribute.
* These tables have a very simple format: Every line describes a pair
* of code points; the left code (<= 0xff) is the code in the character
* set, the right code (<= 0xffff) is the Unicode equivalent.
* For an example, see
* http://www.unicode.org/Public/MAPPINGS/ISO8859/8859-2.TXT
* You can send me such files, and I will integrate them into the
* distribution (if possible).
* - I really do not know very much about the character sets used in
* East Asia. If you need them, please write the necessary conversion
* functions and send them to me.
*
* KNOWN PROBLEMS:
* - The following charsets do not have a bijective mapping to Unicode:
* adobe_standard_encoding, adobe_symbol_encoding,
* adobe_zapf_dingbats_encoding, cp1002 (0xFEBE). The current implementation
* simply removes one of the conflicting code point pairs - this might
* not what you want.
*)
type encoding =
[ `Enc_utf8 (* UTF-8 *)
| `Enc_java (* The variant of UTF-8 used by Java *)
| `Enc_utf16 (* UTF-16 with unspecified endianess (restricted usage) *)
| `Enc_utf16_le (* UTF-16 little endian *)
| `Enc_utf16_be (* UTF-16 big endian *)
| `Enc_usascii (* US-ASCII (only 7 bit) *)
| `Enc_iso88591 (* ISO-8859-1 *)
| `Enc_iso88592 (* ISO-8859-2 *)
| `Enc_iso88593 (* ISO-8859-3 *)
| `Enc_iso88594 (* ISO-8859-4 *)
| `Enc_iso88595 (* ISO-8859-5 *)
| `Enc_iso88596 (* ISO-8859-6 *)
| `Enc_iso88597 (* ISO-8859-7 *)
| `Enc_iso88598 (* ISO-8859-8 *)
| `Enc_iso88599 (* ISO-8859-9 *)
| `Enc_iso885910 (* ISO-8859-10 *)
| `Enc_iso885913 (* ISO-8859-13 *)
| `Enc_iso885914 (* ISO-8859-14 *)
| `Enc_iso885915 (* ISO-8859-15 *)
| `Enc_koi8r (* KOI8-R *)
| `Enc_jis0201 (* JIS-0201 *)
(* Microsoft: *)
| `Enc_windows1250 (* WINDOWS-1250 *)
| `Enc_windows1251 (* WINDOWS-1251 *)
| `Enc_windows1252 (* WINDOWS-1252 *)
| `Enc_windows1253 (* WINDOWS-1253 *)
| `Enc_windows1254 (* WINDOWS-1254 *)
| `Enc_windows1255 (* WINDOWS-1255 *)
| `Enc_windows1256 (* WINDOWS-1256 *)
| `Enc_windows1257 (* WINDOWS-1257 *)
| `Enc_windows1258 (* WINDOWS-1258 *)
(* IBM, ASCII-based: *)
| `Enc_cp437
| `Enc_cp737
| `Enc_cp775
| `Enc_cp850
| `Enc_cp852
| `Enc_cp855
| `Enc_cp856
| `Enc_cp857
| `Enc_cp860
| `Enc_cp861
| `Enc_cp862
| `Enc_cp863
| `Enc_cp864
| `Enc_cp865
| `Enc_cp866
| `Enc_cp869
| `Enc_cp874
| `Enc_cp1006
(* IBM, EBCDIC-based: *)
| `Enc_cp037
| `Enc_cp424
| `Enc_cp500
| `Enc_cp875
| `Enc_cp1026
(* Adobe: *)
| `Enc_adobe_standard_encoding
| `Enc_adobe_symbol_encoding
| `Enc_adobe_zapf_dingbats_encoding
(* Apple: *)
| `Enc_macroman
]
val encoding_of_string : string -> encoding;;
(* Returns the encoding of the name of the encoding. Fails if the
* encoding is unknown.
* E.g. encoding_of_string "iso-8859-1" = `Enc_iso88591
*)
val string_of_encoding : encoding -> string;;
(* Returns the name of the encoding. *)
val makechar : encoding -> int -> string
(* makechar enc i:
* Creates the string representing the code point i in encoding enc.
* Raises Not_found if the character is legal but cannot be represented
* in enc.
*
* Possible encodings: everything but `Enc_utf16.
*)
val recode : in_enc:encoding ->
in_buf:string ->
in_pos:int ->
in_len:int ->
out_enc:encoding ->
out_buf:string ->
out_pos:int ->
out_len:int ->
max_chars:int ->
subst:(int -> string) -> (int * int * encoding)
(*
* let (in_n, out_n, in_enc') =
* recode in_enc in_buf in_len out_enc out_buf out_pos out_len max_chars
* subst:
* Converts the character sequence contained in the at most in_len bytes
* of in_buf starting at position in_pos, and writes the result
* into at most out_len bytes of out_buf starting at out_pos.
* At most max_chars are written into out_buf.
* The characters in in_buf are assumed to be encoded as in_enc, and the
* characters in out_buf will be encoded as out_enc.
* If there is a code point which cannot be represented in out_enc,
* the function subst is called with the code point as argument, and the
* resulting string (which must already be encoded as out_enc) is
* inserted instead.
* Note: It is possible that subst is called several times for the same
* character.
* Return value: out_n is the actual number of bytes written into out_buf.
* in_n is the actual number of bytes that have been converted from
* in_buf; in_n may be smaller than in_len because of incomplete
* multi-byte characters, or because the output buffer has less space
* for characters than the input buffer, or because of a change
* of the encoding variant.
* If there is at least one complete character in in_buf, and at least
* space for one complete character in out_buf, and max_chars >= 1, it is
* guaranteed that in_n > 0 or out_n > 0.
* in_enc' is normally identical to in_enc. However, there are cases
* in which the encoding can be refined when looking at the byte
* sequence; for example whether a little endian or big endian variant
* of the encoding is used. in_enc' is the variant of in_enc that was
* used for the last character that has been converted.
*
* NOTES:
*
* Supported range of code points: 0 to 0xd7ff, 0xe000 to 0xfffd,
* 0x10000 to 0x10ffff.
*
* Enc_utf8: Malformed UTF-8 byte sequences are always rejected. This
* is also true for the sequence 0xc0 0x80 which is used by some software
* (Java) as paraphrase for the code point 0.
*
* Enc_utf16: When reading from a string encoded as Enc_utf16, a byte
* order mark is expected at the beginning. The detected variant
* (Enc_utf16_le or Enc_utf16_be) is returned. The byte order mark is
* not included into the output string. - It is not possible to
* write as Enc_utf16.
*
* Enc_utf16_le, Enc_utf16_be: When reading from such a string, the
* code point 0xfeff is returned as it is; it is a "zero-width
* non-breaking space". The code point 0xfffe is rejected.
*
* Surrogate pairs: These are recognized (or written) only for a
* UTF-16 encoding; and rejected for any other encoding.
*
* Rejected byte sequences cause the exception Bad_character_stream.
*)
val recode_string : in_enc:encoding ->
out_enc:encoding ->
?subst:(int -> string) ->
string ->
string
(* Recodes a complete string from in_enc to out_enc, and returns it.
* The function subst is invoked for code points of in_enc that cannot
* be represented in out_enc, and the result of the function invocation
* is substituted.
* If subst is missing, Not_found is raised in this case.
*)
(* ======================================================================
* History:
*
* $Log: netconversion.mli,v $
* Revision 1.1 2000/08/13 00:02:57 gerd
* Initial revision.
*
*
* ======================================================================
* OLD LOGS FROM THE PXP PACKAGE (FILE NAME pxp_encoding.mli):
*
* Revision 1.4 2000/07/04 22:05:58 gerd
* Enhanced version of 'recode'. Labeled arguments.
* New function 'recode_string'.
*
* Revision 1.3 2000/05/29 23:48:38 gerd
* Changed module names:
* Markup_aux into Pxp_aux
* Markup_codewriter into Pxp_codewriter
* Markup_document into Pxp_document
* Markup_dtd into Pxp_dtd
* Markup_entity into Pxp_entity
* Markup_lexer_types into Pxp_lexer_types
* Markup_reader into Pxp_reader
* Markup_types into Pxp_types
* Markup_yacc into Pxp_yacc
* See directory "compatibility" for (almost) compatible wrappers emulating
* Markup_document, Markup_dtd, Markup_reader, Markup_types, and Markup_yacc.
*
* Revision 1.2 2000/05/29 21:14:57 gerd
* Changed the type 'encoding' into a polymorphic variant.
*
* Revision 1.1 2000/05/20 20:30:50 gerd
* Initial revision.
*
*
*)
|