File: netconversion.ml

package info (click to toggle)
netstring 0.10.1-3
links: PTS
area: main
in suites: woody
size: 1,000 kB
ctags: 895
sloc: ml: 8,389; xml: 416; makefile: 188; sh: 103
file content (861 lines) | stat: -rw-r--r-- 26,919 bytes
(* $Id: netconversion.ml,v 1.2 2000/08/29 00:46:41 gerd Exp $
 * ----------------------------------------------------------------------
 *)

exception Malformed_code


type encoding =
  [  `Enc_utf8       (* UTF-8 *)
  |  `Enc_java
  |  `Enc_utf16      (* UTF-16 with unspecified endianess (restricted usage) *)
  |  `Enc_utf16_le   (* UTF-16 little endian *)
  |  `Enc_utf16_be   (* UTF-16 big endian *)
  |  `Enc_usascii    (* US-ASCII (only 7 bit) *)
  |  `Enc_iso88591   (* ISO-8859-1 *)
  |  `Enc_iso88592   (* ISO-8859-2 *)
  |  `Enc_iso88593   (* ISO-8859-3 *)
  |  `Enc_iso88594   (* ISO-8859-4 *)
  |  `Enc_iso88595   (* ISO-8859-5 *)
  |  `Enc_iso88596   (* ISO-8859-6 *)
  |  `Enc_iso88597   (* ISO-8859-7 *)
  |  `Enc_iso88598   (* ISO-8859-8 *)
  |  `Enc_iso88599   (* ISO-8859-9 *)
  |  `Enc_iso885910  (* ISO-8859-10 *)
  |  `Enc_iso885913  (* ISO-8859-13 *)
  |  `Enc_iso885914  (* ISO-8859-14 *)
  |  `Enc_iso885915  (* ISO-8859-15 *)
  |  `Enc_koi8r      (* KOI8-R *)
  |  `Enc_jis0201    (* JIS-0201 *)
    (* Microsoft: *)
  |  `Enc_windows1250  (* WINDOWS-1250 *)
  |  `Enc_windows1251  (* WINDOWS-1251 *)
  |  `Enc_windows1252  (* WINDOWS-1252 *)
  |  `Enc_windows1253  (* WINDOWS-1253 *)
  |  `Enc_windows1254  (* WINDOWS-1254 *)
  |  `Enc_windows1255  (* WINDOWS-1255 *)
  |  `Enc_windows1256  (* WINDOWS-1256 *)
  |  `Enc_windows1257  (* WINDOWS-1257 *)
  |  `Enc_windows1258  (* WINDOWS-1258 *)
    (* IBM, ASCII-based: *)
  |  `Enc_cp437
  |  `Enc_cp737
  |  `Enc_cp775
  |  `Enc_cp850
  |  `Enc_cp852
  |  `Enc_cp855
  |  `Enc_cp856
  |  `Enc_cp857
  |  `Enc_cp860
  |  `Enc_cp861
  |  `Enc_cp862
  |  `Enc_cp863
  |  `Enc_cp864
  |  `Enc_cp865
  |  `Enc_cp866
  |  `Enc_cp869
  |  `Enc_cp874
  |  `Enc_cp1006
   (* IBM, EBCDIC-based: *)
  |  `Enc_cp037
  |  `Enc_cp424
  |  `Enc_cp500
  |  `Enc_cp875
  |  `Enc_cp1026
   (* Adobe: *)
  |  `Enc_adobe_standard_encoding
  |  `Enc_adobe_symbol_encoding
  |  `Enc_adobe_zapf_dingbats_encoding
   (* Apple: *)
  |  `Enc_macroman

  ]
;;


let norm_enc_name e =
  (* Removes some characters from e; uppercase *)
  let e' = String.create (String.length e) in
  let rec next i j =
    if i < String.length e then
      match e.[i] with
	  ('-'|'_'|'.') -> next (i+1) j
	| c             -> e'.[j] <- c; next (i+1) (j+1)
    else
      j
  in
  let l = next 0 0 in
  String.uppercase(String.sub e' 0 l)
;;


let encoding_of_string e =
  match norm_enc_name e with
      ("UTF16"|"UCS2"|"ISO10646UCS2")                 -> `Enc_utf16
    | "UTF16BE"                                       -> `Enc_utf16_be
    | "UTF16LE"                                       -> `Enc_utf16_le
    | "UTF8"                                          -> `Enc_utf8
    | ("UTF8JAVA"|"JAVA")                             -> `Enc_java
    | ("USASCII"|"ASCII"|"ISO646US"|"IBM367"|"CP367") -> `Enc_usascii
    | ("ISO88591"|"LATIN1"|"IBM819"|"CP819")          -> `Enc_iso88591
    | ("ISO88592"|"LATIN2")                           -> `Enc_iso88592
    | ("ISO88593"|"LATIN3")                           -> `Enc_iso88593
    | ("ISO88594"|"LATIN4")                           -> `Enc_iso88594
    | ("ISO88595"|"CYRILLIC")                         -> `Enc_iso88595
    | ("ISO88596"|"ARABIC"|"ECMA114"|"ASMO708")       -> `Enc_iso88596
    | ("ISO88597"|"GREEK"|"GREEK8"|"ELOT928"|"ECMA118") -> `Enc_iso88597
    | ("ISO88598"|"HEBREW")                           -> `Enc_iso88598
    | ("ISO88599"|"LATIN5")                           -> `Enc_iso88599
    | ("ISO885910"|"LATIN6")                          -> `Enc_iso885910
    | "ISO885913"                                     -> `Enc_iso885913
    | "ISO885914"                                     -> `Enc_iso885914
    | "ISO885915"                                     -> `Enc_iso885915
    | "KOI8R"                                         -> `Enc_koi8r
    | ("JIS0201"|"JISX0201"|"X0201")                  -> `Enc_jis0201

    | "WINDOWS1250"                                   -> `Enc_windows1250
    | "WINDOWS1251"                                   -> `Enc_windows1251
    | "WINDOWS1252"                                   -> `Enc_windows1252
    | "WINDOWS1253"                                   -> `Enc_windows1253
    | "WINDOWS1254"                                   -> `Enc_windows1254
    | "WINDOWS1255"                                   -> `Enc_windows1255
    | "WINDOWS1256"                                   -> `Enc_windows1256
    | "WINDOWS1257"                                   -> `Enc_windows1257
    | "WINDOWS1258"                                   -> `Enc_windows1258

    | ("CP437"|"IBM437")                              -> `Enc_cp437
    | ("CP737"|"IBM737")                              -> `Enc_cp737
    | ("CP775"|"IBM775")                              -> `Enc_cp775
    | ("CP850"|"IBM850")                              -> `Enc_cp850
    | ("CP852"|"IBM852")                              -> `Enc_cp852
    | ("CP855"|"IBM855")                              -> `Enc_cp855
    | ("CP856"|"IBM856")                              -> `Enc_cp856
    | ("CP857"|"IBM857")                              -> `Enc_cp857
    | ("CP860"|"IBM860")                              -> `Enc_cp860
    | ("CP861"|"IBM861")                              -> `Enc_cp861
    | ("CP862"|"IBM862")                              -> `Enc_cp862
    | ("CP863"|"IBM863")                              -> `Enc_cp863
    | ("CP864"|"IBM864")                              -> `Enc_cp864
    | ("CP865"|"IBM865")                              -> `Enc_cp865
    | ("CP866"|"IBM866")                              -> `Enc_cp866
    | ("CP869"|"IBM869")                              -> `Enc_cp869
    | ("CP874"|"IBM874")                              -> `Enc_cp874
    | ("CP1006"|"IBM1006")                            -> `Enc_cp1006

    | ("CP037"|"IBM037"|"EBCDICCPUS"|"EBCDICCPCA"|"EBCDICCPWT"|
       "EBCDICCPNL")                                  -> `Enc_cp037
    | ("CP424"|"IBM424"|"EBCDICCPHE")                 -> `Enc_cp424
    | ("CP500"|"IBM500"|"EBCDICCPBE"|"EBCDICCPCH")    -> `Enc_cp500
    | ("CP875"|"IBM875")                              -> `Enc_cp875
    | ("CP1026"|"IBM1026")                            -> `Enc_cp1026

    | "ADOBESTANDARDENCODING"       -> `Enc_adobe_standard_encoding
    | "ADOBESYMBOLENCODING"         -> `Enc_adobe_symbol_encoding
    | "ADOBEZAPFDINGBATSENCODING"   -> `Enc_adobe_zapf_dingbats_encoding

    | "MACINTOSH"                   -> `Enc_macroman

    | _ ->
	failwith "Netconversion.encoding_of_string: unknown encoding"
;;


let string_of_encoding (e : encoding) =
  (* If there is a "preferred MIME name", this name is returned (see IANA). *)
  match e with
      `Enc_utf16    -> "UTF-16"
    | `Enc_utf16_be -> "UTF-16-BE"
    | `Enc_utf16_le -> "UTF-16-LE"
    | `Enc_utf8     -> "UTF-8"
    | `Enc_java     -> "UTF-8-JAVA"
    | `Enc_usascii  -> "US-ASCII"
    | `Enc_iso88591 -> "ISO-8859-1"
    | `Enc_iso88592 -> "ISO-8859-2"
    | `Enc_iso88593 -> "ISO-8859-3"
    | `Enc_iso88594 -> "ISO-8859-4"
    | `Enc_iso88595 -> "ISO-8859-5"
    | `Enc_iso88596 -> "ISO-8859-6"
    | `Enc_iso88597 -> "ISO-8859-7"
    | `Enc_iso88598 -> "ISO-8859-8"
    | `Enc_iso88599 -> "ISO-8859-9"
    | `Enc_iso885910 -> "ISO-8859-10"
    | `Enc_iso885913 -> "ISO-8859-13"
    | `Enc_iso885914 -> "ISO-8859-14"
    | `Enc_iso885915 -> "ISO-8859-15"
    | `Enc_koi8r     -> "KOI8-R"
    | `Enc_jis0201   -> "JIS_X0201"
    | `Enc_windows1250 -> "WINDOWS-1250"
    | `Enc_windows1251 -> "WINDOWS-1251"
    | `Enc_windows1252 -> "WINDOWS-1252"
    | `Enc_windows1253 -> "WINDOWS-1253"
    | `Enc_windows1254 -> "WINDOWS-1254"
    | `Enc_windows1255 -> "WINDOWS-1255"
    | `Enc_windows1256 -> "WINDOWS-1256"
    | `Enc_windows1257 -> "WINDOWS-1257"
    | `Enc_windows1258 -> "WINDOWS-1258"
    | `Enc_cp437   -> "CP437"
    | `Enc_cp737   -> "CP737"
    | `Enc_cp775   -> "CP775"
    | `Enc_cp850   -> "CP850"
    | `Enc_cp852   -> "CP852"
    | `Enc_cp855   -> "CP855"
    | `Enc_cp856   -> "CP856"
    | `Enc_cp857   -> "CP857"
    | `Enc_cp860   -> "CP860"
    | `Enc_cp861   -> "CP861"
    | `Enc_cp862   -> "CP862"
    | `Enc_cp863   -> "CP863"
    | `Enc_cp864   -> "CP864"
    | `Enc_cp865   -> "CP865"
    | `Enc_cp866   -> "CP866"
    | `Enc_cp869   -> "CP869"
    | `Enc_cp874   -> "CP874"
    | `Enc_cp1006  -> "CP1006"
    | `Enc_cp037   -> "CP037"
    | `Enc_cp424   -> "CP424"
    | `Enc_cp500   -> "CP500"
    | `Enc_cp875   -> "CP875"
    | `Enc_cp1026  -> "CP1026"
    | `Enc_adobe_standard_encoding      -> "ADOBE-STANDARD-ENCODING"
    | `Enc_adobe_symbol_encoding        -> "ADOBE-SYMBOL-ENCODING"
    | `Enc_adobe_zapf_dingbats_encoding -> "ADOBE-ZAPF-DINGBATS-ENCODING"
    | `Enc_macroman                     -> "MACINTOSH"
;;


let read_iso88591 write s_in p_in l_in =
  let rec scan k_in k_out c_out =
    if k_in < l_in then begin
      let p = Char.code s_in.[p_in + k_in] in
      let n = write p k_out c_out in
      if n < 0 then
	k_in, k_out, `Enc_iso88591
      else
	scan (k_in + 1) (k_out + n) (c_out + 1)
    end
    else
      k_in, k_out, `Enc_iso88591
  in
  scan 0 0 0
;;


let read_usascii write s_in p_in l_in =
  let rec scan k_in k_out c_out =
    if k_in < l_in then begin
      let p = Char.code s_in.[p_in + k_in] in
      if p >= 0x80 then raise Malformed_code;
      let n = write p k_out c_out in
      if n < 0 then
	k_in, k_out, `Enc_usascii
      else
	scan (k_in + 1) (k_out + n) (c_out + 1)
    end
    else
      k_in, k_out, `Enc_usascii
  in
  scan 0 0 0
;;


let read_8bit m_to_unicode enc write s_in p_in l_in =
  let rec scan k_in k_out c_out =
    if k_in < l_in then begin
      let p_local = Char.code s_in.[p_in + k_in] in
      let p_uni = Array.unsafe_get m_to_unicode p_local in
      if p_uni < 0 then raise Malformed_code;
      let n = write p_uni k_out c_out in
      if n < 0 then
	k_in, k_out, enc
      else
	scan (k_in + 1) (k_out + n) (c_out + 1)
    end
    else
      k_in, k_out, enc
  in
  scan 0 0 0
;;


let read_utf8 is_java write s_in p_in l_in =
  let rec scan k_in k_out c_out =
    if k_in < l_in then begin
      let n_out, n_in =
	match s_in.[p_in + k_in] with
	    '\000' ->
	      if is_java then raise Malformed_code;
	      write 0 k_out c_out, 1
	  | ('\001'..'\127' as c) ->
	      write (Char.code c) k_out c_out, 1
	  | ('\128'..'\223' as c) ->
	      if k_in + 1 >= l_in then
		-1, 0
	      else begin
		let n1 = Char.code c in
		let n2 = Char.code (s_in.[p_in + k_in + 1]) in
		if is_java && (n1 = 0x80 && n2 = 0xc0) then
		  write 0 k_out c_out, 2
		else begin
		  if n2 < 128 or n2 > 191 then raise Malformed_code;
		  let p = ((n1 land 0b11111) lsl 6) lor (n2 land 0b111111) in
		  if p < 128 then raise Malformed_code;
		  write p k_out c_out, 2
		end
	      end
	  | ('\224'..'\239' as c) ->
	      if k_in + 2 >= l_in then
		-1, 0
	      else begin
		let n1 = Char.code c in
		let n2 = Char.code (s_in.[p_in + k_in + 1]) in
		let n3 = Char.code (s_in.[p_in + k_in + 2]) in
		if n2 < 128 or n2 > 191 then raise Malformed_code;
		if n3 < 128 or n3 > 191 then raise Malformed_code;
		let p =
		  ((n1 land 0b1111) lsl 12) lor
		  ((n2 land 0b111111) lsl 6) lor
		  (n3 land 0b111111)
		in
		if p < 0x800 then raise Malformed_code;
		if (p >= 0xd800 && p < 0xe000) then
		  (* Surrogate pairs are not supported in UTF-8 *)
		  raise Malformed_code;
		if (p >= 0xfffe && p <= 0xffff) then
		  raise Malformed_code;
		write p k_out c_out, 3
	      end
	  | ('\240'..'\247' as c) ->
	      if k_in + 3 >= l_in then
		-1, 0
	      else begin
		let n1 = Char.code c in
		let n2 = Char.code (s_in.[p_in + k_in + 1]) in
		let n3 = Char.code (s_in.[p_in + k_in + 2]) in
		let n4 = Char.code (s_in.[p_in + k_in + 3]) in
		if n2 < 128 or n2 > 191 then raise Malformed_code;
		if n3 < 128 or n3 > 191 then raise Malformed_code;
		if n4 < 128 or n4 > 191 then raise Malformed_code;
		let p = ((n1 land 0b111) lsl 18) lor
			((n2 land 0b111111) lsl 12) lor
			((n3 land 0b111111) lsl 6) lor
			(n4 land 0b111111)
		in
		if p < 0x10000 then raise Malformed_code;
		if p >= 0x110000 then
		  (* These code points are not supported. *)
		  raise Malformed_code;
		write p k_out c_out, 4
	      end
	  | _ ->
	      (* Outside the valid range of XML characters *)
	      raise Malformed_code;
      in
      (* n_out: number of written bytes; -1 means out buf is full
       * n_in: number of read bytes; 0 means end of in buf reached
       * n_in = 0  implies  n_out = -1
       *)
      if n_out < 0 then
	k_in, k_out, `Enc_utf8
      else
	scan (k_in + n_in) (k_out + n_out) (c_out + 1)
    end
    else
      k_in, k_out, `Enc_utf8
  in
  scan 0 0 0
;;


let surrogate_offset = 0x10000 - (0xD800 lsl 10) - 0xDC00;;
	
let read_utf16_le k_in_0 write s_in p_in l_in =
  let rec scan k_in k_out c_out =
    if k_in + 1 < l_in then begin
      let p = (Char.code s_in.[p_in + k_in]) lor ((Char.code s_in.[p_in + k_in + 1]) lsl 8) in

      if p >= 0xd800 & p < 0xe000 then begin
	(* This is a surrogate pair. *)
	if k_in + 3 < l_in then begin
	  if p <= 0xdbff then begin
	    let q = (Char.code s_in.[p_in + k_in + 2 ]) lor
		    ((Char.code s_in.[p_in + k_in + 3]) lsl 8) in
	    if q < 0xdc00 or q > 0xdfff then raise Malformed_code;
	    let eff_p = (p lsl 10) + q + surrogate_offset in
	    let n = write eff_p k_out c_out in
	    if n < 0 then
	      k_in, k_out, `Enc_utf16_le
	    else
	      scan (k_in + 4) (k_out + n) (c_out + 1)
	  end
	  else
	    (* Malformed pair: *)
	    raise Malformed_code;
	end
	else 
	  (* Incomplete pair: *)
	  k_in, k_out, `Enc_utf16_le
      end

      else
	if p = 0xfffe then 
	  (* Big endian byte order mark: It is illegal here *)
	  raise Malformed_code
	else begin
	  (* A regular code point *)
	  let n = write p k_out c_out in
	  if n < 0 then
	    k_in, k_out, `Enc_utf16_le
	  else
	    scan (k_in + 2) (k_out + n) (c_out + 1)
	end
    end
    else
      (* Incomplete character: *)
      k_in, k_out, `Enc_utf16_le
  in
  scan k_in_0 0 0
;;


let read_utf16_be k_in_0 write s_in p_in l_in =
  let rec scan k_in k_out c_out =
    if k_in + 1 < l_in then begin
      let p = (Char.code s_in.[p_in + k_in + 1]) lor ((Char.code s_in.[p_in + k_in]) lsl 8) in

      if p >= 0xd800 & p < 0xe000 then begin
	(* This is a surrogate pair. *)
	if k_in + 3 < l_in then begin
	  if p <= 0xdbff then begin
	    let q = (Char.code s_in.[p_in + k_in + 3 ]) lor
		    ((Char.code s_in.[p_in + k_in + 2]) lsl 8) in
	    if q < 0xdc00 or q > 0xdfff then raise Malformed_code;
	    let eff_p = (p lsl 10) + q + surrogate_offset in
	    let n = write eff_p k_out c_out in
	    if n < 0 then
	      k_in, k_out, `Enc_utf16_be
	    else
	      scan (k_in + 4) (k_out + n) (c_out + 1)
	  end
	  else
	    (* Malformed pair: *)
	    raise Malformed_code;
	end
	else 
	  (* Incomplete pair: *)
	  k_in, k_out, `Enc_utf16_be
      end

      else
	if p = 0xfffe then
	  (* Little endian byte order mark: It is illegal here *)
	  raise Malformed_code
	else begin
	  (* A regular code point *)
	  let n = write p k_out c_out in
	  if n < 0 then
	    k_in, k_out, `Enc_utf16_be
	  else
	    scan (k_in + 2) (k_out + n) (c_out + 1)
	end

    end
    else
      (* Incomplete character: *)
      k_in, k_out, `Enc_utf16_be
  in
  scan k_in_0 0 0
;;


let read_utf16 write s_in p_in l_in =
  (* Expect a BOM at the beginning of the text *)
  if l_in >= 2 then begin
    let c0 = s_in.[p_in + 0] in
    let c1 = s_in.[p_in + 1] in
    if c0 = '\254' & c1 = '\255' then begin
      (* 0xfeff as big endian *)
      read_utf16_be 2 write s_in p_in l_in
    end
    else 
      if c0 = '\255' & c1 = '\254' then begin
	(* 0xfeff as little endian *)
	read_utf16_le 2 write s_in p_in l_in
      end
      else
	(* byte order mark missing *)
	raise Malformed_code
  end
  else
    0, 0, `Enc_utf16
;;


let write_iso88591 s_out p_out l_out max_chars w p k_out c_out =
  if k_out < l_out && c_out < max_chars then begin
    if p > 255 then begin
      let subst = w p in
      let l_subst =  String.length subst in
      if k_out + l_subst <= l_out then begin
	(* Enough space to store 'subst': *)
	String.blit subst 0 s_out (k_out+p_out) l_subst;
	l_subst
      end
      else
	(* Not enough space: Stop this round of recoding *)
	-1
    end
    else begin
      s_out.[p_out + k_out] <- Char.chr p;
      1
    end
  end
  else
    -1   (* End-of-buffer indicator *)
;;


let write_usascii s_out p_out l_out max_chars w p k_out c_out =
  if k_out < l_out && c_out < max_chars then begin
    if p > 127 then begin
      let subst = w p in
      let l_subst =  String.length subst in
      if k_out + l_subst <= l_out then begin
	(* Enough space to store 'subst': *)
	String.blit subst 0 s_out (k_out+p_out) l_subst;
	l_subst
      end
      else
	(* Not enough space: Stop this round of recoding *)
	-1
    end
    else begin
      s_out.[p_out + k_out] <- Char.chr p;
      1
    end
  end
  else
    -1   (* End-of-buffer indicator *)
;;


let write_8bit from_unicode s_out p_out l_out max_chars w p k_out c_out =
  if k_out < l_out && c_out < max_chars then begin
    let p' =
      match Array.unsafe_get from_unicode (p land 255) with
	  Netmappings.U_nil -> -1
	| Netmappings.U_single (p0,q0) ->
	    if p0 = p then q0 else -1
	| Netmappings.U_list l ->
	    (try List.assoc p l with Not_found -> -1)
    in
    if p' < 0 then begin
      let subst = w p in
      let l_subst =  String.length subst in
      if k_out + l_subst <= l_out then begin
	(* Enough space to store 'subst': *)
	String.blit subst 0 s_out (k_out+p_out) l_subst;
	l_subst
      end
      else
	(* Not enough space: Stop this round of recoding *)
	-1
    end
    else begin
      s_out.[p_out + k_out] <- Char.chr p';
      1
    end
  end
  else
    -1   (* End-of-buffer indicator *)
;;


let write_utf8 is_java s_out p_out l_out max_chars w p k_out c_out =
  if p <= 127 && (not is_java || p <> 0) then begin
    if k_out < l_out && c_out < max_chars then begin
      s_out.[p_out + k_out] <- Char.chr p;
      1
    end
    else -1
  end
  else if p <= 0x7ff then begin
    if k_out + 1 < l_out && c_out < max_chars then begin
      s_out.[p_out + k_out]     <- Char.chr (0xc0 lor (p lsr 6));
      s_out.[p_out + k_out + 1] <- Char.chr (0x80 lor (p land 0x3f));
      2
    end
    else -1
  end
  else if p <= 0xffff then begin
    (* Refuse writing surrogate pairs, and fffe, ffff *)
    if (p >= 0xd800 & p < 0xe000) or (p >= 0xfffe) then
      failwith "Netconversion.write_utf8";
    if k_out + 2 < l_out && c_out < max_chars then begin
      s_out.[p_out + k_out]     <- Char.chr (0xe0 lor (p lsr 12));
      s_out.[p_out + k_out + 1] <- Char.chr (0x80 lor ((p lsr 6) land 0x3f));
      s_out.[p_out + k_out + 2] <- Char.chr (0x80 lor (p land 0x3f));
      3
    end
    else -1
  end
  else if p <= 0x10ffff then begin
    if k_out + 3 < l_out && c_out < max_chars then begin
      s_out.[p_out + k_out]     <- Char.chr (0xf0 lor (p lsr 18));
      s_out.[p_out + k_out + 1] <- Char.chr (0x80 lor ((p lsr 12) land 0x3f));
      s_out.[p_out + k_out + 2] <- Char.chr (0x80 lor ((p lsr 6)  land 0x3f));
      s_out.[p_out + k_out + 3] <- Char.chr (0x80 lor (p land 0x3f));
      4
    end
    else -1
  end
  else
    (* Higher code points are not possible in XML: *)
    failwith "Netconversion.write_utf8"
;;


let write_utf16_le s_out p_out l_out max_chars w p k_out c_out =
  if p >= 0xfffe then begin
    if p <= 0xffff or p > 0x10ffff then failwith "Netconversion.write_utf16_le";
    (* Must be written as surrogate pair *)
    if k_out + 3 < l_out && c_out < max_chars then begin
      let high = (p lsr 10) + 0xd800 in
      let low  = (p land 0x3ff) + 0xdc00 in
      s_out.[p_out + k_out    ] <- Char.chr (high land 0xff);
      s_out.[p_out + k_out + 1] <- Char.chr (high lsr 8);
      s_out.[p_out + k_out + 2] <- Char.chr (low land 0xff);
      s_out.[p_out + k_out + 3] <- Char.chr (low lsr 8);
      4
    end
    else -1
  end
  else begin
    if k_out + 1 < l_out && c_out < max_chars then begin
      s_out.[p_out + k_out    ] <- Char.chr (p land 0xff);
      s_out.[p_out + k_out + 1] <- Char.chr (p lsr 8);
      2
    end
    else
      -1
  end
;;


let write_utf16_be s_out p_out l_out max_chars w p k_out c_out =
  if p >= 0xfffe then begin
    if p <= 0xffff or p > 0x10ffff then failwith "Netconversion.write_utf16_be";
    (* Must be written as surrogate pair *)
    if k_out + 3 < l_out && c_out < max_chars then begin
      let high = (p lsr 10) + 0xd800 in
      let low  = (p land 0x3ff) + 0xdc00 in
      s_out.[p_out + k_out + 1] <- Char.chr (high land 0xff);
      s_out.[p_out + k_out    ] <- Char.chr (high lsr 8);
      s_out.[p_out + k_out + 3] <- Char.chr (low land 0xff);
      s_out.[p_out + k_out + 2] <- Char.chr (low lsr 8);
      4
    end
    else -1
  end
  else begin
    if k_out + 1 < l_out && c_out < max_chars then begin
      s_out.[p_out + k_out + 1] <- Char.chr (p land 0xff);
      s_out.[p_out + k_out    ] <- Char.chr (p lsr 8);
      2
    end
    else
      -1
  end
;;


let recode ~in_enc
           ~in_buf
	   ~in_pos
	   ~in_len
	   ~out_enc
	   ~out_buf
           ~out_pos
	   ~out_len
	   ~max_chars
	   ~subst =
  if (in_pos < 0  || in_len < 0  || in_pos  + in_len  > String.length in_buf ||
      out_pos < 0 || out_len < 0 || out_pos + out_len > String.length out_buf)
  then
    invalid_arg "Netconversion.recode";

  let reader =
    match in_enc with
	`Enc_iso88591 -> read_iso88591
      | `Enc_usascii  -> read_usascii
      | `Enc_utf8     -> read_utf8 false
      | `Enc_java     -> read_utf8 true
      | `Enc_utf16    -> read_utf16
      | `Enc_utf16_le -> read_utf16_le 0
      | `Enc_utf16_be -> read_utf16_be 0
      | _             -> 
	  (try
	     let to_unicode' = Hashtbl.find Netmappings.to_unicode in_enc in
	     let to_unicode =
	       Netmappings.lock();
	       Lazy.force to_unicode' in
	     Netmappings.unlock();
	     read_8bit to_unicode in_enc
	   with
	       Not_found ->
		 failwith("Support for the encoding `" ^
			  string_of_encoding in_enc ^ 
			  "' has not been compiled into Netstring")
	  )
  in
  let writer =
    match out_enc with
	`Enc_iso88591 -> write_iso88591  out_buf out_pos out_len max_chars subst
      | `Enc_usascii  -> write_usascii   out_buf out_pos out_len max_chars subst
      | `Enc_utf8     -> write_utf8 false 
                                         out_buf out_pos out_len max_chars subst
      | `Enc_java     -> write_utf8 true out_buf out_pos out_len max_chars subst
      | `Enc_utf16    -> failwith "Netconversion.recode"
      | `Enc_utf16_le -> write_utf16_le  out_buf out_pos out_len max_chars subst
      | `Enc_utf16_be -> write_utf16_be  out_buf out_pos out_len max_chars subst
      | _             -> 
	  (try
	     let from_unicode' = Hashtbl.find Netmappings.from_unicode out_enc 
	     in
	     let from_unicode =
	       Netmappings.lock();
	       Lazy.force from_unicode' in
	     Netmappings.unlock();
	     write_8bit from_unicode out_buf out_pos out_len max_chars subst
	   with
	       Not_found ->
		 failwith("Support for the encoding `" ^
			  string_of_encoding out_enc ^ 
			  "' has not been compiled into Netstring")
	  )
  in
  reader writer in_buf in_pos in_len
;;


let makechar enc p =
  match enc with
      `Enc_iso88591 -> 
	if p > 255 then raise Not_found;
	String.make 1 (Char.chr p)
    | `Enc_usascii ->
	if p > 127 then raise Not_found;
	String.make 1 (Char.chr p)
    | `Enc_utf8 ->
	let s = String.create 4 in
	let n = write_utf8 false s 0 4 1 (fun _ -> raise Not_found) p 0 0 in
	String.sub s 0 n
    | `Enc_java ->
	let s = String.create 4 in
	let n = write_utf8 true s 0 4 1 (fun _ -> raise Not_found) p 0 0 in
	String.sub s 0 n
    | `Enc_utf16_le ->
	let s = String.create 4 in
	let n = write_utf16_le s 0 4 1 (fun _ -> raise Not_found) p 0 0 in
	String.sub s 0 n
    | `Enc_utf16_be ->
	let s = String.create 4 in
	let n = write_utf16_be s 0 4 1 (fun _ -> raise Not_found) p 0 0 in
	String.sub s 0 n
    | `Enc_utf16 ->
	failwith "Netconversion.makechar"
    | _ ->
	let s = String.create 1 in
	let from_unicode' = 
	  try
	    Hashtbl.find Netmappings.from_unicode enc 
	  with
	      Not_found ->
		failwith("Support for the encoding `" ^
			 string_of_encoding enc ^ 
			 "' has not been compiled into Netstring")
	in
	let from_unicode =
	  Netmappings.lock();
	  Lazy.force from_unicode' in
	Netmappings.unlock();
	let n =
	  write_8bit from_unicode s 0 1 1 (fun _ -> raise Not_found) p 0 0 in
	s
;;


let recode_string ~in_enc ~out_enc ?(subst = (fun _ -> raise Not_found)) s =

  let length = String.length s in
  let size = 1024 in
  let out_buf = String.create size in

  let rec recode_loop k s_done in_enc =
    (* 'k' bytes of 's' have already been processed, and the result is in
     * 's_done'.
     *)
    (* Recode to 'out_buf': *)
    let in_len = length - k in
    let in_done, out_done, in_enc' =
      recode ~in_enc:in_enc   ~in_buf:s        ~in_pos:k     ~in_len:in_len
             ~out_enc:out_enc ~out_buf:out_buf ~out_pos:0    ~out_len:size  
             ~max_chars:size  ~subst:subst in
    (* Collect the results: *)
    let k' = k + in_done in
    let s_done' = String.sub out_buf 0 out_done :: s_done in
    (* Still something to do? *)
    if k' < length then
      recode_loop k' s_done' in_enc'
    else
      (* No: Concatenate s_done' to get the final result. *)
      String.concat "" (List.rev s_done')
  in

  recode_loop 0 [] in_enc
;;


(* ======================================================================
 * History:
 * 
 * $Log: netconversion.ml,v $
 * Revision 1.2  2000/08/29 00:46:41  gerd
 * 	New type for the Unicode to 8 bit translation table.
 * 	The Netmappings tables are now Lazy.t.
 *
 * Revision 1.1  2000/08/13 00:02:57  gerd
 * 	Initial revision.
 *
 *
 * ======================================================================
 * OLD LOGS FROM THE PXP PACKAGE (FILE NAME pxp_encoding.ml):
 * 
 * Revision 1.5  2000/07/27 00:41:14  gerd
 * 	new 8 bit codes
 *
 * Revision 1.4  2000/07/04 22:11:41  gerd
 * 	Implemented the enhancements and extensions of
 * rev. 1.4 of pxp_encoding.mli.
 *
 * Revision 1.3  2000/05/29 23:48:38  gerd
 * 	Changed module names:
 * 		Markup_aux          into Pxp_aux
 * 		Markup_codewriter   into Pxp_codewriter
 * 		Markup_document     into Pxp_document
 * 		Markup_dtd          into Pxp_dtd
 * 		Markup_entity       into Pxp_entity
 * 		Markup_lexer_types  into Pxp_lexer_types
 * 		Markup_reader       into Pxp_reader
 * 		Markup_types        into Pxp_types
 * 		Markup_yacc         into Pxp_yacc
 * See directory "compatibility" for (almost) compatible wrappers emulating
 * Markup_document, Markup_dtd, Markup_reader, Markup_types, and Markup_yacc.
 *
 * Revision 1.2  2000/05/29 21:14:57  gerd
 * 	Changed the type 'encoding' into a polymorphic variant.
 *
 * Revision 1.1  2000/05/20 20:30:50  gerd
 * 	Initial revision.
 *
 * 
 *)