1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146
|
exception MalFormed
exception InvalidCodepoint of int
type byte_order = Little_endian | Big_endian
let get_byte_order c0 c1 =
match (Char.code c0, Char.code c1) with
| (0xfe,0xff) -> Big_endian
| (0xff,0xfe) -> Little_endian
| _ -> raise MalFormed
let number_of_char_pair bo c1 c2 = match bo with
| Little_endian -> ((Char.code c2) lsl 8) + (Char.code c1)
| Big_endian -> ((Char.code c1) lsl 8) + (Char.code c2)
let char_pair_of_number bo num = match bo with
| Little_endian ->
(Char.chr (num land 0xFF), Char.chr ((num lsr 8) land 0xFF ))
| Big_endian ->
(Char.chr ((num lsr 8) land 0xFF), Char.chr (num land 0xFF))
let next_in_string bo s pos bytes =
if (pos + 1 >= bytes) then raise MalFormed;
number_of_char_pair bo s.[pos] s.[pos+1]
let next_code bo s pos bytes =
let w1 = next_in_string bo s pos bytes in
if w1 = 0xfffe then raise (InvalidCodepoint w1);
if w1 < 0xd800 || 0xdfff < w1 then (w1, pos+2)
else if w1 <= 0xdbff
then
let w2 = next_in_string bo s (pos + 2) bytes in
if w2 < 0xdc00 || w2 > 0xdfff then raise MalFormed;
let upper10 = (w1 land 0x3ff) lsl 10
and lower10 = w2 land 0x3ff in
(0x10000 + upper10 + lower10, pos + 4)
else raise MalFormed
let next_in_stream bo s =
let c1 = Stream.next s in
let c2 = Stream.next s in
number_of_char_pair bo c1 c2
let from_stream bo s w1 =
if w1 = 0xfffe then raise (InvalidCodepoint w1);
if w1 < 0xd800 || 0xdfff < w1 then w1
else if w1 <= 0xdbff
then
let w2 = next_in_stream bo s in
if w2 < 0xdc00 || w2 > 0xdfff then raise MalFormed;
let upper10 = (w1 land 0x3ff) lsl 10
and lower10 = w2 land 0x3ff in
0x10000 + upper10 + lower10
else raise MalFormed
let stream_from_char_stream opt_bo s =
let bo = ref opt_bo in
Stream.from
(fun _ ->
try
let c1 = Stream.next s in
let c2 = Stream.next s in
let o = match !bo with
| Some o -> o
| None ->
let o = match (Char.code c1, Char.code c2) with
| (0xff,0xfe) -> Little_endian
| _ -> Big_endian in
bo := Some o;
o in
Some (from_stream o s (number_of_char_pair o c1 c2))
with Stream.Failure -> None)
let compute_len opt_bo str pos bytes =
let s = stream_from_char_stream opt_bo
(Stream.from (fun i -> if i + pos >= bytes then None
else Some (str.[i + pos])))
in
let l = ref 0 in
Stream.iter (fun _ -> incr l) s ;
!l
let rec blit_to_int opt_bo s spos a apos bytes =
let s = stream_from_char_stream opt_bo
(Stream.from (fun i -> if i+spos >= bytes then None
else Some (s.[i + spos]))) in
let p = ref apos in
try while true do a.(!p) <- Stream.next s ; incr p done; assert false
with Stream.Failure -> ()
let to_int_array opt_bo s pos bytes =
let len = compute_len opt_bo s pos bytes in
let a = Array.create len 0 in
blit_to_int opt_bo s pos a 0 bytes ;
a
let store bo buf code =
if code < 0x10000
then (
let (c1,c2) = char_pair_of_number bo code in
Buffer.add_char buf c1;
Buffer.add_char buf c2
) else (
let u' = code - 0x10000 in
let w1 = 0xd800 + (u' lsr 10)
and w2 = 0xdc00 + (u' land 0x3ff) in
let (c1,c2) = char_pair_of_number bo w1
and (c3,c4) = char_pair_of_number bo w2 in
Buffer.add_char buf c1;
Buffer.add_char buf c2;
Buffer.add_char buf c3;
Buffer.add_char buf c4
)
let from_int_array bo a apos len bom =
let b = Buffer.create (len * 4) in
if bom then store bo b 0xfeff ; (* first, store the BOM *)
let rec aux apos len =
if len > 0
then (store bo b a.(apos); aux (succ apos) (pred len))
else Buffer.contents b in
aux apos len
let from_stream bo s =
from_stream bo s (next_in_stream bo s)
let from_utf16_stream s opt_bo =
Ulexing.from_stream (stream_from_char_stream opt_bo s)
let from_utf16_channel ic opt_bo =
from_utf16_stream ((Stream.of_channel ic)) opt_bo
let from_utf16_string s opt_bo =
let a = to_int_array opt_bo s 0 (String.length s) in
Ulexing.from_int_array a
let utf16_sub_lexeme lb pos len bo bom =
from_int_array bo (Ulexing.get_buf lb) (Ulexing.get_start lb + pos) len bom
let utf16_lexeme lb bo bom =
utf16_sub_lexeme lb 0 (Ulexing.get_pos lb - Ulexing.get_start lb) bo bom
|