1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203
|
exception Error
exception InvalidCodepoint of int
let eof = -1
(* Absolute position from the beginning of the stream *)
type apos = int
type lexbuf = {
refill: (int array -> int -> int -> int);
mutable buf: int array;
mutable len: int; (* Number of meaningful char in buffer *)
mutable offset: apos; (* Position of the first char in buffer
in the input stream *)
mutable pos : int;
mutable start : int; (* First char we need to keep visible *)
mutable marked_pos : int;
mutable marked_val : int;
mutable finished: bool;
}
let get_buf lb = lb.buf
let get_pos lb = lb.pos
let get_start lb = lb.start
let chunk_size = 512
let empty_lexbuf = {
refill = (fun _ _ _ -> assert false);
buf = [| |];
len = 0;
offset = 0;
pos = 0;
start = 0;
marked_pos = 0;
marked_val = 0;
finished = false;
}
let create f = {
empty_lexbuf with
refill = f;
buf = Array.create chunk_size 0;
}
let from_stream s =
create (fun buf pos len ->
try buf.(pos) <- Stream.next s; 1
with Stream.Failure -> 0)
let from_latin1_stream s =
create (fun buf pos len ->
try buf.(pos) <- Char.code (Stream.next s); 1
with Stream.Failure -> 0)
let from_utf8_stream s =
create (fun buf pos len ->
try buf.(pos) <- Utf8.from_stream s; 1
with Stream.Failure -> 0)
type enc = Ascii | Latin1 | Utf8
exception MalFormed
let from_var_enc_stream enc s =
create (fun buf pos len ->
try
buf.(pos) <- (match !enc with
| Ascii ->
let c = Char.code (Stream.next s) in
if c > 127 then raise (InvalidCodepoint c);
c
| Latin1 -> Char.code (Stream.next s)
| Utf8 -> Utf8.from_stream s);
1
with Stream.Failure -> 0)
let from_var_enc_string enc s =
from_var_enc_stream enc (Stream.of_string s)
let from_var_enc_channel enc ic =
from_var_enc_stream enc (Stream.of_channel ic)
let from_latin1_string s =
let len = String.length s in
{
empty_lexbuf with
buf = Array.init len (fun i -> Char.code s.[i]);
len = len;
finished = true;
}
let from_latin1_channel ic =
from_latin1_stream (Stream.of_channel ic)
let from_utf8_channel ic =
from_stream (Utf8.stream_from_char_stream (Stream.of_channel ic))
let from_int_array a =
let len = Array.length a in
{
empty_lexbuf with
buf = Array.init len (fun i -> a.(i));
len = len;
finished = true;
}
let from_utf8_string s =
from_int_array (Utf8.to_int_array s 0 (String.length s))
let refill lexbuf =
if lexbuf.len + chunk_size > Array.length lexbuf.buf
then begin
let s = lexbuf.start in
let ls = lexbuf.len - s in
if ls + chunk_size <= Array.length lexbuf.buf then
Array.blit lexbuf.buf s lexbuf.buf 0 ls
else begin
let newlen = (Array.length lexbuf.buf + chunk_size) * 2 in
let newbuf = Array.create newlen 0 in
Array.blit lexbuf.buf s newbuf 0 ls;
lexbuf.buf <- newbuf
end;
lexbuf.len <- ls;
lexbuf.offset <- lexbuf.offset + s;
lexbuf.pos <- lexbuf.pos - s;
lexbuf.marked_pos <- lexbuf.marked_pos - s;
lexbuf.start <- 0
end;
let n = lexbuf.refill lexbuf.buf lexbuf.pos chunk_size in
if (n = 0)
then begin
lexbuf.buf.(lexbuf.len) <- eof;
lexbuf.len <- lexbuf.len + 1;
end
else lexbuf.len <- lexbuf.len + n
let next lexbuf =
let i =
if lexbuf.pos = lexbuf.len then
if lexbuf.finished then eof
else (refill lexbuf; lexbuf.buf.(lexbuf.pos))
else lexbuf.buf.(lexbuf.pos)
in
if i = eof then lexbuf.finished <- true else lexbuf.pos <- lexbuf.pos + 1;
i
let start lexbuf =
lexbuf.start <- lexbuf.pos;
lexbuf.marked_pos <- lexbuf.pos;
lexbuf.marked_val <- (-1)
let mark lexbuf i =
lexbuf.marked_pos <- lexbuf.pos;
lexbuf.marked_val <- i
let backtrack lexbuf =
lexbuf.pos <- lexbuf.marked_pos;
lexbuf.marked_val
let rollback lexbuf =
lexbuf.pos <- lexbuf.start
let lexeme_start lexbuf = lexbuf.start + lexbuf.offset
let lexeme_end lexbuf = lexbuf.pos + lexbuf.offset
let loc lexbuf = (lexbuf.start + lexbuf.offset, lexbuf.pos + lexbuf.offset)
let lexeme_length lexbuf = lexbuf.pos - lexbuf.start
let sub_lexeme lexbuf pos len =
Array.sub lexbuf.buf (lexbuf.start + pos) len
let lexeme lexbuf =
Array.sub lexbuf.buf (lexbuf.start) (lexbuf.pos - lexbuf.start)
let lexeme_char lexbuf pos =
lexbuf.buf.(lexbuf.start + pos)
let to_latin1 c =
if (c >= 0) && (c < 256)
then Char.chr c
else raise (InvalidCodepoint c)
let latin1_lexeme_char lexbuf pos =
to_latin1 (lexeme_char lexbuf pos)
let latin1_sub_lexeme lexbuf pos len =
let s = String.create len in
for i = 0 to len - 1 do s.[i] <- to_latin1 lexbuf.buf.(lexbuf.start + pos + i) done;
s
let latin1_lexeme lexbuf =
latin1_sub_lexeme lexbuf 0 (lexbuf.pos - lexbuf.start)
let utf8_sub_lexeme lexbuf pos len =
Utf8.from_int_array lexbuf.buf (lexbuf.start + pos) len
let utf8_lexeme lexbuf =
utf8_sub_lexeme lexbuf 0 (lexbuf.pos - lexbuf.start)
|