1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265
|
let loc = (Lexing.dummy_pos,Lexing.dummy_pos)
(* Named regexp *)
let named_regexps =
(Hashtbl.create 13 : (string, Ulex.regexp) Hashtbl.t)
let () =
List.iter (fun (n,c) -> Hashtbl.add named_regexps n (Ulex.chars c))
[
"eof", Cset.eof;
"xml_letter", Cset.letter;
"xml_digit", Cset.digit;
"xml_extender", Cset.extender;
"xml_base_char", Cset.base_char;
"xml_ideographic", Cset.ideographic;
"xml_combining_char", Cset.combining_char;
"xml_blank", Cset.blank;
"tr8876_ident_char", Cset.tr8876_ident_char;
]
(* Decision tree for partitions *)
type decision_tree =
| Lte of int * decision_tree * decision_tree
| Table of int * int array
| Return of int
let decision l =
let l = List.map (fun (a,b,i) -> (a,b,Return i)) l in
let rec merge2 = function
| (a1,b1,d1) :: (a2,b2,d2) :: rest ->
let x =
if b1 + 1 = a2 then d2
else Lte (a2 - 1,Return (-1), d2)
in
(a1,b2, Lte (b1,d1, x)) :: (merge2 rest)
| rest -> rest in
let rec aux = function
| _::_::_ as l -> aux (merge2 l)
| [(a,b,d)] -> Lte (a - 1, Return (-1), Lte (b, d, Return (-1)))
| _ -> Return (-1)
in
aux l
let limit = 8192
let decision_table l =
let rec aux m accu = function
| ((a,b,i) as x)::rem when (b < limit && i < 255)->
aux (min a m) (x::accu) rem
| rem -> m,accu,rem in
match (aux max_int [] l : int * 'a list * 'b list) with
| _,[], _ -> decision l
| min,((_,max,_)::_ as l1), l2 ->
let arr = Array.create (max-min+1) 0 in
List.iter (fun (a,b,i) -> for j = a to b do arr.(j-min) <- i + 1 done) l1;
Lte (min-1, Return (-1), Lte (max, Table (min,arr), decision l2))
let rec simplify min max = function
| Lte (i,yes,no) ->
if i >= max then simplify min max yes
else if i < min then simplify min max no
else Lte (i, simplify min i yes, simplify (i+1) max no)
| x -> x
let tables = Hashtbl.create 31
let tables_counter = ref 0
let get_tables () =
let t = Hashtbl.fold (fun key x accu -> (x,key)::accu) tables [] in
Hashtbl.clear tables;
t
let table_name t =
try Hashtbl.find tables t
with Not_found ->
incr tables_counter;
let n = Printf.sprintf "__ulex_table_%i" !tables_counter in
Hashtbl.add tables t n;
n
let output_byte buf b =
Buffer.add_char buf '\\';
Buffer.add_char buf (Char.chr(48 + b / 100));
Buffer.add_char buf (Char.chr(48 + (b / 10) mod 10));
Buffer.add_char buf (Char.chr(48 + b mod 10))
let output_byte_array v =
let b = Buffer.create (Array.length v * 5) in
for i = 0 to Array.length v - 1 do
output_byte b (v.(i) land 0xFF);
if i land 15 = 15 then Buffer.add_string b "\\\n "
done;
let s = Buffer.contents b in
<:expr< $str:s$ >>
let table (n,t) = <:str_item< value $lid:n$ = $output_byte_array t$ >>
let partition_name i = Printf.sprintf "__ulex_partition_%i" i
let partition (i,p) =
let rec gen_tree = function
| Lte (i,yes,no) ->
<:expr< if (c <= $int: string_of_int i$)
then $gen_tree yes$ else $gen_tree no$ >>
| Return i ->
<:expr< $int: string_of_int i$ >>
| Table (offset, t) ->
let c = if offset = 0 then <:expr< c >>
else <:expr< (c - $int: string_of_int offset$) >> in
<:expr< Char.code ($lid: table_name t$.[$c$]) - 1>>
in
let body = gen_tree (simplify (-1) (Cset.max_code) (decision_table p)) in
let f = partition_name i in
<:str_item< value $lid:f$ = fun c -> $body$ >>
(* Code generation for the automata *)
let best_final final =
let fin = ref None in
Array.iteri
(fun i b -> if b && (!fin = None) then fin := Some i) final;
!fin
let call_state auto state =
match auto.(state) with (_,trans,final) ->
if Array.length trans = 0
then match best_final final with
| Some i -> <:expr< $int:string_of_int i$ >>
| None -> assert false
else
let f = Printf.sprintf "__ulex_state_%i" state in
<:expr< $lid:f$ lexbuf >>
let gen_state auto loc i (part,trans,final) =
let f = Printf.sprintf "__ulex_state_%i" i in
let p = partition_name part in
let cases =
Array.mapi
(fun i j ->
<:patt< $int:string_of_int i$ >>,
None,
call_state auto j
) trans in
let cases = Array.to_list cases in
let cases = cases @ [<:patt< _ >>, None, <:expr< Ulexing.backtrack lexbuf >>] in
let body =
<:expr< match ($lid:p$ (Ulexing.next lexbuf))
with [ $list:cases$ ] >> in
let ret body =
[<:patt< $lid:f$ >>, <:expr< fun lexbuf -> $body$ >>] in
match best_final final with
| None -> ret body
| Some i ->
if Array.length trans = 0 then [] else
ret
<:expr< do { Ulexing.mark lexbuf $int:string_of_int i$; $body$ } >>
let gen_definition loc l =
let brs = Array.of_list l in
let rs = Array.map fst brs in
let auto = Ulex.compile rs in
let cases = Array.mapi (fun i (_,e) -> <:patt< $int:string_of_int i$ >>, None, e) brs in
let cases = Array.to_list cases in
let cases = cases @ [<:patt< _ >>, None, <:expr< raise Ulexing.Error >>] in
let actions = <:expr< match __ulex_state_0 lexbuf with [ $list:cases$ ] >> in
let states = Array.mapi (gen_state auto loc) auto in
let states = List.flatten (Array.to_list states) in
let body = <:expr< let rec $list:states$ in do { Ulexing.start lexbuf; $actions$ } >> in
<:expr< fun lexbuf -> $body$ >>
(* Lexer specification parser *)
let char s =
Char.code (Token.eval_char s)
let char_int s =
let i = int_of_string s in
if (i >=0) && (i <= Cset.max_code) then i
else failwith ("Invalid Unicode code point: " ^ s)
let regexp_for_string s =
let rec aux n =
if n = String.length s then Ulex.eps
else
Ulex.seq (Ulex.chars (Cset.singleton (Char.code s.[n]))) (aux (succ n))
in aux 0
EXTEND
GLOBAL: Pcaml.expr Pcaml.str_item;
Pcaml.expr: [
[ "lexer";
OPT "|"; l = LIST0 [ r=regexp; "->"; a=Pcaml.expr -> (r,a) ] SEP "|" ->
gen_definition loc l ]
];
Pcaml.str_item: [
[ "let"; LIDENT "regexp"; x = LIDENT; "="; r = regexp ->
if Hashtbl.mem named_regexps x then
Printf.eprintf
"pa_ulex (warning): multiple definition of named regexp '%s'\n"
x;
Hashtbl.add named_regexps x r;
<:str_item< declare $list: []$ end >>
]
];
regexp: [
[ r1 = regexp; "|"; r2 = regexp -> Ulex.alt r1 r2 ]
| [ r1 = regexp; r2 = regexp -> Ulex.seq r1 r2 ]
| [ r = regexp; "*" -> Ulex.rep r
| r = regexp; "+" -> Ulex.plus r
| r = regexp; "?" -> Ulex.alt Ulex.eps r
| "("; r = regexp; ")" -> r
| "_" -> Ulex.chars Cset.any
| c = chr -> Ulex.chars (Cset.singleton c)
| s = STRING -> regexp_for_string (Token.eval_string loc s)
| "["; cc = ch_class; "]" -> Ulex.chars cc
| x = LIDENT ->
try Hashtbl.find named_regexps x
with Not_found ->
failwith
("pa_ulex (error): reference to unbound regexp name `"^x^"'")
]
];
chr: [
[ c = CHAR -> char c
| i = INT -> char_int i ]
];
ch_class: [
[ "^"; cc = ch_class -> Cset.difference Cset.any cc]
| [ c1 = chr; "-"; c2 = chr -> Cset.interval c1 c2
| c = chr -> Cset.singleton c
| cc1 = ch_class; cc2 = ch_class -> Cset.union cc1 cc2
| s = STRING ->
let s = Token.eval_string loc s in
let c = ref Cset.empty in
for i = 0 to String.length s - 1 do
c := Cset.union !c (Cset.singleton (Char.code s.[i]))
done;
!c
]
];
END
let () =
let old_parse_implem = !Pcaml.parse_implem in
let new_parse_implem s =
let (items,d) = old_parse_implem s in
let parts = List.map partition (Ulex.partitions ()) in
let tables = List.map table (get_tables ()) in
(<:str_item< declare $list:tables@parts$ end >>, loc) :: items, d
in
Pcaml.parse_implem := new_parse_implem
|