File: pa_ulex.ml

package info (click to toggle)
ulex 0.5-3
  • links: PTS
  • area: main
  • in suites: sarge
  • size: 156 kB
  • ctags: 260
  • sloc: ml: 1,070; makefile: 73; sh: 50
file content (265 lines) | stat: -rw-r--r-- 7,740 bytes parent folder | download
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
let loc = (Lexing.dummy_pos,Lexing.dummy_pos)

(* Named regexp *)

let named_regexps =
  (Hashtbl.create 13 : (string, Ulex.regexp) Hashtbl.t)

let () =
  List.iter (fun (n,c) -> Hashtbl.add named_regexps n (Ulex.chars c))
    [
      "eof", Cset.eof;
      "xml_letter", Cset.letter;
      "xml_digit", Cset.digit;
      "xml_extender", Cset.extender;
      "xml_base_char", Cset.base_char;
      "xml_ideographic", Cset.ideographic;
      "xml_combining_char", Cset.combining_char;
      "xml_blank", Cset.blank;

      "tr8876_ident_char", Cset.tr8876_ident_char;
    ] 

(* Decision tree for partitions *)

type decision_tree =
  | Lte of int * decision_tree * decision_tree
  | Table of int * int array
  | Return of int

let decision l =
  let l = List.map (fun (a,b,i) -> (a,b,Return i)) l in
  let rec merge2 = function
    | (a1,b1,d1) :: (a2,b2,d2) :: rest ->
	let x =
	  if b1 + 1 = a2 then d2
	  else Lte (a2 - 1,Return (-1), d2)
	in
	(a1,b2, Lte (b1,d1, x)) :: (merge2 rest)
    | rest -> rest in
  let rec aux = function
    | _::_::_ as l -> aux (merge2 l)
    | [(a,b,d)] -> Lte (a - 1, Return (-1), Lte (b, d, Return (-1)))
    | _ -> Return (-1)
  in
  aux l

let limit = 8192

let decision_table l =
  let rec aux m accu = function
    | ((a,b,i) as x)::rem when (b < limit && i < 255)-> 
	aux (min a m) (x::accu) rem
    | rem -> m,accu,rem in
  match (aux max_int [] l : int * 'a list * 'b list) with
    | _,[], _ -> decision l
    | min,((_,max,_)::_ as l1), l2 ->
	let arr = Array.create (max-min+1) 0 in
	List.iter (fun (a,b,i) -> for j = a to b do arr.(j-min) <- i + 1 done) l1;
	Lte (min-1, Return (-1), Lte (max, Table (min,arr), decision l2))

let rec simplify min max = function
  | Lte (i,yes,no) ->
      if i >= max then simplify min max yes 
      else if i < min then simplify min max no
      else Lte (i, simplify min i yes, simplify (i+1) max no)
  | x -> x
		   

let tables = Hashtbl.create 31
let tables_counter = ref 0
let get_tables () = 
  let t = Hashtbl.fold (fun key x accu -> (x,key)::accu) tables [] in
  Hashtbl.clear tables;
  t
let table_name t =
  try Hashtbl.find tables t
  with Not_found ->
    incr tables_counter;
    let n = Printf.sprintf "__ulex_table_%i" !tables_counter in
    Hashtbl.add tables t n;
    n

let output_byte buf b =
  Buffer.add_char buf '\\';
  Buffer.add_char buf (Char.chr(48 + b / 100));
  Buffer.add_char buf (Char.chr(48 + (b / 10) mod 10));
  Buffer.add_char buf (Char.chr(48 + b mod 10))

let output_byte_array v =
  let b = Buffer.create (Array.length v * 5) in
  for i = 0 to Array.length v - 1 do
    output_byte b (v.(i) land 0xFF);
    if i land 15 = 15 then Buffer.add_string b "\\\n    "
  done;
  let s = Buffer.contents b in
  <:expr< $str:s$ >>

let table (n,t) = <:str_item< value $lid:n$ = $output_byte_array t$ >>

let partition_name i = Printf.sprintf "__ulex_partition_%i" i

let partition (i,p) =
  let rec gen_tree = function 
    | Lte (i,yes,no) ->
	<:expr< if (c <= $int: string_of_int i$) 
	then $gen_tree yes$ else $gen_tree no$ >>
    | Return i ->
	<:expr< $int: string_of_int i$ >>
    | Table (offset, t) ->
	let c = if offset = 0 then <:expr< c >> 
	else <:expr< (c - $int: string_of_int offset$) >> in
	<:expr< Char.code ($lid: table_name t$.[$c$]) - 1>>
  in
  let body = gen_tree (simplify (-1) (Cset.max_code) (decision_table p)) in
  let f = partition_name i in
  <:str_item< value $lid:f$ = fun c -> $body$ >>


(* Code generation for the automata *)

let best_final final =
  let fin = ref None in
  Array.iteri 
    (fun i b -> if b && (!fin = None) then fin := Some i) final;
  !fin

let call_state auto state =
  match auto.(state) with (_,trans,final) ->
    if Array.length trans = 0 
    then match best_final final with
      | Some i -> <:expr< $int:string_of_int i$ >>
      | None -> assert false
    else
      let f = Printf.sprintf "__ulex_state_%i" state in
      <:expr< $lid:f$ lexbuf >>
	

let gen_state auto loc i (part,trans,final) = 
  let f = Printf.sprintf "__ulex_state_%i" i in
  let p = partition_name part in
  let cases =
    Array.mapi 
      (fun i j ->
	 <:patt< $int:string_of_int i$ >>,
	 None,
	 call_state auto j
      ) trans in
  let cases = Array.to_list cases in
  let cases = cases @ [<:patt< _ >>, None, <:expr< Ulexing.backtrack lexbuf >>] in
  let body = 
    <:expr< match ($lid:p$ (Ulexing.next lexbuf)) 
    with [ $list:cases$ ] >> in
  let ret body =
    [<:patt< $lid:f$ >>, <:expr< fun lexbuf -> $body$ >>] in
  match best_final final with
    | None -> ret body
    | Some i -> 
	if Array.length trans = 0 then [] else
	  ret
	  <:expr< do { Ulexing.mark lexbuf $int:string_of_int i$;  $body$ } >>


let gen_definition loc l =
  let brs = Array.of_list l in
  let rs = Array.map fst brs in
  let auto = Ulex.compile rs in

  let cases = Array.mapi (fun i (_,e) -> <:patt< $int:string_of_int i$ >>, None, e) brs in
  let cases = Array.to_list cases in
  let cases = cases @ [<:patt< _ >>, None, <:expr< raise Ulexing.Error >>] in
  let actions = <:expr< match __ulex_state_0 lexbuf with [ $list:cases$ ] >> in
  let states = Array.mapi (gen_state auto loc) auto in
  let states = List.flatten (Array.to_list states) in
  let body = <:expr< let rec $list:states$ in do { Ulexing.start lexbuf; $actions$ } >> in
  <:expr< fun lexbuf -> $body$ >>


(* Lexer specification parser *)

let char s = 
  Char.code (Token.eval_char s)

let char_int s =
  let i = int_of_string s in
  if (i >=0) && (i <= Cset.max_code) then i
  else failwith ("Invalid Unicode code point: " ^ s)

let regexp_for_string s =
  let rec aux n =
    if n = String.length s then Ulex.eps
    else 
      Ulex.seq (Ulex.chars (Cset.singleton (Char.code s.[n]))) (aux (succ n))
  in aux 0

EXTEND
 GLOBAL: Pcaml.expr Pcaml.str_item;

 Pcaml.expr: [
  [ "lexer";
     OPT "|"; l = LIST0 [ r=regexp; "->"; a=Pcaml.expr -> (r,a) ] SEP "|" ->
       gen_definition loc l ]
 ];

 Pcaml.str_item: [
   [ "let"; LIDENT "regexp"; x = LIDENT; "="; r = regexp ->
       if Hashtbl.mem named_regexps x then
         Printf.eprintf 
           "pa_ulex (warning): multiple definition of named regexp '%s'\n"
           x;
     Hashtbl.add named_regexps x r;
       <:str_item< declare $list: []$ end >>
   ]
 ];
 
 regexp: [
   [ r1 = regexp; "|"; r2 = regexp -> Ulex.alt r1 r2 ]
 | [ r1 = regexp; r2 = regexp -> Ulex.seq r1 r2 ]
 | [ r = regexp; "*" -> Ulex.rep r
   | r = regexp; "+" -> Ulex.plus r
   | r = regexp; "?" -> Ulex.alt Ulex.eps r
   | "("; r = regexp; ")" -> r
   | "_" -> Ulex.chars Cset.any
   | c = chr -> Ulex.chars (Cset.singleton c)
   | s = STRING -> regexp_for_string (Token.eval_string loc s)
   | "["; cc = ch_class; "]" ->  Ulex.chars cc
   | x = LIDENT ->
       try  Hashtbl.find named_regexps x
       with Not_found ->
         failwith 
           ("pa_ulex (error): reference to unbound regexp name `"^x^"'")
   ]
 ];

 chr: [ 
   [ c = CHAR -> char c
   | i = INT -> char_int i ]
 ];

 ch_class: [
   [ "^"; cc = ch_class -> Cset.difference Cset.any cc]
 | [ c1 = chr; "-"; c2 = chr -> Cset.interval c1 c2
   | c = chr -> Cset.singleton c
   | cc1 = ch_class; cc2 = ch_class -> Cset.union cc1 cc2
   | s = STRING -> 
       let s = Token.eval_string loc s in
       let c = ref Cset.empty in
       for i = 0 to String.length s - 1 do
	 c := Cset.union !c (Cset.singleton (Char.code s.[i])) 
       done;
       !c
   ]
 ];
END


let () =
  let old_parse_implem = !Pcaml.parse_implem in
  let new_parse_implem s =
    let (items,d) = old_parse_implem s in
    let parts = List.map partition (Ulex.partitions ()) in
    let tables = List.map table (get_tables ()) in
    (<:str_item< declare $list:tables@parts$ end >>, loc) :: items, d
  in
  Pcaml.parse_implem := new_parse_implem