File: regexp_ast.ml

package info (click to toggle)
mikmatch 1.0.9-1
  • links: PTS, VCS
  • area: main
  • in suites: bullseye
  • size: 484 kB
  • sloc: ml: 3,898; makefile: 349; sh: 2
file content (269 lines) | stat: -rw-r--r-- 7,418 bytes parent folder | download
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
(* $Id$ *)
(* Abstract syntax tree for regular expressions *)

open Camlp4.PreCast

type converter = [ `Int 
		 | `Float
		 | `Option
		 | `Custom of Ast.expr 
		 | `Value of Ast.expr ]

module S = Set.Make (String)

let list_named_groups set = List.sort String.compare (S.elements set)

module Named_groups = 
struct
  module M = Map.Make (String)
  include M
  let list m =
    List.sort 
      (fun (a, _) (b, _) -> String.compare a b)
      (fold 
	 (fun key data accu -> 
	    let positions = 
	      List.sort
		(fun (loc, i, conv1) (loc, j, conv2) -> 
		   Stdlib.compare i j)
		data in
	    (key, positions) :: accu)
	 m [])

  let list_keys m = 
    List.sort String.compare (fold (fun key data accu -> key :: accu) m [])

  let keys m = fold (fun key data accu -> S.add key accu) m S.empty
  let equal m1 m2 = S.equal (keys m1) (keys m2)
  let inter m1 m2 = S.inter (keys m1) (keys m2)
  let union m1 m2 = S.union (keys m1) (keys m2)
  let diff m1 m2 = S.diff (keys m1) (keys m2)
end

type named_groups = (Ast.loc * int * converter option) list Named_groups.t


let add_new loc name conv group_num set =
  if Named_groups.mem name set then
    Messages.multiple_binding loc [name];
  Named_groups.add name [loc, group_num, conv] set

let add_new_group loc name conv group_num (groups, positions) =
  (add_new loc name conv group_num groups, positions)

let add_new_pos loc name group_num (groups, positions) =
  (groups, add_new loc name None group_num positions)

let merge_lists l1 l2 =
  let tbl = Hashtbl.create (List.length l1 + List.length l2) in
  let add l = 
    List.iter (fun ((_, n, conv) as x) -> Hashtbl.replace tbl n x) l in
  add l1;
  add l2;
  let l = Hashtbl.fold (fun _ x l -> x :: l) tbl [] in
  let cmp (_, x, _) (_, y, _) = compare x y in
  List.sort cmp l

let really_add name l2 set =
  try
    let l1 = Named_groups.find name set in
    Named_groups.add 
      name (merge_lists l1 l2)
      (Named_groups.remove name set)
  with Not_found ->
    Named_groups.add name l2 set

let merge set1 set2 =
  Named_groups.fold really_add set1 set2

type repetition_kind = 
    Star 
  | Option 
  | Plus
  | Range of (int * int option option)

type greediness = bool

type ast =
    Epsilon of Ast.loc
  | Characters of Ast.loc * Charset.t
  | Sequence of Ast.loc * ast * ast
  | Alternative of Ast.loc 
      * ast (* choice 1 *)
      * ast (* choice 2 *)
      * S.t (* group names *) 
      * S.t (* position names *)
  | Repetition of Ast.loc * (repetition_kind * greediness) * ast
  | Possessive of Ast.loc * ast
  | Bind of Ast.loc * ast * string * converter option
  | Bind_pos of Ast.loc * string
  | Backref of Ast.loc * string
  | Variable of Ast.loc * Ast.expr
  | Nocase_variable of Ast.loc * Ast.expr
  | Special of Ast.loc * string * (string * int option)
  | Lookahead of Ast.loc * bool * ast
  | Lookbehind of Ast.loc * bool * ast
  | Closed of ast

let rec loc_of_regexp = function
    Epsilon loc
  | Characters (loc, _)
  | Sequence (loc, _, _)
  | Alternative (loc, _, _, _, _)
  | Repetition (loc, _, _)
  | Possessive (loc, _)
  | Bind (loc, _, _, _)
  | Bind_pos (loc, _)
  | Backref (loc, _)
  | Variable (loc, _)
  | Nocase_variable (loc, _)
  | Special (loc, _, _)
  | Lookahead (loc, _, _)
  | Lookbehind (loc, _, _) -> loc
  | Closed ast -> loc_of_regexp ast

let rec bindings : ast -> S.t = function
    Bind (loc, e, s, conv) -> S.add s (bindings e)
  | Bind_pos _
  | Epsilon _
  | Characters _
  | Backref _
  | Variable _
  | Nocase_variable _
  | Special _ -> S.empty
  | Sequence (loc, e1, e2) -> S.union (bindings e1) (bindings e2)
  | Alternative (loc, e1, e2, set, pos_set) -> set
  | Repetition (loc, kind, e) -> bindings e
  | Possessive (loc, e)
  | Lookahead (loc, _, e)
  | Lookbehind (loc, _, e) -> bindings e
  | Closed e -> S.empty

let rec pos_bindings : ast -> S.t = function
    Bind_pos (loc, s) -> S.singleton s
  | Bind _
  | Epsilon _
  | Characters _
  | Backref _
  | Variable _
  | Nocase_variable _
  | Special _ -> S.empty
  | Sequence (loc, e1, e2) -> S.union (pos_bindings e1) (pos_bindings e2)
  | Alternative (loc, e1, e2, set, pos_set) -> pos_set
  | Repetition (loc, kind, e) -> pos_bindings e
  | Possessive (loc, e)
  | Lookahead (loc, _, e) 
  | Lookbehind (loc, _, e) -> pos_bindings e
  | Closed _ -> S.empty



let alternative loc e1 e2 =
  match e1, e2 with
      Characters (loc1, s1), Characters (loc2, s2) -> 
	Characters (loc, Charset.union s1 s2)
    | _ ->
	let b1 = bindings e1
	and b2 = bindings e2 in
	let pb1 = pos_bindings e1 
	and pb2 = pos_bindings e2 in
	Alternative (loc, e1, e2, S.union b1 b2, S.union pb1 pb2)

let rec repeat loc e (mini, maxoptopt) =
  if mini < 0 then
    Messages.invalid_range loc
  else
    match maxoptopt with
	None -> 
	  (match mini with
	       0 -> Epsilon loc
	     | n -> 
		 let rec loop i =
		   if i > 1 then
		     Sequence (loc, e, loop (i-1))
		   else e in
		 loop n)
      | Some (Some maxi) ->
	  let diff = maxi - mini in
	  if diff < 0 then Messages.invalid_range loc
	  else if diff = 0 then e
	  else 
	    let rec loop i =
	      alternative loc (Epsilon loc) 
		(if i > 1 then
		   (Sequence (loc, e, loop (i-1)))
		 else e) in
	    Sequence (loc, (repeat loc e (mini, None)), loop diff)
      | Some None ->
	  Sequence (loc, repeat loc e (mini, None), 
		    Repetition (loc, (Star, true), e))


let rec nocase = function
    Bind (loc, e, s, conv) -> Bind (loc, nocase e, s, conv)
  | Bind_pos _
  | Epsilon _
  | Backref _
  | Nocase_variable _
  | Special _ as e -> e
  | Characters (loc, charset) -> Characters (loc, Charset.nocase charset)
  | Sequence (loc, e1, e2) -> Sequence (loc, nocase e1, nocase e2)
  | Alternative (loc, e1, e2, ids, pos_ids) -> 
     Alternative (loc, nocase e1, nocase e2, ids, pos_ids)
  | Repetition (loc, kind, e) -> Repetition (loc, kind, nocase e)
  | Possessive (loc, e) -> Possessive (loc, nocase e)
  | Lookahead (loc, b, e) -> Lookahead (loc, b, nocase e)
  | Lookbehind (loc, b, e) -> Lookbehind (loc, b, nocase e)
  | Variable (loc, e) -> Nocase_variable (loc, e)
  | Closed ast -> Closed (nocase ast)


(* Miscellaneous functions *)

let explode s =
  let l = ref [] in
  for i = String.length s - 1 downto 0 do
    l := s.[i] :: !l
  done;
  !l

let of_string loc s =
  let l = explode s in
  match l with
      [c] -> Characters (loc, Charset.singleton c)
    | _ ->
	List.fold_right 
	  (fun c re -> 
	     Sequence (loc, (Characters (loc, Charset.singleton c)), re))
	  l (Epsilon loc)

let as_charset _loc msg = function
    Characters (_loc, set) -> set
  | _ -> Messages.failure _loc msg

let rec warn_bindings w = function
    Bind (loc, e, s, conv) ->
      if w then Messages.not_visible loc [s] "context";
      warn_bindings w e
  | Bind_pos (loc, s) -> if w then Messages.not_visible loc [s] "context"
  | Epsilon _
  | Characters _
  | Backref _
  | Variable _
  | Nocase_variable _
  | Special _ -> ()
  | Sequence (loc, e1, e2) -> warn_bindings w e1; warn_bindings w e2
  | Alternative (loc, e1, e2, set, pos_set) -> 
      if w then
	(match list_named_groups (S.union set pos_set) with
	     [] -> ()
	   | ignored -> Messages.not_visible loc ignored "context")
  | Repetition (loc, kind, e) -> warn_bindings w e
  | Possessive (loc, e)
  | Lookahead (loc, _, e)
  | Lookbehind (loc, _, e) -> warn_bindings w e
  | Closed e -> warn_bindings true e

let warnings re =
  warn_bindings false re