File: pa_text_parse.ml

package info (click to toggle)
ocaml-text 0.3-1
  • links: PTS, VCS
  • area: main
  • in suites: squeeze
  • size: 440 kB
  • ctags: 496
  • sloc: ml: 2,024; ansic: 203; makefile: 63
file content (223 lines) | stat: -rw-r--r-- 7,493 bytes parent folder | download | duplicates (4)
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
(*
 * pa_text_parse.ml
 * ----------------
 * Copyright : (c) 2010, Jeremie Dimino <jeremie@dimino.org>
 * Licence   : BSD3
 *
 * This file is a part of ocaml-text.
 *)

open Camlp4.PreCast
open Syntax
open Pa_text_types

(* +-----------------------------------------------------------------+
   | Types                                                           |
   +-----------------------------------------------------------------+ *)

type converter =
  | Constant of Ast.expr
  | Function of Ast.expr
  | Position
  | Identity

type charset_atom =
  | Ca_variable of Loc.t * string * bool
  | Ca_range of Loc.t * Text.t * Text.t
  | Ca_literal of Loc.t * Text.t

type charset = charset_atom list

type parse_tree =
  | Literal of Loc.t * Text.t
  | Repeat of Loc.t * parse_tree * int * int option * greediness
  | Concat of Loc.t * parse_tree * parse_tree
  | Alternative of Loc.t * parse_tree * parse_tree
  | Bind of Loc.t * parse_tree * string * converter
  | Charset of Loc.t * charset * bool
  | Meta of Loc.t * Text.t * Text.t option
  | Variable of Loc.t * string * bool
  | Backward_reference of Loc.t * string
  | Mode of Loc.t * mode * bool
  | Look of Loc.t * direction * parse_tree * bool
  | Group of Loc.t * parse_tree
  | Condition of Loc.t * string * parse_tree * parse_tree option

(* +-----------------------------------------------------------------+
   | Grammar of regular expression                                   |
   +-----------------------------------------------------------------+ *)

let regexp_eoi = Gram.Entry.mk "regexp_eoi"

EXTEND Gram
  GLOBAL: regexp_eoi;

  utf8_string:
    [ [ s = STRING ->
          match Text.check s with
            | Some error ->
                Loc.raise _loc (Failure("invalid UTF-8 string: " ^ error))
            | None ->
                s
      ] ];

  range:
    [ [ a = INT ->
          let a = int_of_string a in
          if a < 0 then
            Loc.raise _loc (Failure "range bounds must be positive number")
          else
            (a, Some a)
      | a = INT; "-"; b = INT ->
          let a = int_of_string a and b = int_of_string b in
          if a < 0 || b < a then
            Loc.raise _loc (Failure "invalid range bounds")
          else
            (a, Some b)
      | a = INT; "+" ->
          let a = int_of_string a in
          if a < 0 then
            Loc.raise _loc (Failure "range bounds must be positive number")
          else
            (a, None)
      ] ];

  state:
    [ [ "!" -> false | -> true ] ];

  charset_atom:
    [ [ a = utf8_string; ["-" | ".."]; b = utf8_string ->
          if Text.length a <> 1 || Text.length b <> 1 then
            Loc.raise _loc (Failure("UTF-8 string literals in charset range must contain only one unicode character"))
          else if Text.code a < Text.code b then
            Ca_range(_loc, a, b)
          else
            Loc.raise _loc (Failure "invalid charset: the upper limit must be greater than the lower limit")
      | s = utf8_string ->
          Ca_literal(_loc, s)
      | st = state; id = LIDENT ->
          Ca_variable(_loc, id, st)
      | st = state; id = UIDENT ->
          Ca_variable(_loc, id, st)
      ] ];

  charset:
    [ [ l = LIST0 charset_atom -> l ] ];

  mode:
    [ [ mode = LIDENT ->
          match mode with
            | "i" | "caseless" -> Caseless
            | "m" | "multiline" -> Multiline
            | "s" | "singleline" | "dotall" -> Dot_all
            | _ -> Loc.raise _loc (Failure(Printf.sprintf "invalid mode: '%s'" mode))
      ] ];

  regexp:
    [ [ r = SELF; "as"; i = LIDENT;
        conv =
          OPT [ ":"; s = LIDENT -> Function <:expr< $lid: s ^ "_of_string"$ >>
              | ":="; e = expr -> Function e
              | "="; e = expr -> Constant e ] ->
            Bind(_loc, r, i, match conv with Some c -> c | None -> Identity)
      | r1 = SELF; "|"; r2 = SELF -> Alternative(_loc, r1, r2)
      | r1 = SELF; r2 = SELF -> Concat(_loc, r1, r2) ]

    | "postop" NONA
        [ r = SELF; "*" -> Repeat(_loc, r, 0, None, Greedy)
        | r = SELF; "+" -> Repeat(_loc, r, 1, None, Greedy)
        | r = SELF; "?" -> Repeat(_loc, r, 0, Some 1, Greedy)
        | r = SELF; "{"; (a, b) = range; "}" -> Repeat(_loc, r, a, b, Greedy)
        | r = SELF; "*?" -> Repeat(_loc, r, 0, None, Lazy)
        | r = SELF; "+?" -> Repeat(_loc, r, 1, None, Lazy)
        | r = SELF; "??" -> Repeat(_loc, r, 0, Some 1, Lazy)
        | r = SELF; "{"; (a, b) = range; "}"; "?" -> Repeat(_loc, r, a, b, Lazy)
        | r = SELF; "*+" -> Repeat(_loc, r, 0, None, Possessive)
        | r = SELF; "++" -> Repeat(_loc, r, 1, None, Possessive)
        | r = SELF; "?+" -> Repeat(_loc, r, 0, Some 1, Possessive)
        | r = SELF; "{"; (a, b) = range; "}"; "+" -> Repeat(_loc, r, a, b, Possessive) ]

    | "preop" NONA
        [ "\\"; id = LIDENT -> Backward_reference (_loc, id) ]

    | "simple" NONA
        [ "["; cs = charset; "]" ->
            Charset(_loc, cs, true)
        | "[^"; cs = charset; "]" ->
            Charset(_loc, cs, false)
        | s = utf8_string ->
            Literal(_loc, s)
        | "_" ->
            Meta(_loc, ".", None)
        | st = state; i = LIDENT ->
            Variable(_loc, i, st)
        | st = state; i = UIDENT ->
            Variable(_loc, i, st)
        | "^" ->
            Meta(_loc, "^", None)
        | "$" ->
            Meta(_loc, "$", None)
        | "&+"; mode = mode ->
            Mode(_loc, mode, true)
        | "&-"; mode = mode ->
            Mode(_loc, mode, false)
        | "@"; name = LIDENT ->
            Bind(_loc, Literal(_loc, ""), name, Position)
        | "("; r = SELF; ")" ->
            Group(_loc, r)
        | "<"; r = SELF ->
            Look(_loc, Behind, r, true)
        | "<!"; r = SELF ->
            Look(_loc, Behind, r, false)
        | ">"; r = SELF ->
            Look(_loc, Ahead, r, true)
        | ">!"; r = SELF ->
            Look(_loc, Ahead, r, false)
        | "if"; id = LIDENT; "then"; r_then = SELF; r_else = maybe_else ->
            Condition(_loc, id, r_then, r_else)
        ] ];

  maybe_else:
    [ [ "else"; r = regexp -> Some r
      | -> None ] ];

  regexp_eoi:
    [ [ re = regexp; `EOI -> re ] ];
END

(* +-----------------------------------------------------------------+
   | Manipulation                                                    |
   +-----------------------------------------------------------------+ *)

let collect_regexp_bindings ast =
  let rec loop n acc = function
    | Literal _ | Variable _ | Charset _ | Meta _ | Backward_reference _ | Mode _ ->
        (n, acc)
    | Group(_, r) ->
        loop n acc r
    | Look(_, _, r, _) ->
        loop n acc r
    | Repeat(_, r, _, _, _) ->
        loop n acc r
    | Concat(_, r1, r2) ->
        let n, acc = loop n acc r1 in
        loop n acc r2
    | Alternative(_, r1, r2) ->
        let n, acc = loop n acc r1 in
        loop n acc r2
    | Bind(_loc, r, id, conv) ->
        loop (n + 1) ((_loc, id, n, conv) :: acc) r
    | Condition(_, _, r_then, None) ->
        loop n acc r_then
    | Condition(_, _, r_then, Some r_else) ->
        let n, acc = loop n acc r_then in
        loop n acc r_else
  in
  snd (loop 1 [] ast)

(* +-----------------------------------------------------------------+
   | Parsing                                                         |
   +-----------------------------------------------------------------+ *)

let parse loc contents =
  Gram.parse_string regexp_eoi loc contents