File: plexer.ml

package info (click to toggle)
camlp4 2.04-3
  • links: PTS
  • area: main
  • in suites: potato
  • size: 1,576 kB
  • ctags: 3,108
  • sloc: ml: 26,444; makefile: 736; sh: 203
file content (516 lines) | stat: -rw-r--r-- 17,232 bytes parent folder | download
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
469
470
471
472
473
474
475
476
477
478
479
480
481
482
483
484
485
486
487
488
489
490
491
492
493
494
495
496
497
498
499
500
501
502
503
504
505
506
507
508
509
510
511
512
513
514
515
516
(* camlp4r *)
(***********************************************************************)
(*                                                                     *)
(*                             Camlp4                                  *)
(*                                                                     *)
(*        Daniel de Rauglaudre, projet Cristal, INRIA Rocquencourt     *)
(*                                                                     *)
(*  Copyright 1998 Institut National de Recherche en Informatique et   *)
(*  Automatique.  Distributed only by permission.                      *)
(*                                                                     *)
(***********************************************************************)

(* $Id: plexer.ml,v 2.1 1998/12/04 18:29:12 ddr Exp $ *)

open Stdpp;
open Token;

(* The string buffering machinery *)

value buff = ref (String.create 80);
value store len x =
  do if len >= String.length buff.val then
       buff.val := buff.val ^ String.create (String.length buff.val)
     else ();
     buff.val.[len] := x;
  return succ len
;
value mstore len s =
  add_rec len 0 where rec add_rec len i =
    if i == String.length s then len else add_rec (store len s.[i]) (succ i)
;
value get_buff len = String.sub buff.val 0 len;

(* The lexer *)

value valch x = Char.code x - Char.code '0';

value rec ident len =
  parser
  [ [: `('A'..'Z' | 'a'..'z' | '\192'..'\214' | '\216'..'\246'
      | '\248'..'\255' | '0'..'9' | '_' | ''' as c); s :] ->
      ident (store len c) s
  | [: :] -> len ]
and ident2 len =
  parser
  [ [: `('!' | '?' | '~' | '=' | '@' | '^' | '&' | '+' | '-' | '*' | '/' |
         '%' | '.' | ':' | '<' | '>' | '|' as
         c);
       s :] ->
      ident2 (store len c) s
  | [: :] -> len ]
and ident3 len =
  parser
  [ [: `('0'..'9' | 'A'..'Z' | 'a'..'z' | '\192'..'\214' | '\216'..'\246'
      | '\248'..'\255' | '_' | '!' | '%' | '&' | '*' | '+' |
         '-' | '.' | '/' | ':' | '<' | '=' | '>' | '?' | '@' | '^' | '|' |
         '~' | ''' | '$' as
         c);
       s :] ->
      ident3 (store len c) s
  | [: :] -> len ]
and base_number len =
  parser
  [ [: `'o' | 'O'; s :] -> octal_digits (store len 'o') s
  | [: `'x' | 'X'; s :] -> hexa_digits (store len 'x') s
  | [: `'b' | 'B'; s :] -> binary_digits (store len 'b') s
  | [: a = number len :] -> a ]
and octal_digits len =
  parser
  [ [: `('0'..'7' as d); s :] -> octal_digits (store len d) s
  | [: :] -> ("INT", get_buff len) ]
and hexa_digits len =
  parser
  [ [: `('0'..'9' | 'a'..'f' | 'A'..'F' as d); s :] ->
      hexa_digits (store len d) s
  | [: :] -> ("INT", get_buff len) ]
and binary_digits len =
  parser
  [ [: `('0'..'1' as d); s :] -> binary_digits (store len d) s
  | [: :] -> ("INT", get_buff len) ]
and number len =
  parser
  [ [: `('0'..'9' as c); s :] -> number (store len c) s
  | [: `'.'; s :] -> decimal_part (store len '.') s
  | [: `'e' | 'E'; s :] -> exponent_part (store len 'E') s
  | [: :] -> ("INT", get_buff len) ]
and decimal_part len =
  parser
  [ [: `('0'..'9' as c); s :] -> decimal_part (store len c) s
  | [: `'e' | 'E'; s :] -> exponent_part (store len 'E') s
  | [: :] -> ("FLOAT", get_buff len) ]
and exponent_part len =
  parser
  [ [: `('+' | '-' as c); s :] -> end_exponent_part (store len c) s
  | [: a = end_exponent_part len :] -> a ]
and end_exponent_part len =
  parser
  [ [: `('0'..'9' as c); s :] -> end_exponent_part (store len c) s
  | [: :] -> ("FLOAT", get_buff len) ]
;

value rec escape =
  parser
  [ [: `'n' :] -> '\n'
  | [: `'r' :] -> '\r'
  | [: `'t' :] -> '\t'
  | [: `'b' :] -> '\b'
  | [: `'\\' :] -> '\\'
  | [: `('0'..'9' as c); cod = escape1 (valch c) :] -> Char.chr cod ]
and escape1 cod =
  parser
  [ [: `('0'..'9' as c); a = escape2 (10 * cod + valch c) :] -> a
  | [: :] -> cod ]
and escape2 cod =
  parser [ [: `('0'..'9' as c) :] -> 10 * cod + valch c | [: :] -> cod ]
;

value escape_string =
  parser
  [ [: c = escape :] -> c
  | [: `'"' :] -> '"' ]
;
value escape_char =
  parser
  [ [: c = escape :] -> c
  | [: `''' :] -> ''' ]
;

value rec skip_spaces =
  parser
  [ [: `' ' | '\n' | '\r' | '\t' | '\026' | '\012'; s :] -> skip_spaces s
  | [: :] -> () ]
;

value rec skip_indent =
  parser
  [ [: `' ' | '\t'; s :] -> skip_indent s
  | [: :] -> () ]
;

value skip_opt_newline =
  parser
  [ [: `'\n' :] -> ()
  | [: :] -> () ]
;

value error_on_unknown_keywords = ref False;

value next_token_fun find_id_kwd find_spe_kwd =
  let err bp ep msg = raise_with_loc (bp, ep) (Token.Error msg) in
  let keyword_or_error (bp, ep) s =
    try ("", find_spe_kwd s) with
    [ Not_found ->
        if error_on_unknown_keywords.val then
          err bp ep ("illegal token: " ^ s)
        else ("", s) ]
  in
  let rec next_token =
    parser bp
    [ [: `('A'..'Z' | '\192'..'\214' | '\216'..'\222' as c); s :] ->
        let id = get_buff (ident (store 0 c) s) in
        try ("", find_id_kwd id) with [ Not_found -> ("UIDENT", id) ]
    | [: `('a'..'z' | '\223'..'\246' | '\248'..'\255' | '_' as c); s :] ->
        let id = get_buff (ident (store 0 c) s) in
        try ("", find_id_kwd id) with [ Not_found -> ("LIDENT", id) ]
    | [: `('1'..'9' as c); s :] -> number (store 0 c) s
    | [: `'0'; s :] -> base_number (store 0 '0') s
    | [: `'''; s :] ep ->
        match Stream.npeek 2 s with
        [ [_; '''] | ['\\'; _] -> ("CHAR", String.make 1 (char bp s))
        | _ -> keyword_or_error (bp, ep) "'" ]
    | [: `'"'; s :] -> ("STRING", string bp 0 s)
    | [: `'$'; s :] -> locate_or_antiquot bp 0 s
    | [: `('!' | '?' | '~' | '=' | '@' | '^' | '&' | '+' | '-' | '*' | '/' |
           '%' as
           c);
         s :] ->
        let id = get_buff (ident2 (store 0 c) s) in
        keyword_or_error (bp, Stream.count s) id
    | [: `'<'; s :] -> less bp s
    | [: `(':' as c1);
         len =
           parser
           [ [: `(']' | ':' | '=' | '>' as c2) :] -> store (store 0 c1) c2
           | [: :] -> store 0 c1 ] :] ep ->
        let id = get_buff len in keyword_or_error (bp, ep) id
    | [: `('>' | '|' as c1);
         len =
           parser
           [ [: `(']' | '}' as c2) :] -> store (store 0 c1) c2
           | [: a = ident2 (store 0 c1) :] -> a ] :] ep ->
        let id = get_buff len in keyword_or_error (bp, ep) id
    | [: `('[' | '{' as c1); s :] ->
        let len =
          match Stream.npeek 2 s with
          [ ['<'; '<' | ':'] -> store 0 c1
          | _ ->
              match s with parser
              [ [: `('|' | '<' | ':' as c2) :] -> store (store 0 c1) c2
              | [: :] -> store 0 c1 ] ]
        in
        let ep = Stream.count s in
        let id = get_buff len in keyword_or_error (bp, ep) id
    | [: `'.'; id = parser [ [: `'.' :] -> ".." | [: :] -> "." ] :] ep ->
        keyword_or_error (bp, ep) id
    | [: `';'; id = parser [ [: `';' :] -> ";;" | [: :] -> ";" ] :] ep ->
        keyword_or_error (bp, ep) id
    | [: `'\\'; s :] -> ("LIDENT", get_buff (ident3 0 s))
    | [: `c :] ep -> keyword_or_error (bp, ep) (String.make 1 c) ]
  and less bp =
    parser
    [ [: `'<'; s :] -> ("QUOTATION", (":" ^ get_buff (quotation bp 0 s)))
    | [: `':'; i = parser [: len = ident 0 :] -> get_buff len;
         `'<' ? "character '<' expected"; s :] ->
        ("QUOTATION", i ^ ":" ^ get_buff (quotation bp 0 s))
    | [: s :] ep ->
        let id = get_buff (ident2 (store 0 '<') s) in
        keyword_or_error (bp, ep) id ]
  and string bp len =
    parser
    [ [: `'"' :] -> get_buff len
    | [: `'\\';
         len =
           parser
           [ [: `'\n'; _ = skip_indent :] -> len
           | [: `'\r'; _ = skip_opt_newline; _ = skip_indent :] -> len
           | [: c = escape_string :] -> store len c
           | [: `c :] -> store (store len '\\') c ];
         s :] ->
        string bp len s
    | [: `c; s :] -> string bp (store len c) s
    | [: :] ep -> err bp ep "string not terminated" ]
  and char bp =
    parser
    [ [: `'\\'; c = escape_char ? "escape character expected";
         `''' ? "quote expected" :] -> c
    | [: `c; `''' ? "quote expected" :] -> c
    | [: :] ep -> err bp ep "char not terminated" ]
  and locate_or_antiquot bp len =
    parser
    [ [: `'$' :] -> ("ANTIQUOT", ":" ^ get_buff len)
    | [: `('a'..'z' | 'A'..'Z' as c); s :] -> antiquot bp (store len c) s
    | [: `('0'..'9' as c); s :] -> maybe_locate bp (store len c) s
    | [: `':'; s :] ->
        let k = get_buff len in
        ("ANTIQUOT", k ^ ":" ^ locate_or_antiquot_rest bp 0 s)
    | [: `'\\'; `c; s :] ->
        ("ANTIQUOT", ":" ^ locate_or_antiquot_rest bp (store len c) s)
    | [: `c; s :] ->
        ("ANTIQUOT", ":" ^  locate_or_antiquot_rest bp (store len c) s)
    | [: :] ep -> err bp ep "antiquotation not terminated" ]
  and maybe_locate bp len =
    parser
    [ [: `'$' :] -> ("ANTIQUOT", ":" ^ get_buff len)
    | [: `('0'..'9' as c); s :] -> maybe_locate bp (store len c) s
    | [: `':'; s :] ->
        ("LOCATE", get_buff len ^ ":" ^ locate_or_antiquot_rest bp 0 s)
    | [: `'\\'; `c; s :] ->
        ("ANTIQUOT", ":" ^ locate_or_antiquot_rest bp (store len c) s)
    | [: `c; s :] ->
        ("ANTIQUOT", ":" ^ locate_or_antiquot_rest bp (store len c) s)
    | [: :] ep -> err bp ep "antiquotation not terminated" ]
  and antiquot bp len =
    parser
    [ [: `'$' :] -> ("ANTIQUOT", ":" ^ get_buff len)
    | [: `('a'..'z' | 'A'..'Z' | '0'..'9' as c); s :] ->
        antiquot bp (store len c) s
    | [: `':'; s :] ->
        let k = get_buff len in
        ("ANTIQUOT", k ^ ":" ^ locate_or_antiquot_rest bp 0 s)
    | [: `'\\'; `c; s :] ->
        ("ANTIQUOT", ":" ^ locate_or_antiquot_rest bp (store len c) s)
    | [: `c; s :] ->
        ("ANTIQUOT", ":" ^ locate_or_antiquot_rest bp (store len c) s)
    | [: :] ep -> err bp ep "antiquotation not terminated" ]
  and locate_or_antiquot_rest bp len =
    parser
    [ [: `'$' :] -> get_buff len
    | [: `'\\'; `c; s :] -> locate_or_antiquot_rest bp (store len c) s
    | [: `c; s :] -> locate_or_antiquot_rest bp (store len c) s
    | [: :] ep -> err bp ep "antiquotation not terminated" ]
  and quotation bp len =
    parser
    [ [: `'>'; s :] -> maybe_end_quotation bp len s
    | [: `'<'; s :] ->
        quotation bp (maybe_nested_quotation bp (store len '<') s) s
    | [: `'\\';
         len =
           parser
           [ [: `('>' | '<' | '\\' as c) :] -> store len c
           | [: :] -> store len '\\' ];
         s :] ->
        quotation bp len s
    | [: `c; s :] -> quotation bp (store len c) s
    | [: :] ep -> err bp ep "quotation not terminated" ]
  and maybe_nested_quotation bp len =
    parser
    [ [: `'<'; s :] -> mstore (quotation bp (store len '<') s) ">>"
    | [: `':'; len = ident (store len ':');
         a =
           parser
           [ [: `'<'; s :] -> mstore (quotation bp (store len '<') s) ">>"
           | [: :] -> len ] :] ->
        a
    | [: :] -> len ]
  and maybe_end_quotation bp len =
    parser [ [: `'>' :] -> len | [: a = quotation bp (store len '>') :] -> a ]
  in
  let rec next_token_loc =
    parser bp
    [ [: `' ' | '\n' | '\r' | '\t' | '\026' | '\012'; s :] -> next_token_loc s
    | [: `'('; s :] -> maybe_comment bp s
    | [: `'#'; _ = spaces_tabs; t = linenum bp :] -> t
    | [: tok = next_token :] ep -> (tok, (bp, ep))
    | [: _ = Stream.empty :] -> (("EOI", ""), (bp, succ bp)) ]
  and maybe_comment bp =
    parser
    [ [: `'*'; s :] -> do comment bp s; return next_token_loc s
    | [: :] ep -> let tok = keyword_or_error (bp, ep) "(" in (tok, (bp, ep)) ]
  and comment bp =
    parser
    [ [: `'('; s :] -> maybe_nested_comment bp s
    | [: `'*'; s :] -> maybe_end_comment bp s
    | [: `c; s :] -> comment bp s
    | [: :] ep -> err bp ep "comment not terminated" ]
  and maybe_nested_comment bp =
    parser
    [ [: `'*'; s :] -> do comment bp s; return comment bp s
    | [: a = comment bp :] -> a ]
  and maybe_end_comment bp =
    parser [ [: `')' :] -> () | [: a = comment bp :] -> a ]
  and linenum bp =
    parser
    [ [: `'0'..'9'; _ = digits; _ = spaces_tabs; `'"'; _ = any_to_nl; s :] ->
        next_token_loc s
    | [: :] -> (keyword_or_error (bp, bp + 1) "#", (bp, bp + 1)) ]
  and spaces_tabs =
    parser
    [ [: `' ' | '\t'; s :] -> spaces_tabs s
    | [: :] -> () ]
  and digits =
    parser
    [ [: `'0'..'9'; s :] -> digits s
    | [: :] -> () ]
  and any_to_nl =
    parser
    [ [: `'\r' | '\n' :] -> ()
    | [: `_; s :] -> any_to_nl s
    | [: :] -> () ]
  in
  fun cstrm ->
    try next_token_loc cstrm with
    [ Stream.Error str ->
        err (Stream.count cstrm) (Stream.count cstrm + 1) str ]
;

value locerr () = invalid_arg "Lexer: location function";
value loct_create () = ref (Array.create 1024 None);
value loct_func loct i =
  match
    if i < 0 || i >= Array.length loct.val then None
    else Array.unsafe_get loct.val i
  with
  [ Some loc -> loc
  | _ -> locerr () ]
;
value loct_add loct i loc =
  do if i >= Array.length loct.val then
       let new_tmax = Array.length loct.val * 2 in
       let new_loct = Array.create new_tmax None in
       do Array.blit loct.val 0 new_loct 0 (Array.length loct.val);
          loct.val := new_loct;
       return ()
     else ();
     loct.val.(i) := Some loc;
  return ()
;

value func kwd_table =
  let find = Hashtbl.find kwd_table in
  let lex cstrm =
    let next_token_loc = next_token_fun find find in
    let loct = loct_create () in
    let ts =
      Stream.from
        (fun i ->
           let (tok, loc) = next_token_loc cstrm in
           do loct_add loct i loc; return Some tok)
    in
    let locf = loct_func loct in
    (ts, locf)
  in
  lex
;

value rec check_keyword_stream =
  parser [: _ = check; _ = Stream.empty :] -> True
and check =
  parser
  [ [: `'A'..'Z' | 'a'..'z' | '\192'..'\214' | '\216'..'\246'
       | '\248'..'\255'; s :] -> check_ident s
  | [: `'!' | '?' | '~' | '=' | '@' | '^' | '&' | '+' | '-' | '*' | '/' |
        '%' | '.';
       s :] ->
      check_ident2 s
  | [: `'<'; s :] ->
      match Stream.npeek 1 s with
      [ [':' | '<']  -> ()
      | _ -> check_ident2 s ]
  | [: `':';
       _ =
         parser [ [: `']' | ':' | '=' | '>' :] -> () | [: :] -> () ] :] ep ->
      ()
  | [: `'>' | '|';
       _ =
         parser [ [: `']' | '}' :] -> () | [: a = check_ident2 :] -> a ] :] ->
      ()
  | [: `'[' | '{'; s :] ->
      match Stream.npeek 2 s with
      [ ['<'; '<' | ':'] -> ()
      | _ ->
          match s with parser [ [: `'|' | '<' | ':' :] -> () | [: :] -> () ] ]
  | [: `';'; _ = parser [ [: `';' :] -> () | [: :] -> () ] :] -> ()
  | [: `_ :] -> () ]
and check_ident =
  parser
  [ [: `'A'..'Z' | 'a'..'z' | '\192'..'\214' | '\216'..'\246' |
        '\248'..'\255' | '0'..'9' | '_' | '''; s :] -> check_ident s
  | [: :] -> () ]
and check_ident2 =
  parser
  [ [: `'!' | '?' | '~' | '=' | '@' | '^' | '&' | '+' | '-' | '*' | '/' |
        '%' | '.' | ':' | '<' | '>' | '|';
       s :] ->
      check_ident2 s
  | [: :] -> () ]
;

value check_keyword s =
  try check_keyword_stream (Stream.of_string s) with _ -> False
;

value using_token kwd_table (p_con, p_prm) =
  match p_con with
  [ "" ->
      try
        let _ = Hashtbl.find kwd_table p_prm in ()
      with
      [ Not_found ->
          if check_keyword p_prm then Hashtbl.add kwd_table p_prm p_prm
          else
            raise (Token.Error ("\
the token \"" ^ p_prm ^ "\" does not respect Plexer rules")) ]
  | "LIDENT" | "UIDENT" | "INT"| "FLOAT" | "CHAR" | "STRING" | "QUOTATION"
  | "ANTIQUOT" | "LOCATE" | "EOI" -> ()
  | _ ->
      raise (Token.Error ("\
the constructor \"" ^ p_con ^ "\" is not recognized by Plexer")) ]
;

value removing_token kwd_table (p_con, p_prm) =
  if p_con = "" then Hashtbl.remove kwd_table p_prm else ()
;

value text =
  fun
  [ ("", t) -> "'" ^ t ^ "'"
  | ("LIDENT", "") -> "lowercase identifier"
  | ("LIDENT", t) -> "'" ^ t ^ "'"
  | ("UIDENT", "") -> "uppercase identifier"
  | ("UIDENT", t) -> "'" ^ t ^ "'"
  | ("INT", "") -> "integer"
  | ("INT", s) -> "'" ^ s ^ "'"
  | ("FLOAT", "") -> "float"
  | ("STRING", "") -> "string"
  | ("CHAR", "") -> "char"
  | ("QUOTATION", "") -> "quotation"
  | ("ANTIQUOT", k) -> "antiquot \"" ^ k ^ "\""
  | ("LOCATE", "") -> "locate"
  | ("EOI", "") -> "end of input"
  | (con, "") -> con
  | (con, prm) -> con ^ " \"" ^ prm ^ "\"" ]
;

value eq_before_colon p e =
  loop 0 where rec loop i =
    if i == String.length e then
      failwith "Internal error in Plexer: incorrect ANTIQUOT"
    else if i == String.length p then e.[i] == ':'
    else if p.[i] == e.[i] then loop (i + 1)
    else False
;

value after_colon e =
  try
    let i = String.index e ':' in
    String.sub e (i + 1) (String.length e - i - 1)
  with [ Not_found -> "" ]
;

value tparse =
  fun
  [ ("ANTIQUOT", p_prm) ->
      parser [: `("ANTIQUOT", prm) when eq_before_colon p_prm prm :] ->
        after_colon prm
  | (p_con, "") ->
      parser [: `(con, prm) when con = p_con :] -> prm
  | (p_con, p_prm) ->
      parser [: `(con, prm) when con = p_con && prm = p_prm :] -> prm ]
;

value make () =
  let kwd_table = Hashtbl.create 301 in
  {func = func kwd_table; using = using_token kwd_table;
   removing = removing_token kwd_table; tparse = tparse; text = text}
;