File: xmlscan.mll

package info (click to toggle)
mlglade 0.5-2
  • links: PTS
  • area: main
  • in suites: woody
  • size: 480 kB
  • ctags: 386
  • sloc: ml: 4,519; makefile: 152; sh: 4
file content (197 lines) | stat: -rw-r--r-- 6,652 bytes parent folder | download | duplicates (2)
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
(* -*- indented-text -*- --------------------------------------------


    Copyright (c) 1999 Christian Lindig <lindig@ips.cs.tu-bs.de>. All
    rights reserved. See COPYING for details.
   $Id: xmlscan.mll,v 1.1 2002/03/05 14:23:03 monate Exp $

   ------------------------------------------------------------------ *)

{
  open Xmlparse				(* tokens are defined here *)
  open Error				(* error defined here *)
  open Xmlstate
  open Lc                               (* lexer combinators *)
  
  (* little helpers *)

  let pos   = Lexing.lexeme_start
  let last  = Lexing.lexeme_end
  let get   = Lexing.lexeme
  
  (* simple minded definitions for line counting. Whenever we
   * we scan a newline we call newline().
   *)

  let line         = ref 1
  let startOfLine  = ref 0

  let init () =
    line          := 1;
    startOfLine   := 0;
    setContext DataContext

  let newline lexbuf =
    line := !line + 1;
    startOfLine := Lexing.lexeme_start lexbuf

  (* actual (line, column) position *)
  let position lexbuf = (!line,Lexing.lexeme_start lexbuf - !startOfLine)

  (* handling of keywords *)

  let keywords = Hashtbl.create 127
  let keyword s = Hashtbl.find keywords s
  let _ = Array.iter (fun (x,y) -> Hashtbl.add keywords x y)
    [|
	("xml",         XMLNAME);
        ("version",     VERSION);
        ("standalone",  STANDALONE);
        ("encoding",    ENCODING);
        ("system",      SYSTEM);
        ("public",      PUBLIC);
        
    |]

  (* store for string constants *)

  let strconst      = ref ""
  let storeStr s    = strconst := s
  let getStr ()     = !strconst

  (* substring extraction from lexemes using lexer combinators
   * from the [Lc] module
   *)

  let is_ws c           = c = ' ' || c = '\t' || c = '\r' || c = '\n'
  let alpha             = satisfy  (fun c -> not (is_ws c))
  let pi_prefix         = str "<?" *** saveStr (some alpha)
  let elem_prefix       = chr '<'*** opt (chr '/')  *** saveStr (some alpha)
  let pi_extract str    = match scan str pi_prefix with
                          | _,[w]   -> w
                          | _,_     -> assert false
  let elem_extract str  = match scan str elem_prefix with
                          | _,[w]   -> w
                          | _,_     -> assert false
}

(* regexp declarations *)

let     ws      = [' ' '\t' '\r']
let     nl      = '\n'
let     ident   = ['a'-'z' 'A'-'Z' '_' ':'] 
                  ['a'-'z' 'A'-'Z' '0'-'9' '.' '-' '_' ':']*
let     xml     = ['x' 'X']['m' 'M']['l' 'L']

(* ------------------------------------------------------------------
   different lexers - the active lexer is controlled by
   the parser!
   ------------------------------------------------------------------ *)

(* inside <element  ... /> *)
rule element = parse
      eof			{ EOF }
   
   (* whitespace and newlines *)
   | '\n'			{ newline lexbuf;
                                  element lexbuf }
   | ws	+                       { element lexbuf }		

   | ident                      { NAME (get lexbuf) }

   | '='			{ EQ    }
   | '"'			{ storeStr ""; string1 lexbuf }
   | '\''			{ storeStr ""; string2 lexbuf }
   | '>'			{ CLOSE }   
   | "/>"                       { SLASHCLOSE }

   (* everything else is illegal *)
   | _				{ error "illegal character" }

(* outside of <element ... />, i.e. we scan raw text *)
and data = parse
   | eof			{ EOF }
   | [^ '<' '\n' ] * nl         { newline lexbuf; CHUNK (get lexbuf)    }
   | [^ '<' '\n' ] *            {                 CHUNK (get lexbuf)    }
   | "<?xml" ws+        	{ XMLOPEN                               }
   | "<?xml" nl         	{ newline lexbuf; XMLOPEN               }
   | '<'  ident 	        { OPEN(elem_extract (get lexbuf))       }
   | "</" ident                 { OPENSLASH(elem_extract (get lexbuf))  }
   | "<?" ident ws+	        { PIOPEN(pi_extract (get lexbuf))       }
   | "<?" ident nl	        { newline lexbuf; 
                                  PIOPEN(pi_extract (get lexbuf))       }
   | "<!--"			{ comment lexbuf                        }
   | "<!DOCTYPE" ws+            { DTDOPEN                               }
   | "<!DOCTYPE" nl             { newline lexbuf; DTDOPEN               }
   
   | _				{ error ("illegal character `"
                                         ^ (get lexbuf) ^ "'" ) 
                                }

(* inside <? .. ?> *)
and pi = parse
   | '\n'			{ newline lexbuf; pi lexbuf }
   | "?>"			{ PICLOSE }
   | ws +                       { pi lexbuf }
   | [^ '?' '\n']+              { WORD(get lexbuf) }
   | '?'	                { WORD(get lexbuf) }

   | eof			{ error "unterminated <?YYY .. ?> "}
   | _				{ error "illegal char in <?YYY .. ?>"}

(* inside <?xml ... ?> *)
and decl = parse
   | '\n'			{ newline lexbuf; 
                                  decl lexbuf }
   | ws +                       { decl lexbuf }
   | "?>"			{ XMLCLOSE }
   | ">"                        { DTDCLOSE }
   | '='                        { EQ }
   | ident                      { let s = get lexbuf in
				      try  keyword (String.lowercase s) 
                                      with Not_found -> NAME(s)
				}
   | '\''                       { string2 lexbuf }
   | '"'                        { string1 lexbuf }

   | eof			{ error "unterminated declaration "}
   | _				{ error "illegal char declaration "}

and string1 = parse
   | [^ '"' '\n']+              { storeStr (get lexbuf);
                                  string1 lexbuf
                                }
   | '"'		        { STRING(getStr())              }
   | eof			{ error "unterminated string"   }
   | nl			        { error "newline in string is unsupported"}
   | _				{ error "illegal char in string"}

and string2 = parse
   | [^ '\'' '\n']+             { storeStr (get lexbuf);
                                  string2 lexbuf 
                                }
   | '\''	                { STRING(getStr())              }
   | eof			{ error "unterminated string"   }
   | nl			        { error "newline in string is unsupported"}
   | _				{ error "illegal char in string"}

and comment = parse   
     '\n'			{ newline lexbuf; comment lexbuf }
   | [^ '-' '\n']+ | '-'	{ comment lexbuf }
   | "-->"			{ COMMENT }
   | eof			{ error "unterminated comment"}
   | _				{ error "illegal char in comment"}

(* From the outside the scanner is always invoced through [scan]
 * which will respect the currently active context 
 *)

{
let scan buffer =
    match getContext() with
    | DataContext	    -> data     buffer
    | ElementContext        -> element  buffer
    | DeclContext           -> decl     buffer
    | PiContext             -> pi       buffer
}