1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197
|
(* -*- indented-text -*- --------------------------------------------
Copyright (c) 1999 Christian Lindig <lindig@ips.cs.tu-bs.de>. All
rights reserved. See COPYING for details.
$Id: xmlscan.mll,v 1.1 2002/03/05 14:23:03 monate Exp $
------------------------------------------------------------------ *)
{
open Xmlparse (* tokens are defined here *)
open Error (* error defined here *)
open Xmlstate
open Lc (* lexer combinators *)
(* little helpers *)
let pos = Lexing.lexeme_start
let last = Lexing.lexeme_end
let get = Lexing.lexeme
(* simple minded definitions for line counting. Whenever we
* we scan a newline we call newline().
*)
let line = ref 1
let startOfLine = ref 0
let init () =
line := 1;
startOfLine := 0;
setContext DataContext
let newline lexbuf =
line := !line + 1;
startOfLine := Lexing.lexeme_start lexbuf
(* actual (line, column) position *)
let position lexbuf = (!line,Lexing.lexeme_start lexbuf - !startOfLine)
(* handling of keywords *)
let keywords = Hashtbl.create 127
let keyword s = Hashtbl.find keywords s
let _ = Array.iter (fun (x,y) -> Hashtbl.add keywords x y)
[|
("xml", XMLNAME);
("version", VERSION);
("standalone", STANDALONE);
("encoding", ENCODING);
("system", SYSTEM);
("public", PUBLIC);
|]
(* store for string constants *)
let strconst = ref ""
let storeStr s = strconst := s
let getStr () = !strconst
(* substring extraction from lexemes using lexer combinators
* from the [Lc] module
*)
let is_ws c = c = ' ' || c = '\t' || c = '\r' || c = '\n'
let alpha = satisfy (fun c -> not (is_ws c))
let pi_prefix = str "<?" *** saveStr (some alpha)
let elem_prefix = chr '<'*** opt (chr '/') *** saveStr (some alpha)
let pi_extract str = match scan str pi_prefix with
| _,[w] -> w
| _,_ -> assert false
let elem_extract str = match scan str elem_prefix with
| _,[w] -> w
| _,_ -> assert false
}
(* regexp declarations *)
let ws = [' ' '\t' '\r']
let nl = '\n'
let ident = ['a'-'z' 'A'-'Z' '_' ':']
['a'-'z' 'A'-'Z' '0'-'9' '.' '-' '_' ':']*
let xml = ['x' 'X']['m' 'M']['l' 'L']
(* ------------------------------------------------------------------
different lexers - the active lexer is controlled by
the parser!
------------------------------------------------------------------ *)
(* inside <element ... /> *)
rule element = parse
eof { EOF }
(* whitespace and newlines *)
| '\n' { newline lexbuf;
element lexbuf }
| ws + { element lexbuf }
| ident { NAME (get lexbuf) }
| '=' { EQ }
| '"' { storeStr ""; string1 lexbuf }
| '\'' { storeStr ""; string2 lexbuf }
| '>' { CLOSE }
| "/>" { SLASHCLOSE }
(* everything else is illegal *)
| _ { error "illegal character" }
(* outside of <element ... />, i.e. we scan raw text *)
and data = parse
| eof { EOF }
| [^ '<' '\n' ] * nl { newline lexbuf; CHUNK (get lexbuf) }
| [^ '<' '\n' ] * { CHUNK (get lexbuf) }
| "<?xml" ws+ { XMLOPEN }
| "<?xml" nl { newline lexbuf; XMLOPEN }
| '<' ident { OPEN(elem_extract (get lexbuf)) }
| "</" ident { OPENSLASH(elem_extract (get lexbuf)) }
| "<?" ident ws+ { PIOPEN(pi_extract (get lexbuf)) }
| "<?" ident nl { newline lexbuf;
PIOPEN(pi_extract (get lexbuf)) }
| "<!--" { comment lexbuf }
| "<!DOCTYPE" ws+ { DTDOPEN }
| "<!DOCTYPE" nl { newline lexbuf; DTDOPEN }
| _ { error ("illegal character `"
^ (get lexbuf) ^ "'" )
}
(* inside <? .. ?> *)
and pi = parse
| '\n' { newline lexbuf; pi lexbuf }
| "?>" { PICLOSE }
| ws + { pi lexbuf }
| [^ '?' '\n']+ { WORD(get lexbuf) }
| '?' { WORD(get lexbuf) }
| eof { error "unterminated <?YYY .. ?> "}
| _ { error "illegal char in <?YYY .. ?>"}
(* inside <?xml ... ?> *)
and decl = parse
| '\n' { newline lexbuf;
decl lexbuf }
| ws + { decl lexbuf }
| "?>" { XMLCLOSE }
| ">" { DTDCLOSE }
| '=' { EQ }
| ident { let s = get lexbuf in
try keyword (String.lowercase s)
with Not_found -> NAME(s)
}
| '\'' { string2 lexbuf }
| '"' { string1 lexbuf }
| eof { error "unterminated declaration "}
| _ { error "illegal char declaration "}
and string1 = parse
| [^ '"' '\n']+ { storeStr (get lexbuf);
string1 lexbuf
}
| '"' { STRING(getStr()) }
| eof { error "unterminated string" }
| nl { error "newline in string is unsupported"}
| _ { error "illegal char in string"}
and string2 = parse
| [^ '\'' '\n']+ { storeStr (get lexbuf);
string2 lexbuf
}
| '\'' { STRING(getStr()) }
| eof { error "unterminated string" }
| nl { error "newline in string is unsupported"}
| _ { error "illegal char in string"}
and comment = parse
'\n' { newline lexbuf; comment lexbuf }
| [^ '-' '\n']+ | '-' { comment lexbuf }
| "-->" { COMMENT }
| eof { error "unterminated comment"}
| _ { error "illegal char in comment"}
(* From the outside the scanner is always invoced through [scan]
* which will respect the currently active context
*)
{
let scan buffer =
match getContext() with
| DataContext -> data buffer
| ElementContext -> element buffer
| DeclContext -> decl buffer
| PiContext -> pi buffer
}
|