1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168
|
(* $Id$
* ----------------------------------------------------------------------
*
*)
{
type token =
Lcomment (* <!-- *)
| Rcomment (* --> *)
| Mcomment (* within comment *)
| Ldoctype (* <! *)
| Rdoctype (* > *)
| Mdoctype (* within declaration *)
| Lpi (* <? *)
| Rpi (* ?> or > *)
| Mpi (* within processing instruction *)
| Lelement of string
| Lelementend of string
| Relement (* > *)
| Relement_empty (* />, for XML compat *)
| Cdata of string
| Space of int
| Name of string
| Is
| Literal of string
| Other
| Eof
}
(* Simplified rules: Only ASCII is recognized as character set *)
let letter = ['A'-'Z' 'a'-'z' ]
let digit = ['0'-'9']
let hexdigit = ['0'-'9' 'A'-'F' 'a'-'f']
let namechar = letter | digit | '.' | ':' | '-' | '_'
let name = ( letter | '_' | ':' ) namechar*
let nmtoken = namechar+
let ws = [ ' ' '\t' '\r' '\n' ]
let string_literal1 = '"' [^ '"' ]* '"'
let string_literal2 = "'" [^ '\'' ]* "'"
let string_literal3 = [^ '"' '\'' '>' '=' ' ' '\t' '\n' '\r' ]+
let string_literal4 = [^ '"' '\'' '>' ' ' '\t' '\n' '\r' ]+
(* This following rules reflect HTML as it is used, not the SGML
* rules.
*)
rule scan_document = parse
| "<!--"
{ Lcomment }
| "<!"
{ Ldoctype }
| "<?"
{ Lpi }
| "<" name
{ let s = Lexing.lexeme lexbuf in
Lelement (String.sub s 1 (String.length s - 1))
}
| "</" name
{ let s = Lexing.lexeme lexbuf in
Lelementend (String.sub s 2 (String.length s - 2))
}
| "<" (* misplaced "<" *)
{ Cdata "<" }
| eof
{ Eof }
| [^ '<' ]+
{ Cdata (Lexing.lexeme lexbuf)}
and scan_special = parse
| "</" name
{ let s = Lexing.lexeme lexbuf in
Lelementend (String.sub s 2 (String.length s - 2))
}
| "<"
{ Cdata "<" }
| eof
{ Eof }
| [^ '<' ]+
{ Cdata (Lexing.lexeme lexbuf)}
and scan_comment = parse
| "-->"
{ Rcomment } (* FIXME: There may be any number of ws between -- and > *)
| "-"
{ Mcomment }
| eof
{ Eof }
| [^ '-']+
{ Mcomment }
and scan_doctype = parse
| ">" (* Occurence in strings, and [ ] brackets ignored *)
{ Rdoctype }
| eof
{ Eof }
| [^ '>' ] +
{ Mdoctype }
and scan_pi = parse
| "?>"
{ Rpi }
| ">"
{ Rpi }
| eof
{ Eof }
| '?'
{ Mpi }
| [^ '>' '?' ] +
{ Mpi }
and scan_element = parse
| ">"
{ Relement }
| "/>"
{ Relement_empty }
| ws+
{ Space (String.length (Lexing.lexeme lexbuf)) }
| name
{ Name (Lexing.lexeme lexbuf) }
| "="
{ Is }
| '"'
{ Other }
| "'"
{ Other }
| string_literal3
{ Literal (Lexing.lexeme lexbuf) }
| eof
{ Eof }
| _
{ Other }
and scan_element_after_Is = parse
| ">"
{ Relement }
| "/>"
{ Relement_empty }
| ws+
{ Space (String.length (Lexing.lexeme lexbuf)) }
| '"'
{ try
Literal (scan_string_literal1 lexbuf)
with
| _ -> Other
}
| "'"
{ try
Literal (scan_string_literal2 lexbuf)
with
| _ -> Other
}
| string_literal4
{ Literal (Lexing.lexeme lexbuf) }
| eof
{ Eof }
| _
{ Other }
and scan_string_literal1 = parse
| ( [^ '"' ]* as s) '"'
{ s }
and scan_string_literal2 = parse
| ( [^ '\'' ]* as s) '\''
{ s }
|