File: nethtml_scanner.mll

package info (click to toggle)
ocamlnet 4.1.2-1
  • links: PTS, VCS
  • area: main
  • in suites: stretch
  • size: 51,764 kB
  • ctags: 16,446
  • sloc: ml: 148,419; ansic: 10,989; sh: 1,885; makefile: 1,355
file content (168 lines) | stat: -rw-r--r-- 3,442 bytes parent folder | download | duplicates (6)
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
(* $Id$
 * ----------------------------------------------------------------------
 *
 *)

{
  type token =
      Lcomment  (* <!-- *)
    | Rcomment  (* --> *)
    | Mcomment  (* within comment *)
    | Ldoctype  (* <! *)
    | Rdoctype  (* > *)
    | Mdoctype  (* within declaration *)
    | Lpi       (* <? *)
    | Rpi       (* ?> or > *)
    | Mpi       (* within processing instruction *)
    | Lelement of string
    | Lelementend of string
    | Relement  (* > *)
    | Relement_empty   (* />, for XML compat *)
    | Cdata of string 
    | Space of int
    | Name of string
    | Is
    | Literal of string
    | Other
    | Eof
}

(* Simplified rules: Only ASCII is recognized as character set *)

let letter = ['A'-'Z' 'a'-'z' ]
let digit = ['0'-'9']
let hexdigit = ['0'-'9' 'A'-'F' 'a'-'f']
let namechar = letter | digit | '.' | ':' | '-' | '_'
let name = ( letter | '_' | ':' ) namechar*
let nmtoken = namechar+
let ws = [ ' ' '\t' '\r' '\n' ]
let string_literal1 = '"' [^ '"' ]* '"'
let string_literal2 = "'" [^ '\'' ]* "'"
let string_literal3 = [^ '"' '\'' '>' '=' ' ' '\t' '\n' '\r' ]+
let string_literal4 = [^ '"' '\'' '>' ' ' '\t' '\n' '\r' ]+

(* This following rules reflect HTML as it is used, not the SGML
 * rules.
 *)

rule scan_document = parse
  | "<!--"
      { Lcomment }
  | "<!"
      { Ldoctype }
  | "<?"
      { Lpi }
  | "<" name
      { let s = Lexing.lexeme lexbuf in
	Lelement (String.sub s 1 (String.length s - 1))
      }
  | "</" name
      { let s = Lexing.lexeme lexbuf in
	Lelementend (String.sub s 2 (String.length s - 2))
      }
  | "<"                (* misplaced "<" *)
      { Cdata "<" }
  | eof
      { Eof }
  | [^ '<' ]+
      { Cdata (Lexing.lexeme lexbuf)}

and scan_special = parse
  | "</" name 
      { let s = Lexing.lexeme lexbuf in
	Lelementend (String.sub s 2 (String.length s - 2))
      }
  | "<"
      { Cdata "<" }
  | eof
      { Eof }
  | [^ '<' ]+
      { Cdata (Lexing.lexeme lexbuf)}


and scan_comment = parse
  | "-->"
      { Rcomment }  (* FIXME: There may be any number of ws between -- and > *)
  | "-"
      { Mcomment }
  | eof
      { Eof }
  | [^ '-']+
      { Mcomment }

and scan_doctype = parse
  | ">"                   (* Occurence in strings, and [ ] brackets ignored *)
      { Rdoctype }
  | eof
      { Eof }
  | [^ '>' ] +
      { Mdoctype }

and scan_pi = parse
  | "?>"
      { Rpi }
  | ">"
      { Rpi }
  | eof
      { Eof }
  | '?' 
      { Mpi }
  | [^ '>' '?' ] +
      { Mpi }

and scan_element = parse
  | ">"
      { Relement }
  | "/>"
      { Relement_empty }
  | ws+
      { Space (String.length (Lexing.lexeme lexbuf)) }
  | name
      { Name (Lexing.lexeme lexbuf) }
  | "="
      { Is }
  | '"' 
      { Other }
  | "'"
      { Other }
  | string_literal3
      { Literal (Lexing.lexeme lexbuf) }
  | eof
      { Eof }
  | _
      { Other }

and scan_element_after_Is = parse
  | ">"
      { Relement }
  | "/>"
      { Relement_empty }
  | ws+
      { Space (String.length (Lexing.lexeme lexbuf)) }
  | '"' 
      { try
	  Literal (scan_string_literal1 lexbuf)
	with
	  | _ -> Other
      }
  | "'"
      { try
	  Literal (scan_string_literal2 lexbuf)
	with
	  | _ -> Other
      }
  | string_literal4
      { Literal (Lexing.lexeme lexbuf) }
  | eof
      { Eof }
  | _
      { Other }

and scan_string_literal1 = parse
  | ( [^ '"' ]* as s) '"'
      { s }

and scan_string_literal2 = parse
  | ( [^ '\'' ]* as s) '\''
      { s }