File: parse.ml

package info (click to toggle)
netclient 0.91-10
  • links: PTS
  • area: main
  • in suites: etch, etch-m68k
  • size: 2,096 kB
  • ctags: 1,539
  • sloc: ml: 8,808; sh: 527; makefile: 203
file content (253 lines) | stat: -rw-r--r-- 5,484 bytes parent folder | download | duplicates (12)
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
open Html;;
open Scan;;

exception End_of_scan;;


let no_end_tag =  (* empty HTML elements *)
  [ "isindex";
    "base";
    "meta";
    "link";
    "p";
    "hr";
    "input";
    "img";
    "param";
    "basefont";
    "br";
    "area";
  ]
;;


let special_tag =   (* other lexical rules *)
  [ "script";
    "style";
  ]
;;


let rec parse_comment buf =
  let t = scan_comment buf in
  match t with
      Mcomment ->
	parse_comment buf
    | Eof ->
	raise End_of_scan
    | _ ->
	()
;;


let rec parse_doctype buf =
  let t = scan_doctype buf in
  match t with
      Mdoctype ->
	parse_doctype buf
    | Eof ->
	raise End_of_scan
    | _ ->
	()
;;


let parse_document buf =
  let current_name = ref "" in
  let current_atts = ref [] in
  let current_subs = ref [] in
  let stack = Stack.create() in

  let rec parse_atts () =
    let rec next_no_space() =
      match scan_element buf with
	  Space _ -> next_no_space()
	| t -> t
    in
    match next_no_space() with
	Relement -> []
      | Name n ->
	  begin match next_no_space() with
	      Is ->
		begin match next_no_space() with
		    Name v ->
		      (String.lowercase n, String.uppercase v) :: parse_atts()
		  | Literal v ->
		      (String.lowercase n,v) :: parse_atts()
		  | Eof ->
		      raise End_of_scan
		  | Relement ->
		      (* Illegal *)
		      []
		  | _ ->
		      (* Illegal *)
		      parse_atts()
		end
	    | Eof ->
		raise End_of_scan
	    | Relement ->
		(* Illegal *)
		[]
	    | _ ->
		(* Illegal *)
		parse_atts()
	  end
      | Eof ->
	  raise End_of_scan
      | _ ->
	  (* Illegal *)
	  parse_atts()
  in

  let rec parse_special name =
    (* Parse until </name> *)
    match scan_special buf with
	Lelementend n ->
	  if n = name then
	    ""
	  else
	    "</" ^ n ^ parse_special name
      | Eof ->
	  raise End_of_scan
      | Cdata s ->
	  s ^ parse_special name
      | _ ->
	  (* Illegal *)
	  parse_special name
  in

  let rec skip_element() =
    (* Skip until ">" *)
    match scan_element buf with
	Relement ->
	  ()
      | Eof ->
	  raise End_of_scan
      | _ ->
	  skip_element()
  in

  let rec parse_next() =
    let t = scan_document buf in
    match t with
	Lcomment -> 
	  parse_comment buf;
	  parse_next()
      | Ldoctype ->
	  parse_doctype buf;
	  parse_next()
      | Lelement name ->
	  let name = String.lowercase name in
	  if List.mem name no_end_tag then begin
	    let atts = parse_atts() in
	    current_subs := (Element(name, atts, [])) :: !current_subs;
	    parse_next()
	  end
	  else if List.mem name special_tag then begin
	    let atts = parse_atts() in
	    let data = parse_special name in
	    (* Read until ">" *)
	    skip_element();
	    current_subs := (Element(name, atts, [Data data])) :: !current_subs;
	    parse_next()
	  end
	  else begin
	    let atts = parse_atts() in
	    Stack.push (!current_name, !current_atts, !current_subs) stack;
	    current_name := name;
	    current_atts := atts;
	    current_subs := [];
	    parse_next()
	  end
      | Cdata data ->
	  current_subs := (Data data) :: !current_subs;
	  parse_next()
      | Lelementend name ->
	  let name = String.lowercase name in
	  (* Read until ">" *)
	  skip_element();
	  (* Search the element to close on the stack: *)
	  let found = ref (name = !current_name) in
	  Stack.iter
	    (fun (old_name, _, _) ->
	       if name = old_name then found := true)
	    stack;
	  (* If not found, the end tag is wrong. Simply ignore it. *)
	  if not !found then
	    parse_next()
	  else begin
	    (* Put the current element on to the stack: *)
	    Stack.push (!current_name, !current_atts, !current_subs) stack;
	    (* If found: Remove the elements from the stack, and append
	     * them to the previous element as sub elements
	     *)
	    let rec remove() =
	      let old_name, old_atts, old_subs = Stack.pop stack in
	        (* or raise Stack.Empty *)
	      if old_name = name then
		old_name, old_atts, old_subs
	      else
		let older_name, older_atts, older_subs = remove() in
		older_name, 
		older_atts,
		(Element (old_name, old_atts, List.rev old_subs) :: older_subs)
	    in
	    let old_name, old_atts, old_subs = remove() in
	    (* Remove one more element: the element containing the element
	     * currently being closed.
	     *)
	    let new_name, new_atts, new_subs = Stack.pop stack in
	    current_name := new_name;
	    current_atts := new_atts;
	    current_subs := (Element (old_name, old_atts, List.rev old_subs)) 
                            :: new_subs;
	    (* Go on *)
	    parse_next()
	  end
      | Eof ->
	  raise End_of_scan
      | _ ->
	  parse_next()
  in
  try
    parse_next();
    List.rev !current_subs
  with
      End_of_scan ->
	(* Close all remaining elements: *)
	Stack.push (!current_name, !current_atts, !current_subs) stack;
	let rec remove() =
	  let old_name, old_atts, old_subs = Stack.pop stack in
	        (* or raise Stack.Empty *)
	  try
	    let older_name, older_atts, older_subs = remove() in
	    older_name, 
	    older_atts,
	    (Element (old_name, old_atts, List.rev old_subs) :: older_subs)
	  with
	      Stack.Empty ->
		old_name, old_atts, old_subs
	in
	let name, atts, subs = remove() in
	List.rev subs
;;


let parse_string s =
  let buf = Lexing.from_string s in
  parse_document buf
;;


let parse_file fname =
  let f = open_in fname in
  try
    let buf = Lexing.from_channel f in
    let doc = parse_document buf in
    close_in f;
    doc
  with
      any ->
	close_in f;
	raise any
;;