File: nethtml.mli

package info (click to toggle)
netstring 0.10.1-3
  • links: PTS
  • area: main
  • in suites: woody
  • size: 1,000 kB
  • ctags: 895
  • sloc: ml: 8,389; xml: 416; makefile: 188; sh: 103
file content (255 lines) | stat: -rw-r--r-- 10,626 bytes parent folder | download
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
(* $Id: nethtml.mli,v 1.7 2001/08/31 22:11:56 gerd Exp $
 * ----------------------------------------------------------------------
 *
 *)


(* The type 'document' represents parsed HTML documents. 
 * Element (name, args, subnodes): is an element node for an element of
 *   type 'name' (i.e. written <name ...>...</name>) with arguments 'args'
 *   and subnodes 'subnodes' (the material within the element). The arguments
 *   are simply name/value pairs. Entity references (something like &xy;)
 *   occuring in the values are NOT resolved.
 *   Arguments without values (e.g. <select name="x" multiple>: here,
 *   "multiple" is such an argument) are represented as (name,name), i.e. the
 *   name is returned as value.
 *   As argument names are case-insensitive, the names are all lowercase.
 * Data s: is a character data node. Again, entity references are contained
 *   as such and not as what they mean.
 *)

type document =
    Element of (string  *  (string*string) list  *  document list)
  | Data of string
;;


(* Now follows the type definition of simplified DTDs. *)

type element_class =         (* What is the class of an element? *)
  [ `Inline
  | `Block
  | `Essential_block
  | `None
  | `Everywhere
  ]
;;

(* The class `None means that the tag is an individual tag that is neither
 * block nor inline.
 * The class `Everywhere means that the tag can occur everywhere, regardless
 * of whether the model of the parent element allows it or not.
 * The class `Essential_block means that the end tag of the block element
 * can never be omitted.
 *)


type model_constraint =      (* The constraint the subelements must fulfill *)
  [ `Inline
  | `Block
  | `Flow                                            (* = `Inline or `Block *)
  | `Empty
  | `Any
  | `Special
  | `Elements of string list             (* Enumeration of allowed elements *)
  | `Or of (model_constraint * model_constraint)
  | `Except of (model_constraint * model_constraint)
  | `Sub_exclusions of (string list * model_constraint)
  ]
;;

(* Model constraints define the possible sub elements of an element:
 * `Inline, `Block:    The sub elements must belong to these classes
 * `Flow:              The sub elements must belong to `Inline or `Block
 * `Empty:             There are no sub elements
 * `Any:               Any sub element is allowed
 * `Special:           The element has special content (<script>).
 *                     Functionally equivalent to `Empty
 * `Elements l:        Only these enumerated elements may occur
 * `Or(m1,m2):         One of the constraints m1 or m2 must hold
 * `Except(m1,m2):     The constraint m1 must hold, and m2 must not hold
 * `Sub_exclusions(l,m):  The constraint m must hold; furthermore, the elements
 *                     enumerated in list l are not allowed as direct or
 *                     indirect subelements, even if m or the model of a
 *                     subelement would allow them. The difference to
 *                     `Except(m, `Elements l) is that the exclusion is
 *                     inherited to the subelements. The `Sub_exclusions
 *                     expression must be toplevel, i.e. it must not occur
 *                     within an `Or, `Except, or another 'Sub_exclusions
 *                     expression.
 *
 * Note that certain aspects are not modelled:
 * - #PCDATA: We do not specify where PCDATA is allowed and where not.
 * - Order, Number: We do neither specify in which order the sub elements must
 *   occur nor how often they can occur
 * - Inclusions: DTDs may describe that an element extraordinarily
 *   allows a list of elements in all sub elements. 
 * - Optional tags: Whether start or end tags can be omitted
 *)

type simplified_dtd =
    (string * (element_class * model_constraint)) list;;

(* This list contains the class of every element, and the constraint for
 * the subelements of the element.
 *)

val html40_dtd : simplified_dtd
  (* The (transitional) HTML 4.0 DTD *)

val relaxed_html40_dtd : simplified_dtd
  (* A relaxed version of the HTML 4.0 DTD that matches better common
   * practice. In particular, this DTD additionally allows that inline
   * elements may span blocks. For example, 
   *   <B>text1 <P>text2
   * is parsed as
   *   <B>text1 <P>text2</P></B>
   * and not as
   *   <B>text1 </B><P>text2</P>
   * - the latter is more correct (and parsed by html40_dtd), but is not what
   * users expect.
   * Note that this is still not what many browsers implement. For example,
   * Netscape treats most inline tags specially: <B> switches bold on,
   * </B> switches bold off. For example,
   *   <A href='a'>text1<B>text2<A href='b'>text3
   * is parsed as
   *   <A href='a'>text1<B>text2</B></A><B><A href='b'>text3</A></B>
   * - there is an extra B element around the second anchor! (You can
   * see what Netscape parses by loading a page into the "Composer".)
   * IMHO it is questionable to consider inline tags as switches because
   * this is totally outside of the HTML specification, and browsers may
   * differ in that point.
   *
   * Furthermore, several elements are turned into essential blocks:
   * table, ul, ol, dl. David Fox reported a problem with structures
   * like:
   * <table><tr><td><table><tr><td>x</td></td></tr></table>y</td></tr></table>
   * i.e. the td of the inner table has two end tags. Without additional
   * help, the second </td> would close the outer table cell. Because of
   * this problem, tables are now essential meaning that it is not allowed
   * to implicitly add a missing </table>; every table element has to
   * be explicitly ended. This rule seems to be what many browsers implement.
   *)

val parse_document : ?dtd:simplified_dtd ->            (* default: html40_dtd *)
                     ?return_declarations:bool ->      (* default: false *)
                     ?return_pis:bool ->               (* default: false *)
                     ?return_comments:bool ->          (* default: false *)
                     Lexing.lexbuf ->
                       document list
  (* Parses the HTML document from a lexbuf and returns it. 
   * Options:
   * ~dtd: specifies the DTD to use. By default, html40_dtd is used which
   *   bases on the transitional HTML 4.0 DTD
   * ~return_declarations: if set, the parser returns <!...> declarations
   *   as Element("!",["contents",c],[]) nodes, where c is the string inside
   *   <! and >. - By default, declarations are skipped.
   * ~return_pis: if set, the parser returns <?...> (or <?...?>) processing
   *   instructions as Element("?",["contents",c],[]) nodes, where c is the
   *   string inside <? and > (or ?>). - By default, processing instructions
   *   are skipped.
   * ~return_comments: if set, the parser returns <!-- .... --> comments
   *   as Element("--",["contents",c],[]) nodes, where c is the string inside
   *   <!-- and -->. - By default, comments are skipped.
   *)

val parse_string : ?dtd:simplified_dtd ->            (* default: html40_dtd *)
                   ?return_declarations:bool ->      (* default: false *)
                   ?return_pis:bool ->               (* default: false *)
                   ?return_comments:bool ->          (* default: false *)
                   string ->
                     document list
  (* Parses the HTML document from a string and returns it. *)

val parse_file : ?dtd:simplified_dtd ->            (* default: html40_dtd *)
                 ?return_declarations:bool ->      (* default: false *)
                 ?return_pis:bool ->               (* default: false *)
                 ?return_comments:bool ->          (* default: false *)
                 in_channel ->
                   document list
  (* Parses the HTML document from a file and returns it. *)


(* NOTE ON XHTML:
 * The parser can read xhtml, as long as the following XML features are not
 * used:
 * - Internal DTD subset, i.e. <!DOCTYPE html ... [ ... ]>
 * - External entities
 * - <![CDATA[
 * - <![INCLUDE[
 * - <![IGNORE[
 * - encodings other than ISO-8859-1
 * The following XML features are ok:
 * - processing instructions
 * - empty elements (e.g. <br/>) as long as the element is declared as EMPTY.
 *)

val decode : document list -> document list
  (* decode: converts entities &name; and &#num; into the corresponding 
   * characters
   * Note: Declarations, processing instructions, and comments are not
   * decoded.
   *)

val encode : document list -> document list
  (* encode: converts problematic characters to their corresponding
   * entities
   * Note: Declarations, processing instructions, and comments are not
   * encoded.
   *)

val write : ?dtd:simplified_dtd ->            (* default: html40_dtd *) 
            [ `Out_buffer of Buffer.t
	    | `Out_channel of out_channel
	    | `Out_function of (string -> int -> int -> unit)
	    ] ->
            document list ->
	      unit
  (* Writes the document to the buffer/channel/function. No encoding or
   * decoding happens.
   * `Out_function (fun s pos len -> ()): Must write the len characters at
   *    pos from the string
   *)


(* ======================================================================
 * History:
 * 
 * $Log: nethtml.mli,v $
 * Revision 1.7  2001/08/31 22:11:56  gerd
 * 	Added essential blocks.
 *
 * Revision 1.6  2001/07/15 14:18:59  gerd
 * 	New relaxed_html40_dtd.
 * 	New constraint `Sub_exclusions.
 *
 * Revision 1.5  2001/06/10 23:56:50  gerd
 * 	Fix: 'write' no longer writes end tags of empty elements.
 *
 * Revision 1.4  2001/06/08 22:19:55  gerd
 * 	Added functions encode, decode, write for convenience.
 *
 * Revision 1.3  2001/06/08 16:25:27  gerd
 * 	Bugfix: </SCRIPT> is now recognized (thanks to David Fox)
 * 	The parser may now return comments, declarations, and processing
 * instructions if requested to do so
 * 	The parser accepts xhtml to some extent
 * 	Now exported: parse_document.
 *
 * Revision 1.2  2001/04/07 23:38:26  gerd
 * 	Added a simplified representation of the DTD. This improves
 * the quality of the parser drastically. For example,
 * "<p>abc<p>def" is no longer parsed as "<p>abc<p>def</p></p>",
 * but as "<p>abc</p><p>def</p>". However, the representation is not
 * perfect yet. What's definitly missing are the exclusion lists
 * of the DTD. Because of this missing feature, "<a>abc<a>def" is
 * still parsed as "<a>abc<a>def</a></a>" although the DTD states
 * that anchors cannot contain anchors ( - but it also states that
 * end tags of anchors cannot be omitted, so this feature is not
 * priority 1).
 *
 * Revision 1.1  2000/03/03 01:07:25  gerd
 * 	Initial revision.
 *
 * 
 *)