File: nethtml.mli

package info (click to toggle)
netstring 0.10.1-3
links: PTS
area: main
in suites: woody
size: 1,000 kB
ctags: 895
sloc: ml: 8,389; xml: 416; makefile: 188; sh: 103
file content (255 lines) | stat: -rw-r--r-- 10,626 bytes
(* $Id: nethtml.mli,v 1.7 2001/08/31 22:11:56 gerd Exp $
 * ----------------------------------------------------------------------
 *
 *)


(* The type 'document' represents parsed HTML documents. 
 * Element (name, args, subnodes): is an element node for an element of
 *   type 'name' (i.e. written <name ...>...</name>) with arguments 'args'
 *   and subnodes 'subnodes' (the material within the element). The arguments
 *   are simply name/value pairs. Entity references (something like &xy;)
 *   occuring in the values are NOT resolved.
 *   Arguments without values (e.g. <select name="x" multiple>: here,
 *   "multiple" is such an argument) are represented as (name,name), i.e. the
 *   name is returned as value.
 *   As argument names are case-insensitive, the names are all lowercase.
 * Data s: is a character data node. Again, entity references are contained
 *   as such and not as what they mean.
 *)

type document =
    Element of (string  *  (string*string) list  *  document list)
  | Data of string
;;


(* Now follows the type definition of simplified DTDs. *)

type element_class =         (* What is the class of an element? *)
  [ `Inline
  | `Block
  | `Essential_block
  | `None
  | `Everywhere
  ]
;;

(* The class `None means that the tag is an individual tag that is neither
 * block nor inline.
 * The class `Everywhere means that the tag can occur everywhere, regardless
 * of whether the model of the parent element allows it or not.
 * The class `Essential_block means that the end tag of the block element
 * can never be omitted.
 *)


type model_constraint =      (* The constraint the subelements must fulfill *)
  [ `Inline
  | `Block
  | `Flow                                            (* = `Inline or `Block *)
  | `Empty
  | `Any
  | `Special
  | `Elements of string list             (* Enumeration of allowed elements *)
  | `Or of (model_constraint * model_constraint)
  | `Except of (model_constraint * model_constraint)
  | `Sub_exclusions of (string list * model_constraint)
  ]
;;

(* Model constraints define the possible sub elements of an element:
 * `Inline, `Block:    The sub elements must belong to these classes
 * `Flow:              The sub elements must belong to `Inline or `Block
 * `Empty:             There are no sub elements
 * `Any:               Any sub element is allowed
 * `Special:           The element has special content (<script>).
 *                     Functionally equivalent to `Empty
 * `Elements l:        Only these enumerated elements may occur
 * `Or(m1,m2):         One of the constraints m1 or m2 must hold
 * `Except(m1,m2):     The constraint m1 must hold, and m2 must not hold
 * `Sub_exclusions(l,m):  The constraint m must hold; furthermore, the elements
 *                     enumerated in list l are not allowed as direct or
 *                     indirect subelements, even if m or the model of a
 *                     subelement would allow them. The difference to
 *                     `Except(m, `Elements l) is that the exclusion is
 *                     inherited to the subelements. The `Sub_exclusions
 *                     expression must be toplevel, i.e. it must not occur
 *                     within an `Or, `Except, or another 'Sub_exclusions
 *                     expression.
 *
 * Note that certain aspects are not modelled:
 * - #PCDATA: We do not specify where PCDATA is allowed and where not.
 * - Order, Number: We do neither specify in which order the sub elements must
 *   occur nor how often they can occur
 * - Inclusions: DTDs may describe that an element extraordinarily
 *   allows a list of elements in all sub elements. 
 * - Optional tags: Whether start or end tags can be omitted
 *)

type simplified_dtd =
    (string * (element_class * model_constraint)) list;;

(* This list contains the class of every element, and the constraint for
 * the subelements of the element.
 *)

val html40_dtd : simplified_dtd
  (* The (transitional) HTML 4.0 DTD *)

val relaxed_html40_dtd : simplified_dtd
  (* A relaxed version of the HTML 4.0 DTD that matches better common
   * practice. In particular, this DTD additionally allows that inline
   * elements may span blocks. For example, 
   *   <B>text1 <P>text2
   * is parsed as
   *   <B>text1 <P>text2</P></B>
   * and not as
   *   <B>text1 </B><P>text2</P>
   * - the latter is more correct (and parsed by html40_dtd), but is not what
   * users expect.
   * Note that this is still not what many browsers implement. For example,
   * Netscape treats most inline tags specially: <B> switches bold on,
   * </B> switches bold off. For example,
   *   <A href='a'>text1<B>text2<A href='b'>text3
   * is parsed as
   *   <A href='a'>text1<B>text2</B></A><B><A href='b'>text3</A></B>
   * - there is an extra B element around the second anchor! (You can
   * see what Netscape parses by loading a page into the "Composer".)
   * IMHO it is questionable to consider inline tags as switches because
   * this is totally outside of the HTML specification, and browsers may
   * differ in that point.
   *
   * Furthermore, several elements are turned into essential blocks:
   * table, ul, ol, dl. David Fox reported a problem with structures
   * like:
   * <table><tr><td><table><tr><td>x</td></td></tr></table>y</td></tr></table>
   * i.e. the td of the inner table has two end tags. Without additional
   * help, the second </td> would close the outer table cell. Because of
   * this problem, tables are now essential meaning that it is not allowed
   * to implicitly add a missing </table>; every table element has to
   * be explicitly ended. This rule seems to be what many browsers implement.
   *)

val parse_document : ?dtd:simplified_dtd ->            (* default: html40_dtd *)
                     ?return_declarations:bool ->      (* default: false *)
                     ?return_pis:bool ->               (* default: false *)
                     ?return_comments:bool ->          (* default: false *)
                     Lexing.lexbuf ->
                       document list
  (* Parses the HTML document from a lexbuf and returns it. 
   * Options:
   * ~dtd: specifies the DTD to use. By default, html40_dtd is used which
   *   bases on the transitional HTML 4.0 DTD
   * ~return_declarations: if set, the parser returns <!...> declarations
   *   as Element("!",["contents",c],[]) nodes, where c is the string inside
   *   <! and >. - By default, declarations are skipped.
   * ~return_pis: if set, the parser returns <?...> (or <?...?>) processing
   *   instructions as Element("?",["contents",c],[]) nodes, where c is the
   *   string inside <? and > (or ?>). - By default, processing instructions
   *   are skipped.
   * ~return_comments: if set, the parser returns <!-- .... --> comments
   *   as Element("--",["contents",c],[]) nodes, where c is the string inside
   *   <!-- and -->. - By default, comments are skipped.
   *)

val parse_string : ?dtd:simplified_dtd ->            (* default: html40_dtd *)
                   ?return_declarations:bool ->      (* default: false *)
                   ?return_pis:bool ->               (* default: false *)
                   ?return_comments:bool ->          (* default: false *)
                   string ->
                     document list
  (* Parses the HTML document from a string and returns it. *)

val parse_file : ?dtd:simplified_dtd ->            (* default: html40_dtd *)
                 ?return_declarations:bool ->      (* default: false *)
                 ?return_pis:bool ->               (* default: false *)
                 ?return_comments:bool ->          (* default: false *)
                 in_channel ->
                   document list
  (* Parses the HTML document from a file and returns it. *)


(* NOTE ON XHTML:
 * The parser can read xhtml, as long as the following XML features are not
 * used:
 * - Internal DTD subset, i.e. <!DOCTYPE html ... [ ... ]>
 * - External entities
 * - <![CDATA[
 * - <![INCLUDE[
 * - <![IGNORE[
 * - encodings other than ISO-8859-1
 * The following XML features are ok:
 * - processing instructions
 * - empty elements (e.g. <br/>) as long as the element is declared as EMPTY.
 *)

val decode : document list -> document list
  (* decode: converts entities &name; and &#num; into the corresponding 
   * characters
   * Note: Declarations, processing instructions, and comments are not
   * decoded.
   *)

val encode : document list -> document list
  (* encode: converts problematic characters to their corresponding
   * entities
   * Note: Declarations, processing instructions, and comments are not
   * encoded.
   *)

val write : ?dtd:simplified_dtd ->            (* default: html40_dtd *) 
            [ `Out_buffer of Buffer.t
	    | `Out_channel of out_channel
	    | `Out_function of (string -> int -> int -> unit)
	    ] ->
            document list ->
	      unit
  (* Writes the document to the buffer/channel/function. No encoding or
   * decoding happens.
   * `Out_function (fun s pos len -> ()): Must write the len characters at
   *    pos from the string
   *)


(* ======================================================================
 * History:
 * 
 * $Log: nethtml.mli,v $
 * Revision 1.7  2001/08/31 22:11:56  gerd
 * 	Added essential blocks.
 *
 * Revision 1.6  2001/07/15 14:18:59  gerd
 * 	New relaxed_html40_dtd.
 * 	New constraint `Sub_exclusions.
 *
 * Revision 1.5  2001/06/10 23:56:50  gerd
 * 	Fix: 'write' no longer writes end tags of empty elements.
 *
 * Revision 1.4  2001/06/08 22:19:55  gerd
 * 	Added functions encode, decode, write for convenience.
 *
 * Revision 1.3  2001/06/08 16:25:27  gerd
 * 	Bugfix: </SCRIPT> is now recognized (thanks to David Fox)
 * 	The parser may now return comments, declarations, and processing
 * instructions if requested to do so
 * 	The parser accepts xhtml to some extent
 * 	Now exported: parse_document.
 *
 * Revision 1.2  2001/04/07 23:38:26  gerd
 * 	Added a simplified representation of the DTD. This improves
 * the quality of the parser drastically. For example,
 * "<p>abc<p>def" is no longer parsed as "<p>abc<p>def</p></p>",
 * but as "<p>abc</p><p>def</p>". However, the representation is not
 * perfect yet. What's definitly missing are the exclusion lists
 * of the DTD. Because of this missing feature, "<a>abc<a>def" is
 * still parsed as "<a>abc<a>def</a></a>" although the DTD states
 * that anchors cannot contain anchors ( - but it also states that
 * end tags of anchors cannot be omitted, so this feature is not
 * priority 1).
 *
 * Revision 1.1  2000/03/03 01:07:25  gerd
 * 	Initial revision.
 *
 * 
 *)