module Xmlm:Streaming XML IO.sig
..end
A well-formed sequence of signals represents an XML document tree traversal in depth first order (this has nothing to do with XML well-formedness). Input pulls a well-formed sequence of signals from a data source and output pushes a well-formed sequence of signals to a data destination. Functions are provided to easily transform sequences of signals to/from arborescent data structures.
Consult the features and limitations and examples of use.
Version 1.0.2 - daniel.buenzl i@erratique.ch
References.
Tim Bray. The annotated XML Specification, 1998.
Tim Bray et al. Namespaces in XML 1.1 (2nd ed.), 2006.
typeencoding =
[ `ISO_8859_1 | `US_ASCII | `UTF_16 | `UTF_16BE | `UTF_16LE | `UTF_8 ]
typedtd =
string option
typename =
string * string
(uri,local)
. An empty uri
represents a name without a
namespace name, i.e. an unprefixed name
that is not under the scope of a default namespace.typeattribute =
name * string
typetag =
name * attribute list
typesignal =
[ `Data of string | `Dtd of dtd | `El_end | `El_start of tag ]
doc
grammar :
doc ::= `Dtd tree
tree ::= `El_start child `El_end
child ::= `Data | tree | epsilon
Input and output deal only with well-formed sequences or
exceptions are raised.val ns_xml : string
val ns_xmlns : string
typepos =
int * int
typeerror =
[ `Expected_char_seqs of string list * string
| `Expected_root_element
| `Illegal_char_ref of string
| `Illegal_char_seq of string
| `Malformed_char_stream
| `Max_buffer_size
| `Unexpected_eoi
| `Unknown_encoding of string
| `Unknown_entity_ref of string
| `Unknown_ns_prefix of string ]
val error_message : error -> string
exception Error of pos * error
typesource =
[ `Channel of Pervasives.in_channel
| `Fun of unit -> int
| `String of int * string ]
`String
starts reading at the
given integer position. For `Fun
the function must return the
next byte as an int
and raise End_of_file
if there is no
such byte.type
input
val make_input : ?enc:encoding option ->
?strip:bool ->
?ns:(string -> string option) ->
?entity:(string -> string option) -> source -> input
enc
, character encoding of the document, details.
Defaults to None
.strip
, strips whitespace in character data, details.
Defaults to false
.ns
is called to bind undeclared namespace prefixes,
details. Default returns always None
.entity
is called to resolve non predefined entity references,
details. Default returns always None
.val input : input -> signal
Xmlm.Error
is raised. Furthermore there will be no
two consecutive `Data
signals in the sequence and their string
is always non empty. After a well-formed sequence was input another may
be input, see Xmlm.eoi
and details.
Raises Xmlm.Error
on input errors.
val input_tree : el:(tag -> 'a list -> 'a) -> data:(string -> 'a) -> input -> 'a
`Data
signal, inputs it and invokes data
with the character data.`El_start
signal, inputs the sequence of signals until its
matching `El_end
and invokes el
and data
as follows
el
, is called on each `El_end
signals with the corresponding
`El_start
tag and the result of the callback invocation for the
element's children.data
, is called on each `Data
signals with the character data.
This function won't be called twice consecutively or with the empty
string.Invalid_argument
.
Raises Xmlm.Error
on input errors and Invalid_argument
if the next signal is not `El_start
or `Data
.
val input_doc_tree : el:(tag -> 'a list -> 'a) ->
data:(string -> 'a) -> input -> dtd * 'a
Xmlm.input_tree
but reads a complete well-formed
sequence of signals.
Raises Xmlm.Error
on input errors and Invalid_argument
if the next signal is not `Dtd
.
val peek : input -> signal
Xmlm.input
but doesn't remove the signal from the sequence.
Raises Xmlm.Error
on input errors.
val eoi : input -> bool
val pos : input -> pos
type'a
frag =[ `Data of string | `El of tag * 'a list ]
'a
.typedest =
[ `Buffer of Buffer.t
| `Channel of Pervasives.out_channel
| `Fun of int -> unit ]
`Buffer
, the buffer won't
be cleared. For `Fun
the function is called with the output bytes as int
s.type
output
val make_output : ?nl:bool ->
?indent:int option ->
?ns_prefix:(string -> string option) -> dest -> output
nl
, if true
a newline is output when the root's element `El_end
signal is output.
Defaults to false
.indent
, identation behaviour, see details. Defaults to
None
.ns_prefix
, undeclared namespace prefix bindings,
see details. Default returns always None
.val output : output -> signal -> unit
Raises Invalid_argument
if the resulting signal sequence on
the output abstraction is not well-formed or if a
namespace name could not be bound to a prefix.
val output_tree : ('a -> 'a frag) -> output -> 'a -> unit
Raises see Xmlm.output
.
val output_doc_tree : ('a -> 'a frag) -> output -> dtd * 'a -> unit
Xmlm.output_tree
but outputs a complete well-formed
sequence of signals.
Raises see Xmlm.output
.
Xmlm.Make
allows client to specify types for strings and internal
buffers. Among other things this can be used to perform
hash-consing or to process the character stream, e.g. to normalize
unicode characters or to convert to a custom encoding.
typestd_string =
string
typestd_buffer =
Buffer.t
module type String =sig
..end
module type Buffer =sig
..end
module type S =sig
..end
Xmlm.Make
.
module Make:
The module assumes strings are immutable, thus strings the client gives or receives during the input and output process must not be modified.
The parser supports ASCII, US-ASCII, UTF-8, UTF-16, UTF-16LE, UTF-16BE and ISO-8559-1 (Latin-1) encoded documents. But strings returned by the library are always UTF-8 encoded (unless you use the functor).
The encoding can be specified explicitly using the optional
argument enc
. Otherwise the parser uses UTF-16 or UTF-8 if there is a
BOM at the
beginning of the document. If there is no BOM it uses the encoding
specified in the XML
declaration. Finally, if there is no XML declaration UTF-8 is assumed.
The parser performs
attribute data
normalization on every attribute data. This means that
attribute data does not have leading and trailling white space and that
any white space is collapsed and transformed to a single space
character (U+0020
).
White space handling of character data depends on the strip
argument. If strip
is true
, character data is treated like
attribute data, white space before and after elements is removed
and any white space is collapsed and transformed to a single
space character (U+0020
), except if the data is under the scope of a xml:space attribute whose value is preserve. If strip
is
false
all white space data is preserved as present in the
document (however all kinds of
line ends are
translated to the newline character (U+000A
).
Xmlm's names are
expanded names.
The parser automatically handles the document's namespace
declarations. Undeclared namespace prefixes can be bound via the
callback ns
, which must return a namespace name. If ns
returns
None
an `Unknown_ns_prefix
error is raised.
Attributes used for namespace declarations are preserved by the
parser. They are in the Xmlm.ns_xmlns
namespace. Default namespace
declarations made with xmlns have the attribute name
(Xmlm.ns_xmlns, "xmlns")
. Prefix declarations have the prefix as
the local name, for example xmlns:ex results in the attribute name
(Xmlm.ns_xmlns, "ex")
.
Regarding constraints on the usage of the xml and xmlns prefixes by documents, the parser does not report errors on violations of the must constraints listed in this paragraph.
Character references
and predefined
entities are automatically resolved. Other entity references can
be resolved by the callback entity
, which must return an UTF-8
(unless you use the functor) string corresponding to the
replacement character data. The replacement data is not
analysed for further references, it is added to the data as such
modulo white space stripping. If entity
returns None
the error
`Unknown_entity_ref
is returned.
When a well-formed sequence of signals is input, no data is consumed beyond
the closing '>'
of the document's root element.
If you want to parse a document as
defined in the XML
specification, call Xmlm.eoi
after a well-formed sequence of
signals, it must return true
. If you expect another document on
the same input abstraction a new well-formed sequence of signals
can be Xmlm.input
. Use Xmlm.eoi
to check if a document follows (this
may consume data).
Invoking Xmlm.eoi
after a well-formed sequence of signals skips
whitespaces, comments and processing instructions until it gets to
either an XML
declaration or a DTD
or the start of a new element or the end of input (in which case
Xmlm.eoi
returns true
). If there is a new document but there is no
XML declaration or the declaration specifies UTF-16, the same
encoding as for the previous document is used.
':'
because
of namespaces).Sys.max_string_length
(unless you use the functor).
The error `Max_buffer_size
is raised if the limit is hit.Outputs only UTF-8 encoded documents (even if you use the functor). Strings given to output functions must be UTF-8 encoded (unless you use the functor, but you need to provide a translation), no checks are performed.
Xmlm's names are
expanded names.
Expanded names are automatically converted to
qualified
names by the output abstraction. There is no particular api to specify
prefixes and default namespaces,
the actual result depends solely on the output
of attributes belonging to the Xmlm.ns_xmlns
namespace. For example to set
the default namespace of an element to http://example.org/myns,
use the following attribute :
(* xmlns='http://example.org/myns' *)
let default_ns = (Xmlm.ns_xmlns, "xmlns"), "http://example.org/myns"
To bind the prefix "ex"
to http://example.org/ex, use the
following attribute :
(* xmlns:ex='http://example.org/ex' *)
let ex_ns = (Xmlm.ns_xmlns, "ex"), "http://example.org/ex"
Note that outputing input signals without
touching namespace declaration attributes will preserve existing
prefixes and bindings provided the same namespace name is not
bound to different prefixes in a given context.
The callback ns_prefix
of an output abstraction can be used to
give a prefix to a namespace name lacking a prefix binding in the
current output scope. Given a namespace name the function must return
the prefix to use. Note that this
will not add any namespace declaration attribute to the
output. If the function returns None
, Xmlm.output
will raise
Invalid_argument
. The default function returns always None
.
Output can be indented by specifying the indent
argument when an
output abstraction is created. If indent
is None
(default)
signal output does not introduce any extra white space. If
ident
is Some c
, each Xmlm.signal
is output on its own line
(for empty elements `El_start
and `El_end
are collapsed on a single
line) and nested elements are indented with c
space
characters.
After a well-formed sequence of signals was output, the output abstraction can be reused to output a new well-formed sequence of signals.
'<'
,'>'
,'&'
, and '\"'
are
automatically escaped to
predefined
entities.("","dip d")
will produce
a non well-formed document because of the space character.strip = false
and output with indent = None
.indent = None
and suitable `Data
signalsSequential processing has the advantage that you don't need to get the whole document tree in memory to process it.
The following function reads a single document on an input channel and outputs it.
let id ic oc =
let i = Xmlm.make_input (`Channel ic) in
let o = Xmlm.make_output (`Channel oc) in
let rec pull i o depth =
Xmlm.output o (Xmlm.peek i);
match Xmlm.input i with
| `El_start _ -> pull i o (depth + 1)
| `El_end -> if depth = 1 then () else pull i o (depth - 1)
| `Data _ -> pull i o depth
| `Dtd _ -> assert false
in
Xmlm.output o (Xmlm.input i); (* `Dtd *)
pull i o 0;
if not (Xmlm.eoi i) then invalid_arg "document not well-formed"
The following function reads a sequence of documents on an
input channel and outputs it.
let id_seq ic oc =
let i = Xmlm.make_input (`Channel ic) in
let o = Xmlm.make_output ~nl:true (`Channel oc) in
while not (Xmlm.eoi i) do Xmlm.output o (Xmlm.input i) done
The following function reads a sequence of documents on the
input channel. In each document's tree it prunes non root elements
whose name belongs to prune_list
.
let prune_docs prune_list ic oc =
let i = Xmlm.make_input (`Channel ic) in
let o = Xmlm.make_output ~nl:true (`Channel oc) in
let copy i o = Xmlm.output o (Xmlm.input i) in
let prune (name, _) = List.mem name prune_list in
let rec process i o d =
let rec skip i d = match Xmlm.input i with
| `El_start _ -> skip i (d + 1)
| `El_end -> if d = 1 then () else skip i (d - 1)
| s -> skip i d
in
match Xmlm.peek i with
| `El_start tag when prune tag -> skip i 0; process i o d
| `El_start _ -> copy i o; process i o (d + 1)
| `El_end -> copy i o; if d = 0 then () else process i o (d - 1)
| `Data _ -> copy i o; process i o d
| `Dtd _ -> assert false
in
let rec docs i o =
copy i o; (* `Dtd *)
copy i o; (* root start *)
process i o 0;
if Xmlm.eoi i then () else docs i o
in
docs i o
A document's sequence of signals can be easily converted to an arborescent data structure. Assume your trees are defined by :
type tree = E of Xmlm.tag * tree list | D of string
The following functions input/output xml documents from/to abstractions
as value of type tree
.
let in_tree i =
let el tag childs = E (tag, childs) in
let data d = D d in
Xmlm.input_doc_tree ~el ~data i
let out_tree o t =
let frag = function
| E (tag, childs) -> `El (tag, childs)
| D d -> `Data d
in
Xmlm.output_doc_tree frag o t
We show how to process XML data that represents tabular data (some people like do that).
The file we need to deal with represents nominal data about W3C bureaucrats. There are no namespaces and attributes are ignored. The element structure of the document is :
A bureaucrat contains the following elements, in order.
In OCaml we represent a W3C bureaucrat by this type :
type w3c_bureaucrat = {
name : string;
surname : string;
honest : bool;
obfuscation_level : float;
trs : string list; }
The following functions input and output W3C bureaucrats as lists
of values of type w3c_bureaucrat
.
let in_w3c_bureaucrats src =
let i = Xmlm.make_input ~strip:true src in
let tag n = ("", n), [] in
let error () = invalid_arg "parse error" in
let accept s i = if Xmlm.input i = s then () else error () in
let rec i_seq el acc i = match Xmlm.peek i with
| `El_start _ -> i_seq el ((el i) :: acc) i
| `El_end -> List.rev acc
| _ -> error ()
in
let i_el n i =
accept (`El_start (tag n)) i;
let d = match Xmlm.peek i with
| `Data d -> ignore (Xmlm.input i); d
| `El_end -> ""
| _ -> error ()
in
accept (`El_end) i;
d
in
let i_bureaucrat i =
try
accept (`El_start (tag "bureaucrat")) i;
let name = i_el "name" i in
let surname = i_el "surname" i in
let honest = match Xmlm.peek i with
| `El_start (("", "honest"), []) -> ignore (i_el "honest" i); true
| _ -> false
in
let obf = float_of_string (i_el "obfuscation_level" i) in
let trs = i_seq (i_el "tr") [] i in
accept (`El_end) i;
{ name = name; surname = surname; honest = honest;
obfuscation_level = obf; trs = trs }
with
| Failure _ -> error () (* float_of_string *)
in
accept (`Dtd None) i;
accept (`El_start (tag "list")) i;
let bl = i_seq i_bureaucrat [] i in
accept (`El_end) i;
if not (Xmlm.eoi i) then invalid_arg "more than one document";
bl
let out_w3c_bureaucrats dst bl =
let tag n = ("", n), [] in
let o = Xmlm.make_output ~nl:true ~indent:(Some 2) dst in
let out = Xmlm.output o in
let o_el n d =
out (`El_start (tag n));
if d <> "" then out (`Data d);
out `El_end
in
let o_bureaucrat b =
out (`El_start (tag "bureaucrat"));
o_el "name" b.name;
o_el "surname" b.surname;
if b.honest then o_el "honest" "";
o_el "obfuscation_level" (string_of_float b.obfuscation_level);
List.iter (o_el "tr") b.trs;
out `El_end
in
out (`Dtd None);
out (`El_start (tag "list"));
List.iter o_bureaucrat bl;
out (`El_end)