1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201
|
<HTML>
<HEAD>
<TITLE>SGML parse and stream definition for libwww</TITLE></HEAD>
<BODY>
<H1>SGML and Structured streams</H1>The SGML parser is a state machine.
It is called for every character of the input stream. The DTD data
structure contains pointers to functions which are called to
implement the actual effect of the text read. When these functions are
called, the attribute structures pointed to by the DTD are valid,
and the function is parsed a pointer to the curent tag structure, and
an "element stack" which represents the state of nesting within SGML
elements.<P>
The following aspects are from Dan
Connolly's suggestions: Binary search,
Strcutured object scheme basically,
SGML content enum type.<P>
The module is a part of the <A HREF="Overview">CERN Common WWW Library</A>
(c) Copyright CERN 1991 - See Copyright.html
<PRE>
#ifndef SGML_H
#define SGML_H
#include "HTUtils.h"
#include "HTStream.h"
</PRE>
<H2>SGML content types</H2>
<PRE>typedef enum _SGMLContent{
SGML_EMPTY, /* no content */
SGML_LITERAL, /* character data. Recognized exact close tag only.
Old www server compatibility only! Not SGML */
SGML_CDATA, /* character data. recognize </ only */
SGML_RCDATA, /* replaceable character data. recognize </ and &ref; */
SGML_MIXED, /* elements and parsed character data. recognize all markup */
SGML_ELEMENT /* any data found will be returned as an error*/
} SGMLContent;
typedef struct {
char * name; /* The (constant) name of the attribute */
/* Could put type info in here */
} attr;
/* A tag structure describes an SGML element.
** -----------------------------------------
**
**
** name is the string which comes after the tag opener "<".
**
** attributes points to a zero-terminated array
** of attribute names.
**
** litteral determines how the SGML engine parses the charaters
** within the element. If set, tag openers are ignored
** except for that which opens a matching closing tag.
**
*/
typedef struct _tag HTTag;
struct _tag{
char * name; /* The name of the tag */
attr * attributes; /* The list of acceptable attributes */
int number_of_attributes; /* Number of possible attributes */
SGMLContent contents; /* End only on end tag @@ */
};
/* DTD Information
** ---------------
**
** Not the whole DTD, but all this parser usues of it.
*/
typedef struct {
HTTag * tags; /* Must be in strcmp order by name */
int number_of_tags;
CONST char ** entity_names; /* Must be in strcmp order by name */
int number_of_entities;
} SGML_dtd;
#define MAX_ATTRIBUTES 20 /* Max number of attributes per element */
/* SGML context passed to parsers
*/
typedef struct _HTSGMLContext *HTSGMLContext; /* Hidden */
/*__________________________________________________________________________
*/
</PRE>
<H2>Structured Object definition</H2>A structured object is something
which can reasonably be represented
in SGML. I'll rephrase that. A
structured object is am ordered tree-structured
arrangement of data which is representable
as text.The SGML parer outputs to
a Structured object. A Structured
object can output its contents to
another Structured Object. It's a
kind of typed stream. The architecure
is largely Dan Conolly's. Elements
and entities are passed to the sob
by number, implying a knowledge of
the DTD. Knowledge of the SGML syntax
is not here, though.<P>
Superclass: HTStream<P>
The creation methods will vary on
the type of Structured Object.Maybe
the callerData is enough info to
pass along.
<PRE>typedef struct _HTStructured HTStructured;
typedef struct _HTStructuredClass{
char* name; /* Just for diagnostics */
void (*_free) PARAMS((
HTStructured* me));
void (*abort) PARAMS((
HTStructured* me,
HTError e));
void (*put_character) PARAMS((
HTStructured* me,
char ch));
void (*put_string) PARAMS((
HTStructured* me,
CONST char * str));
void (*write) PARAMS((
HTStructured* me,
CONST char * str,
int len));
void (*start_element) PARAMS((
HTStructured* me,
int element_number,
CONST BOOL* attribute_present,
CONST char** attribute_value));
void (*end_element) PARAMS((
HTStructured* me,
int element_number));
void (*put_entity) PARAMS((
HTStructured* me,
int entity_number));
}HTStructuredClass;
</PRE>
<H2>Find a Tag by Name</H2>Returns a pointer to the tag within
the DTD.
<PRE>extern HTTag * SGMLFindTag PARAMS((CONST SGML_dtd* dtd, CONST char * string));
</PRE>
<H2>Find a Attribute by Name</H2>Returns the number of the
atribute or -1 if failure.
<PRE>extern int SGMLFindAttribute PARAMS((HTTag* tag, CONST char * string));
</PRE>
<H2>Create an SGML parser</H2>
<PRE>/*
** On entry,
** dtd must point to a DTD structure as defined above
** callbacks must point to user routines.
** callData is returned in callbacks transparently.
** On exit,
** The default tag starter has been processed.
*/
extern HTStream* SGML_new PARAMS((
CONST SGML_dtd * dtd,
HTStructured * target));
extern CONST HTStreamClass SGMLParser;
#endif /* SGML_H */
</PRE></BODY>
</HTML>
|