File: HTMLparser.h

package info (click to toggle)
libxml2 2.15.1%2Bdfsg-2
  • links: PTS, VCS
  • area: main
  • in suites: experimental, forky, sid
  • size: 9,964 kB
  • sloc: ansic: 138,103; python: 6,692; sh: 4,736; xml: 1,476; makefile: 715
file content (397 lines) | stat: -rw-r--r-- 10,612 bytes parent folder | download
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
/**
 * @file
 * 
 * @brief HTML parser, doesn't support HTML5
 * 
 * This module orginally implemented an HTML parser based on the
 * (underspecified) HTML 4.0 spec. As of 2.14, the tokenizer
 * conforms to HTML5. Tree construction still follows a custom,
 * unspecified algorithm with many differences to HTML5.
 *
 * The parser defaults to ISO-8859-1, the default encoding of
 * HTTP/1.0.
 *
 * @copyright See Copyright for the status of this software.
 *
 * @author Daniel Veillard
 */

#ifndef __HTML_PARSER_H__
#define __HTML_PARSER_H__
#include <libxml/xmlversion.h>
#include <libxml/parser.h>

#ifdef LIBXML_HTML_ENABLED

#ifdef __cplusplus
extern "C" {
#endif

/*
 * Backward compatibility
 */
#define UTF8ToHtml htmlUTF8ToHtml
#define htmlDefaultSubelement(elt) elt->defaultsubelt
#define htmlElementAllowedHereDesc(parent,elt) \
	htmlElementAllowedHere((parent), (elt)->name)
#define htmlRequiredAttrs(elt) (elt)->attrs_req

/*
 * Most of the back-end structures from XML and HTML are shared.
 */
/** Same as xmlParserCtxt */
typedef xmlParserCtxt htmlParserCtxt;
typedef xmlParserCtxtPtr htmlParserCtxtPtr;
typedef xmlParserNodeInfo htmlParserNodeInfo;
/** Same as xmlSAXHandler */
typedef xmlSAXHandler htmlSAXHandler;
typedef xmlSAXHandlerPtr htmlSAXHandlerPtr;
/** Same as xmlParserInput */
typedef xmlParserInput htmlParserInput;
typedef xmlParserInputPtr htmlParserInputPtr;
typedef xmlDocPtr htmlDocPtr;
typedef xmlNodePtr htmlNodePtr;

/** @cond ignore */

/*
 * Internal description of an HTML element, representing HTML 4.01
 * and XHTML 1.0 (which share the same structure).
 */
typedef struct _htmlElemDesc htmlElemDesc;
typedef htmlElemDesc *htmlElemDescPtr;
struct _htmlElemDesc {
    const char *name;	/* The tag name */
    char startTag;      /* unused */
    char endTag;        /* Whether the end tag can be implied */
    char saveEndTag;    /* unused */
    char empty;         /* Is this an empty element ? */
    char depr;          /* unused */
    char dtd;           /* unused */
    char isinline;      /* is this a block 0 or inline 1 element */
    const char *desc;   /* the description */

    const char** subelts XML_DEPRECATED_MEMBER;
    const char* defaultsubelt XML_DEPRECATED_MEMBER;
    const char** attrs_opt XML_DEPRECATED_MEMBER;
    const char** attrs_depr XML_DEPRECATED_MEMBER;
    const char** attrs_req XML_DEPRECATED_MEMBER;

    int dataMode;
};

/*
 * Internal description of an HTML entity.
 */
typedef struct _htmlEntityDesc htmlEntityDesc;
typedef htmlEntityDesc *htmlEntityDescPtr;
struct _htmlEntityDesc {
    unsigned int value;	/* the UNICODE value for the character */
    const char *name;	/* The entity name */
    const char *desc;   /* the description */
};

#ifdef LIBXML_SAX1_ENABLED
/**
 * @deprecated Use #xmlSAX2InitHtmlDefaultSAXHandler
 */
XML_DEPRECATED
XMLPUBVAR const xmlSAXHandlerV1 htmlDefaultSAXHandler;
#endif /* LIBXML_SAX1_ENABLED */

/** @endcond */

/*
 * There is only few public functions.
 */
XML_DEPRECATED
XMLPUBFUN void
			htmlInitAutoClose	(void);
XML_DEPRECATED
XMLPUBFUN const htmlElemDesc *
			htmlTagLookup	(const xmlChar *tag);
XML_DEPRECATED
XMLPUBFUN const htmlEntityDesc *
			htmlEntityLookup(const xmlChar *name);
XML_DEPRECATED
XMLPUBFUN const htmlEntityDesc *
			htmlEntityValueLookup(unsigned int value);

XML_DEPRECATED
XMLPUBFUN int
			htmlIsAutoClosed(xmlDoc *doc,
					 xmlNode *elem);
XML_DEPRECATED
XMLPUBFUN int
			htmlAutoCloseTag(xmlDoc *doc,
					 const xmlChar *name,
					 xmlNode *elem);
XML_DEPRECATED
XMLPUBFUN const htmlEntityDesc *
			htmlParseEntityRef(htmlParserCtxt *ctxt,
					 const xmlChar **str);
XML_DEPRECATED
XMLPUBFUN int
			htmlParseCharRef(htmlParserCtxt *ctxt);
XML_DEPRECATED
XMLPUBFUN void
			htmlParseElement(htmlParserCtxt *ctxt);

XMLPUBFUN htmlParserCtxt *
			htmlNewParserCtxt(void);
XMLPUBFUN htmlParserCtxt *
			htmlNewSAXParserCtxt(const htmlSAXHandler *sax,
					     void *userData);

XMLPUBFUN htmlParserCtxt *
			htmlCreateMemoryParserCtxt(const char *buffer,
						   int size);

XMLPUBFUN int
			htmlParseDocument(htmlParserCtxt *ctxt);
XML_DEPRECATED
XMLPUBFUN xmlDoc *
			htmlSAXParseDoc	(const xmlChar *cur,
					 const char *encoding,
					 htmlSAXHandler *sax,
					 void *userData);
XMLPUBFUN xmlDoc *
			htmlParseDoc	(const xmlChar *cur,
					 const char *encoding);
XMLPUBFUN htmlParserCtxt *
			htmlCreateFileParserCtxt(const char *filename,
	                                         const char *encoding);
XML_DEPRECATED
XMLPUBFUN xmlDoc *
			htmlSAXParseFile(const char *filename,
					 const char *encoding,
					 htmlSAXHandler *sax,
					 void *userData);
XMLPUBFUN xmlDoc *
			htmlParseFile	(const char *filename,
					 const char *encoding);
XML_DEPRECATED
XMLPUBFUN int
			htmlUTF8ToHtml	(unsigned char *out,
					 int *outlen,
					 const unsigned char *in,
					 int *inlen);
XML_DEPRECATED
XMLPUBFUN int
			htmlEncodeEntities(unsigned char *out,
					 int *outlen,
					 const unsigned char *in,
					 int *inlen, int quoteChar);
XML_DEPRECATED
XMLPUBFUN int
			htmlIsScriptAttribute(const xmlChar *name);
XML_DEPRECATED
XMLPUBFUN int
			htmlHandleOmittedElem(int val);

#ifdef LIBXML_PUSH_ENABLED
/*
 * Interfaces for the Push mode.
 */
XMLPUBFUN htmlParserCtxt *
			htmlCreatePushParserCtxt(htmlSAXHandler *sax,
						 void *user_data,
						 const char *chunk,
						 int size,
						 const char *filename,
						 xmlCharEncoding enc);
XMLPUBFUN int
			htmlParseChunk		(htmlParserCtxt *ctxt,
						 const char *chunk,
						 int size,
						 int terminate);
#endif /* LIBXML_PUSH_ENABLED */

XMLPUBFUN void
			htmlFreeParserCtxt	(htmlParserCtxt *ctxt);

/*
 * New set of simpler/more flexible APIs
 */

/**
 * This is the set of HTML parser options that can be passed to
 * #htmlReadDoc, #htmlCtxtSetOptions and other functions.
 */
typedef enum {
    /**
     * No effect as of 2.14.0.
     */
    HTML_PARSE_RECOVER = 1<<0,
    /**
     * Do not default to a doctype if none was found.
     */
    HTML_PARSE_NODEFDTD = 1<<2,
    /**
     * Disable error and warning reports to the error handlers.
     * Errors are still accessible with xmlCtxtGetLastError().
     */
    HTML_PARSE_NOERROR = 1<<5,
    /**
     * Disable warning reports.
     */
    HTML_PARSE_NOWARNING = 1<<6,
    /**
     * No effect.
     */
    HTML_PARSE_PEDANTIC = 1<<7,
    /**
     * Remove some text nodes containing only whitespace from the
     * result document. Which nodes are removed depends on a conservative
     * heuristic. The reindenting feature of the serialization code relies
     * on this option to be set when parsing. Use of this option is
     * DISCOURAGED.
     */
    HTML_PARSE_NOBLANKS = 1<<8,
    /**
     * No effect.
     */
    HTML_PARSE_NONET = 1<<11,
    /**
     * Do not add implied html, head or body elements.
     */
    HTML_PARSE_NOIMPLIED = 1<<13,
    /**
     * Store small strings directly in the node struct to save
     * memory.
    */
    HTML_PARSE_COMPACT = 1<<16,
    /**
     * Relax some internal limits. See XML_PARSE_HUGE in xmlParserOption.
     *
     * @since 2.14.0
     *
     * Use XML_PARSE_HUGE with older versions.
     */
    HTML_PARSE_HUGE = 1<<19,
    /**
     * Ignore the encoding in the HTML declaration. This option is
     * mostly unneeded these days. The only effect is to enforce
     * ISO-8859-1 decoding of ASCII-like data.
     */
    HTML_PARSE_IGNORE_ENC =1<<21,
    /**
     * Enable reporting of line numbers larger than 65535.
     *
     * @since 2.14.0
     *
     * Use XML_PARSE_BIG_LINES with older versions.
     */
    HTML_PARSE_BIG_LINES = 1<<22,
    /**
     * Make the tokenizer emit a SAX callback for each token. This results
     * in unbalanced invocations of startElement and endElement.
     *
     * For now, this is only usable to tokenize HTML5 with custom SAX
     * callbacks. A tree builder isn't implemented yet.
     *
     * @since 2.14.0
    */
    HTML_PARSE_HTML5 = 1<<26
} htmlParserOption;

XMLPUBFUN void
		htmlCtxtReset		(htmlParserCtxt *ctxt);
XMLPUBFUN int
		htmlCtxtSetOptions	(htmlParserCtxt *ctxt,
					 int options);
XMLPUBFUN int
		htmlCtxtUseOptions	(htmlParserCtxt *ctxt,
					 int options);
XMLPUBFUN xmlDoc *
		htmlReadDoc		(const xmlChar *cur,
					 const char *URL,
					 const char *encoding,
					 int options);
XMLPUBFUN xmlDoc *
		htmlReadFile		(const char *URL,
					 const char *encoding,
					 int options);
XMLPUBFUN xmlDoc *
		htmlReadMemory		(const char *buffer,
					 int size,
					 const char *URL,
					 const char *encoding,
					 int options);
XMLPUBFUN xmlDoc *
		htmlReadFd		(int fd,
					 const char *URL,
					 const char *encoding,
					 int options);
XMLPUBFUN xmlDoc *
		htmlReadIO		(xmlInputReadCallback ioread,
					 xmlInputCloseCallback ioclose,
					 void *ioctx,
					 const char *URL,
					 const char *encoding,
					 int options);
XMLPUBFUN xmlDoc *
		htmlCtxtParseDocument	(htmlParserCtxt *ctxt,
					 xmlParserInput *input);
XMLPUBFUN xmlDoc *
		htmlCtxtReadDoc		(xmlParserCtxt *ctxt,
					 const xmlChar *cur,
					 const char *URL,
					 const char *encoding,
					 int options);
XMLPUBFUN xmlDoc *
		htmlCtxtReadFile		(xmlParserCtxt *ctxt,
					 const char *filename,
					 const char *encoding,
					 int options);
XMLPUBFUN xmlDoc *
		htmlCtxtReadMemory		(xmlParserCtxt *ctxt,
					 const char *buffer,
					 int size,
					 const char *URL,
					 const char *encoding,
					 int options);
XMLPUBFUN xmlDoc *
		htmlCtxtReadFd		(xmlParserCtxt *ctxt,
					 int fd,
					 const char *URL,
					 const char *encoding,
					 int options);
XMLPUBFUN xmlDoc *
		htmlCtxtReadIO		(xmlParserCtxt *ctxt,
					 xmlInputReadCallback ioread,
					 xmlInputCloseCallback ioclose,
					 void *ioctx,
					 const char *URL,
					 const char *encoding,
					 int options);

/**
 * deprecated content model
 */
typedef enum {
  HTML_NA = 0 ,		/* something we don't check at all */
  HTML_INVALID = 0x1 ,
  HTML_DEPRECATED = 0x2 ,
  HTML_VALID = 0x4 ,
  HTML_REQUIRED = 0xc /* VALID bit set so ( & HTML_VALID ) is TRUE */
} htmlStatus ;

/* Using htmlElemDesc rather than name here, to emphasise the fact
   that otherwise there's a lookup overhead
*/
XML_DEPRECATED
XMLPUBFUN htmlStatus htmlAttrAllowed(const htmlElemDesc*, const xmlChar*, int) ;
XML_DEPRECATED
XMLPUBFUN int htmlElementAllowedHere(const htmlElemDesc*, const xmlChar*) ;
XML_DEPRECATED
XMLPUBFUN htmlStatus htmlElementStatusHere(const htmlElemDesc*, const htmlElemDesc*) ;
XML_DEPRECATED
XMLPUBFUN htmlStatus htmlNodeStatus(xmlNode *, int) ;

#ifdef __cplusplus
}
#endif

#endif /* LIBXML_HTML_ENABLED */
#endif /* __HTML_PARSER_H__ */