File: xml_ecm.h

package info (click to toggle)
virtuoso-opensource 7.2.5.1%2Bdfsg1-0.3
  • links: PTS, VCS
  • area: main
  • in suites: bookworm
  • size: 285,240 kB
  • sloc: ansic: 641,220; sql: 490,413; xml: 269,570; java: 83,893; javascript: 79,900; cpp: 36,927; sh: 31,653; cs: 25,702; php: 12,690; yacc: 10,227; lex: 7,601; makefile: 7,129; jsp: 4,523; awk: 1,697; perl: 1,013; ruby: 1,003; python: 326
file content (394 lines) | stat: -rw-r--r-- 17,700 bytes parent folder | download | duplicates (2)
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
/*
 *  xml_ecm.h
 *
 *  $Id$
 *
 *  This file is part of the OpenLink Software Virtuoso Open-Source (VOS)
 *  project.
 *
 *  Copyright (C) 1998-2018 OpenLink Software
 *
 *  This project is free software; you can redistribute it and/or modify it
 *  under the terms of the GNU General Public License as published by the
 *  Free Software Foundation; only version 2 of the License, dated June 1991.
 *
 *  This program is distributed in the hope that it will be useful, but
 *  WITHOUT ANY WARRANTY; without even the implied warranty of
 *  MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
 *  General Public License for more details.
 *
 *  You should have received a copy of the GNU General Public License along
 *  with this program; if not, write to the Free Software Foundation, Inc.,
 *  51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA
 *
 */

#ifndef _XML_ECM_H
#define _XML_ECM_H

#include "Dk.h"
#include "libutil.h"
#include "langfunc.h"

/* XML element-content model (ECM) processor */

extern unsigned char ecm_utf8props[0x100];
#define ECM_ISZERO	0x01
#define ECM_ISSPACE	0x02
#define ECM_ISNAME	0x04
#define ECM_ISNAMEBEGIN 0x08

/* Part 1. Data types for compiled DTD description */

/*! All element names should be enumerated: negative are special values,
0...N-1 are N known element names. */
typedef ptrlong ecm_el_idx_t;

#define ECM_EL_EOS		(-1)		/* end of sequence (a closing tag) */
#define ECM_EL_UNKNOWN		(-2)		/* unknown element */
#define ECM_EL_PCDATA		(-3)		/* non-element (#PCDATA) */
#define ECM_EL_NULL_T		(-4)		/* "null-transportation" (unconditional jump in FSA graph) */
#define ECM_EL_OFFSET		4		/* offset in arrays for known element names */
#define ECM_EL_UNDEF		(-21)
#define ECM_EL_ERROR		(-22)


/*! A basic unit of ECM description is a term. Element name and #PCDATA
are terms, repetition of term is term, select among two terms is term.
*/
struct ecm_term_s
{
  ecm_el_idx_t	et_value;			/*!< Value of type ecm_el_idx, or ECM_CHAIN or ECM_CHOICE or ECM_LOOP */
  struct ecm_term_s *	et_subterm1;		/*!< \c NULL for a leaf, or left side of ECM_TERM_CHAIN, or body of ECM_LOOP, or first case of ECM_CHOICE */
  struct ecm_term_s *	et_subterm2;		/*!< \c NULL for a leaf or ECM_LOOP, or right side of ECM_TERM_CHAIN, or second case of ECM_CHOICE */
  ptrlong		et_may_be_empty;	/*!< Flags if empty sequence may match this term, thus producing glue between start and finish states */
};

typedef struct ecm_term_s ecm_term_t;

#define ECM_TERM_CHAIN		(-31)	/*!< Sequence of two given subterms should present */
#define ECM_TERM_CHOICE		(-32)	/*!< One of two given subterms should present */
#define ECM_TERM_LOOP_ZERO	(-33)	/*!< Given subterm should be repeated zero or more times */
#define ECM_TERM_LOOP_ONE	(-34)	/*!< Given subterm should be repeated one or more times */

#define ECM_EMPTY	((ecm_term_t *)(-1))
#define ECM_ANY		((ecm_term_t *)(-2))

void ecm_term_free (ecm_term_t* term);

/*! All states should be enumerated, too. -1 is an error, 0 is initial,
-2 is a terminal, others are intermediates. */
typedef ptrlong ecm_st_idx_t;

#define ECM_ST_START	0
#define ECM_ST_UNDEF	(-41)
#define ECM_ST_ERROR	(-42)
#define ECM_ST_ANY	(-51)
#define ECM_ST_EMPTY	(-52)

/*! element state */
struct ecm_st_s
{
  ecm_st_idx_t	*es_nexts;		/*!< \c es_targets[el_idx] is a index of resulting state */
  ecm_el_idx_t	es_conflict;		/*!< Sample of element which causes jump/jump conflict */
  ptrlong	es_is_frozen;		/*!< Flags if the state should not be reused */
};

typedef struct ecm_st_s ecm_st_t;

/* Index of an attribute in the array of element's attributes */
typedef ptrlong ecm_attr_idx_t;

/*! All DTD types of attributes are enumerated */
/* Do not change order of attributes' types in this enumeration! see definition of
   index_hashed_types
*/
typedef ptrlong ecm_attr_type_t;
#define ECM_AT_CDATA		0

#define ECM_AT_ENUM_NAMES	1
#define ECM_AT_ENUM_NOTATIONS	2
#define ECM_AT_ID		3
#define ECM_AT_IDREF		4
#define ECM_AT_IDREFS		5
#define ECM_AT_ENTITY		6
#define ECM_AT_ENTITIES		7
#define ECM_AT_NMTOKEN		8
#define ECM_AT_NMTOKENS		9
#define ECM_COUNTOFATTS		10

#define ECM_MESSAGE_LEN	 512

/*! Description of an attribute */
struct ecm_attr_s
{
  char *		da_name;	/*!< Name of attribute */
  ecm_attr_type_t	da_type;	/*!< Data type for attribute's values */
  ptrlong		da_is_required;	/*!< Flags if #REQUIRED specified */
  ptrlong		da_is_implied;	/*!< Flags if #IMPLIED specified */
  ptrlong		da_is_fixed;    /*!< Flags if #FIXED specified */
  char **		da_values;	/*!< Array of \c da_values_no pointers to normalized values */
  ptrlong		da_values_no;	/*!< Number of enumerated values, default value will come first */
  union
    {
      caddr_t		boxed_value;	/*!< Pointer to memory with boxed default value, valid if (NULL == da_values) */
      ptrlong		index;		/*!< Index in da_values array, valid if (NULL == da_values) */
    } da_default;
};

typedef struct ecm_attr_s ecm_attr_t;

/*! Description of an element */
struct ecm_el_s
{
  char *		ee_name;	/*!< Name of element */
  char *		ee_grammar;	/*!< String which represents grammar of order of element's children */
  char *		ee_errmsg;	/*!< Boxed string with compilation error message or NULL */
  ecm_attr_t *		ee_attrs;	/*!< Array of definitions of element's attributes */
  ecm_attr_idx_t	ee_attrs_no;	/*!< Number of defined attributes */
  ecm_st_t *		ee_states;	/*!< Array of FSA states */
  ecm_st_idx_t		ee_state_no;	/*!< Number of FSA states */
  ecm_el_idx_t		ee_conflict;	/*!< Sample of jump/jump conflict to prove that grammar is non-LARL(1) */
  ptrlong		ee_has_id_attr;	/*!< Indicates that the element has an ID attribute */
  ecm_attr_idx_t	ee_id_attr_idx;	/*!< Index of element's ID attribute, if \c ee_has_attr is set */
  ptrlong		ee_is_empty;	/*!< Indicates that element must be empty */
  ptrlong		ee_is_any;	/*!< Indicates that element could be any */
  ptrlong		ee_has_notation; /*!< Indicates that element contains notation attribute */
  ptrlong		ee_req_no;	/*!< Number of required attributes */
  ptrlong		ee_is_defbyatt; /*!< Element is defined by attribute declaration */
};

typedef struct ecm_el_s ecm_el_t;

/*! Description of XML DTD, it should be stored as box of type DV_CUSTOM */
struct dtd_s
{
  char *		ed_puburi;	/*!< PUBLIC URI of external DTD */
  char *		ed_sysuri;	/*!< SYSTEM URI of external DTD */
  int			ed_refctr;	/*!< Reference counter */
  int			ed_is_global;	/*!< Flags if the object is global and refcounting should be mutexed */
  int			ed_is_filled;	/*!< Flags if the DTD is filled by reading DTD from an XML file so it contains some actual data */
  ecm_el_t *		ed_els;		/*!< Array of elements' descriptions */
  ecm_el_idx_t		ed_el_no;	/*!< Number of described elements (0 while DTD is not parsed) */
  ecm_attr_idx_t	ed_max_attr_no;	/*!< Maximum possible number of attributes per element */
  struct id_hash_s *	ed_notations;	/*!< Hash table of all notations' definitions */
  struct id_hash_s *	ed_params;	/*!< Hash table of all parameter-entities' definitions */
  struct id_hash_s *	ed_generics;	/*!< Hash table of all generic-entities' definitions */
/*! Length of string (w/o markup header) in XPER, -1 if empty, 0 while not calculated */
  int			ed_xper_text_length;
};

#ifndef DTD_T_DECLARED
#define DTD_T_DECLARED
typedef struct dtd_s dtd_t;
#endif

/* Macro defs and functions for handling of dtd_t in a refcountable box */
#ifdef UNIT_DEBUG
#define dtd_alloc()		(dtd_t *)(calloc (1, sizeof (dtd_t)))
#define dtd_free(dtd)		free (dtd)
#define dtd_addref(dtd,global)	(dtd->ed_refctr++)
#define dtd_release(dtd)	((--(dtd->ed_refctr) > 0) ? 0 : (dtd_free(dtd), 1))
#else
#define dtd_alloc()	(dtd_t *)(dk_alloc_box_zero (sizeof (dtd_t), DV_CUSTOM))
extern void dtd_free (dtd_t *dtd);
extern void dtd_addref (dtd_t *dtd, int make_global);
extern int dtd_release (dtd_t *dtd);
#endif

/* Part 2. Data types for validator */

/*! Maximal depth for checking. If this depth reached, either fatal error
 or error or warning will be reported, and children-order-validation
will be suspended temporary. */
#define ECM_MAX_DEPTH 1024

typedef struct dc_attr_dict_item_s
{
  caddr_t dcad_attr_name;
  VXmlAttrParser dcad_handler;
  caddr_t *dcad_elements;
  ptrlong dcad_elements_count;  
}
dc_attr_dict_item_t;


/* There's a trick with storing of dictionaries. Dictionary can be DV_CUSTOM or of fake type 0.
DV_CUSTOM if made on the fly from app-specific vector or 0 if it's a static structure that is built into the server.
Attribute and element names are not copied/freed when placed to dc_attr_dict_item_t.
*/
typedef struct dc_attr_dict_s
{
  ptrlong dcad_count;
  dc_attr_dict_item_t *dcad_items;
} dc_attr_dict_t;

/* Configuration of validator. Keep static ecm_cfgparam_t ecm_cfgparams in
sync after changes of this structure. */

struct dtd_config_s
{
  ptrlong dc_attr_completion;	/*!< If validator should complete XML document with default values of attributes. */
  ptrlong dc_attr_misformat;	/*!< Check of attributes' values for matching their format with declared types: [VC: ID] [VC: IDREF] [VC: Entity Name] [VC: Name Token] */
  ptrlong dc_attr_missing;	/*!< Check if every element contain all 'required' attributes */
  ptrlong dc_attr_unknown;	/*!< Check if every attribute is declared in DTD */
  ptrlong dc_bad_recursion;	/*!< Check if entity's replacement text references to itself */
  ptrlong dc_build_standalone;	/*!< If texts of external documents should be included instead of references to GEs */
  ptrlong dc_error_context;	/*!< If error mesages should have lines taht list context in soruce text. Should be disabled if precise byte offsets are needed. */
  ptrlong dc_fsa;		/*!< Partial children-order-validation via FSA: [VC: Element Valid] */
  ptrlong dc_fsa_bad_ws;	/*!< Check if spaces are in place where no PCDATA allowed by FSA: [VC: Element Valid] */
  ptrlong dc_fsa_sgml;		/*!< Partial check if a content model is deterministic */
  ptrlong dc_ge_redef;		/*!< Check if no one GE is defined more than once */
  ptrlong dc_ge_unknown;	/*!< Report if a generic entity is defined (an expanded). This check has no effect if BuildStandalone=DISABLE. */
  ptrlong dc_id_cache;		/*!< Build a hashtable whose keys are IDs and values are locations of identified elements */
  ptrlong dc_id_dupe;		/*!< Check that ID attribute is unique: [VC: ID] */
  ptrlong dc_idref_dangling;	/*!< Check that IDREF attribute is not dangling: [VC: IDREF] */
  ptrlong dc_include;		/*!< If validator should include referenced subdocuments. */
  ptrlong dc_max_errors;	/*!< Maximal number of errors to detect before "too many errors" fatal error */
  ptrlong dc_max_warnings;	/*!< Maximal number of warnings to detect & report before before "too many warnings" */
  ptrlong dc_names_unknown;	/*!< Check if every element name used in whole document is declared in DTD */
  ptrlong dc_names_unordered;	/*!< Check if every element name is declared before use in DTD */
  ptrlong dc_names_unresolved;	/*!< Check if every element name used in DTD is declared, maybe not before */
  ptrlong dc_ns_redef;		/*!< Check if no one namespace prefix is used for different URIs in same document */
  ptrlong dc_ns_unknown;	/*!< Check if every namespace prefix is declared before use */
  ptrlong dc_pe_redef;		/*!< Check if no one PE is defined more than once */
  ptrlong dc_signal_on_error;	/*!< Signal if an XCFG_ERROR or XCFG_FATAL is detected */
  ptrlong dc_sgml;		/*!< Check for those things which are important _solely_ for SGML-compatibility */
  ptrlong dc_too_many_warns;	/*!< Status of "too many warnings" message */
  ptrlong dc_trace_loading;	/*!< Report names of used resources for validation of non-standalone documents */
  ptrlong dc_validation;	/*!< Overall mode of validator, used to choose one of preset configurations. */
  ptrlong dc_vc_data;		/*!< Check for violations of validity constraints in data (main) part of XML. */
  ptrlong dc_vc_dtd;		/*!< Check for violations of validity constraints in DTD. */
  ptrlong dc_namespaces;	/*!< Enable / disable namespace processing -- UNUSED */
  /* schema related configuration parameters */
  ptrlong dc_xs_counter;	/*!< Element appearence counter support */
  ptrlong dc_xs_decl;		/*!< Current document is schema declaration */
  ptrlong dc_xs_namespaces;	/*!< Turn off needness of using namespace prefixes -- UNUSED */
  /* postprocessing */
  dc_attr_dict_t *dc_shadow_attr_names;	/*!< List of names of attributes that should have shadow copies (with parsed values) */
};

typedef struct dtd_config_s dtd_config_t;

/*! Description of actual state of validator at one level of nesting */
struct dtd_astate_s
{
  ecm_el_t *	da_el;		/*!< Description of current element */
  ecm_st_idx_t	da_state;	/*!< Index of current state at this level */
  ptrlong	da_sstate;	/*!< XML Schema	state */
  struct xs_tag_s*	da_tag;	/*!< Schema related data */
};

typedef struct dtd_astate_s dtd_astate_t;

struct dtd_validator_s
{
  dtd_t *	dv_dtd;				/*!< DTD of the document */
  char*     dv_root;			/*!< Root element's name for DTD to be applied */
  ptrlong	dv_depth;			/*!< Current number of not-yet-closed elements */
  dtd_astate_t	dv_stack[ECM_MAX_DEPTH];	/*!< Stack of actual states, one item per one not-yet-closed element */
  dtd_config_t	dv_init_config;			/*!< Initial configuration, as set by XML parser caller */
  dtd_config_t	dv_curr_config;			/*!< Current configuration, equal to \c dv_init_config at start and possibly changed after errors' detection */
  char *	dv_attr_uses;			/*!< Buffer for collecting counters of attributes' occurrences in current element */
};

typedef struct dtd_validator_s dtd_validator_t;

typedef struct ecm_refid_logitem_s
{
  ccaddr_t			li_filename; /* boxed string of URI */
  ptrlong			li_line_no;
} ecm_refid_logitem_t;

typedef struct ecm_id_s
{
  basket_t			id_log;
  ptrlong			id_defined;
} ecm_id_t;


/* Dictionary-in-array support */

#define ECM_MEM_NOT_FOUND   ((ptrlong ) -1)
#define ECM_MEM_UNIQ_ERROR  ((ptrlong ) -1)
/* for future... */
#define ECM_MEM_ERROR       ((ptrlong ) -2)

/* number of items in initial array */
#define ECM_MEM_PREALLOCATE_ITEMS 4

#define utf8len(zz) strlen((const char*)(zz))
#define utf8cmp(zz1,zz2) strcmp((const char*)(zz1),(const char*)(zz2))

/* Functions */

struct vxml_parser_config_s;

extern ecm_term_t *ecm_term_ctor (
  ecm_el_idx_t et_value_, ecm_term_t *et_subterm1_, ecm_term_t *et_subterm2_ );

extern ecm_term_t *ecm_term_compose (
  utf8char *ee_grammar_string, dtd_t *dv_dtd, char **errmsg);

/*! Translates term tree into graph of FSA.
\c tree contains expression for path;
\c states is a pointer to array to be edited/appended, must be pointer to NULL!;
\c state_no is a pointer to current number of items in \c states, must be pointer to zero! */
extern void ecm_term_to_fsa (
  ecm_term_t *tree, ecm_st_t **states, ptrlong *state_no,
  dtd_t *dv_dtd );

/* Compiles grammar of element with index \c el_idx, filling \c ee_states and \c ee_state_no */
extern void ecm_grammar_to_fsa (ecm_el_idx_t el_idx, dtd_t *dv_dtd);

/* Returns a box with listing of FSM states */
extern char *ecm_print_fsm (ecm_el_idx_t el_idx, dtd_t *dtd);

/*! Inserts new item into array of named objs. New item is zero-filled and only
its name is set to new_obj_name.
\return index of newly added object or -1 if name is already defined */
#ifdef MALLOC_DEBUG
extern ptrlong dbg_ecm_add_name (DBG_PARAMS const char *new_obj_name, void **objs, ptrlong *obj_no, size_t size_of_obj);
#define ecm_add_name(new_obj_name,objs,obj_no,size_of_obj) dbg_ecm_add_name(__FILE__,__LINE__,new_obj_name,objs,obj_no,size_of_obj)
#else
extern ptrlong ecm_add_name (const char *new_obj_name, void **objs, ptrlong *obj_no, size_t size_of_obj);
#endif

/*! Inserts new item into hash of named objs if it does not exist. New item is zero-filled
If this item is exist locates this one.
\return pointer to the allocated or found object */
extern caddr_t ecm_try_add_name (const char *new_obj_name, struct id_hash_s *hash, size_t size_of_obj);

/*! Deletes nth item from array of named objs.
\return 1 if deleted 0 if \c obj_idx is out of bounds */
extern int ecm_delete_nth (ptrlong obj_idx, void *objs, ptrlong *obj_no, size_t size_of_obj);

/*! Locates an existing item in array of named objs.
\return index of found object or -1 if name is already defined */
extern ptrlong ecm_find_name (const char *new_obj_name, void *objs, ptrlong obj_no, size_t size_of_obj);


/*! Replacement for 2 calls find and add,
  locates item in any case, if there is no item with such name add new one (zero filled)
  \return index of object */
extern ptrlong ecm_map_name (const char *new_obj_name, void **objs, ptrlong *obj_no, size_t size_of_obj);

/*! fuse to sorted arrays,
    \return -1 if there are two items with equal names,
    otherwise returns number of items in new array */
extern int ecm_fuse_arrays(caddr_t* target, ptrlong *target_no,
		      caddr_t branch, ptrlong branch_no, size_t size_of_obj, int copy, mem_pool_t *pool);

/*! functions for rare arrayes */
struct xecm_big_array_s;
struct xecm_big_el_s;
extern struct xecm_big_array_s* xecm_create_big_array(size_t valsz);
extern void xecm_ba_set_defval(struct xecm_big_array_s* ar, ptrlong defval);
extern struct xecm_big_el_s* xecm_ba_get_elem(struct xecm_big_array_s* array, ptrlong idx);
extern void xecm_ba_set_val(struct xecm_big_array_s* array, ptrlong idx, ptrlong val);
extern ptrlong xecm_ba_get_val(struct xecm_big_array_s* array, ptrlong idx);
extern void xecm_ba_delete(struct xecm_big_array_s* array);
extern void xecm_ba_copy(struct xecm_big_array_s* target, struct xecm_big_array_s* source);

#endif /* _XML_ECM_H */