File: xml.c

package info (click to toggle)
golf 601.4.41-1
  • links: PTS, VCS
  • area: main
  • in suites: forky, sid
  • size: 2,824 kB
  • sloc: ansic: 20,020; sh: 1,171; makefile: 292
file content (349 lines) | stat: -rw-r--r-- 14,643 bytes parent folder | download
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
// SPDX-License-Identifier: Apache-2.0
// Copyright 2018-2025 Gliim LLC. 
// Licensed under Apache License v2. See LICENSE file.
// On the web http://golf-lang.com/ - this file is part of Golf framework.
//
// gcc -o xml3 xml3.c $(pkg-config --cflags libxml-2.0) -lxml2 -g -std=gnu99


//
//
// This is XML module. At this point it's inoperative, and it's just a first cut for the future XML feature.
//
//
// https://github.com/tiran/defusedxml SECURITY OF XML, NO DTD, NO ENTITY EXPANSION, NO EXTERNAL RESOLUTION, LIMIT PARSE DEPTH, LIMIT INPUT SIZE, LIMIT PARSE TIME, SAX, NO XPATH NO XSL 
//
// Note that XML, unlike JSON, can be encoded in any way (while JSON is *always* UTF8), so any decodings must be done once data is extracted! (meaning text-utf)
#include "golf.h"



#ifdef DEBUG
#define GG_XML_NODES 32
#else
#define GG_XML_NODES 1024
#endif

#define GG_XML_MAX_DEPTH 32
#define GG_XML_MAX_NAME_LEN 500

// Internal structure that holds data needed to parse a single XML document
// The parser walks the documents in order of appearance of the closing tag and provides name/value pairs
// Meaning it's a fast SAX parsing, there's no document building (no DOM)
typedef struct s_gg_ixml {
    xmlParserCtxtPtr xml_parser; // parsing context from the library
    char ename[GG_XML_MAX_DEPTH][GG_XML_MAX_NAME_LEN]; // normalized names
    char *eval[GG_XML_MAX_DEPTH]; // values
    gg_num eval_len[GG_XML_MAX_DEPTH]; // values lengths in bytes
    gg_num eval_alloc[GG_XML_MAX_DEPTH]; // how much memory actually allocated for values
    gg_num ename_len[GG_XML_MAX_DEPTH];  // normalized name lengths
    int dep;
} gg_ixml;


static gg_xml *xloc = NULL; // final result, super structure for the whole xml document
static gg_num node_tot = 0; //node total alloc'd
static gg_num node_c = 0; //node counter (current)
static gg_xmln *nodes = NULL; // nodes of normalized xml (name, value)
static gg_ixml xp; // context of a single parse
static gg_num char_ec = -1; // char within line where error happened
static gg_num line_ec = -1; // line where error happened (1..)
static char *errm; // error message
static bool is_error; // if true, there was an error
static void gg_add_xml (); // add more memory for XML nodes

// add name/value pair to the list of nodes (names are normalized)
#define GG_ADD_XML(x_name, x_str) { gg_add_xml(); nodes[node_c].name = x_name; nodes[node_c].str = x_str; node_c++; }; 


//
// Set local static variable xloc to Golf's variable x, which is created here
// This local variable is used in parsing, since it's entirely encapsulated here
//
void gg_set_xml (gg_xml **x)
{
    GG_TRACE("");
    // get json object
    *x = (gg_xml*)gg_malloc (sizeof(gg_xml));

    xloc = *x; // set local processing object

}

// prototypes
void gg_xml_beg( void *ctx, const xmlChar *localname, const xmlChar *prefix, const xmlChar *URI, int nb_namespaces, const xmlChar **namespaces, int nb_attributes, int nb_defaulted, const xmlChar **attributes);
void gg_xml_seterr(void *userData, const struct _xmlError *error);
void gg_xml_end( void* ctx, const xmlChar* localname, const xmlChar* prefix, const xmlChar* URI);
void gg_xml_data(void* ctx, const xmlChar * ch, int len);

// 
// Called when there's a a beginning of XML node or attribute. See libxml2 doc for input parameters. Some are ignored b/c
// we don't do validation
//
void gg_xml_beg( void *ctx, const xmlChar *localname, const xmlChar *prefix, const xmlChar *URI, int nb_namespaces, const xmlChar **namespaces, int nb_attributes, int nb_defaulted, const xmlChar **attributes)
{
    GG_UNUSED(ctx);
    GG_UNUSED(prefix);
    GG_UNUSED(URI);
    GG_UNUSED(nb_namespaces);
    GG_UNUSED(namespaces);
    GG_UNUSED(nb_defaulted);

    // Copy previous name to the current one (one deeper in the hierarchy), so that normalized name path is ready 
    memcpy (xp.ename[xp.dep+1], xp.ename[xp.dep], (xp.ename_len[xp.dep+1]=xp.ename_len[xp.dep])+1);
    xp.dep++; // a new node is one level deeper in XML hierarchy of nodes
              //
    // bufsize is the the space left for this node (and all deeper ones that may follow)
    gg_num bufsize = sizeof(xp.ename[xp.dep])-xp.ename_len[xp.dep];
    gg_num localen = strlen ((char*)localname); // localname is the name of the xml node
    // check 1+1 for / and null byte, to see if the normalized name is too long
    if (localen + 1 + 1 > bufsize) gg_report_error ("XML element is too long [%s...]", xp.ename[xp.dep]);
    // add this node to ../../.. normalized name and finish with /
    memcpy (xp.ename[xp.dep]+xp.ename_len[xp.dep], localname,  localen);
    memcpy (xp.ename[xp.dep]+xp.ename_len[xp.dep]+localen, "/", 2);
    xp.ename_len[xp.dep] += localen+1; // +1 is for /

    unsigned int index = 0;
    // now process any attributes associated with this node
    int i_att;
    for (i_att = 0; i_att < nb_attributes; ++i_att, index += 5)
    {
        const xmlChar *localname = attributes[index];
        // beginning and ending of attribute value
        const xmlChar *begv = attributes[index + 3];
        const xmlChar *endv = attributes[index + 4];
        // attribute's name is one deeper in hierarchy (see comment above)
        memcpy (xp.ename[xp.dep+1], xp.ename[xp.dep], (xp.ename_len[xp.dep+1]=xp.ename_len[xp.dep])+1);
        xp.dep++;
        gg_num bufsize = sizeof(xp.ename[xp.dep])-xp.ename_len[xp.dep]; // bytes left for a normalized name
        gg_num att_len = endv-begv; // attribute value length
        gg_num localen = strlen ((char*)localname); // name of attribute and it's length
        // check 1+1 for / and null byte, if too long a normalized name
        if (localen + 1 + 1 > bufsize) gg_report_error ("XML element name is too long [%s...]", xp.ename[xp.dep]);
        // create full normalized name for this attribute and finish it with @
        memcpy (xp.ename[xp.dep]+xp.ename_len[xp.dep], localname, localen);
        memcpy (xp.ename[xp.dep]+xp.ename_len[xp.dep]+localen, "/@", 3);
        xp.ename_len[xp.dep]+=localen+2; // +2 is for /@

        // attributes and node values are allocated, so copy and finish attribute with a null-char
        // the size of attribute is exactly correct (as it includes +1 byte for null) for the purpose of memory safety
        char *att = gg_malloc(att_len+1);
        memcpy (att, (char*)begv, att_len);
        att[att_len] = 0;
        // add node to the list
        GG_ADD_XML(gg_strdupl(xp.ename[xp.dep],0,xp.ename_len[xp.dep]), att);
        xp.dep--; // go one level shallower in hierarchy, since we just finished processing an attribute
    }
}

//
// Process the end of a node
//
void gg_xml_end( void* ctx, const xmlChar* localname, const xmlChar* prefix, const xmlChar* URI
)
{
    GG_UNUSED(ctx);
    GG_UNUSED(prefix);
    GG_UNUSED(URI);
    GG_UNUSED(localname);

    if (xp.eval[xp.dep] == NULL) { xp.dep--;return;} // node may have been without a value
                                                     // so skip it (typically a node that only contains other nodes)
    // At this point, value has already been filled in gg_xml_data() callback,
    // so nothing to do other than to trim data (which updates its length with Golmem for memory safety)
    // It is IMPORTANT not to miss setting length, if gg_trim is not used, then MUST USE gg_mem_set_len()!
    //
    gg_trim (xp.eval[xp.dep], &(xp.eval_len[xp.dep]), true); // trim the value of a node (on left and right), if not used, MUST use gg_mem_set_len !!
    // realloc mem so we don't use +256 etc for super small blocks of data! This should
    // fast, since we're effectively shrinking the last memory we realloced
    // 
    //
    // and for that reason, to avoid fragmentation, we must do this BEFORE gg_maloc below!!
    //
    //
    xp.eval[xp.dep] = gg_realloc(gg_mem_get_id(xp.eval[xp.dep]), xp.eval_len[xp.dep]+1); 
                                                             //
    gg_num namel = xp.ename_len[xp.dep]; // length of normalized name
    // allocate memory for name to return to Golf and place null at the end
    char *name = gg_malloc (namel+1);
    memcpy (name, xp.ename[xp.dep], namel);
    name[namel] = 0;
    // add name/value to list of xml nodes
    GG_ADD_XML(name, xp.eval[xp.dep]);
    // clear out this node (at depth xp.dep) so the next node/attribute that comes along at that depth in hierarchy doesn't
    // concatenate to some old random values
    xp.eval[xp.dep] = NULL;
    xp.eval_len[xp.dep] = 0;
    xp.dep--; // processed node, go down in depth
}


//
// Callback from libxml2 that processes data. It's often delivered in chunks, so even the very same continuous node value
// could be delivered as more than one chunk.
//
void gg_xml_data(void* ctx, const xmlChar * ch, int len)
{
    GG_UNUSED(ctx);
    int alen = (len < 256 ? 256:len+256); // allocate more memory so we don't realloc when (and if) another chunk comes in
    // Because this XML parser generally returns data in chunks (and not a whole), we have to realloc if and when more data comes along
    // but we do allocate more to minimize realloc's
    // Being NULL is the sign we need to malloc, otherwise realloc
    if (xp.eval[xp.dep] == NULL) {xp.eval_alloc[xp.dep] = alen; xp.eval[xp.dep] = gg_malloc (xp.eval_alloc[xp.dep]);}
    else 
    {
        // if not enough space, add a chunk and then reall
        if (xp.eval_len[xp.dep]+len >= xp.eval_alloc[xp.dep]) xp.eval_alloc[xp.dep] += alen; 
        xp.eval[xp.dep] = gg_realloc(gg_mem_get_id(xp.eval[xp.dep]), xp.eval_alloc[xp.dep]); 
    }
    // Copy data to further complete the node value
    memcpy (xp.eval[xp.dep]+xp.eval_len[xp.dep], ch, len);
    xp.eval_len[xp.dep] += len; // update its length
    xp.eval[xp.dep][xp.eval_len[xp.dep]]= 0; // set length
    // length will be set in gg_xml_end()
}

//
// Called when there's an error. Set error message (errm), line_ec/char_ec and is_error static flags
// to be given to the caller in v1.c
//
void gg_xml_seterr(void *userData, const struct _xmlError *error)
{
    GG_UNUSED(userData);
    errm = gg_strdup(error->message);
    line_ec = error->line;
    char_ec = error->int2;
    is_error = true;
    //xmlSetStructuredErrorFunc(NULL, NULL);
}

//
// Golf code gets error message this way, since errm is static
//
char *gg_xml_err()
{
    return errm;
}

//
// Add another chunk of XML nodes
//
void gg_add_xml ()
{
    GG_TRACE("");
    static gg_num incby;
    
    if (node_tot == 0) incby = GG_XML_NODES/2; // must start with half, so that initial block below is GG_XML_NODES, since 
                                    // malloc/realloc choice depends on it
    if (node_c >= node_tot)
    {
        if (incby < 4096) incby *= 2; // initial block is GG_XML_NODES
        node_tot += incby;
        if (node_tot == GG_XML_NODES) nodes = gg_malloc (node_tot*sizeof (gg_xmln));
        else 
        {
            gg_num id = gg_mem_get_id(nodes);
            nodes = gg_realloc (id, node_tot*sizeof (gg_xmln));
        }
        // initialize nodes to prevent program crashing if developer fails to check the status
        gg_num i;
        for (i = node_c; i < node_tot; i++) {nodes[i].name = GG_EMPTY_STRING; nodes[i].str = GG_EMPTY_STRING; }
    }
}

//
// Delete XML x, first all its key/value pairs and then the whole nodes structure
//
void gg_del_xml (gg_xml **x)
{
    GG_TRACE("");
    gg_num i;
    for (i = 0; i < (*x)->node_c; i++)
    {
        gg_free ((*x)->nodes[i].name);
        gg_free ((*x)->nodes[i].str);
    }
    if ((*x)->node_c != 0) gg_free ((*x)->nodes);
    (*x)->node_c = 0;
    gg_free (*x); // delete the entire xml structure
    *x = NULL;
}


//
// Parse XML. val is data to parse, len is its length, errc/errl and char and line where error may have happened
// Returns 0 if error, -1 if okay 
//
gg_num gg_xml_new (char *val, gg_num len, gg_num *errc, gg_num *errl)
{
    GG_TRACE("");

    // create initial block of normalized nodes
    node_c = 0;
    node_tot = 0; // both node_c and node_tot must be 0 for allocation to work properly, see gg_add_json
    gg_add_xml(); // add initial chunk of nodes
    is_error = false; // no error initially
    errm = GG_EMPTY_STRING; // no error text initially

    // Initialize hierarchy of XML nodes up to maximum depth. We reuse this set of depth data, so once a node
    // is processed say in depth 5 (xp.ename[4], xp.eval[4] etc.), some other node who comes later at the same
    // depth will use it again. 
    int i;
    for (i = 0; i < GG_XML_MAX_DEPTH; i ++)
    {
        xp.ename[i][0] = 0;
        xp.eval[i] = NULL;
        xp.eval_len[i] = 0;
        xp.eval_alloc[i] = 0;
        xp.ename_len[i] = 0;
    }
    xp.dep = 0; // initially depth 0

    // Init SAX XML handler
    xmlSAXHandler handler;
    memset(&handler, 0, sizeof(handler));

    // Set up handler functions
    handler.startElementNs = gg_xml_beg; // beginning of node
    handler.endElementNs = gg_xml_end; // end of done
    handler.characters = gg_xml_data; // data of node
    handler.initialized = XML_SAX2_MAGIC; 

    xmlSetStructuredErrorFunc(NULL, (xmlStructuredErrorFunc)gg_xml_seterr); // set error callback

    // use either length of data, or passed length, but passed length cannot be greater than
    // the actually data length (memory safety)
    gg_num vlen = gg_mem_get_len(gg_mem_get_id( val) );
    if (len == -1) len = vlen; // len is -1 only in root invocation
    else if (len > vlen) len = vlen; // do not cause segmentation violation

    // Do XML parsing
    // We do NOT ever use any network validations. All parsing is strictly local.
    // We allow large XML.
    xmlDocPtr doc;
    xp.xml_parser = xmlCreatePushParserCtxt( &handler, NULL, val, len, NULL);
    xmlCtxtUseOptions		(xp.xml_parser, XML_PARSE_NONET|XML_PARSE_HUGE|XML_PARSE_IGNORE_ENC);

    // finish parsing
    xmlParseChunk(xp.xml_parser, "", 0, 1);

    // Since we don't use DOM, release it
    doc = xp.xml_parser->myDoc; //destroy the tree since not used
    xmlFreeParserCtxt(xp.xml_parser); // release resources for parsing
    if (doc != NULL) xmlFreeDoc(doc);

    // this is once per program, it should be at the end. Since it may (or may not?) cause sigsegv, we're omitting it as exiting will release all resources
    // xmlCleanupParser();

    // Get error info, if requested
    if (errl) *errl = line_ec;
    if (errc) *errc = char_ec;

    // Set actual nodes to be with the XML variable defined in Golf program, so that developer can use it there
    xloc->nodes = nodes;
    xloc->node_c = node_c;
    xloc->node_r = 0;  // iteration set to first node

    return is_error ? 0:-1; // success or failure
}