File: eperl_parse.c

package info (click to toggle)
eperl 2.2.16-1
  • links: PTS, VCS
  • area: main
  • in suites: forky, sid
  • size: 608 kB
  • sloc: ansic: 1,692; perl: 252; makefile: 139; sh: 10
file content (397 lines) | stat: -rw-r--r-- 15,172 bytes parent folder | download | duplicates (2)
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
/* SPDX-License-Identifier: Artistic-1.0-Perl OR GPL-2.0-only
**  Copyright (c) 1996,1997,1998 Ralf S. Engelschall <rse@engelschall.com>
*/


#include "eperl.h"
#include <errno.h>


bool        ePerl_line_continuation         = false;
char       *ePerl_ErrorString               = NULL;

/*
**  save ePerl error string
*/
void ePerl_SetError(const char *str, ...)
{
    va_list ap;
    va_start(ap, str);
    free(ePerl_ErrorString);
    if (vasprintf(&ePerl_ErrorString, str, ap) == -1)
        ePerl_ErrorString = NULL;
    va_end(ap);
}

/*
**  fwrite for internal buffer WITH character escaping
*/
static void ePerl_Efwrite(const char *cpI, size_t nBuf, size_t cNum, FILE *output)
{
    nBuf *= cNum;
    char orig = cpI[nBuf];
    ((char *)cpI)[nBuf] = '\0';
    while (nBuf) {
        size_t literal = strcspn(cpI, "\"@$\\\t\n");
        fwrite(cpI, 1, literal, output);
        cpI += literal, nBuf -= literal;

        while (nBuf)
            switch (*cpI) {
                case '"': case '@': case '$': case '\\':
                    fputc('\\', output); fputc(*cpI,  output); ++cpI, --nBuf; break;
                case '\t':               fputs("\\t", output); ++cpI, --nBuf; break;
                case '\n':               fputs("\\n", output); ++cpI, --nBuf; break;
                default: goto break2;
            }
break2:;
    }
    fwrite(cpI, 1, nBuf, output);
    ((char *)cpI)[nBuf] = orig;
}


/*
**  fwrite for internal buffer WITH HTML entity conversion to CP-1252
*/
#define HTML2CHAR_MIN 2
#define HTML2CHAR_MAX 6
struct html2char {
    char h[HTML2CHAR_MAX];
    char c;
};
static const struct html2char html2char_2[] = {
    { "gt",      '>'    },    /* Greater than */
    { "lt",      '<'    },    /* Less than */
    { "um",      '\xA8' },    /* Diæresis / Umlaut */
};
static const struct html2char html2char_3[] = {
    { "ETH",     '\xD0' },    /* Capital Eth, Icelandic */
    { "amp",     '&'    },    /* Ampersand */
    { "deg",     '\xB0' },    /* Degree sign */
    { "die",     '\xA8' },    /* Diæresis / Umlaut */
    { "eth",     '\xF0' },    /* Small eth, Icelandic */
    { "not",     '\xAC' },    /* Not sign */
    { "reg",     '\xAE' },    /* Registered trademark */
    { "shy",     '\xAD' },    /* Soft hyphen */
    { "yen",     '\xA5' },    /* Yen sign */
};
static const struct html2char html2char_4[] = {
    { "Auml",    '\xC4' },    /* Capital A, diæresis / umlaut */
    { "Euml",    '\xCB' },    /* Capital E, diæresis / umlaut */
    { "Iuml",    '\xCF' },    /* Capital I, diæresis / umlaut */
    { "Ouml",    '\xD6' },    /* Capital O, diæresis / umlaut */
    { "Uuml",    '\xDC' },    /* Capital U, diæresis / umlaut */
    { "auml",    '\xE4' },    /* Small a, diæresis / umlaut */
    { "cent",    '\xA2' },    /* Cent sign */
    { "copy",    '\xA9' },    /* Copyright */
    { "euml",    '\xEB' },    /* Small e, diæresis / umlaut */
    { "iuml",    '\xEF' },    /* Small i, diæresis / umlaut */
    { "macr",    '\xAF' },    /* Macron accent */
    { "nbsp",    '\x20' },    /* Non-breaking Space */
    { "ordf",    '\xAA' },    /* Feminine ordinal */
    { "ordm",    '\xBA' },    /* Masculine ordinal */
    { "ouml",    '\xF6' },    /* Small o, diæresis / umlaut */
    { "para",    '\xB6' },    /* Paragraph sign */
    { "quot",    '"'    },    /* Quotation mark */
    { "sect",    '\xA7' },    /* Section sign */
    { "sup1",    '\xB9' },    /* Superscript one */
    { "sup2",    '\xB2' },    /* Superscript two */
    { "sup3",    '\xB3' },    /* Superscript three */
    { "uuml",    '\xFC' },    /* Small u, diæresis / umlaut */
    { "yuml",    '\xFF' },    /* Small y, diæresis / umlaut */
};
static const struct html2char html2char_5[] = {
    { "AElig",   '\xC6' },    /* Capital AE ligature */
    { "Acirc",   '\xC2' },    /* Capital A, circumflex */
    { "Aring",   '\xC5' },    /* Capital A, ring */
    { "Ecirc",   '\xCA' },    /* Capital E, circumflex */
    { "Icirc",   '\xCE' },    /* Capital I, circumflex */
    { "Ocirc",   '\xD4' },    /* Capital O, circumflex */
    { "THORN",   '\xDE' },    /* Capital Thorn, Icelandic */
    { "Ucirc",   '\xDB' },    /* Capital U, circumflex */
    { "acirc",   '\xE2' },    /* Small a, circumflex */
    { "acute",   '\xB4' },    /* Acute accent */
    { "aelig",   '\xE6' },    /* Small ae ligature */
    { "aring",   '\xE5' },    /* Small a, ring */
    { "cedil",   '\xB8' },    /* Cedilla */
    { "ecirc",   '\xEA' },    /* Small e, circumflex */
    { "hibar",   '\xAF' },    /* Macron accent */
    { "icirc",   '\xEE' },    /* Small i, circumflex */
    { "iexcl",   '\xA1' },    /* Inverted exclamation */
    { "laquo",   '\xAB' },    /* Left angle quote, guillemot left */
    { "micro",   '\xB5' },    /* Micro sign */
    { "ocirc",   '\xF4' },    /* Small o, circumflex */
    { "pound",   '\xA3' },    /* Pound sterling */
    { "raquo",   '\xBB' },    /* Right angle quote, guillemot right */
    { "szlig",   '\xDF' },    /* Small sharp s, German sz */
    { "thorn",   '\xFE' },    /* Small thorn, Icelandic */
    { "times",   '\xD7' },    /* Multiply sign */
    { "ucirc",   '\xFB' },    /* Small u, circumflex */
};
static const struct html2char html2char_6[] = {
    { "Aacute",  '\xC1' },    /* Capital A, acute accent */
    { "Agrave",  '\xC0' },    /* Capital A, grave accent */
    { "Atilde",  '\xC3' },    /* Capital A, tilde */
    { "Ccedil",  '\xC7' },    /* Capital C, cedilla */
    { "Eacute",  '\xC9' },    /* Capital E, acute accent */
    { "Egrave",  '\xC8' },    /* Capital E, grave accent */
    { "Iacute",  '\xCD' },    /* Capital I, acute accent */
    { "Igrave",  '\xCC' },    /* Capital I, grave accent */
    { "Ntilde",  '\xD1' },    /* Capital N, tilde */
    { "Oacute",  '\xD3' },    /* Capital O, acute accent */
    { "Ograve",  '\xD2' },    /* Capital O, grave accent */
    { "Oslash",  '\xD8' },    /* Capital O, slash */
    { "Otilde",  '\xD5' },    /* Capital O, tilde */
    { "Uacute",  '\xDA' },    /* Capital U, acute accent */
    { "Ugrave",  '\xD9' },    /* Capital U, grave accent */
    { "Yacute",  '\xDD' },    /* Capital Y, acute accent */
    { "aacute",  '\xDF' },    /* Small a, acute accent */
    { "agrave",  '\xE0' },    /* Small a, grave accent */
    { "atilde",  '\xE3' },    /* Small a, tilde */
    { "brkbar",  '\xA6' },    /* Broken vertical bar */
    { "brvbar",  '\xA6' },    /* Broken vertical bar */
    { "ccedil",  '\xE7' },    /* Small c, cedilla */
    { "curren",  '\xA4' },    /* General currency sign */
    { "divide",  '\xF7' },    /* Division sign */
    { "eacute",  '\xE9' },    /* Small e, acute accent */
    { "egrave",  '\xE8' },    /* Small e, grave accent */
    { "frac12",  '\xBD' },    /* Fraction one-half */
    { "frac14",  '\xBC' },    /* Fraction one-fourth */
    { "frac34",  '\xBE' },    /* Fraction three-fourths */
    { "iacute",  '\xED' },    /* Small i, acute accent */
    { "igrave",  '\xEC' },    /* Small i, grave accent */
    { "iquest",  '\xBF' },    /* Inverted question mark */
    { "middot",  '\xB7' },    /* Middle dot */
    { "ntilde",  '\xF1' },    /* Small n, tilde */
    { "oacute",  '\xF3' },    /* Small o, acute accent */
    { "ograve",  '\xF2' },    /* Small o, grave accent */
    { "oslash",  '\xF8' },    /* Small o, slash */
    { "otilde",  '\xF5' },    /* Small o, tilde */
    { "plusmn",  '\xB1' },    /* Plus or minus */
    { "uacute",  '\xFA' },    /* Small u, acute accent */
    { "ugrave",  '\xF9' },    /* Small u, grave accent */
    { "yacute",  '\xFD' },    /* Small y, acute accent */
};
static const struct {
    const struct html2char *h2c;
    size_t                  h2cn;
} html2chars[] = {
    {html2char_2, sizeof(html2char_2) / sizeof(*html2char_2)},
    {html2char_3, sizeof(html2char_3) / sizeof(*html2char_3)},
    {html2char_4, sizeof(html2char_4) / sizeof(*html2char_4)},
    {html2char_5, sizeof(html2char_5) / sizeof(*html2char_5)},
    {html2char_6, sizeof(html2char_6) / sizeof(*html2char_6)},
};
static int html2char_cmp(const void *lhs, const void *rhs)
{
    return memcmp(((const struct html2char *)lhs)->h, ((const struct html2char *)rhs)->h, HTML2CHAR_MAX);
}

static void ePerl_Cfwrite(const char *cpBuf, size_t nBuf, size_t cNum, FILE *output)
{
    nBuf *= cNum;

    for (char *amp; (amp = memchr(cpBuf, '&', nBuf)); ) {
        fwrite(cpBuf, 1, amp - cpBuf, output);
        nBuf -= amp - cpBuf;
        cpBuf = amp;

        char *semi = memchr(cpBuf, ';', nBuf);
        if (!semi)
            break;
        ++semi;
        nBuf -= semi - cpBuf;
        cpBuf = semi;
        size_t namelen = semi - amp - 2;
        if (namelen < HTML2CHAR_MIN || namelen > HTML2CHAR_MAX) {
        nomatch:
            fwrite(amp, 1, semi - amp, output);
            continue;
        }

        struct html2char key = {0};
        memcpy(key.h, amp + 1, namelen);
        struct html2char *ent =
            bsearch(&key, html2chars[namelen - HTML2CHAR_MIN].h2c, html2chars[namelen - HTML2CHAR_MIN].h2cn, sizeof(struct html2char), html2char_cmp);
        if (!ent)
            goto nomatch;
        fputc(ent->c, output);
    }
    fwrite(cpBuf, 1, nBuf, output);
}


/*
**  memmem() but case-insensitive; basically equivalent to strncasestr()
*/
void *memcasemem(const void *buf, size_t n, const void *str, size_t len)
{
    for (const char *cp = buf, *cpe = buf + n - len; cp <= cpe; ++cp)
        if (strncasecmp(cp, str, len) == 0)
            return (void *)cp;
    return NULL;
}


/*
**  convert buffer from sprinkled format to plain format
*/
char *ePerl_Sprinkled2Plain(const char *cpBuf, const char *ePerl_begin_delimiter, const char *ePerl_end_delimiter, bool ePerl_case_sensitive_delimiters, bool ePerl_convert_entities)
{
    char *cpOutBuf = NULL;
    size_t nOutBuf = 0;
    const char *cps, *cpe;
    const char *cps2, *cpe2;
    size_t ePerl_begin_delimiter_len = strlen(ePerl_begin_delimiter);
    size_t ePerl_end_delimiter_len   = strlen(ePerl_end_delimiter);

    if (!*cpBuf) {
        /* make sure we return a buffer which the caller can free() */
        return strdup("");
    }

    const char *cpEND = cpBuf+strlen(cpBuf);

    FILE *output = open_memstream(&cpOutBuf, &nOutBuf);
    if (!output) {
        ePerl_SetError("Cannot allocate memstream: %s", strerror(errno));
        return NULL;
    }

    /* now step through the file and convert it to legal Perl code.
       This is a bit complicated because we have to make sure that
       we parse the correct delimiters while the delimiter
       characters could also occur inside the Perl code! */
    cps = cpBuf;
    while (cps < cpEND) {
        cpe = (ePerl_case_sensitive_delimiters ? memmem : memcasemem)(cps, cpEND - cps, ePerl_begin_delimiter, ePerl_begin_delimiter_len);
        if (cpe == NULL)
            cpe = cpEND;

        /* first, encapsulate the content from current pos
           up to the begin of the ePerl block as print statements */
        cps2 = cps;
        /* first, do all complete lines */
        while ((cpe2 = memchr(cps2, '\n', cpe-cps2)) != NULL) {
            if (ePerl_line_continuation && cps < cpe2 && *(cpe2-1) == '\\') {
                if (cpe2-1-cps2 > 0) {
                    fputs("print \"", output);
                    ePerl_Efwrite(cps2, cpe2-1-cps2, 1, output);
                    fputs("\";", output);
                }
                fputc('\n', output);
            }
            else {
                fputs("print \"", output);
                ePerl_Efwrite(cps2, cpe2-cps2, 1, output);
                fputs("\\n\";\n", output);
            }
            cps2 = cpe2+1;
        }
        /* then do the remainder which is not
           finished by a newline */
        if (cpe > cps2) {
            fputs("print \"", output);
            ePerl_Efwrite(cps2, cpe-cps2, 1, output);
            fputs("\";", output);
        }

        /* Ok, there is at least one more ePerl block */
        if (cpe == cpEND)
            break;

        /* just output a leading space to make
           the -x display more readable. */
        if (ftell(output) && (fflush(output), cpOutBuf[nOutBuf - 1] != '\n'))
            fputc(' ', output);

        /* skip the start delimiter */
        cps = cpe+strlen(ePerl_begin_delimiter);

        /* recognize the 'print' shortcut with '=',
         * e.g. <:=$var:>
         */
        if (*cps == '=') {
            fputs("print ", output);
            cps++;
        }

        /* skip all following whitespaces.
           Be careful: we could skip newlines too, but then the
           error output will give wrong line numbers!!! */
        while (cps < cpEND) {
            if (*cps != ' ' && *cps != '\t')
                break;
            cps++;
        }
        cpe = cps;

        /* move forward to end of ePerl block. */
        cpe = (ePerl_case_sensitive_delimiters ? memmem : memcasemem)(cpe, cpEND-cpe, ePerl_end_delimiter, ePerl_end_delimiter_len);
        if (cpe == NULL) {
            ePerl_SetError("Missing end delimiter");
            goto CUS;
        }

        /* step again backward over whitespaces */
        for (cpe2 = cpe;
             cpe2 > cps && (*(cpe2-1) == ' ' || *(cpe2-1) == '\t' || *(cpe2-1) == '\n');
             cpe2--)
            ;

        /* pass through the ePerl block without changes! */
        if (cpe2 > cps) {
            if (ePerl_convert_entities)
                ePerl_Cfwrite(cps, cpe2-cps, 1, output);
            else
                fwrite(cps, cpe2-cps, 1, output);

            /* be smart and automatically add a semicolon
               if not provided at the end of the ePerl block.
               But know the continuation indicator "_". */
            if ((*(cpe2-1) != ';') &&
                (*(cpe2-1) != '_')   )
                fputc(';', output);
            if (*(cpe2-1) == '_')
                fseek(output, -1, SEEK_CUR);
        }

        /* end preserve newlines for correct line numbers */
        for ( ; cpe2 <= cpe; cpe2++)
            if (*cpe2 == '\n')
                fputc('\n', output);

        /* output a trailing space to make
           the -x display more readable when
           no newlines have finished the block. */
        if (ftell(output) && (fflush(output), cpOutBuf[nOutBuf - 1] != '\n'))
            fputc(' ', output);

        /* and adjust the current position to the first character
           after the end delimiter */
        cps = cpe+strlen(ePerl_end_delimiter);

        /* finally just one more feature: when an end delimiter
           is directly followed by "//" this discards all
           data up to and including the following newline */
        if (cps < cpEND-2 && *cps == '/' && *(cps+1) == '/') {
            /* skip characters */
            cps += 2;
            for ( ; cps < cpEND && *cps != '\n'; cps++)
                ;
            if (cps < cpEND)
                cps++;
            /* but preserve the newline in the script */
            fputc('\n', output);
        }
    }
    fclose(output);
    cpOutBuf[nOutBuf] = '\0';
    return cpOutBuf;

CUS:
    fclose(output);
    free(cpOutBuf);
    return NULL;
}