File: eperl_parse.c

package info (click to toggle)
eperl 2.2.16-1
links: PTS, VCS
area: main
in suites: forky, sid
size: 608 kB
sloc: ansic: 1,692; perl: 252; makefile: 139; sh: 10
file content (397 lines) | stat: -rw-r--r-- 15,172 bytes
parent folder | download | duplicates (2)
/* SPDX-License-Identifier: Artistic-1.0-Perl OR GPL-2.0-only
**  Copyright (c) 1996,1997,1998 Ralf S. Engelschall <rse@engelschall.com>
*/


#include "eperl.h"
#include <errno.h>


bool        ePerl_line_continuation         = false;
char       *ePerl_ErrorString               = NULL;

/*
**  save ePerl error string
*/
void ePerl_SetError(const char *str, ...)
{
    va_list ap;
    va_start(ap, str);
    free(ePerl_ErrorString);
    if (vasprintf(&ePerl_ErrorString, str, ap) == -1)
        ePerl_ErrorString = NULL;
    va_end(ap);
}

/*
**  fwrite for internal buffer WITH character escaping
*/
static void ePerl_Efwrite(const char *cpI, size_t nBuf, size_t cNum, FILE *output)
{
    nBuf *= cNum;
    char orig = cpI[nBuf];
    ((char *)cpI)[nBuf] = '\0';
    while (nBuf) {
        size_t literal = strcspn(cpI, "\"@$\\\t\n");
        fwrite(cpI, 1, literal, output);
        cpI += literal, nBuf -= literal;

        while (nBuf)
            switch (*cpI) {
                case '"': case '@': case '$': case '\\':
                    fputc('\\', output); fputc(*cpI,  output); ++cpI, --nBuf; break;
                case '\t':               fputs("\\t", output); ++cpI, --nBuf; break;
                case '\n':               fputs("\\n", output); ++cpI, --nBuf; break;
                default: goto break2;
            }
break2:;
    }
    fwrite(cpI, 1, nBuf, output);
    ((char *)cpI)[nBuf] = orig;
}


/*
**  fwrite for internal buffer WITH HTML entity conversion to CP-1252
*/
#define HTML2CHAR_MIN 2
#define HTML2CHAR_MAX 6
struct html2char {
    char h[HTML2CHAR_MAX];
    char c;
};
static const struct html2char html2char_2[] = {
    { "gt",      '>'    },    /* Greater than */
    { "lt",      '<'    },    /* Less than */
    { "um",      '\xA8' },    /* Diæresis / Umlaut */
};
static const struct html2char html2char_3[] = {
    { "ETH",     '\xD0' },    /* Capital Eth, Icelandic */
    { "amp",     '&'    },    /* Ampersand */
    { "deg",     '\xB0' },    /* Degree sign */
    { "die",     '\xA8' },    /* Diæresis / Umlaut */
    { "eth",     '\xF0' },    /* Small eth, Icelandic */
    { "not",     '\xAC' },    /* Not sign */
    { "reg",     '\xAE' },    /* Registered trademark */
    { "shy",     '\xAD' },    /* Soft hyphen */
    { "yen",     '\xA5' },    /* Yen sign */
};
static const struct html2char html2char_4[] = {
    { "Auml",    '\xC4' },    /* Capital A, diæresis / umlaut */
    { "Euml",    '\xCB' },    /* Capital E, diæresis / umlaut */
    { "Iuml",    '\xCF' },    /* Capital I, diæresis / umlaut */
    { "Ouml",    '\xD6' },    /* Capital O, diæresis / umlaut */
    { "Uuml",    '\xDC' },    /* Capital U, diæresis / umlaut */
    { "auml",    '\xE4' },    /* Small a, diæresis / umlaut */
    { "cent",    '\xA2' },    /* Cent sign */
    { "copy",    '\xA9' },    /* Copyright */
    { "euml",    '\xEB' },    /* Small e, diæresis / umlaut */
    { "iuml",    '\xEF' },    /* Small i, diæresis / umlaut */
    { "macr",    '\xAF' },    /* Macron accent */
    { "nbsp",    '\x20' },    /* Non-breaking Space */
    { "ordf",    '\xAA' },    /* Feminine ordinal */
    { "ordm",    '\xBA' },    /* Masculine ordinal */
    { "ouml",    '\xF6' },    /* Small o, diæresis / umlaut */
    { "para",    '\xB6' },    /* Paragraph sign */
    { "quot",    '"'    },    /* Quotation mark */
    { "sect",    '\xA7' },    /* Section sign */
    { "sup1",    '\xB9' },    /* Superscript one */
    { "sup2",    '\xB2' },    /* Superscript two */
    { "sup3",    '\xB3' },    /* Superscript three */
    { "uuml",    '\xFC' },    /* Small u, diæresis / umlaut */
    { "yuml",    '\xFF' },    /* Small y, diæresis / umlaut */
};
static const struct html2char html2char_5[] = {
    { "AElig",   '\xC6' },    /* Capital AE ligature */
    { "Acirc",   '\xC2' },    /* Capital A, circumflex */
    { "Aring",   '\xC5' },    /* Capital A, ring */
    { "Ecirc",   '\xCA' },    /* Capital E, circumflex */
    { "Icirc",   '\xCE' },    /* Capital I, circumflex */
    { "Ocirc",   '\xD4' },    /* Capital O, circumflex */
    { "THORN",   '\xDE' },    /* Capital Thorn, Icelandic */
    { "Ucirc",   '\xDB' },    /* Capital U, circumflex */
    { "acirc",   '\xE2' },    /* Small a, circumflex */
    { "acute",   '\xB4' },    /* Acute accent */
    { "aelig",   '\xE6' },    /* Small ae ligature */
    { "aring",   '\xE5' },    /* Small a, ring */
    { "cedil",   '\xB8' },    /* Cedilla */
    { "ecirc",   '\xEA' },    /* Small e, circumflex */
    { "hibar",   '\xAF' },    /* Macron accent */
    { "icirc",   '\xEE' },    /* Small i, circumflex */
    { "iexcl",   '\xA1' },    /* Inverted exclamation */
    { "laquo",   '\xAB' },    /* Left angle quote, guillemot left */
    { "micro",   '\xB5' },    /* Micro sign */
    { "ocirc",   '\xF4' },    /* Small o, circumflex */
    { "pound",   '\xA3' },    /* Pound sterling */
    { "raquo",   '\xBB' },    /* Right angle quote, guillemot right */
    { "szlig",   '\xDF' },    /* Small sharp s, German sz */
    { "thorn",   '\xFE' },    /* Small thorn, Icelandic */
    { "times",   '\xD7' },    /* Multiply sign */
    { "ucirc",   '\xFB' },    /* Small u, circumflex */
};
static const struct html2char html2char_6[] = {
    { "Aacute",  '\xC1' },    /* Capital A, acute accent */
    { "Agrave",  '\xC0' },    /* Capital A, grave accent */
    { "Atilde",  '\xC3' },    /* Capital A, tilde */
    { "Ccedil",  '\xC7' },    /* Capital C, cedilla */
    { "Eacute",  '\xC9' },    /* Capital E, acute accent */
    { "Egrave",  '\xC8' },    /* Capital E, grave accent */
    { "Iacute",  '\xCD' },    /* Capital I, acute accent */
    { "Igrave",  '\xCC' },    /* Capital I, grave accent */
    { "Ntilde",  '\xD1' },    /* Capital N, tilde */
    { "Oacute",  '\xD3' },    /* Capital O, acute accent */
    { "Ograve",  '\xD2' },    /* Capital O, grave accent */
    { "Oslash",  '\xD8' },    /* Capital O, slash */
    { "Otilde",  '\xD5' },    /* Capital O, tilde */
    { "Uacute",  '\xDA' },    /* Capital U, acute accent */
    { "Ugrave",  '\xD9' },    /* Capital U, grave accent */
    { "Yacute",  '\xDD' },    /* Capital Y, acute accent */
    { "aacute",  '\xDF' },    /* Small a, acute accent */
    { "agrave",  '\xE0' },    /* Small a, grave accent */
    { "atilde",  '\xE3' },    /* Small a, tilde */
    { "brkbar",  '\xA6' },    /* Broken vertical bar */
    { "brvbar",  '\xA6' },    /* Broken vertical bar */
    { "ccedil",  '\xE7' },    /* Small c, cedilla */
    { "curren",  '\xA4' },    /* General currency sign */
    { "divide",  '\xF7' },    /* Division sign */
    { "eacute",  '\xE9' },    /* Small e, acute accent */
    { "egrave",  '\xE8' },    /* Small e, grave accent */
    { "frac12",  '\xBD' },    /* Fraction one-half */
    { "frac14",  '\xBC' },    /* Fraction one-fourth */
    { "frac34",  '\xBE' },    /* Fraction three-fourths */
    { "iacute",  '\xED' },    /* Small i, acute accent */
    { "igrave",  '\xEC' },    /* Small i, grave accent */
    { "iquest",  '\xBF' },    /* Inverted question mark */
    { "middot",  '\xB7' },    /* Middle dot */
    { "ntilde",  '\xF1' },    /* Small n, tilde */
    { "oacute",  '\xF3' },    /* Small o, acute accent */
    { "ograve",  '\xF2' },    /* Small o, grave accent */
    { "oslash",  '\xF8' },    /* Small o, slash */
    { "otilde",  '\xF5' },    /* Small o, tilde */
    { "plusmn",  '\xB1' },    /* Plus or minus */
    { "uacute",  '\xFA' },    /* Small u, acute accent */
    { "ugrave",  '\xF9' },    /* Small u, grave accent */
    { "yacute",  '\xFD' },    /* Small y, acute accent */
};
static const struct {
    const struct html2char *h2c;
    size_t                  h2cn;
} html2chars[] = {
    {html2char_2, sizeof(html2char_2) / sizeof(*html2char_2)},
    {html2char_3, sizeof(html2char_3) / sizeof(*html2char_3)},
    {html2char_4, sizeof(html2char_4) / sizeof(*html2char_4)},
    {html2char_5, sizeof(html2char_5) / sizeof(*html2char_5)},
    {html2char_6, sizeof(html2char_6) / sizeof(*html2char_6)},
};
static int html2char_cmp(const void *lhs, const void *rhs)
{
    return memcmp(((const struct html2char *)lhs)->h, ((const struct html2char *)rhs)->h, HTML2CHAR_MAX);
}

static void ePerl_Cfwrite(const char *cpBuf, size_t nBuf, size_t cNum, FILE *output)
{
    nBuf *= cNum;

    for (char *amp; (amp = memchr(cpBuf, '&', nBuf)); ) {
        fwrite(cpBuf, 1, amp - cpBuf, output);
        nBuf -= amp - cpBuf;
        cpBuf = amp;

        char *semi = memchr(cpBuf, ';', nBuf);
        if (!semi)
            break;
        ++semi;
        nBuf -= semi - cpBuf;
        cpBuf = semi;
        size_t namelen = semi - amp - 2;
        if (namelen < HTML2CHAR_MIN || namelen > HTML2CHAR_MAX) {
        nomatch:
            fwrite(amp, 1, semi - amp, output);
            continue;
        }

        struct html2char key = {0};
        memcpy(key.h, amp + 1, namelen);
        struct html2char *ent =
            bsearch(&key, html2chars[namelen - HTML2CHAR_MIN].h2c, html2chars[namelen - HTML2CHAR_MIN].h2cn, sizeof(struct html2char), html2char_cmp);
        if (!ent)
            goto nomatch;
        fputc(ent->c, output);
    }
    fwrite(cpBuf, 1, nBuf, output);
}


/*
**  memmem() but case-insensitive; basically equivalent to strncasestr()
*/
void *memcasemem(const void *buf, size_t n, const void *str, size_t len)
{
    for (const char *cp = buf, *cpe = buf + n - len; cp <= cpe; ++cp)
        if (strncasecmp(cp, str, len) == 0)
            return (void *)cp;
    return NULL;
}


/*
**  convert buffer from sprinkled format to plain format
*/
char *ePerl_Sprinkled2Plain(const char *cpBuf, const char *ePerl_begin_delimiter, const char *ePerl_end_delimiter, bool ePerl_case_sensitive_delimiters, bool ePerl_convert_entities)
{
    char *cpOutBuf = NULL;
    size_t nOutBuf = 0;
    const char *cps, *cpe;
    const char *cps2, *cpe2;
    size_t ePerl_begin_delimiter_len = strlen(ePerl_begin_delimiter);
    size_t ePerl_end_delimiter_len   = strlen(ePerl_end_delimiter);

    if (!*cpBuf) {
        /* make sure we return a buffer which the caller can free() */
        return strdup("");
    }

    const char *cpEND = cpBuf+strlen(cpBuf);

    FILE *output = open_memstream(&cpOutBuf, &nOutBuf);
    if (!output) {
        ePerl_SetError("Cannot allocate memstream: %s", strerror(errno));
        return NULL;
    }

    /* now step through the file and convert it to legal Perl code.
       This is a bit complicated because we have to make sure that
       we parse the correct delimiters while the delimiter
       characters could also occur inside the Perl code! */
    cps = cpBuf;
    while (cps < cpEND) {
        cpe = (ePerl_case_sensitive_delimiters ? memmem : memcasemem)(cps, cpEND - cps, ePerl_begin_delimiter, ePerl_begin_delimiter_len);
        if (cpe == NULL)
            cpe = cpEND;

        /* first, encapsulate the content from current pos
           up to the begin of the ePerl block as print statements */
        cps2 = cps;
        /* first, do all complete lines */
        while ((cpe2 = memchr(cps2, '\n', cpe-cps2)) != NULL) {
            if (ePerl_line_continuation && cps < cpe2 && *(cpe2-1) == '\\') {
                if (cpe2-1-cps2 > 0) {
                    fputs("print \"", output);
                    ePerl_Efwrite(cps2, cpe2-1-cps2, 1, output);
                    fputs("\";", output);
                }
                fputc('\n', output);
            }
            else {
                fputs("print \"", output);
                ePerl_Efwrite(cps2, cpe2-cps2, 1, output);
                fputs("\\n\";\n", output);
            }
            cps2 = cpe2+1;
        }
        /* then do the remainder which is not
           finished by a newline */
        if (cpe > cps2) {
            fputs("print \"", output);
            ePerl_Efwrite(cps2, cpe-cps2, 1, output);
            fputs("\";", output);
        }

        /* Ok, there is at least one more ePerl block */
        if (cpe == cpEND)
            break;

        /* just output a leading space to make
           the -x display more readable. */
        if (ftell(output) && (fflush(output), cpOutBuf[nOutBuf - 1] != '\n'))
            fputc(' ', output);

        /* skip the start delimiter */
        cps = cpe+strlen(ePerl_begin_delimiter);

        /* recognize the 'print' shortcut with '=',
         * e.g. <:=$var:>
         */
        if (*cps == '=') {
            fputs("print ", output);
            cps++;
        }

        /* skip all following whitespaces.
           Be careful: we could skip newlines too, but then the
           error output will give wrong line numbers!!! */
        while (cps < cpEND) {
            if (*cps != ' ' && *cps != '\t')
                break;
            cps++;
        }
        cpe = cps;

        /* move forward to end of ePerl block. */
        cpe = (ePerl_case_sensitive_delimiters ? memmem : memcasemem)(cpe, cpEND-cpe, ePerl_end_delimiter, ePerl_end_delimiter_len);
        if (cpe == NULL) {
            ePerl_SetError("Missing end delimiter");
            goto CUS;
        }

        /* step again backward over whitespaces */
        for (cpe2 = cpe;
             cpe2 > cps && (*(cpe2-1) == ' ' || *(cpe2-1) == '\t' || *(cpe2-1) == '\n');
             cpe2--)
            ;

        /* pass through the ePerl block without changes! */
        if (cpe2 > cps) {
            if (ePerl_convert_entities)
                ePerl_Cfwrite(cps, cpe2-cps, 1, output);
            else
                fwrite(cps, cpe2-cps, 1, output);

            /* be smart and automatically add a semicolon
               if not provided at the end of the ePerl block.
               But know the continuation indicator "_". */
            if ((*(cpe2-1) != ';') &&
                (*(cpe2-1) != '_')   )
                fputc(';', output);
            if (*(cpe2-1) == '_')
                fseek(output, -1, SEEK_CUR);
        }

        /* end preserve newlines for correct line numbers */
        for ( ; cpe2 <= cpe; cpe2++)
            if (*cpe2 == '\n')
                fputc('\n', output);

        /* output a trailing space to make
           the -x display more readable when
           no newlines have finished the block. */
        if (ftell(output) && (fflush(output), cpOutBuf[nOutBuf - 1] != '\n'))
            fputc(' ', output);

        /* and adjust the current position to the first character
           after the end delimiter */
        cps = cpe+strlen(ePerl_end_delimiter);

        /* finally just one more feature: when an end delimiter
           is directly followed by "//" this discards all
           data up to and including the following newline */
        if (cps < cpEND-2 && *cps == '/' && *(cps+1) == '/') {
            /* skip characters */
            cps += 2;
            for ( ; cps < cpEND && *cps != '\n'; cps++)
                ;
            if (cps < cpEND)
                cps++;
            /* but preserve the newline in the script */
            fputc('\n', output);
        }
    }
    fclose(output);
    cpOutBuf[nOutBuf] = '\0';
    return cpOutBuf;

CUS:
    fclose(output);
    free(cpOutBuf);
    return NULL;
}