1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 390 391 392 393 394 395 396 397
|
/* SPDX-License-Identifier: Artistic-1.0-Perl OR GPL-2.0-only
** Copyright (c) 1996,1997,1998 Ralf S. Engelschall <rse@engelschall.com>
*/
#include "eperl.h"
#include <errno.h>
bool ePerl_line_continuation = false;
char *ePerl_ErrorString = NULL;
/*
** save ePerl error string
*/
void ePerl_SetError(const char *str, ...)
{
va_list ap;
va_start(ap, str);
free(ePerl_ErrorString);
if (vasprintf(&ePerl_ErrorString, str, ap) == -1)
ePerl_ErrorString = NULL;
va_end(ap);
}
/*
** fwrite for internal buffer WITH character escaping
*/
static void ePerl_Efwrite(const char *cpI, size_t nBuf, size_t cNum, FILE *output)
{
nBuf *= cNum;
char orig = cpI[nBuf];
((char *)cpI)[nBuf] = '\0';
while (nBuf) {
size_t literal = strcspn(cpI, "\"@$\\\t\n");
fwrite(cpI, 1, literal, output);
cpI += literal, nBuf -= literal;
while (nBuf)
switch (*cpI) {
case '"': case '@': case '$': case '\\':
fputc('\\', output); fputc(*cpI, output); ++cpI, --nBuf; break;
case '\t': fputs("\\t", output); ++cpI, --nBuf; break;
case '\n': fputs("\\n", output); ++cpI, --nBuf; break;
default: goto break2;
}
break2:;
}
fwrite(cpI, 1, nBuf, output);
((char *)cpI)[nBuf] = orig;
}
/*
** fwrite for internal buffer WITH HTML entity conversion to CP-1252
*/
#define HTML2CHAR_MIN 2
#define HTML2CHAR_MAX 6
struct html2char {
char h[HTML2CHAR_MAX];
char c;
};
static const struct html2char html2char_2[] = {
{ "gt", '>' }, /* Greater than */
{ "lt", '<' }, /* Less than */
{ "um", '\xA8' }, /* Diæresis / Umlaut */
};
static const struct html2char html2char_3[] = {
{ "ETH", '\xD0' }, /* Capital Eth, Icelandic */
{ "amp", '&' }, /* Ampersand */
{ "deg", '\xB0' }, /* Degree sign */
{ "die", '\xA8' }, /* Diæresis / Umlaut */
{ "eth", '\xF0' }, /* Small eth, Icelandic */
{ "not", '\xAC' }, /* Not sign */
{ "reg", '\xAE' }, /* Registered trademark */
{ "shy", '\xAD' }, /* Soft hyphen */
{ "yen", '\xA5' }, /* Yen sign */
};
static const struct html2char html2char_4[] = {
{ "Auml", '\xC4' }, /* Capital A, diæresis / umlaut */
{ "Euml", '\xCB' }, /* Capital E, diæresis / umlaut */
{ "Iuml", '\xCF' }, /* Capital I, diæresis / umlaut */
{ "Ouml", '\xD6' }, /* Capital O, diæresis / umlaut */
{ "Uuml", '\xDC' }, /* Capital U, diæresis / umlaut */
{ "auml", '\xE4' }, /* Small a, diæresis / umlaut */
{ "cent", '\xA2' }, /* Cent sign */
{ "copy", '\xA9' }, /* Copyright */
{ "euml", '\xEB' }, /* Small e, diæresis / umlaut */
{ "iuml", '\xEF' }, /* Small i, diæresis / umlaut */
{ "macr", '\xAF' }, /* Macron accent */
{ "nbsp", '\x20' }, /* Non-breaking Space */
{ "ordf", '\xAA' }, /* Feminine ordinal */
{ "ordm", '\xBA' }, /* Masculine ordinal */
{ "ouml", '\xF6' }, /* Small o, diæresis / umlaut */
{ "para", '\xB6' }, /* Paragraph sign */
{ "quot", '"' }, /* Quotation mark */
{ "sect", '\xA7' }, /* Section sign */
{ "sup1", '\xB9' }, /* Superscript one */
{ "sup2", '\xB2' }, /* Superscript two */
{ "sup3", '\xB3' }, /* Superscript three */
{ "uuml", '\xFC' }, /* Small u, diæresis / umlaut */
{ "yuml", '\xFF' }, /* Small y, diæresis / umlaut */
};
static const struct html2char html2char_5[] = {
{ "AElig", '\xC6' }, /* Capital AE ligature */
{ "Acirc", '\xC2' }, /* Capital A, circumflex */
{ "Aring", '\xC5' }, /* Capital A, ring */
{ "Ecirc", '\xCA' }, /* Capital E, circumflex */
{ "Icirc", '\xCE' }, /* Capital I, circumflex */
{ "Ocirc", '\xD4' }, /* Capital O, circumflex */
{ "THORN", '\xDE' }, /* Capital Thorn, Icelandic */
{ "Ucirc", '\xDB' }, /* Capital U, circumflex */
{ "acirc", '\xE2' }, /* Small a, circumflex */
{ "acute", '\xB4' }, /* Acute accent */
{ "aelig", '\xE6' }, /* Small ae ligature */
{ "aring", '\xE5' }, /* Small a, ring */
{ "cedil", '\xB8' }, /* Cedilla */
{ "ecirc", '\xEA' }, /* Small e, circumflex */
{ "hibar", '\xAF' }, /* Macron accent */
{ "icirc", '\xEE' }, /* Small i, circumflex */
{ "iexcl", '\xA1' }, /* Inverted exclamation */
{ "laquo", '\xAB' }, /* Left angle quote, guillemot left */
{ "micro", '\xB5' }, /* Micro sign */
{ "ocirc", '\xF4' }, /* Small o, circumflex */
{ "pound", '\xA3' }, /* Pound sterling */
{ "raquo", '\xBB' }, /* Right angle quote, guillemot right */
{ "szlig", '\xDF' }, /* Small sharp s, German sz */
{ "thorn", '\xFE' }, /* Small thorn, Icelandic */
{ "times", '\xD7' }, /* Multiply sign */
{ "ucirc", '\xFB' }, /* Small u, circumflex */
};
static const struct html2char html2char_6[] = {
{ "Aacute", '\xC1' }, /* Capital A, acute accent */
{ "Agrave", '\xC0' }, /* Capital A, grave accent */
{ "Atilde", '\xC3' }, /* Capital A, tilde */
{ "Ccedil", '\xC7' }, /* Capital C, cedilla */
{ "Eacute", '\xC9' }, /* Capital E, acute accent */
{ "Egrave", '\xC8' }, /* Capital E, grave accent */
{ "Iacute", '\xCD' }, /* Capital I, acute accent */
{ "Igrave", '\xCC' }, /* Capital I, grave accent */
{ "Ntilde", '\xD1' }, /* Capital N, tilde */
{ "Oacute", '\xD3' }, /* Capital O, acute accent */
{ "Ograve", '\xD2' }, /* Capital O, grave accent */
{ "Oslash", '\xD8' }, /* Capital O, slash */
{ "Otilde", '\xD5' }, /* Capital O, tilde */
{ "Uacute", '\xDA' }, /* Capital U, acute accent */
{ "Ugrave", '\xD9' }, /* Capital U, grave accent */
{ "Yacute", '\xDD' }, /* Capital Y, acute accent */
{ "aacute", '\xDF' }, /* Small a, acute accent */
{ "agrave", '\xE0' }, /* Small a, grave accent */
{ "atilde", '\xE3' }, /* Small a, tilde */
{ "brkbar", '\xA6' }, /* Broken vertical bar */
{ "brvbar", '\xA6' }, /* Broken vertical bar */
{ "ccedil", '\xE7' }, /* Small c, cedilla */
{ "curren", '\xA4' }, /* General currency sign */
{ "divide", '\xF7' }, /* Division sign */
{ "eacute", '\xE9' }, /* Small e, acute accent */
{ "egrave", '\xE8' }, /* Small e, grave accent */
{ "frac12", '\xBD' }, /* Fraction one-half */
{ "frac14", '\xBC' }, /* Fraction one-fourth */
{ "frac34", '\xBE' }, /* Fraction three-fourths */
{ "iacute", '\xED' }, /* Small i, acute accent */
{ "igrave", '\xEC' }, /* Small i, grave accent */
{ "iquest", '\xBF' }, /* Inverted question mark */
{ "middot", '\xB7' }, /* Middle dot */
{ "ntilde", '\xF1' }, /* Small n, tilde */
{ "oacute", '\xF3' }, /* Small o, acute accent */
{ "ograve", '\xF2' }, /* Small o, grave accent */
{ "oslash", '\xF8' }, /* Small o, slash */
{ "otilde", '\xF5' }, /* Small o, tilde */
{ "plusmn", '\xB1' }, /* Plus or minus */
{ "uacute", '\xFA' }, /* Small u, acute accent */
{ "ugrave", '\xF9' }, /* Small u, grave accent */
{ "yacute", '\xFD' }, /* Small y, acute accent */
};
static const struct {
const struct html2char *h2c;
size_t h2cn;
} html2chars[] = {
{html2char_2, sizeof(html2char_2) / sizeof(*html2char_2)},
{html2char_3, sizeof(html2char_3) / sizeof(*html2char_3)},
{html2char_4, sizeof(html2char_4) / sizeof(*html2char_4)},
{html2char_5, sizeof(html2char_5) / sizeof(*html2char_5)},
{html2char_6, sizeof(html2char_6) / sizeof(*html2char_6)},
};
static int html2char_cmp(const void *lhs, const void *rhs)
{
return memcmp(((const struct html2char *)lhs)->h, ((const struct html2char *)rhs)->h, HTML2CHAR_MAX);
}
static void ePerl_Cfwrite(const char *cpBuf, size_t nBuf, size_t cNum, FILE *output)
{
nBuf *= cNum;
for (char *amp; (amp = memchr(cpBuf, '&', nBuf)); ) {
fwrite(cpBuf, 1, amp - cpBuf, output);
nBuf -= amp - cpBuf;
cpBuf = amp;
char *semi = memchr(cpBuf, ';', nBuf);
if (!semi)
break;
++semi;
nBuf -= semi - cpBuf;
cpBuf = semi;
size_t namelen = semi - amp - 2;
if (namelen < HTML2CHAR_MIN || namelen > HTML2CHAR_MAX) {
nomatch:
fwrite(amp, 1, semi - amp, output);
continue;
}
struct html2char key = {0};
memcpy(key.h, amp + 1, namelen);
struct html2char *ent =
bsearch(&key, html2chars[namelen - HTML2CHAR_MIN].h2c, html2chars[namelen - HTML2CHAR_MIN].h2cn, sizeof(struct html2char), html2char_cmp);
if (!ent)
goto nomatch;
fputc(ent->c, output);
}
fwrite(cpBuf, 1, nBuf, output);
}
/*
** memmem() but case-insensitive; basically equivalent to strncasestr()
*/
void *memcasemem(const void *buf, size_t n, const void *str, size_t len)
{
for (const char *cp = buf, *cpe = buf + n - len; cp <= cpe; ++cp)
if (strncasecmp(cp, str, len) == 0)
return (void *)cp;
return NULL;
}
/*
** convert buffer from sprinkled format to plain format
*/
char *ePerl_Sprinkled2Plain(const char *cpBuf, const char *ePerl_begin_delimiter, const char *ePerl_end_delimiter, bool ePerl_case_sensitive_delimiters, bool ePerl_convert_entities)
{
char *cpOutBuf = NULL;
size_t nOutBuf = 0;
const char *cps, *cpe;
const char *cps2, *cpe2;
size_t ePerl_begin_delimiter_len = strlen(ePerl_begin_delimiter);
size_t ePerl_end_delimiter_len = strlen(ePerl_end_delimiter);
if (!*cpBuf) {
/* make sure we return a buffer which the caller can free() */
return strdup("");
}
const char *cpEND = cpBuf+strlen(cpBuf);
FILE *output = open_memstream(&cpOutBuf, &nOutBuf);
if (!output) {
ePerl_SetError("Cannot allocate memstream: %s", strerror(errno));
return NULL;
}
/* now step through the file and convert it to legal Perl code.
This is a bit complicated because we have to make sure that
we parse the correct delimiters while the delimiter
characters could also occur inside the Perl code! */
cps = cpBuf;
while (cps < cpEND) {
cpe = (ePerl_case_sensitive_delimiters ? memmem : memcasemem)(cps, cpEND - cps, ePerl_begin_delimiter, ePerl_begin_delimiter_len);
if (cpe == NULL)
cpe = cpEND;
/* first, encapsulate the content from current pos
up to the begin of the ePerl block as print statements */
cps2 = cps;
/* first, do all complete lines */
while ((cpe2 = memchr(cps2, '\n', cpe-cps2)) != NULL) {
if (ePerl_line_continuation && cps < cpe2 && *(cpe2-1) == '\\') {
if (cpe2-1-cps2 > 0) {
fputs("print \"", output);
ePerl_Efwrite(cps2, cpe2-1-cps2, 1, output);
fputs("\";", output);
}
fputc('\n', output);
}
else {
fputs("print \"", output);
ePerl_Efwrite(cps2, cpe2-cps2, 1, output);
fputs("\\n\";\n", output);
}
cps2 = cpe2+1;
}
/* then do the remainder which is not
finished by a newline */
if (cpe > cps2) {
fputs("print \"", output);
ePerl_Efwrite(cps2, cpe-cps2, 1, output);
fputs("\";", output);
}
/* Ok, there is at least one more ePerl block */
if (cpe == cpEND)
break;
/* just output a leading space to make
the -x display more readable. */
if (ftell(output) && (fflush(output), cpOutBuf[nOutBuf - 1] != '\n'))
fputc(' ', output);
/* skip the start delimiter */
cps = cpe+strlen(ePerl_begin_delimiter);
/* recognize the 'print' shortcut with '=',
* e.g. <:=$var:>
*/
if (*cps == '=') {
fputs("print ", output);
cps++;
}
/* skip all following whitespaces.
Be careful: we could skip newlines too, but then the
error output will give wrong line numbers!!! */
while (cps < cpEND) {
if (*cps != ' ' && *cps != '\t')
break;
cps++;
}
cpe = cps;
/* move forward to end of ePerl block. */
cpe = (ePerl_case_sensitive_delimiters ? memmem : memcasemem)(cpe, cpEND-cpe, ePerl_end_delimiter, ePerl_end_delimiter_len);
if (cpe == NULL) {
ePerl_SetError("Missing end delimiter");
goto CUS;
}
/* step again backward over whitespaces */
for (cpe2 = cpe;
cpe2 > cps && (*(cpe2-1) == ' ' || *(cpe2-1) == '\t' || *(cpe2-1) == '\n');
cpe2--)
;
/* pass through the ePerl block without changes! */
if (cpe2 > cps) {
if (ePerl_convert_entities)
ePerl_Cfwrite(cps, cpe2-cps, 1, output);
else
fwrite(cps, cpe2-cps, 1, output);
/* be smart and automatically add a semicolon
if not provided at the end of the ePerl block.
But know the continuation indicator "_". */
if ((*(cpe2-1) != ';') &&
(*(cpe2-1) != '_') )
fputc(';', output);
if (*(cpe2-1) == '_')
fseek(output, -1, SEEK_CUR);
}
/* end preserve newlines for correct line numbers */
for ( ; cpe2 <= cpe; cpe2++)
if (*cpe2 == '\n')
fputc('\n', output);
/* output a trailing space to make
the -x display more readable when
no newlines have finished the block. */
if (ftell(output) && (fflush(output), cpOutBuf[nOutBuf - 1] != '\n'))
fputc(' ', output);
/* and adjust the current position to the first character
after the end delimiter */
cps = cpe+strlen(ePerl_end_delimiter);
/* finally just one more feature: when an end delimiter
is directly followed by "//" this discards all
data up to and including the following newline */
if (cps < cpEND-2 && *cps == '/' && *(cps+1) == '/') {
/* skip characters */
cps += 2;
for ( ; cps < cpEND && *cps != '\n'; cps++)
;
if (cps < cpEND)
cps++;
/* but preserve the newline in the script */
fputc('\n', output);
}
}
fclose(output);
cpOutBuf[nOutBuf] = '\0';
return cpOutBuf;
CUS:
fclose(output);
free(cpOutBuf);
return NULL;
}
|