1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213
|
/*
* Copyright (c) 2012 Tim Ruehsen
* Copyright (c) 2015-2024 Free Software Foundation, Inc.
*
* This file is part of libwget.
*
* Libwget is free software: you can redistribute it and/or modify
* it under the terms of the GNU Lesser General Public License as published by
* the Free Software Foundation, either version 3 of the License, or
* (at your option) any later version.
*
* Libwget is distributed in the hope that it will be useful,
* but WITHOUT ANY WARRANTY; without even the implied warranty of
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
* GNU Lesser General Public License for more details.
*
* You should have received a copy of the GNU Lesser General Public License
* along with libwget. If not, see <https://www.gnu.org/licenses/>.
*
*
* css parsing routines
*
* Changelog
* 03.07.2012 Tim Ruehsen created
*
* A parser using the flex tokenizer, created with flex tokens from
* https://www.w3.org/TR/css3-syntax/
*
* TODO:
* - since we are just interested in @import ... and url(...), we could use
* a simplistic hand-written parser which might be much smaller and faster
*/
#include <config.h>
#include <stddef.h>
#include <unistd.h>
#include <string.h>
#include <c-ctype.h>
#include <fcntl.h>
#include <sys/stat.h>
#ifdef HAVE_MMAP
#include <sys/mman.h>
#endif
#include <wget.h>
#include "private.h"
#include "css_tokenizer.h"
// see css_tokenizer.c
typedef void* yyscan_t;
int yyget_leng(yyscan_t yyscanner);
char *yyget_text(yyscan_t yyscanner);
typedef struct yy_buffer_state *YY_BUFFER_STATE;
int yylex_init(yyscan_t* scanner);
YY_BUFFER_STATE yy_scan_string(const char * yystr, yyscan_t yyscanner);
YY_BUFFER_STATE yy_scan_bytes(const char * yystr, int len, yyscan_t yyscanner);
int yylex(yyscan_t yyscanner);
int yylex_destroy(yyscan_t yyscanner);
void *yyalloc(size_t size);
void *yyrealloc(void *p, size_t size);
void *yyalloc(size_t size) {
return wget_malloc(size);
}
void *yyrealloc(void *p, size_t size) {
return wget_realloc(p, size);
}
void wget_css_parse_buffer(
const char *buf,
size_t len,
wget_css_parse_uri_callback *callback_uri,
wget_css_parse_encoding_callback *callback_encoding,
void *user_ctx)
{
int token;
size_t length, pos = 0;
char *text;
yyscan_t scanner;
yylex_init(&scanner);
yy_scan_bytes(buf, (int) len, scanner);
while ((token = yylex(scanner)) != CSSEOF) {
if (token == IMPORT_SYM) {
// e.g. @import "https://example.com/index.html"
pos += yyget_leng(scanner);
// skip whitespace before URI/STRING
while ((token = yylex(scanner)) == S)
pos += yyget_leng(scanner);
// now token should be STRING or URI
if (token == STRING)
token = URI;
}
if (token == URI && callback_uri) {
// e.g. url(https://example.com/index.html)
text = yyget_text(scanner);
length = yyget_leng(scanner);
if (*text == '\'' || *text == '\"') {
// a string - remove the quotes
callback_uri(user_ctx, text + 1, length - 2, pos + 1);
} else {
// extract URI from url(...)
if (!wget_strncasecmp_ascii(text, "url(", 4)) {
char *otext = text;
// remove trailing ) and any spaces before
for (length--; c_isspace(text[length - 1]); length--);
// remove leading url( and any spaces after
for (length -= 4, text += 4; length && c_isspace(*text); text++, length--);
// remove quotes
if (length && (*text == '\'' || *text == '\"')) {
text++;
length--;
}
if (length && (text[length - 1] == '\'' || text[length - 1] == '\"'))
length--;
callback_uri(user_ctx, text, length, pos + (text - otext));
}
}
} else if (token == CHARSET_SYM && callback_encoding) {
// e.g. @charset "UTF-8"
pos += yyget_leng(scanner);
// skip whitespace before charset name
while ((token = yylex(scanner)) == S)
pos += yyget_leng(scanner);
// now token should be STRING
if (token == STRING) {
text = yyget_text(scanner);
length = yyget_leng(scanner);
if (*text == '\'' || *text == '\"') {
// a string - remove the quotes
callback_encoding(user_ctx, text + 1, length - 2);
} else {
// a string without quotes
callback_encoding(user_ctx, text, length);
}
} else {
error_printf(_("Unknown token after @charset: %d\n"), token);
}
}
pos += yyget_leng(scanner);
}
yylex_destroy(scanner);
}
void wget_css_parse_file(
const char *fname,
wget_css_parse_uri_callback *callback_uri,
wget_css_parse_encoding_callback *callback_encoding,
void *user_ctx)
{
if (strcmp(fname,"-")) {
int fd;
if ((fd = open(fname, O_RDONLY|O_BINARY)) != -1) {
struct stat st;
if (fstat(fd, &st) == 0) {
#ifdef HAVE_MMAP
size_t nread = st.st_size;
char *buf = mmap(NULL, nread + 1, PROT_READ|PROT_WRITE, MAP_PRIVATE, fd, 0);
#else
char *buf=wget_malloc(st.st_size+1);
size_t nread=read(fd,buf,st.st_size);
#endif
if (nread > 0) {
buf[nread] = 0; // PROT_WRITE allows this write, MAP_PRIVATE prevents changes in underlying file system
wget_css_parse_buffer(buf, st.st_size, callback_uri, callback_encoding, user_ctx);
}
#ifdef HAVE_MMAP
munmap(buf, nread);
#else
xfree(buf);
#endif
}
close(fd);
} else
error_printf(_("Failed to open %s\n"), fname);
} else {
// read data from STDIN.
// maybe should use yy_scan_bytes instead of buffering into memory.
char tmp[4096];
ssize_t nbytes;
wget_buffer buf;
wget_buffer_init(&buf, NULL, 4096);
while ((nbytes = read(STDIN_FILENO, tmp, sizeof(tmp))) > 0) {
wget_buffer_memcat(&buf, tmp, nbytes);
}
if (buf.length)
wget_css_parse_buffer(buf.data, buf.length, callback_uri, callback_encoding, user_ctx);
wget_buffer_deinit(&buf);
}
}
|