File: css.c

package info (click to toggle)
wget2 2.2.0%2Bds-3
links: PTS, VCS
area: main
in suites: forky, sid
size: 22,468 kB
sloc: ansic: 121,166; sh: 11,559; makefile: 878; xml: 182; sed: 16
file content (213 lines) | stat: -rw-r--r-- 5,739 bytes
parent folder | download | duplicates (2)
/*
 * Copyright (c) 2012 Tim Ruehsen
 * Copyright (c) 2015-2024 Free Software Foundation, Inc.
 *
 * This file is part of libwget.
 *
 * Libwget is free software: you can redistribute it and/or modify
 * it under the terms of the GNU Lesser General Public License as published by
 * the Free Software Foundation, either version 3 of the License, or
 * (at your option) any later version.
 *
 * Libwget is distributed in the hope that it will be useful,
 * but WITHOUT ANY WARRANTY; without even the implied warranty of
 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
 * GNU Lesser General Public License for more details.
 *
 * You should have received a copy of the GNU Lesser General Public License
 * along with libwget.  If not, see <https://www.gnu.org/licenses/>.
 *
 *
 * css parsing routines
 *
 * Changelog
 * 03.07.2012  Tim Ruehsen  created
 *
 * A parser using the flex tokenizer, created with flex tokens from
 *   https://www.w3.org/TR/css3-syntax/
 *
 * TODO:
 *  - since we are just interested in @import ... and url(...), we could use
 *    a simplistic hand-written parser which might be much smaller and faster
 */

#include <config.h>

#include <stddef.h>
#include <unistd.h>
#include <string.h>
#include <c-ctype.h>
#include <fcntl.h>
#include <sys/stat.h>
#ifdef HAVE_MMAP
#include <sys/mman.h>
#endif

#include <wget.h>
#include "private.h"

#include "css_tokenizer.h"

// see css_tokenizer.c
typedef void* yyscan_t;
int yyget_leng(yyscan_t yyscanner);
char *yyget_text(yyscan_t yyscanner);
typedef struct yy_buffer_state *YY_BUFFER_STATE;
int yylex_init(yyscan_t* scanner);
YY_BUFFER_STATE yy_scan_string(const char * yystr, yyscan_t yyscanner);
YY_BUFFER_STATE yy_scan_bytes(const char * yystr, int len, yyscan_t yyscanner);
int yylex(yyscan_t yyscanner);
int yylex_destroy(yyscan_t yyscanner);
void *yyalloc(size_t size);
void *yyrealloc(void *p, size_t size);

void *yyalloc(size_t size) {
	return wget_malloc(size);
}
void *yyrealloc(void *p, size_t size) {
	return wget_realloc(p, size);
}

void wget_css_parse_buffer(
	const char *buf,
	size_t len,
	wget_css_parse_uri_callback *callback_uri,
	wget_css_parse_encoding_callback *callback_encoding,
	void *user_ctx)
{
	int token;
	size_t length, pos = 0;
	char *text;
	yyscan_t scanner;

	yylex_init(&scanner);
	yy_scan_bytes(buf, (int) len, scanner);

	while ((token = yylex(scanner)) != CSSEOF) {
		if (token == IMPORT_SYM) {
			// e.g. @import "https://example.com/index.html"
			pos += yyget_leng(scanner);

			// skip whitespace before URI/STRING
			while ((token = yylex(scanner)) == S)
				pos += yyget_leng(scanner);

			// now token should be STRING or URI
			if (token == STRING)
				token = URI;
		}

		if (token == URI && callback_uri) {
			// e.g. url(https://example.com/index.html)
			text = yyget_text(scanner);
			length = yyget_leng(scanner);

			if (*text == '\'' || *text == '\"') {
				// a string - remove the quotes
				callback_uri(user_ctx, text + 1, length - 2, pos + 1);
			} else {
				// extract URI from url(...)
				if (!wget_strncasecmp_ascii(text, "url(", 4)) {
					char *otext = text;

					// remove trailing ) and any spaces before
					for (length--; c_isspace(text[length - 1]); length--);

					// remove leading url( and any spaces after
					for (length -= 4, text += 4; length && c_isspace(*text); text++, length--);

					// remove quotes
					if (length && (*text == '\'' || *text == '\"')) {
						text++;
						length--;
					}

					if (length && (text[length - 1] == '\'' || text[length - 1] == '\"'))
						length--;

					callback_uri(user_ctx, text, length, pos + (text - otext));
				}
			}
		} else if (token == CHARSET_SYM && callback_encoding) {
			// e.g. @charset "UTF-8"
			pos += yyget_leng(scanner);

			// skip whitespace before charset name
			while ((token = yylex(scanner)) == S)
				pos += yyget_leng(scanner);

			// now token should be STRING
			if (token == STRING) {
				text = yyget_text(scanner);
				length = yyget_leng(scanner);

				if (*text == '\'' || *text == '\"') {
					// a string - remove the quotes
					callback_encoding(user_ctx, text + 1, length - 2);
				} else {
					// a string without quotes
					callback_encoding(user_ctx, text, length);
				}
			} else {
				error_printf(_("Unknown token after @charset: %d\n"), token);
			}
		}
		pos += yyget_leng(scanner);
	}

	yylex_destroy(scanner);
}

void wget_css_parse_file(
	const char *fname,
	wget_css_parse_uri_callback *callback_uri,
	wget_css_parse_encoding_callback *callback_encoding,
	void *user_ctx)
{
	if (strcmp(fname,"-")) {
		int fd;

		if ((fd = open(fname, O_RDONLY|O_BINARY)) != -1) {
			struct stat st;
			if (fstat(fd, &st) == 0) {
#ifdef HAVE_MMAP
				size_t nread = st.st_size;
				char *buf = mmap(NULL, nread + 1, PROT_READ|PROT_WRITE, MAP_PRIVATE, fd, 0);
#else
				char *buf=wget_malloc(st.st_size+1);
				size_t nread=read(fd,buf,st.st_size);
#endif

				if (nread > 0) {
					buf[nread] = 0; // PROT_WRITE allows this write, MAP_PRIVATE prevents changes in underlying file system
					wget_css_parse_buffer(buf, st.st_size, callback_uri, callback_encoding, user_ctx);
				}

#ifdef HAVE_MMAP
				munmap(buf, nread);
#else
				xfree(buf);
#endif
			}
			close(fd);
		} else
			error_printf(_("Failed to open %s\n"), fname);
	} else {
		// read data from STDIN.
		// maybe should use yy_scan_bytes instead of buffering into memory.
		char tmp[4096];
		ssize_t nbytes;
		wget_buffer buf;

		wget_buffer_init(&buf, NULL, 4096);

		while ((nbytes = read(STDIN_FILENO, tmp, sizeof(tmp))) > 0) {
			wget_buffer_memcat(&buf, tmp, nbytes);
		}

		if (buf.length)
			wget_css_parse_buffer(buf.data, buf.length, callback_uri, callback_encoding, user_ctx);

		wget_buffer_deinit(&buf);
	}
}