1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178
|
/*
* Copyright (c) 2013 Tim Ruehsen
* Copyright (c) 2015-2024 Free Software Foundation, Inc.
*
* This file is part of libwget.
*
* Libwget is free software: you can redistribute it and/or modify
* it under the terms of the GNU Lesser General Public License as published by
* the Free Software Foundation, either version 3 of the License, or
* (at your option) any later version.
*
* Libwget is distributed in the hope that it will be useful,
* but WITHOUT ANY WARRANTY; without even the implied warranty of
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
* GNU Lesser General Public License for more details.
*
* You should have received a copy of the GNU Lesser General Public License
* along with libwget. If not, see <https://www.gnu.org/licenses/>.
*
*
* Advanced example for CSS parsing using libwget
*
* Changelog
* 15.01.2013 Tim Ruehsen created
*
* Demonstrate how to extract URIs from CSS files, converting them to UTF-8
* if needed, converting relative URIs to absolute.
*
* We ignore the BOM (Byte Order Mark) here.
* BOM see: https://www.w3.org/International/questions/qa-byte-order-mark
*
*/
#include <unistd.h>
#include <stdlib.h>
#include <string.h>
#include <wget.h>
// use the helper routines provided by libwget
#define info_printf wget_info_printf
#define error_printf wget_error_printf
#define error_printf_exit wget_error_printf_exit
struct css_context {
wget_iri
*base;
const char
*encoding;
wget_buffer
uri_buf;
char
encoding_allocated;
};
static void WGET_GCC_NORETURN usage(const char *myname)
{
error_printf_exit(
"\nUsage: %s [options] file...\n"\
" --base <URI> Default base for relative URIs, default: http://www.example.com\n"\
" --encoding <Encoding> Default file character encoding, default: iso-8859-1\n"\
"\n"\
" Examples:\n"\
" %s --base http://www.mydomain.com x.css\n"\
" cat x.css | %s --base http://www.mydomain.com -\n"\
"\n"\
" Print URIs as found (without a base):\n"\
" %s --base \"\" x.css\n\n",
myname, myname, myname, myname);
}
// Callback function, called from CSS parser for each @charset found.
static void css_parse_encoding(void *context, const char *encoding, size_t len)
{
struct css_context *ctx = context;
// take only the first @charset rule
if (!ctx->encoding_allocated && wget_strncasecmp_ascii(ctx->encoding, encoding, len)) {
if (ctx->encoding)
info_printf("Encoding changed from '%s' to '%.*s'\n", ctx->encoding, (int)len, encoding);
else
info_printf("Encoding set to '%.*s'\n", (int)len, encoding);
ctx->encoding = wget_strmemdup(encoding, len);
ctx->encoding_allocated = 1;
}
}
// Callback function, called from CSS parser for each URI found.
static void css_parse_uri(void *context, const char *url, size_t len, size_t pos WGET_GCC_UNUSED)
{
struct css_context *ctx = context;
// ignore e.g. href='#'
if (!ctx->base) {
wget_info_printf(" %.*s\n", (int)len, url);
} else if (wget_iri_relative_to_abs(ctx->base, url, len, &ctx->uri_buf)) {
wget_info_printf(" %.*s -> %s\n", (int)len, url, ctx->uri_buf.data);
} else {
error_printf("Cannot resolve relative URI %.*s\n", (int)len, url);
}
}
static void css_parse_localfile(const char *fname, wget_iri *base, const char *encoding)
{
struct css_context context = { .base = base, .encoding = encoding };
wget_buffer_init(&context.uri_buf, NULL, 128);
wget_css_parse_file(fname, css_parse_uri, css_parse_encoding, &context);
if (context.encoding_allocated)
wget_xfree(context.encoding);
wget_buffer_deinit(&context.uri_buf);
}
int main(int argc, const char *const *argv)
{
// Base URI for converting relative to absolute URIs
const char *
base = "http://www.example.com";
// We assume that base is encoded in the local charset.
const char *
local_encoding = wget_local_charset_encoding();
// parsed 'base'
wget_iri
*base_uri;
// Character encoding of CSS file content
// An HTTP response may contain the encoding in the Content-Type header,
// but if
// see https://stackoverflow.com/questions/2526033/why-specify-charset-utf-8-in-your-css-file
const char *
css_encoding = NULL;
int
argpos;
// We want the libwget error messages be printed to STDERR.
// From here on, we can call wget_error_printf, etc.
wget_logger_set_stream(wget_get_logger(WGET_LOGGER_ERROR), stderr);
// We want the libwget info messages be printed to STDOUT.
// From here on, we can call wget_info_printf, etc.
wget_logger_set_stream(wget_get_logger(WGET_LOGGER_INFO), stdout);
// parse options
for (argpos = 1; argpos < argc; argpos++) {
if (!strcmp(argv[argpos], "--base") && argc - argpos > 1) {
base = argv[++argpos];
info_printf("Base URL encoding = '%s'\n", local_encoding);
} else if (!strcmp(argv[argpos], "--encoding") && argc - argpos > 1) {
css_encoding = argv[++argpos];
} else if (!strcmp(argv[argpos], "--")) {
argpos++;
break;
} else if (argv[argpos][0] == '-') {
usage(argv[0]);
} else
break;
}
// All URIs are converted into UTF-8 charset.
// That's why we need the local encoding (aka 'encoding of base URI') here.
base_uri = wget_iri_parse(base, local_encoding);
for (;argpos < argc; argpos++) {
// use '-' as filename for STDIN
css_parse_localfile(argv[argpos], base_uri, css_encoding);
}
wget_iri_free(&base_uri);
return 0;
}
|