File: print_css_urls2.c

package info (click to toggle)
wget2 2.2.0%2Bds-1
  • links: PTS, VCS
  • area: main
  • in suites: trixie
  • size: 22,248 kB
  • sloc: ansic: 121,144; sh: 11,559; makefile: 878; xml: 182; sed: 16
file content (178 lines) | stat: -rw-r--r-- 5,386 bytes parent folder | download | duplicates (3)
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
/*
 * Copyright (c) 2013 Tim Ruehsen
 * Copyright (c) 2015-2024 Free Software Foundation, Inc.
 *
 * This file is part of libwget.
 *
 * Libwget is free software: you can redistribute it and/or modify
 * it under the terms of the GNU Lesser General Public License as published by
 * the Free Software Foundation, either version 3 of the License, or
 * (at your option) any later version.
 *
 * Libwget is distributed in the hope that it will be useful,
 * but WITHOUT ANY WARRANTY; without even the implied warranty of
 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
 * GNU Lesser General Public License for more details.
 *
 * You should have received a copy of the GNU Lesser General Public License
 * along with libwget.  If not, see <https://www.gnu.org/licenses/>.
 *
 *
 * Advanced example for CSS parsing using libwget
 *
 * Changelog
 * 15.01.2013  Tim Ruehsen  created
 *
 * Demonstrate how to extract URIs from CSS files, converting them to UTF-8
 * if needed, converting relative URIs to absolute.
 *
 * We ignore the BOM (Byte Order Mark) here.
 * BOM see: https://www.w3.org/International/questions/qa-byte-order-mark
 *
 */

#include <unistd.h>
#include <stdlib.h>
#include <string.h>

#include <wget.h>

// use the helper routines provided by libwget
#define info_printf        wget_info_printf
#define error_printf       wget_error_printf
#define error_printf_exit  wget_error_printf_exit

struct css_context {
	wget_iri
		*base;
	const char
		*encoding;
	wget_buffer
		uri_buf;
	char
		encoding_allocated;
};

static void WGET_GCC_NORETURN usage(const char *myname)
{
	error_printf_exit(
		"\nUsage: %s [options] file...\n"\
		"  --base <URI>          Default base for relative URIs, default: http://www.example.com\n"\
		"  --encoding <Encoding> Default file character encoding, default: iso-8859-1\n"\
		"\n"\
		"  Examples:\n"\
		"    %s --base http://www.mydomain.com x.css\n"\
		"    cat x.css | %s --base http://www.mydomain.com -\n"\
		"\n"\
		"  Print URIs as found (without a base):\n"\
		"    %s --base \"\" x.css\n\n",
		myname, myname, myname, myname);
}

// Callback function, called from CSS parser for each @charset found.
static void css_parse_encoding(void *context, const char *encoding, size_t len)
{
	struct css_context *ctx = context;

	// take only the first @charset rule
	if (!ctx->encoding_allocated && wget_strncasecmp_ascii(ctx->encoding, encoding, len)) {
		if (ctx->encoding)
			info_printf("Encoding changed from '%s' to '%.*s'\n", ctx->encoding, (int)len, encoding);
		else
			info_printf("Encoding set to '%.*s'\n", (int)len, encoding);

		ctx->encoding = wget_strmemdup(encoding, len);
		ctx->encoding_allocated = 1;
	}
}

// Callback function, called from CSS parser for each URI found.
static void css_parse_uri(void *context, const char *url, size_t len, size_t pos WGET_GCC_UNUSED)
{
	struct css_context *ctx = context;

	// ignore e.g. href='#'
	if (!ctx->base) {
		wget_info_printf("  %.*s\n", (int)len, url);
	} else if (wget_iri_relative_to_abs(ctx->base, url, len, &ctx->uri_buf)) {
		wget_info_printf("  %.*s -> %s\n", (int)len, url, ctx->uri_buf.data);
	} else {
		error_printf("Cannot resolve relative URI %.*s\n", (int)len, url);
	}
}

static void css_parse_localfile(const char *fname, wget_iri *base, const char *encoding)
{
	struct css_context context = { .base = base, .encoding = encoding };

	wget_buffer_init(&context.uri_buf, NULL, 128);

	wget_css_parse_file(fname, css_parse_uri, css_parse_encoding, &context);

	if (context.encoding_allocated)
		wget_xfree(context.encoding);

	wget_buffer_deinit(&context.uri_buf);
}

int main(int argc, const char *const *argv)
{
	// Base URI for converting relative to absolute URIs
	const char *
		base = "http://www.example.com";

	// We assume that base is encoded in the local charset.
	const char *
		local_encoding = wget_local_charset_encoding();

	// parsed 'base'
	wget_iri
		*base_uri;

	// Character encoding of CSS file content
	// An HTTP response may contain the encoding in the Content-Type header,
	// but if
	// see https://stackoverflow.com/questions/2526033/why-specify-charset-utf-8-in-your-css-file
	const char *
		css_encoding = NULL;

	int
		argpos;

	// We want the libwget error messages be printed to STDERR.
	// From here on, we can call wget_error_printf, etc.
	wget_logger_set_stream(wget_get_logger(WGET_LOGGER_ERROR), stderr);

	// We want the libwget info messages be printed to STDOUT.
	// From here on, we can call wget_info_printf, etc.
	wget_logger_set_stream(wget_get_logger(WGET_LOGGER_INFO), stdout);

	// parse options
	for (argpos = 1; argpos < argc; argpos++) {
		if (!strcmp(argv[argpos], "--base") && argc - argpos > 1) {
			base = argv[++argpos];
			info_printf("Base URL encoding = '%s'\n", local_encoding);
		} else if (!strcmp(argv[argpos], "--encoding") && argc - argpos > 1) {
			css_encoding = argv[++argpos];
		} else if (!strcmp(argv[argpos], "--")) {
			argpos++;
			break;
		} else if (argv[argpos][0] == '-') {
			usage(argv[0]);
		} else
			break;
	}

	// All URIs are converted into UTF-8 charset.
	// That's why we need the local encoding (aka 'encoding of base URI') here.
	base_uri = wget_iri_parse(base, local_encoding);

	for (;argpos < argc; argpos++) {
		// use '-' as filename for STDIN
		css_parse_localfile(argv[argpos], base_uri, css_encoding);
	}

	wget_iri_free(&base_uri);

	return 0;
}