File: html.c

package info (click to toggle)
swish++ 1.1b3-3
  • links: PTS
  • area: main
  • in suites: slink
  • size: 416 kB
  • ctags: 409
  • sloc: ansic: 2,842; makefile: 247; sh: 48
file content (437 lines) | stat: -rw-r--r-- 11,105 bytes parent folder | download | duplicates (2)
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
/*
**	SWISH++
**	html.c
**
**	Copyright (C) 1998  Paul J. Lucas
**
**	This program is free software; you can redistribute it and/or modify
**	it under the terms of the GNU General Public License as published by
**	the Free Software Foundation; either version 2 of the License, or
**	(at your option) any later version.
** 
**	This program is distributed in the hope that it will be useful,
**	but WITHOUT ANY WARRANTY; without even the implied warranty of
**	MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
**	GNU General Public License for more details.
** 
**	You should have received a copy of the GNU General Public License
**	along with this program; if not, write to the Free Software
**	Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
*/

// standard
#include <algorithm>
#include <cctype>
#include <cstring>

// local
#include "config.h"
#include "entities.h"
#include "fake_ansi.h"
#include "html.h"
#include "util.h"

#ifndef	PJL_NO_NAMESPACES
using namespace std;
#endif

//*****************************************************************************
//
// SYNOPSIS
//
	char convert_entity(
		register file_vector<char>::const_iterator &c,
		register file_vector<char>::const_iterator end
	)
//
// DESCRIPTION
//
//	Convert either a numeric or character entity reference to its ASCII
//	character equivalent (if it has one).  A numeric reference is a
//	character sequence having the form:
//
//		&d;
//		&xh;
//		&Xh;
//
//	where 'd' is a sequence of 1 or more decimal digits [0-9] and 'h' is
//	a sequence of 1 or more hexadecimal digits [0-9A-Fa-f].  A character
//	entity reference is a character sequence having the form:
//
//		&ref;
//
//	The character entities converted are listed in the entities.c file.
//
// PARAMETERS
//
//	c	This iterator is to be positioned at the character past the
//		'&'; if an entity is found, it is left after the ';'.
//
//	end	The iterator marking the end of the file.
//
// RETURN VALUE
//
//	Returns the ASCII equivalent of the entity or ' ' (space) if either
//	there is no equivalent or the entity is malformed.
//
// EXAMPLE
//
//	The references in "r&eacute;sum&eacute;" will be converted to the
//	letter 'e' resulting in the "resume" string.
//
// SEE ALSO
//
//	"Character entity references in HTML 4.0," HTML 4.0 Specification,
//	http://www.w3.org/
//
//	ISO 8859-1: "Information Processing -- 8-bit single-byte coded graphic
//	character sets -- Part 1: Latin alphabet No. 1," 1987.
//
//	ISO 8879: "Information Processing -- Text and Office Systems --
//	Standard Generalized Markup Language (SGML)," 1986.
//
//*****************************************************************************
{
	////////// See if it's a numeric character reference //////////////////

	bool const is_num = (c != end && *c == '#');
	bool const is_hex = (is_num && ++c != end && (*c == 'x' || *c == 'X'));
	if ( is_hex ) ++c;

	////////// Find the terminating ';' ///////////////////////////////////

	char entity_buf[ Entity_Max_Size + 1 ];
	int entity_len = 0;

	while ( c != end && *c != ';' ) {
		if ( ++entity_len >= sizeof( entity_buf ) )
			return ' ';			// give up looking
		entity_buf[ entity_len - 1 ] = *c++;
	}
	if ( c == end )					// didn't find it
		return ' ';
	++c;						// put past ';'

	entity_buf[ entity_len ] = '\0';

	////////// Look up character entity reference /////////////////////////

	if ( !is_num ) {
		static entity_map entities;
		return entities[ entity_buf ];
	}

	////////// Parse a numeric character reference ////////////////////////

	register unsigned n = 0;
	for ( char const *e = entity_buf; *e; ++e ) {
		if ( is_hex ) {
			if ( !isxdigit( *e ) )		// bad hex num
				return ' ';
			n = (n << 4) | ( isdigit( *e ) ?
				*e - '0' : tolower( *e ) - 'a' + 10
			);
		} else {
			if ( !isdigit( *e ) )		// bad dec num
				return ' ';
			n = n * 10 + *e - '0';
		}
	}

	return isprint( n ) ? char( n ) : ' ';
}

//*****************************************************************************
//
// SYNOPSIS
//
	bool tag_cmp(
		file_vector<char>::const_iterator &c,
		register file_vector<char>::const_iterator end,
		register char const *s
	)
//
// DESCRIPTION
//
//	Compares the tag starting at the given iterator to the given string
//	(case insensitive).
//
// PARAMETERS
//
//	c	The iterator to use.  It is presumed to be positioned at the
//		first character after the '<'.  It is left at the first
//		character past the tag only if the tag matches; otherwise, it
//		is not touched.
//
//	end	The iterator marking the end of the file.
//
//	s	The string to compare against.
//
// RETURN VALUE
//
//	Returns true only if the tag matches.
//
//*****************************************************************************
{
	register file_vector<char>::const_iterator d = c;
	while ( *s && d != end && to_upper( *s++ ) == to_upper( *d++ ) ) ;
	return *s ? false : c = d;
}

//*****************************************************************************
//
// SYNOPSIS
//
	inline bool is_html_comment(
		file_vector<char>::const_iterator &c,
		file_vector<char>::const_iterator end
	)
//
// DESCRIPTION
//
//	Checks to see if the current HTML tag is the start of a comment.
//
// RETURN VALUE
//
//	Returns true only if the current tage is the beginning of a comment.
//
// SEE ALSO
//
//	"On SGML and HTML: Comments," HTML 4.0 Specification,
//	http://www.w3.org/
//
//*****************************************************************************
{
	return tag_cmp( c, end, "!--" );
}

//*****************************************************************************
//
// SYNOPSIS
//
	void skip_html_comment(
		register file_vector<char>::const_iterator &c,
		register file_vector<char>::const_iterator end
	)
//
// DESCRIPTION
//
//	Skip an HTML comment scanning for the closing "-->" character
//	sequence.  The HTML specification permits whitespace between the "--"
//	and the ">" (for some strange reason).  Unlike skipping an ordinary
//	HTML tag, quotes are not significant and no attempt must be made
//	either to "balance" them or to ignore what is in between them.
//
//	This function is more lenient that the HTML 4.0 specification in that
//	it allows for s tring of hyphens within a comment; the specification
//	considers this to be an error.
//
// PARAMETERS
//
//	c	The iterator to use.  It is presumed to start at any position
//		after the '<' and before the '>'; it is left after the '>'.
//
//	end	The iterator marking the end of the file.
//
// SEE ALSO
//
//	"On SGML and HTML: Comments," HTML 4.0 Specification,
//	http://www.w3.org/
//
//*****************************************************************************
{
	while ( c != end ) {
		if ( *c++ != '-' )
			continue;
		while ( c != end && *c == '-' )
			++c;
		while ( c != end && isspace( *c ) )
			++c;
		if ( c != end && *c++ == '>' )
			break;
	}
}

//*****************************************************************************
//
// SYNOPSIS
//
	char const* grep_title( file_vector<char> const &file )
//
// DESCRIPTION
//
//	Scan ("grep") through the first Title_Lines lines in an HTML file
//	looking for <TITLE>...</TITLE> tags to extract the title.  Every
//	non-space whitespace character in the title is converted to a space;
//	leading and trailing spaces are removed.
//
// PARAMETERS
//
//	file	The file presumed to be an HTML file.
//
// RETURN VALUE
//
//	Returns the title string or null if no title can be found.
//
// EXAMPLE
//
//	Given:
//
//		<TITLE>
//		This is
//		a title
//		</TITLE>
//
//	returns:
//
//		This is a title
//
// SEE ALSO
//
//	"The global structure of an HTML document," HTML 4.0 Specification,
//	http://www.w3.org/
//
//*****************************************************************************
{
	static char const *const title_tag[] = {	// tag_index
		"TITLE",				//	0
		"/TITLE"				//	1
	};
	int tag_index = 0;
	int lines = 0;

	//
	// <TITLE>This is a title</TITLE>
	//        |              |
	//        after          before
	//
	// Mark the positions after the closing '>' of the start tag and before
	// the opening '<' of the end tag.  What's in between is the title.
	//
	file_vector<char>::const_iterator after, before;

	register file_vector<char>::const_iterator c = file.begin();
	while ( c != file.end() ) {
		if ( *c == '\n' && ++lines > Title_Lines ) {
			//
			// Didn't find <TITLE> within first Title_Lines lines
			// of file: forget it.
			//
			return 0;
		}

		if ( *c != '<' ) {			// not a tag: forget it
			++c;
			continue;
		}

		//
		// Found the start of an HTML tag: mark the position before it
		// in case it turns out to be the </TITLE> tag.
		//
		before = c++;

		if ( is_html_comment( c, file.end() ) ) {
			skip_html_comment( c, file.end() );
			continue;
		}

		//
		// Is the HTML tag a TITLE tag?
		//
		bool const found_title_tag =
			tag_cmp( c, file.end(), title_tag[ tag_index ] );
		skip_html_tag( c, file.end() );		// skip until '>'
		if ( !found_title_tag )			// wrong tag
			continue;

		if ( tag_index == 0 ) {
			//
			// Found the <TITLE> tag: mark the position after it
			// and begin looking for the </TITLE> tag.
			//
			after = c;
			++tag_index;
			continue;
		}

		////////// Found the entire title /////////////////////////////

		// Remove leading spaces
		while ( after < before && isspace( *after ) )
			++after;

		// Remove trailing spaces
		while ( after < --before && isspace( *before ) )
			;
		++before;

		static char title[ Title_Max_Size + 1 ];
		int len = before - after;
		if ( len > Title_Max_Size ) {
			::strncpy( title, after, Title_Max_Size );
			::strcpy( title + Title_Max_Size - 3, "..." );
			len = Title_Max_Size;
		} else {
			::copy( after, before, title );
		}
		title[ len ] = '\0';

		// Normalize all whitespace chars to space chars
		for ( register char *p = title; *p; ++p )
			if ( isspace( *p ) )
				*p = ' ';

		return title;
	}
}

//*****************************************************************************
//
// SYNOPSIS
//
	void skip_html_tag(
		register file_vector<char>::const_iterator &c,
		register file_vector<char>::const_iterator end
	)
//
// DESCRIPTION
//
//	Scan for the closing '>' of an HTML tag skipping all characters until
//	it's found.  It takes care to ignore any '>' inside either single or
//	double quotation marks.
//
// PARAMETERS
//
//	c	The iterator to use.  It is presumed to start at any position
//		after the '<' and before the '>'; it is left at the first
//		character after the '>'.
//
//	end	The iterator marking the end of the file.
//
// SEE ALSO
//
//	"The global structure of an HTML document," HTML 4.0 Specification,
//	http://www.w3.org/
//
//*****************************************************************************
{
	if ( is_html_comment( c, end ) ) {
		skip_html_comment( c, end );
		return;
	}
	register char quote = '\0';
	while ( c != end ) {
		if ( quote ) {			// ignore everything
			if ( *c++ == quote )	// until matching quote
				quote = '\0';
			continue;
		}
		if ( *c == '\"' || *c == '\'' ) {
			quote = *c++;		// start ignoring stuff
			continue;
		}
		if ( *c++ == '>' )		// found it
			break;
	}
}