File: html.c

package info (click to toggle)
swish%2B%2B 1.1b3-3
links: PTS
area: main
in suites: slink
size: 416 kB
ctags: 409
sloc: ansic: 2,842; makefile: 247; sh: 48
file content (437 lines) | stat: -rw-r--r-- 11,105 bytes
parent folder | download | duplicates (2)
/*
**	SWISH++
**	html.c
**
**	Copyright (C) 1998  Paul J. Lucas
**
**	This program is free software; you can redistribute it and/or modify
**	it under the terms of the GNU General Public License as published by
**	the Free Software Foundation; either version 2 of the License, or
**	(at your option) any later version.
** 
**	This program is distributed in the hope that it will be useful,
**	but WITHOUT ANY WARRANTY; without even the implied warranty of
**	MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
**	GNU General Public License for more details.
** 
**	You should have received a copy of the GNU General Public License
**	along with this program; if not, write to the Free Software
**	Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
*/

// standard
#include <algorithm>
#include <cctype>
#include <cstring>

// local
#include "config.h"
#include "entities.h"
#include "fake_ansi.h"
#include "html.h"
#include "util.h"

#ifndef	PJL_NO_NAMESPACES
using namespace std;
#endif

//*****************************************************************************
//
// SYNOPSIS
//
	char convert_entity(
		register file_vector<char>::const_iterator &c,
		register file_vector<char>::const_iterator end
	)
//
// DESCRIPTION
//
//	Convert either a numeric or character entity reference to its ASCII
//	character equivalent (if it has one).  A numeric reference is a
//	character sequence having the form:
//
//		&d;
//		&xh;
//		&Xh;
//
//	where 'd' is a sequence of 1 or more decimal digits [0-9] and 'h' is
//	a sequence of 1 or more hexadecimal digits [0-9A-Fa-f].  A character
//	entity reference is a character sequence having the form:
//
//		&ref;
//
//	The character entities converted are listed in the entities.c file.
//
// PARAMETERS
//
//	c	This iterator is to be positioned at the character past the
//		'&'; if an entity is found, it is left after the ';'.
//
//	end	The iterator marking the end of the file.
//
// RETURN VALUE
//
//	Returns the ASCII equivalent of the entity or ' ' (space) if either
//	there is no equivalent or the entity is malformed.
//
// EXAMPLE
//
//	The references in "r&eacute;sum&eacute;" will be converted to the
//	letter 'e' resulting in the "resume" string.
//
// SEE ALSO
//
//	"Character entity references in HTML 4.0," HTML 4.0 Specification,
//	http://www.w3.org/
//
//	ISO 8859-1: "Information Processing -- 8-bit single-byte coded graphic
//	character sets -- Part 1: Latin alphabet No. 1," 1987.
//
//	ISO 8879: "Information Processing -- Text and Office Systems --
//	Standard Generalized Markup Language (SGML)," 1986.
//
//*****************************************************************************
{
	////////// See if it's a numeric character reference //////////////////

	bool const is_num = (c != end && *c == '#');
	bool const is_hex = (is_num && ++c != end && (*c == 'x' || *c == 'X'));
	if ( is_hex ) ++c;

	////////// Find the terminating ';' ///////////////////////////////////

	char entity_buf[ Entity_Max_Size + 1 ];
	int entity_len = 0;

	while ( c != end && *c != ';' ) {
		if ( ++entity_len >= sizeof( entity_buf ) )
			return ' ';			// give up looking
		entity_buf[ entity_len - 1 ] = *c++;
	}
	if ( c == end )					// didn't find it
		return ' ';
	++c;						// put past ';'

	entity_buf[ entity_len ] = '\0';

	////////// Look up character entity reference /////////////////////////

	if ( !is_num ) {
		static entity_map entities;
		return entities[ entity_buf ];
	}

	////////// Parse a numeric character reference ////////////////////////

	register unsigned n = 0;
	for ( char const *e = entity_buf; *e; ++e ) {
		if ( is_hex ) {
			if ( !isxdigit( *e ) )		// bad hex num
				return ' ';
			n = (n << 4) | ( isdigit( *e ) ?
				*e - '0' : tolower( *e ) - 'a' + 10
			);
		} else {
			if ( !isdigit( *e ) )		// bad dec num
				return ' ';
			n = n * 10 + *e - '0';
		}
	}

	return isprint( n ) ? char( n ) : ' ';
}

//*****************************************************************************
//
// SYNOPSIS
//
	bool tag_cmp(
		file_vector<char>::const_iterator &c,
		register file_vector<char>::const_iterator end,
		register char const *s
	)
//
// DESCRIPTION
//
//	Compares the tag starting at the given iterator to the given string
//	(case insensitive).
//
// PARAMETERS
//
//	c	The iterator to use.  It is presumed to be positioned at the
//		first character after the '<'.  It is left at the first
//		character past the tag only if the tag matches; otherwise, it
//		is not touched.
//
//	end	The iterator marking the end of the file.
//
//	s	The string to compare against.
//
// RETURN VALUE
//
//	Returns true only if the tag matches.
//
//*****************************************************************************
{
	register file_vector<char>::const_iterator d = c;
	while ( *s && d != end && to_upper( *s++ ) == to_upper( *d++ ) ) ;
	return *s ? false : c = d;
}

//*****************************************************************************
//
// SYNOPSIS
//
	inline bool is_html_comment(
		file_vector<char>::const_iterator &c,
		file_vector<char>::const_iterator end
	)
//
// DESCRIPTION
//
//	Checks to see if the current HTML tag is the start of a comment.
//
// RETURN VALUE
//
//	Returns true only if the current tage is the beginning of a comment.
//
// SEE ALSO
//
//	"On SGML and HTML: Comments," HTML 4.0 Specification,
//	http://www.w3.org/
//
//*****************************************************************************
{
	return tag_cmp( c, end, "!--" );
}

//*****************************************************************************
//
// SYNOPSIS
//
	void skip_html_comment(
		register file_vector<char>::const_iterator &c,
		register file_vector<char>::const_iterator end
	)
//
// DESCRIPTION
//
//	Skip an HTML comment scanning for the closing "-->" character
//	sequence.  The HTML specification permits whitespace between the "--"
//	and the ">" (for some strange reason).  Unlike skipping an ordinary
//	HTML tag, quotes are not significant and no attempt must be made
//	either to "balance" them or to ignore what is in between them.
//
//	This function is more lenient that the HTML 4.0 specification in that
//	it allows for s tring of hyphens within a comment; the specification
//	considers this to be an error.
//
// PARAMETERS
//
//	c	The iterator to use.  It is presumed to start at any position
//		after the '<' and before the '>'; it is left after the '>'.
//
//	end	The iterator marking the end of the file.
//
// SEE ALSO
//
//	"On SGML and HTML: Comments," HTML 4.0 Specification,
//	http://www.w3.org/
//
//*****************************************************************************
{
	while ( c != end ) {
		if ( *c++ != '-' )
			continue;
		while ( c != end && *c == '-' )
			++c;
		while ( c != end && isspace( *c ) )
			++c;
		if ( c != end && *c++ == '>' )
			break;
	}
}

//*****************************************************************************
//
// SYNOPSIS
//
	char const* grep_title( file_vector<char> const &file )
//
// DESCRIPTION
//
//	Scan ("grep") through the first Title_Lines lines in an HTML file
//	looking for <TITLE>...</TITLE> tags to extract the title.  Every
//	non-space whitespace character in the title is converted to a space;
//	leading and trailing spaces are removed.
//
// PARAMETERS
//
//	file	The file presumed to be an HTML file.
//
// RETURN VALUE
//
//	Returns the title string or null if no title can be found.
//
// EXAMPLE
//
//	Given:
//
//		<TITLE>
//		This is
//		a title
//		</TITLE>
//
//	returns:
//
//		This is a title
//
// SEE ALSO
//
//	"The global structure of an HTML document," HTML 4.0 Specification,
//	http://www.w3.org/
//
//*****************************************************************************
{
	static char const *const title_tag[] = {	// tag_index
		"TITLE",				//	0
		"/TITLE"				//	1
	};
	int tag_index = 0;
	int lines = 0;

	//
	// <TITLE>This is a title</TITLE>
	//        |              |
	//        after          before
	//
	// Mark the positions after the closing '>' of the start tag and before
	// the opening '<' of the end tag.  What's in between is the title.
	//
	file_vector<char>::const_iterator after, before;

	register file_vector<char>::const_iterator c = file.begin();
	while ( c != file.end() ) {
		if ( *c == '\n' && ++lines > Title_Lines ) {
			//
			// Didn't find <TITLE> within first Title_Lines lines
			// of file: forget it.
			//
			return 0;
		}

		if ( *c != '<' ) {			// not a tag: forget it
			++c;
			continue;
		}

		//
		// Found the start of an HTML tag: mark the position before it
		// in case it turns out to be the </TITLE> tag.
		//
		before = c++;

		if ( is_html_comment( c, file.end() ) ) {
			skip_html_comment( c, file.end() );
			continue;
		}

		//
		// Is the HTML tag a TITLE tag?
		//
		bool const found_title_tag =
			tag_cmp( c, file.end(), title_tag[ tag_index ] );
		skip_html_tag( c, file.end() );		// skip until '>'
		if ( !found_title_tag )			// wrong tag
			continue;

		if ( tag_index == 0 ) {
			//
			// Found the <TITLE> tag: mark the position after it
			// and begin looking for the </TITLE> tag.
			//
			after = c;
			++tag_index;
			continue;
		}

		////////// Found the entire title /////////////////////////////

		// Remove leading spaces
		while ( after < before && isspace( *after ) )
			++after;

		// Remove trailing spaces
		while ( after < --before && isspace( *before ) )
			;
		++before;

		static char title[ Title_Max_Size + 1 ];
		int len = before - after;
		if ( len > Title_Max_Size ) {
			::strncpy( title, after, Title_Max_Size );
			::strcpy( title + Title_Max_Size - 3, "..." );
			len = Title_Max_Size;
		} else {
			::copy( after, before, title );
		}
		title[ len ] = '\0';

		// Normalize all whitespace chars to space chars
		for ( register char *p = title; *p; ++p )
			if ( isspace( *p ) )
				*p = ' ';

		return title;
	}
}

//*****************************************************************************
//
// SYNOPSIS
//
	void skip_html_tag(
		register file_vector<char>::const_iterator &c,
		register file_vector<char>::const_iterator end
	)
//
// DESCRIPTION
//
//	Scan for the closing '>' of an HTML tag skipping all characters until
//	it's found.  It takes care to ignore any '>' inside either single or
//	double quotation marks.
//
// PARAMETERS
//
//	c	The iterator to use.  It is presumed to start at any position
//		after the '<' and before the '>'; it is left at the first
//		character after the '>'.
//
//	end	The iterator marking the end of the file.
//
// SEE ALSO
//
//	"The global structure of an HTML document," HTML 4.0 Specification,
//	http://www.w3.org/
//
//*****************************************************************************
{
	if ( is_html_comment( c, end ) ) {
		skip_html_comment( c, end );
		return;
	}
	register char quote = '\0';
	while ( c != end ) {
		if ( quote ) {			// ignore everything
			if ( *c++ == quote )	// until matching quote
				quote = '\0';
			continue;
		}
		if ( *c == '\"' || *c == '\'' ) {
			quote = *c++;		// start ignoring stuff
			continue;
		}
		if ( *c++ == '>' )		// found it
			break;
	}
}