File: validate.cgi

package info (click to toggle)
wdg-html-validator 1.0-6
links: PTS
area: contrib
in suites: potato
size: 1,404 kB
ctags: 103
sloc: perl: 2,912; ansic: 1,228; makefile: 48
file content (1356 lines) | stat: -rwxr-xr-x 37,342 bytes
#!/usr/bin/perl

#####################################################################
#
# WDG HTML Validator <http://www.htmlhelp.com/tools/validator/>
# by Liam Quinn <liam@htmlhelp.com>
#
# Copyright (c) 1998-99 by Liam Quinn
# This program is free software; you can redistribute it
# and/or modify it under the same terms as Perl itself.
#
# Debian modification (c) 1999, Jaldhar H. Vyas <jaldhar@debian.org>
# Licensing terms as above.
#####################################################################

#####################################################################
# Required libraries #
######################

use lib ('/usr/lib/wdg-html-validator');
use CGI;
use LWP::UserAgent;
use HTMLLinkExtractor;
use URI::URL;
use WWW::RobotRules;
use I18N::Charset;
use Unicode::Map8;
use Unicode::String qw(utf8 ucs2);

#####################################################################

#####################################################################
# Variables to define #
#######################

# Location of jconv Japanese character encoding converter
my $jconv = '/usr/lib/wdg-html-validator/jconv';

# Location of cjkvconv.pl CJK character encoding converter
my $cjkvconv = '/usr/lib/wdg-html-validator/cjkvconv.pl';

# SGML directory (catalog, DTDs, SGML declarations)
my $sgmlDir = '/usr/lib/sgml';

# Directory containing templates
my $templates = '/usr/lib/wdg-html-validator/html/templates';

# nsgmls command line
# The SGML declaration and name of the temporary file storing the retrieved
# document will be appended to this string
my $nsgmls = "/usr/bin/nsgmls -E0 -s -c $sgmlDir/catalog";

# Location of temporary file in which to store the retrieved document
my $tempfile = "/tmp/validator-$$.html";

# Text preceding identification of the document checked
my %documentChecked = (
  # English
  'en' => 'Document Checked'
);

# Text preceding identification of the character encoding
my %characterEncoding = (
  # English
  'en' => '<a href="/wdg-html-validator/charset.html">Character encoding</a>:'
);

# Text preceding the level of HTML checked
my %levelOfHTML = (
  # English
  'en' => '<a href="/wdg-html-validator/doctype.html">Level of HTML</a>:'
);

# Default DOCTYPE for forgetful users
my $defaultDoctype = '<!DOCTYPE HTML PUBLIC "-//W3C//DTD HTML 4.0 Transitional//EN">';

# Default DOCTYPE if the document contains frames
my $defaultFramesetDoctype = '<!DOCTYPE HTML PUBLIC "-//W3C//DTD HTML 4.0 Frameset//EN">';

# Error for missing DOCTYPE
my %noDoctype = (
  # English error message
  'en' => "missing _document type declaration_; assuming _HTML 4.0 Transitional_"
);

# Error for missing DOCTYPE in a Frameset document
my %noFramesetDoctype = (
  # English error message
  'en' => "missing _document type declaration_; assuming _HTML 4.0 Frameset_"
);

# Message if the document is valid
my %noErrors = (
  # English
  'en' => 'Congratulations, no errors!'
);

# Text to precede an error message
my %preError = (
  # English
  'en' => 'Error:'
);

# Heading for errors
my %errorsHeading = (
  # English
  'en' => 'Errors'
);

# Heading for input listing
my %inputHeading = (
  # English
  'en' => 'Input'
);

# Text to precede line number
my %lineNumberText = (
  # English
  'en' => 'Line '
);

# Text to precede character number
my %characterNumberText = (
  # English
  'en' => 'character '
);

# Mapping from IANA charset name to preferred MIME name
my %MIMECharset = (
  'ISO_8859-1:1987' => 'ISO-8859-1',
  'ISO-8859-1' => 'ISO-8859-1',
  'ISO_8859-2:1987' => 'ISO-8859-2',
  'ISO_8859-3:1988' => 'ISO-8859-3',
  'ISO_8859-4:1988' => 'ISO-8859-4',
  'ISO_8859-5:1988' => 'ISO-8859-5',
  'ISO_8859-6:1987' => 'ISO-8859-6',
  'ISO_8859-6-E' => 'ISO-8859-6-e',
  'ISO_8859-6-I' => 'ISO-8859-6-i',
  'ISO_8859-7:1987' => 'ISO-8859-7',
  'ISO_8859-8:1988' => 'ISO-8859-8',
  'ISO_8859-8-E' => 'ISO-8859-8-e',
  'ISO_8859-8-I' => 'ISO-8859-8-i',
  'ISO_8859-9:1989' => 'ISO-8859-9',
  'ISO_8859-10:1993' => 'ISO-8859-10',
  'ISO_8859-14:1998' => 'ISO-8859-14',
  'ISO_8859-15:1998' => 'ISO-8859-15',
  'UTF-8' => 'UTF-8',
  'ISO-2022-JP' => 'ISO-2022-JP',
  'Extended_UNIX_Code_Packed_Format_for_Japanese' => 'EUC-JP',
  'EUC-KR' => 'EUC-KR',
  'GB2312' => 'GB2312',
  'Shift_JIS' => 'Shift_JIS',
  'Big5' => 'Big5',
  'windows-1250' => 'windows-1250',
  'windows-1251' => 'windows-1251',
  'windows-1252' => 'ISO-8859-1-Windows-3.1-Latin-1',
  'windows-1253' => 'windows-1253',
  'windows-1254' => 'windows-1254',
  'windows-1255' => 'windows-1255',
  'windows-1256' => 'windows-1256',
  'windows-1257' => 'windows-1257',
  'windows-1258' => 'windows-1258',
  'KOI8-R' => 'KOI8-R',
  'KOI8-U' => 'KOI8-U',
  'IBM866' => 'cp866',
  'IBM874' => 'cp874',
  'TIS-620' => 'TIS-620',
  'VISCII' => 'VISCII',
  'VPS' => 'x-viet-vps',
  'TCVN-5712:1993' => 'x-viet-tcvn',
  'ANSI_X3.4-1968' => 'US-ASCII'
);

# Mapping from preferred MIME name to name required by nsgmls
my %encodings = (
  'US-ASCII' => 'ISO-8859-1',
  'ISO-8859-1' => 'ISO-8859-1',
  'ISO-8859-2' => 'ISO-8859-2',
  'ISO-8859-3' => 'ISO-8859-3',
  'ISO-8859-4' => 'ISO-8859-4',
  'ISO-8859-5' => 'ISO-8859-5',
  'ISO-8859-6' => 'ISO-8859-6',
  'ISO-8859-6-e' => 'ISO-8859-6',
  'ISO-8859-6-i' => 'ISO-8859-6',
  'ISO-8859-7' => 'ISO-8859-7',
  'ISO-8859-8' => 'ISO-8859-8',
  'ISO-8859-8-e' => 'ISO-8859-8',
  'ISO-8859-8-i' => 'ISO-8859-8',
  'ISO-8859-9' => 'ISO-8859-9',
  'UTF-8' => 'UTF-8',
  'EUC-JP' => 'EUC-JP',
  'EUC-KR' => 'EUC-KR',
  'GB2312' => 'GB2312',
  'Big5' => 'Big5',
  'Shift_JIS' => 'Shift_JIS',

  # The following character encodings will be converted to UTF-8 for
  # parsing by nsgmls
  'ISO-8859-10' => 'UTF-8',
  'ISO-8859-14' => 'UTF-8',
  'ISO-8859-15' => 'UTF-8',
  'windows-1250' => 'UTF-8',
  'windows-1251' => 'UTF-8',
  'ISO-8859-1-Windows-3.1-Latin-1' => 'UTF-8',
  'windows-1253' => 'UTF-8',
  'windows-1254' => 'UTF-8',
  'windows-1255' => 'UTF-8',
  'windows-1256' => 'UTF-8',
  'windows-1257' => 'UTF-8',
  'windows-1258' => 'UTF-8',
  'KOI8-R' => 'UTF-8',
  'KOI8-U' => 'UTF-8',
  'cp866' => 'UTF-8',
  'cp874' => 'UTF-8',
  'TIS-620' => 'UTF-8',
  'VISCII' => 'UTF-8',
  'x-viet-vps' => 'UTF-8',
  'x-viet-tcvn' => 'UTF-8'
);


# Hash table of multibyte character encodings supported
# The value is a regular expression representing a single character
# in the encoding.
my %multibyte = (
  'UTF-8' => '[\x00-\x7F]|[\xC2-\xDF][\x80-\xBF]|\xE0[\xA0-\xBF][\x80-\xBF]|[\xE1-\xEF][\x80-\xBF][\x80-\xBF]|\xF0[\x90-\xBF][\x80-\xBF][\x80-\xBF]|[\xF1-\xF7][\x80-\xBF][\x80-\xBF][\x80-\xBF]|\xF8[\x88-\xBF][\x80-\xBF][\x80-\xBF][\x80-\xBF]|[\xF9-\xFB][\x80-\xBF][\x80-\xBF][\x80-\xBF][\x80-\xBF]|\xFC[\x84-\xBF][\x80-\xBF][\x80-\xBF][\x80-\xBF][\x80-\xBF]|\xFD[\x80-\xBF][\x80-\xBF][\x80-\xBF][\x80-\xBF][\x80-\xBF]',
  'EUC-JP' => '[\x00-\x7F]|[\xA1-\xFE][\xA1-\xFE]|\x8E[\xA0-\xDF]|\x8F[\xA1-\xFE][\xA1-\xFE]',
  'EUC-KR' => '[\x00-\x7F]|[\xA1-\xFE][\xA1-\xFE]',
  'GB2312' => '[\x00-\x7F]|[\xA1-\xFE][\xA1-\xFE]',
  'Big5' => '[\x00-\x7E]|[\xA1-\xFE][\x40-\x7E\xA1-\xFE]',
  'Shift_JIS' => '[\x00-\x7F]|[\x81-\x9F\xE0-\xFC][\x40-\x7E\x80-\xFC]|[\xA0-\xDF]'
);


# Hash table of character encodings that must be converted before validation
my %conversionNeeded = (
  'ISO-8859-10' => 1,
  'ISO-8859-14' => 1,
  'ISO-8859-15' => 1,
  'windows-1250' => 1,
  'windows-1251' => 1,
  'ISO-8859-1-Windows-3.1-Latin-1' => 1,
  'windows-1253' => 1,
  'windows-1254' => 1,
  'windows-1255' => 1,
  'windows-1256' => 1,
  'windows-1257' => 1,
  'windows-1258' => 1,
  'KOI8-R' => 1,
  'KOI8-U' => 1,
  'cp866' => 1,
  'cp874' => 1,
  'TIS-620' => 1,
  'VISCII' => 1,
  'x-viet-vps' => 1,
  'x-viet-tcvn' => 1
);

# Versions of HTML associated with a given FPI
my %HTMLversion = (
  'PUBLIC "-//W3C//DTD HTML 4.0//EN"' => 'HTML 4.0 Strict',
  'PUBLIC "-//W3C//DTD HTML 4.0 Transitional//EN"' => 'HTML 4.0 Transitional',
  'PUBLIC "-//W3C//DTD HTML 4.0 Frameset//EN"' => 'HTML 4.0 Frameset',
  'PUBLIC "-//W3C//DTD HTML 3.2 Final//EN"' => 'HTML 3.2',
  'PUBLIC "-//W3C//DTD HTML 3.2 Draft//EN"' => 'HTML 3.2',
  'PUBLIC "-//W3C//DTD HTML 3.2//EN"' => 'HTML 3.2',
  'PUBLIC "-//W3C//DTD HTML Experimental 970421//EN"' => 'HTML 3.2 + Style',
  'PUBLIC "-//W3O//DTD W3 HTML 3.0//EN"' => 'HTML 3.0 Draft',
  'PUBLIC "-//IETF//DTD HTML 3.0//EN//"' => 'HTML 3.0 Draft',
  'PUBLIC "-//IETF//DTD HTML 3.0//EN"' => 'HTML 3.0 Draft',
  'PUBLIC "-//IETF//DTD HTML//EN"' => 'HTML 2.0',
  'PUBLIC "-//IETF//DTD HTML 2.0//EN"' => 'HTML 2.0',
  'PUBLIC "-//IETF//DTD HTML Level 2//EN"' => 'HTML 2.0',
  'PUBLIC "-//IETF//DTD HTML 2.0 Level 2//EN"' => 'HTML 2.0',
  'PUBLIC "-//IETF//DTD HTML Level 1//EN"' => 'HTML 2.0 Level 1',
  'PUBLIC "-//IETF//DTD HTML 2.0 Level 1//EN"' => 'HTML 2.0 Level 1',
  'PUBLIC "-//IETF//DTD HTML Strict//EN"' => 'HTML 2.0 Strict',
  'PUBLIC "-//IETF//DTD HTML 2.0 Strict//EN"' => 'HTML 2.0 Strict',
  'PUBLIC "-//IETF//DTD HTML Strict Level 2//EN"' => 'HTML 2.0 Strict',
  'PUBLIC "-//IETF//DTD HTML 2.0 Strict Level 2//EN"' => 'HTML 2.0 Strict',
  'PUBLIC "-//IETF//DTD HTML Strict Level 1//EN"' => 'HTML 2.0 Strict Level 1',
  'PUBLIC "-//IETF//DTD HTML 2.0 Strict Level 1//EN"' => 'HTML 2.0 Strict Level 1'
);

# SGML declarations for a given level of HTML
my %sgmlDecl = (
  'HTML 4.0 Strict' => "$sgmlDir/sgml/html-4.decl",
  'HTML 4.0 Transitional' => "$sgmlDir/sgml/html-4.decl",
  'HTML 4.0 Frameset' => "$sgmlDir/sgml/html-4.decl",
# CHANGE THIS
  'HTML 3.2' => "/usr/lib/html-valdator/sgml/html-32.decl",
  'HTML 3.2 + Style' => "$sgmlDir/sgml/html-970421.decl",
#CHANGE THIS
  'HTML 3.0 Draft' => "/usr/lib/html-validator/sgml/html-3.dcl",
  'HTML 2.0' => "$sgmlDir/sgml/html.decl",
  'HTML 2.0 Strict' => "$sgmlDir/sgml/html.decl",
  'HTML 2.0 Level 1' => "$sgmlDir/sgml/html.decl",
  'HTML 2.0 Strict Level 1' => "$sgmlDir/sgml/html.decl",
  'Unknown' => "$sgmlDir/html-4.dcl"
);

# Files of links for a given level of HTML and a specified language
my %linksFile = (
  # English links
  'en-HTML 4.0 Strict' => "$templates/html40links.txt.en",
  'en-HTML 4.0 Transitional' => "$templates/html40links.txt.en",
  'en-HTML 4.0 Frameset' => "$templates/html40links.txt.en",
  'en-HTML 3.2' => "$templates/html32links.txt.en",
  'en-HTML 3.2 + Style' => "$templates/html32links.txt.en",
  'en-HTML 3.0 Draft' => "$templates/html30links.txt.en",
  'en-HTML 2.0' => "$templates/html20links.txt.en",
  'en-HTML 2.0 Strict' => "$templates/html20links.txt.en",
  'en-HTML 2.0 Level 1' => "$templates/html20links.txt.en",
  'en-HTML 2.0 Strict Level 1' => "$templates/html20links.txt.en",
  'en-Unknown' => "$templates/html40links.txt.en"
);

# Full link text to HTML references
my %htmlLink = (
  # English links
  'en-HTML 4.0 Strict' => '<a href="/doc/wdg-html-reference/html40/">HTML 4.0</a> Strict',
  'en-HTML 4.0 Transitional' => '<a href="/doc/wdg-html-reference/html40/">HTML 4.0</a> Transitional',
  'en-HTML 4.0 Frameset' => '<a href="/doc/wdg-html-reference/html40/">HTML 4.0</a> Frameset',
  'en-HTML 3.2' => '<a href="/doc/wdg-html-reference/wilbur/">HTML 3.2</a>',
  'en-HTML 3.2 + Style' => '<a href="http://www.w3.org/TR/NOTE-html-970421">HTML 3.2 + Style</a>',
  'en-HTML 3.0 Draft' => '<a href="http://www.w3.org/MarkUp/html3/">HTML 3.0 Draft</a>',
  'en-HTML 2.0' => '<a href="http://www.w3.org/MarkUp/html-spec/">HTML 2.0</a>',
  'en-HTML 2.0 Strict' => '<a href="http://www.w3.org/MarkUp/html-spec/">HTML 2.0</a> Strict',
  'en-HTML 2.0 Level 1' => '<a href="http://www.w3.org/MarkUp/html-spec/">HTML 2.0</a> Level 1',
  'en-HTML 2.0 Strict Level 1' => '<a href="http://www.w3.org/MarkUp/html-spec/">HTML 2.0</a> Strict Level 1',
  'en-Unknown' => 'Unknown'
);

# User-Agent string for spidering validation
my $spiderUA = 'WDG_SiteValidator/1.0 (Debian)';

# User-Agent string for normal validation
my $normalUA = 'WDG_Validator/1.0 (Debian)';

# Location of HTML fragments
# Each file name must end with ".xx" where "xx" is the two-letter language
# code, but the ".xx" extension must not be given in this variable
my $beginningHTML = "$templates/header.htmlf";
my $beginningHTMLwithInput = "$templates/headerWithInput.htmlf";
my $endingHTML = "$templates/footer.htmlf";

# Maximum number of extra characters to include in the HTML extract on
# either side of the source of the error
my $extraChars = 30;

# Maximum number of URLs to check in batch mode
my $maxURLs = 60;

#####################################################################

#####################################################################
#
# The rest of the script...
#
#####################################################################

# Flush output buffer
$| = 1;

### Get user input ###

my $query = new CGI;

# URL of document to check
my $url = &trim($query->param('url'));

# Whether or not to spider the site
my $spider = $query->param('spider');

# URLs of documents to check (for batch validation)
my $urls = &trim($query->param('urls'));

# Uploaded file
my $file = &trim($query->param('file'));

# Direct HTML input
my $area = &trim($query->param('area'));

# Whether or not to show user's input
my $input = $query->param('input');

# Character encoding of uploaded file
my $charset = $query->param('charset');

#############

# Only English is currently supported
my $lang = 'en';

# Determine user's browser to later avoid browser bugs
my $browser = $ENV{'HTTP_USER_AGENT'};

my $document;
my $lastModified = "";
my $outputURL = "";
my %links;
my %linksFound;
my %linksChecked;
my @documentLinks = ();
my @urls = ();
my $startURL;
my $urlBasePath = '/';
my $parser;
my $multipleURLs;
my $robotsRules;
my $startURLFetched;

# It's hard to spider a site without a URL
if ($spider && !$url) {
	undef $spider;
}

# Check which input method was used
if ($url) { # URL input

	# Check if the URL to be validated is the referring URL
	if ($url =~ /^referr?er$/i) {
		my $referer = $ENV{'HTTP_REFERER'};
		if (length($referer) > 0) {
			$url = $referer;
		} else {
			&printHeader($browser, 'ISO-8859-1', $lang);
			&printFile("$beginningHTML.$lang");
			&error('Your browser did not send a referring <abbr class=initialism title="Uniform Resource Locator">URL</abbr>.');
		}
	}

	# Check if URL is valid
	if ($url =~ m#(?:ht|f)tp://\S#i) {

		if ($spider) {
			$multipleURLs = 1;
			$startURL = url($url);
			$linksFound{$startURL->full_path} = 1;
			@urls = ($startURL);
		} else {

			# Fetch document
			($url, $charset, $lastModified) = getDocument($url);
			$outputURL = $query->escapeHTML($url);

		}

	} else {
		&printHeader($browser, 'ISO-8859-1', $lang);
		&printFile("$beginningHTML.$lang");
		&error("Invalid <abbr class=initialism title=\"Uniform Resource Locator\">URL</abbr>: " . $query->escapeHTML($url));
	}

} elsif ($area) { # HTML input directly

	$document = $area;

} elsif ($file) { # HTML file uploaded

	while (<$file>) {
		$document .= $_;
	}
	close $file || &badUploadError;

	# Determine local file URL unless the browser is Netscape for X11,
	# which fails to give the full path of the uploaded file.
	unless ($browser =~ /X11/io && $browser =~ /^Mozilla/o
		&& $browser !~ /compatible/io)
	{
		# Netscape 2.x requires an extra slash in file URLs
		my $extraSlash = "";
		if ($browser =~ m#^Mozilla/2\.0#o && $browser !~ /compatible/io) {
			$extraSlash = "/";
		}
		$outputURL = "file://$extraSlash" . $query->escapeHTML(&encodeFileURL($file));
	}

} elsif (!($urls && $urls =~ /\S/o)) { # No input

	&printHeader($browser, 'ISO-8859-1', $lang);
	&printFile("$beginningHTML.$lang");
	&error("No input");

}

# Check for batch validation
if ($urls) { # batch validation of URLs
	# Adjust newlines
	if ($urls =~ /\n/o) {
		$urls =~ s/\r//go;
	} else {
		$urls =~ s/\r/\n/go;
	}

	$multipleURLs = 1;
	@urls = split(/\n/, $urls);

} elsif (!$spider) {
	# Add dummy URL so that the validation loop executes once
	push(@urls, 1);
}

if ($multipleURLs) {

	# Print beginning of output page
	&printHeader($browser, 'UTF-8', $lang);
	if ($input) {
		&printFile("$beginningHTMLwithInput.$lang");
	} else {
		&printFile("$beginningHTML.$lang");
	}

	# The spider must check if it's allowed to play
	if ($spider && $startURL->scheme eq 'http') {
		my $userAgent = new LWP::UserAgent;
		$userAgent->agent("$spiderUA");
		my $robotsURL = 'http://' . $startURL->host . ':' . $startURL->port
			. '/robots.txt';
		my $request = new HTTP::Request 'GET' => $robotsURL;
		my $response = $userAgent->request($request);
		if ($response->is_success && $response->content_type eq 'text/plain') {
			$robotsRules = new WWW::RobotRules "$spiderUA";
			$robotsRules->parse($robotsURL, $response->content);
		}
	}
}

my $urlsChecked = 0;
while (@urls) {

	my $url = shift @urls;

	if ($multipleURLs) {

		$document = "";
		$url = &trim($url);
		next unless $url;

		# Check if we're over our limit of URLs
		if ($urlsChecked >= $maxURLs) {
			print "<div class=checkedDocument>\n<p><em>Batch validation is limited to $maxURLs URLs at one time. The remaining URLs were not checked.</em></p>\n</div>\n";
			last;
		}

		# Check if URL is valid
		if ($url =~ m#^(?:ht|f)tp://\S#i) {

			# Make sure we're allowed to access the URL
			if (!$robotsRules || $robotsRules->allowed($url)) {

				my $success;

				# Create an HTML link extractor
				if ($spider) {
					@documentLinks = ();
					$parser = HTMLLinkExtractor->new(\&linkExtractorCallback);
				}

				# Fetch document
				($url, $charset, $lastModified, $success) = getDocument($url);
				$outputURL = $query->escapeHTML($url);

				# If this is the first URL of a site validation, re-initialize
				# our information about the starting URL to account for any
				# redirection.
				if ($spider && !$startURLFetched) {
					$startURL = url($url);
					$linksFound{$startURL->full_path} = 1;
					if ($startURL->epath =~ m#(.*/)#os) {
						$urlBasePath = $1;
					}
					$startURLFetched = 1;
				}

				next unless $success;

			} else {
				$outputURL = $query->escapeHTML($url);
				print "<div class=checkedDocument><p>Unable to access <a href=\"$outputURL\">$outputURL</a> according to the <a href=\"http://info.webcrawler.com/mak/projects/robots/exclusion.html\#robotstxt\">Robots Exclusion Protocol</a></p></div>";
				next;
			}

		} else {
			print "<div class=checkedDocument><p>Invalid <abbr class=initialism title=\"Uniform Resource Locator\">URL</abbr>: " . $query->escapeHTML($url) . "</p></div>";
			next;
		}

	}


	# Determine character encoding of output page
	my $encodingMsg = "	<li>" . $characterEncoding{$lang};

	unless ($charset) {
		# Check for a META element specifying the character encoding
		if ($document =~ m#<META(\s[^>]*http\-equiv\s*=\s*["']?Content\-Type["']?[^>]*)>#iso) {
			my $metaAttributes = $1;
			if ($metaAttributes =~ m#\scontent\s*=\s*["']?.*[\s;]charset\s*=\s*['"]?([^"']+)#iso) {
				$charset = $1;
			}
		}
	}

	if ($charset) {

		my $enteredCharset = $charset;

		# Get IANA name for character encoding
		my $ianaCharset = iana_charset_name($charset);

		# Get preferred MIME name
		if ($ianaCharset) {
			$charset = $MIMECharset{$ianaCharset};
		}

		if ($ianaCharset && $charset) {
			$encodingMsg .= " $charset</li>\n";

			# This is a quick hack to add basic support for ISO-2022-JP
			if ($charset eq 'ISO-2022-JP') {

				# Write document to temporary file for conversion
				$convTempFile = "$tempfile.conv";
				open(CTMP, ">$convTempFile") || &error("Server error");
				print CTMP "$document";
				close(CTMP);

				# Convert ISO-2022-JP document to Shift_JIS
				system("$jconv -ij -os < $convTempFile > $tempfile");

				# Read document back in
				$document = "";
				open(DOC, "$tempfile") || &error("Server error");
				while (<DOC>) {
					$document .= $_;
				}
				close(DOC);

				unlink("$convTempFile");

				$charset = 'Shift_JIS';

			}

		} else {
			$encodingMsg .= " " . $query->escapeHTML($enteredCharset)
				. "--<strong>not supported, assuming ISO-8859-1</strong></li>\n";
			$charset = 'ISO-8859-1';
		}

	} else {

		$encodingMsg .= " Unknown; assuming ISO-8859-1</li>\n";
		$charset = 'ISO-8859-1';

	}

	# Print beginning of output page
	unless ($multipleURLs) {
		&printHeader($browser, $charset, $lang);
		if ($input) {
			&printFile("$beginningHTMLwithInput.$lang");
		} else {
			&printFile("$beginningHTML.$lang");
		}
	}

	# Print the URL of the document checked
	print "<div class=checkedDocument>\n" if $multipleURLs;
	print "<h2>$documentChecked{$lang}</h2>\n<ul>\n";
	print "	<li>URL: <a href=\"$outputURL\">$outputURL</a></li>\n" if $outputURL;
	print "	<li>Last modified: " . $query->escapeHTML($lastModified) . "</li>\n" if $lastModified;

	# Print character encoding information
	print "$encodingMsg";

	my @errors; # queue of errors
	my @externalErrors; # queue of errors in an external DTD

	# Amount to decrease line count by (i.e., if we add a DOCTYPE)
	my $lineAdjust = 0;

	my $validatorInput;

	# Adjust line-endings if necessary
	if ($document !~ /\n/) {
		$document =~ s/\r/\n/g;
	}

	# If we're doing a batch validation, convert the input to UTF-8
	my $convTempFile;
	if ($multipleURLs) {
		if ($multibyte{$charset} && $charset ne 'UTF-8') {

			# Write document to temporary file for conversion
			$convTempFile = "$tempfile.conv";
			open(CTMP, ">$convTempFile") || &error("Server error");
			print CTMP "$document";
			close(CTMP);

			SWITCH: {
				if ($charset eq 'Shift_JIS') {
					system("$cjkvconv -ij -ou8 < $convTempFile > $tempfile");
					last SWITCH;
				}
				if ($charset eq 'GB2312') {
					system("$cjkvconv -is -ou8 < $convTempFile > $tempfile");
					last SWITCH;
				}
				if ($charset eq 'Big5') {
					system("$cjkvconv -it -ou8 < $convTempFile > $tempfile");
					last SWITCH;
				}
				if ($charset eq 'EUC-JP') {
					system("$cjkvconv -ie -ou8 < $convTempFile > $tempfile");
					last SWITCH;
				}
				if ($charset eq 'EUC-KR') {
					system("$cjkvconv -ik -ou8 < $convTempFile > $tempfile");
					last SWITCH;
				}
			}

			# Read document back in
			$document = "";
			open(DOC, "$tempfile") || &error("Server error");
			while (<DOC>) {
				$document .= $_;
			}
			close(DOC);

			unlink("$convTempFile");

		} elsif ($charset ne 'UTF-8' && $charset ne 'US-ASCII') {

			my $batchmap = Unicode::Map8->new($charset)
				|| &error("Server error<!--Unable to create character encoding map for $charset-->");

			# Pass through invalid characters
			$batchmap->nostrict;

			# Convert document to UTF-8
			$document = $batchmap->tou($document)->utf8;

		}

		$charset = 'UTF-8';
	}

	# Determine the level of HTML
	my $htmlLevel = 'Unknown';
	if ($document =~ /<!DOCTYPE([^>]*)>/iso) {

		my $doctypeMeat = $1;
		if ($doctypeMeat =~ /PUBLIC\s+("[^"]*")/iso) {
			$htmlLevel = $HTMLversion{"PUBLIC $1"} || 'Unknown';
		}

		$validatorInput = $document;

	} else { # Missing DOCTYPE

		# Add a default DOCTYPE
		my ($insertedDoctype, $doctypeError);
		if ($document =~ /<FRAMESET/io) {
			$insertedDoctype = $defaultFramesetDoctype;
			$doctypeError = $noFramesetDoctype{$lang};
		} else {
			$insertedDoctype = $defaultDoctype;
			$doctypeError = $noDoctype{$lang};
		}

		$validatorInput = "$insertedDoctype" . "\n$document";

		# Calculate line adjustment
		$lineAdjust = 2;

		# Add error message
		push(@errors, "::" . (1 + $lineAdjust) . ":0:E: $doctypeError");

	}

	# Print level of HTML checked
	print "	<li>$levelOfHTML{$lang} <strong>" . $htmlLink{"${lang}-$htmlLevel"} . "</strong></li>\n</ul>\n";

	# Prepare links for the specified level of HTML and language
	&loadLinks($htmlLevel, $lang);

	# Prepare an array of lines in the document for easy access to a given line
	my @lines = split(/\n/, $document);

	# If necessary, convert to a character encoding (UTF-8) recognized by nsgmls
	my $map;
	if (!($multipleURLs) && $conversionNeeded{$charset}) {

		$map = Unicode::Map8->new($charset)
			|| &error("Server error<!--Unable to create character encoding map for $charset-->");

		# Pass through invalid characters
		$map->nostrict;

		# Convert document to UTF-8
		$validatorInput = $map->tou($validatorInput)->utf8;

	}

	# Put the document in a temporary file
	open(TMP, ">$tempfile") || &error("Server error");
	print TMP "$validatorInput";
	close(TMP);

	# Run the validator
	$ENV{'SP_CHARSET_FIXED'} = 1;
	$ENV{'SP_ENCODING'} = $encodings{$charset};
	open(NSGMLS, "$nsgmls -b$encodings{$charset} $sgmlDecl{$htmlLevel} $tempfile 2>&1 |")
		|| &error("Server error");

	# Create a queue of errors
	while (<NSGMLS>) {

		# Convert character encodings, if necessary
		if (defined($map)) {
			$_ = $map->to8(utf8($_)->ucs2);
		}

		my @error = split(/:/, $_, 6);

		if ($error[4] eq 'E') {

			push(@errors, $_);

			# If the DOCTYPE is bad, bail out
			last if ($error[5] eq " unrecognized _DOCTYPE_; unable to check document\n");

		} elsif ($error[1] =~ /^<URL>/o) { # error from external DTD

			push(@externalErrors, $_);

		} elsif (length($error[4]) > 1) { # Allow secondary messages about preceding error

			push(@errors, $_);

		}

	}
	close(NSGMLS);

	# Delete temporary file
	unlink $tempfile;

	# Report errors
	if ($#errors > -1 || $#externalErrors > -1) {

		print "<h2>$errorsHeading{$lang}</h2>\n";
		print "<ul>\n";

		foreach (@externalErrors) {
			my @error = split(/:/, $_, 7);

			# Determine URL containing the error
			my $errorURL;
			if ($error[1] =~ /<URL>(.+)/o) {
				$errorURL = "$1:$error[2]";
			}

			my $lineNumber = $error[3];
			my $character = $error[4] + 1;

			my $escapedURL = $query->escapeHTML($errorURL);
			print "<li><a href=\"$escapedURL\">$escapedURL</a>, " .
			  lc($lineNumberText{$lang}) .
			  "$lineNumber, $characterNumberText{$lang}$character: ";

			if ($error[6]) {
				print $query->escapeHTML($error[6]);
			} else {
				print $query->escapeHTML($error[5]);
			}
			print "</li>\n";
		}

		foreach (@errors) {
			my @error = split(/:/, $_, 6);

			# I don't think this should happen, but I'm not sure
			next if $#error < 4;

			# Determine line number and character of error
			my $lineNumber = $error[2] - $lineAdjust;
			next unless $lineNumber > 0;
			my $character = $error[3] + 1;

			if ($input) {
				my $urlNumber = "";
				if ($multipleURLs) {
					$urlNumber = "${urlsChecked}-";
				}
				print "<li><a href=\"\#L$urlNumber$lineNumber\" onclick=\"highlight('$urlNumber$lineNumber')\">$lineNumberText{$lang}$lineNumber</a>";
			} else {
				print "<li>$lineNumberText{$lang}$lineNumber";
			}
			print ", $characterNumberText{$lang}$character:\n";

			# Extract relevant section of HTML source.
			my ($line, $preMatch, $maxMatch, $spacesToAdd, $extract, $insertedSpaces, $tabcount, $lineLength, $oneChar);
			$oneChar = ($multibyte{$charset} || '.');

			$line = superChomp($lines[$lineNumber-1]);
			$lineLength = ulength($line, $oneChar);
			$preMatch = max(0, $character - $extraChars);
			$maxMatch = 2 * $extraChars;
			($extract) = ($line =~ /
				(?:$oneChar)
				{$preMatch}
				((?:$oneChar)
				{1,$maxMatch})/x);
			$spacesToAdd = $error[3];

			# Expand tabs in the first part of the string to ensure that our character
			# pointer lines up correctly
			($insertedSpaces, $tabcount) = (0, 0);
			if ($extract =~ /\t/o) {
				my ($firstPart, $secondPart) =
					($extract =~ /^(
					(?:$oneChar)
					{0,$spacesToAdd})
					(.*)$/sx);
				($insertedSpaces, $tabcount, $firstPart) = tabExpand($firstPart);
				$extract = "$firstPart$secondPart";
				$spacesToAdd = $spacesToAdd - $tabcount + $insertedSpaces;
			}

			if (length($extract) > 0) {

				$extract = "<code class=html>" . $query->escapeHTML($extract) . "</code>";

				# Check if the line was truncated for the extract
				if ($preMatch > 0) {
					$extract = "... $extract";
					$spacesToAdd = $extraChars + 3 - $tabcount + $insertedSpaces;
				}
				if ($preMatch + $maxMatch < ulength($line, $oneChar)) {
					$extract = "$extract ...";
				}

				# Link element names in extract
				$extract = linkElements($extract);

				print "<pre>$extract\n";
				print ' ' x $spacesToAdd;
				print "<strong>^</strong></pre>\n";
			}

			# Prepare error message, adding emphasis and links where appropriate
			my $errorMsg;
			if ($error[5]) {
				$errorMsg = superChomp($query->escapeHTML($error[5]));
			} else {
				$errorMsg = superChomp($query->escapeHTML($error[4]));
			}
			while ($errorMsg =~ m#_(?:&quot;)?(.+?)(?:&quot;)?_#gos) {
				my $linkText = $1;
				my $lcLinkText = lc($linkText);
				if ($links{$lcLinkText}) {
					$errorMsg =~ s#_(&quot;)?$linkText(&quot;)?_#$1<a href="$links{$lcLinkText}">$linkText</a>$2#;
				} else {
					$errorMsg =~ s#_(&quot;)?$linkText(&quot;)?_#$1$linkText$2#;
				}
			}
			$errorMsg =~ s#&quot;(.+?)&quot;#<strong class=html>$1</strong>#g;

			print "<p>";
			if ($error[4] eq 'E') { # Error message
				print "$preError{$lang}";
			}
			print "$errorMsg</p></li>\n";

		}
		print "</ul>\n";

	} else { # no errors

		print "<p class=congratulations><strong>$noErrors{$lang}</strong></p>\n";

	}

	# Show input if desired
	if ($input) {
		my $cite = "";
		$cite = " cite=\"$outputURL\"" if $outputURL;
		print "<h2>$inputHeading{$lang}</h2>\n<blockquote$cite><pre>";

		my $line;
		my $lineCount = 1;

		# Determine maximum number of digits for a line number
		my $maxNumLength = length($#lines + 1);

		foreach $line (@lines) {

			$line = superChomp($line);

			# Add spaces to right-align line numbers
			my $addedSpaces = $maxNumLength - length($lineCount);
			print ' ' x $addedSpaces;

			my $urlNumber = "";
			if ($multipleURLs) {
				$urlNumber = "${urlsChecked}-";
			}

			print "<span id=line$urlNumber$lineCount><a name=L$urlNumber$lineCount>$lineCount</a>   <code class=html>" . linkElements($query->escapeHTML($line)) . "</code></span>\n";
			$lineCount++;
		}
		print "</pre></blockquote>\n";
	}

	if ($multipleURLs) {
		$urlsChecked++;
		print "</div>\n";

		# Update list of links to spider
		foreach $link (@documentLinks) {
			if ($link->scheme =~ /^(?:ht|f)tp$/i &&
				lc($link->host) eq lc($startURL->host) &&
				$link->port eq $startURL->port &&
				$link->epath =~ /^$urlBasePath/ &&
				!$linksFound{$link->full_path})
			{
				$linksFound{$link->full_path} = 1;
				push(@urls, $link);
			}
		}

	}
}

# Output footer
&printFile("$endingHTML.$lang");


# Fetch a document and return it
# Takes the URL as the first argument
# The URL is assumed to have been checked for basic validity (e.g., that it
# begins with "http://" or "ftp://").
# Calls &error if the document cannot be retrieved
sub getDocument {

	my $url = shift;

	my $userAgent = new LWP::UserAgent;
	if ($spider) {
		$userAgent->agent("$spiderUA");
	} else {
		$userAgent->agent("$normalUA");
	}

	# Send request
	my $request = new HTTP::Request 'GET' => $url;

	# Receive response
	my $response;
	if ($spider) {
		$response = $userAgent->request($request, \&requestCallback);
	} else {
		$response = $userAgent->request($request);
	}

	# Determine URL of document.  This may be different from the original
	# request URL if we were redirected.
	$url = $response->request->url;

	# Bail out if we've already checked this URL
	if ($spider) {
		my $fullPath = $url->full_path;
		return 0 if $linksChecked{$fullPath};
		$linksChecked{$fullPath} = 1;
		$linksFound{$fullPath} = 1;
	}

	# Check return status
	if ($response->is_success) {

		# Bail out if we're spidering and we found a non-HTML document
		return 0 if $spider && $response->content_type ne 'text/html';

		# Determine character encoding of document
		my $contentType = $response->header('Content-Type');
		my $charset = "";

		if ($contentType && $contentType =~ /[\s;]charset\s*=\s*"?([^,"\s]+)/io) {
			$charset = $1;
		}

		# Grab Last-Modified header
		my $lastModified = $response->header('Last-Modified');

		# Expand found links' URLs to absolute ones if spidering
		if ($spider) {
			my $base = $response->base;
			@documentLinks = map { $_ = url($_, $base)->abs; } @documentLinks;
		} else {
			# If we're not spidering, set the document to the content held by
			# the response object.  If we are spidering, the content is stored
			# as it's received and parsed.
			$document = $response->content;
		}

		return ($url, $charset, $lastModified, 1);

	} else {
		&printHeader($browser, 'ISO-8859-1', $lang) unless $multipleURLs;
		&printFile("$beginningHTML.$lang") unless $multipleURLs;
		my $outputURL = $query->escapeHTML($url);
		&error("Error retrieving <a href=\"$outputURL\">$outputURL</a>: " . $response->message);
	}

	return 0;

}

# Return an error message
# Exit unless we have multiple URLs to validate
# The error message must be given as the first argument
sub error {

	my $error_message = shift;

	if ($multipleURLs) {
		print "<div class=checkedDocument><p>$error_message</p></div>\n";
	} else {
		print "<p>$error_message</p>\n";
		&printFile("$endingHTML.$lang");
		exit;
	}
}

# Trim leading and trailing whitespace from a string
# Takes a string as the first argument and returns the new string
sub trim {
	my $str = shift || return;
	$str =~ s/^\s+//go;
	$str =~ s/\s+$//go;
	return $str;
}

# Print HTTP headers
# Optional first argument is the HTTP_USER_AGENT, which is used
# to keep the charset parameter from confusing certain old browsers
# Optional second argument is the character encoding of the HTML
# document; the default is ISO-8859-1
# Optional third argument is the language of the response
sub printHeader {

	my $browser = shift;
	my $characterEncoding = shift || 'ISO-8859-1';
	my $language = shift;

	print "Content-Type: text/html";
	unless ($browser && $browser =~ m#(NCSA[ _]Mosaic)|(Mozilla/(1\.)?0)#i) {
		print "; charset=$characterEncoding";
	}
	print "\015\012"; # CRLF

	print "Content-Language: $language\015\012" if $language;

	print "\015\012"; # CRLF

}

# Print the contents of a file
# The file name must be specified as the first argument
sub printFile {

	my $file = shift || return;

	open(FILE, "$file") || die("Unable to open file $file: $!");
	while (<FILE>) {
		print "$_";
	}
	close FILE;

}


# Encode unsafe characters in a file URL
sub encodeFileURL {

  my $url = shift || return;

  $url =~ s/ /\%20/go;
  $url =~ s/"/\%22/go;
  $url =~ s/\#/\%23/go;

  return $url;

}

# Return the maximum of two numbers
sub max {
	if ($_[0] > $_[1]) {
		return $_[0];
	} else {
		return $_[1];
	}
}

# Expand tabs in a string
# Return a list of the number of spaces inserted, the number of
# tabs removed, and the expanded string
# (This is a modified version of Text::Tabs::expand.)
sub tabExpand
{
	my @l = @_;

	my $tabstop = 8;
	my $totalSpacesAdded = 0;
	my $totalTabCount = 0;

	for $_ (@l) {
		my $spacesAdded;
		my $tabCount;
		while (s/(^|\n)([^\t\n]*)(\t+)/
			$1. $2 . (" " x 
				($spacesAdded = ($tabstop * ($tabCount = length($3))
				- (length($2) % $tabstop))))
			/gsex)
		{
			$totalSpacesAdded += $spacesAdded;
			$totalTabCount += $tabCount;
		}
	}

	return ($totalSpacesAdded, $totalTabCount, @l);
}


# Populate the global hash table of links based on the level of HTML
# specified in the first argument and the language specified in the
# second argument
sub loadLinks {

	my $htmlLevel = shift || 'HTML 4.0 Transitional';
	my $lang = shift || 'en';

	my $lfile = $linksFile{"${lang}-$htmlLevel"};
	if ($lfile) {

		open(LINKS, "$lfile") || return;
		# Read links in and populate the hash table
		while (<LINKS>) {
			chomp;
			if (/(.+)\t(.+)/o) {
				$links{$1} = $2;
			}
		}

		close(LINKS);

	}

}


# Link element names in HTML code to the appropriate reference page
# The first argument is the input string
# Returns the string with links inserted
sub linkElements {

	my $code = shift || return;

	while ($code =~ m#&lt;([^\s&]+)#go) {
		my $linkText = $1;
		my $lcLinkText = lc($linkText);
		if ($links{$lcLinkText}) {
			$code =~ s#&lt;$linkText([\s&])#&lt;<a href="$links{$lcLinkText}">$linkText</a>$1#;
		}
	}

	return $code;
}


# Remove any newline characters (\r or \n) at the end of a string
# First argument is the string
# Returns the new string
sub superChomp {

	my $str = shift || return;
	$str =~ s/[\r\n]+$//o;
	return $str;

}


# Return an error message after a bad file upload
sub badUploadError {

	&printHeader($browser, 'ISO-8859-1', $lang);
	&printFile("$beginningHTML.$lang");
	&error('Your browser does not appear to support form-based file upload. Please try one of our <a href="/tools/validator/">alternate methods of validation</a>.');

}


# Return the number of characters in a potentially-multibyte character string
# First argument is the string
# Second argument is a regular expression denoting a single character
# If the string is single-byte, the second argument should be '.' or omitted
sub ulength {

	my $str = shift || return 0;
	my $oneChar = (shift || '.');

	my $length = 0;

	if ($oneChar eq '.') {	# single-byte

		$length = length(str);

	} else {			# multibyte

		while ($str =~ /$oneChar/gos) {
			$length++;
		}

	}

	return $length;
}

# Callback to sift through extracted links in spider mode
sub linkExtractorCallback {
	foreach (@_) {
		# Remove fragment identifiers
		s/\#.*$//os;
		if ($_) {
			push(@documentLinks, $_);
		}
	}
}

# Callback to check if a response is an HTML document when spidering
sub requestCallback {
	# Check if request is text/html
	my $contentType = $_[1]->content_type;
	if ($contentType) {
		if ($contentType eq 'text/html') {
			$document .= $_[0];
			$parser->parse($_[0]);
		} else {
			die "Resource is not an HTML document";
		}
	}
}