1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196
|
#!perl -w
use strict;
use Test::More tests => 16;
{ package H;
sub new { bless {}, shift; }
sub header {
my $self = shift;
my $key = uc(shift);
my $old = $self->{$key};
if (@_) { $self->{$key} = shift; }
$old;
}
sub push_header {
my($self, $k, $v) = @_;
$k = uc($k);
if (exists $self->{$k}) {
$self->{$k} = [ $self->{$k} ] unless ref $self->{$k};
push(@{$self->{$k}}, $v);
} else {
$self->{$k} = $v;
}
}
sub as_string {
my $self = shift;
my $str = "";
for (sort keys %$self) {
if (ref($self->{$_})) {
my $v;
for $v (@{$self->{$_}}) {
$str .= "$_: $v\n";
}
} else {
$str .= "$_: $self->{$_}\n";
}
}
$str;
}
}
my $HTML = <<'EOT';
<title>Å vre eller å ikke vre</title>
<meta http-equiv="Expires" content="Soon">
<meta http-equiv="Foo" content="Bar">
<link href="mailto:gisle@aas.no" rev=made title="Gisle Aas">
<script>
ignore this
</script>
<noscript> ... and this </noscript>
<object classid="foo">
<base href="http://www.sn.no">
<meta name="Keywords" content="test, test, test,...">
<meta name="Keywords" content="more">
<meta charset="ISO-8859-1"><!-- HTML 5 -->
Dette er vanlig tekst. Denne teksten definerer ogs slutten p
<head> delen av dokumentet.
<style>
ignore this too
</style>
<isindex>
Dette er ogs vanlig tekst som ikke skal blir parset i det hele tatt.
EOT
$| = 1;
#$HTML::HeadParser::DEBUG = 1;
require HTML::HeadParser;
my $p = HTML::HeadParser->new( H->new );
if ($p->parse($HTML)) {
fail("Need more data which should not happen");
} else {
#diag $p->as_string;
pass();
}
like($p->header('Title'), qr/ vre eller ikke vre/);
is($p->header('Expires'), 'Soon');
is($p->header('Content-Base'), 'http://www.sn.no');
is_deeply($p->header('X-Meta-Keywords'), ['test, test, test,...', 'more']);
is($p->header('X-Meta-Charset'), 'ISO-8859-1');
like($p->header('Link'), qr/<mailto:gisle\@aas.no>/);
# This header should not be present because the head ended
ok(!$p->header('Isindex'));
# Try feeding one char at a time
my $expected = $p->as_string;
my $nl = 1;
$p = HTML::HeadParser->new(H->new);
while ($HTML =~ /(.)/sg) {
#print STDERR '#' if $nl;
#print STDERR $1;
$nl = $1 eq "\n";
$p->parse($1) or last;
}
is($p->as_string, $expected);
# Try reading it from a file
my $file = "hptest$$.html";
die "$file already exists" if -e $file;
open(FILE, ">$file") or die "Can't create $file: $!";
binmode(FILE);
print FILE $HTML;
print FILE "<p>This is more content...</p>\n" x 2000;
print FILE "<title>Buuuh!</title>\n" x 200;
close FILE or die "Can't close $file: $!";
$p = HTML::HeadParser->new(H->new);
$p->parse_file($file);
unlink($file) or warn "Can't unlink $file: $!";
is($p->header("Title"), " vre eller ikke vre");
# We got into an infinite loop on data without tags and no EOL.
# This was actually a HTML::Parser bug.
open(FILE, ">$file") or die "Can't create $file: $!";
print FILE "Foo";
close(FILE);
$p = HTML::HeadParser->new(H->new);
$p->parse_file($file);
unlink($file) or warn "Can't unlink $file: $!";
ok(!$p->as_string);
SKIP: {
skip "Need Unicode support", 5 if $] < 5.008;
# Test that the Unicode BOM does not confuse us?
$p = HTML::HeadParser->new(H->new);
ok($p->parse("\x{FEFF}\n<title>Hi <foo></title>"));
$p->eof;
is($p->header("title"), "Hi <foo>");
$p = HTML::HeadParser->new(H->new);
$p->utf8_mode(1);
$p->parse(<<"EOT"); # example from http://rt.cpan.org/Ticket/Display.html?id=27522
\xEF\xBB\xBF<!DOCTYPE html PUBLIC "-//W3C//DTD XHTML 1.0 Transitional//EN" "http://www.w3.org/TR/xhtml1/DTD/xhtml1-transitional.dtd">
<html>
<head>
<title>
Parkinson's disease</title>
<meta name="Keywords" content="brain,disease,dopamine,drug,levodopa,parkinson,patients,symptoms,,Medications, Medications">
</meta>
\t
\t<link href="../../css/ummAdam.css" rel="stylesheet" type="text/css" />
\t<link rel="stylesheet" rev="stylesheet" href="../../css/ummprint.css" media="print" />
\t
\t </head>
<body>
EOT
$p->eof;
is($p->header("title"), "Parkinson's disease");
is($p->header("link")->[0], '<../../css/ummAdam.css>; rel="stylesheet"; type="text/css"');
$p = HTML::HeadParser->new(H->new);
$p->utf8_mode(1);
$p->parse(<<"EOT"); # example from http://www.mjw.com.pl/
<!DOCTYPE html PUBLIC "-//W3C//DTD XHTML 1.0 Strict//EN" "http://www.w3.org/TR/xhtml1/DTD/xhtml1-strict.dtd">\r
<html xmlns="http://www.w3.org/1999/xhtml" xml:lang="pl" lang="pl"> \r
\r
<head profile="http://gmpg.org/xfn/11">\r
<meta http-equiv="Content-Type" content="text/html; charset=UTF-8" />\r
\r
<title> ko\xC5\x84c\xC3\xB3wki kolekcji, outlet, hurtownia odzie\xC5\xBCy Warszawa – MJW</title>\r
<link rel="shortcut icon" href="favicon.ico" type="image/x-icon" />\r
EOT
$p->eof;
is($p->header("title"), "ko\xC5\x84c\xC3\xB3wki kolekcji, outlet, hurtownia odzie\xC5\xBCy Warszawa \xE2\x80\x93 MJW");
}
|