File: tokeparser.t

package info (click to toggle)
libhtml-parser-perl 3.56-1%2Blenny1
links: PTS
area: main
in suites: lenny
size: 536 kB
ctags: 192
sloc: ansic: 1,985; perl: 1,935; makefile: 43
file content (164 lines) | stat: -rw-r--r-- 3,204 bytes
parent folder | download | duplicates (7)
use Test::More tests => 17;

use strict;
use HTML::TokeParser;

# First we create an HTML document to test

my $file = "ttest$$.htm";
die "$file already exists" if -e $file;

open(F, ">$file") or die "Can't create $file: $!";
print F <<'EOT';  close(F);

<!--This is a test-->
<html><head><title>
  This is the &lt;title&gt;
</title>

  <base href="http://www.perl.com">
</head>

<body background="bg.gif">

    <h1>This is the <b>title</b> again
    </h1>

    And this is a link to the <a href="http://www.perl.com"><img src="camel.gif" alt="Perl">&nbsp;<!--nice isn't it-->Institute</a>

   <br/><? process instruction >

</body>
</html>

EOT

END { unlink($file) || warn "Can't unlink $file: $!"; }


my $p;


$p = HTML::TokeParser->new($file) || die "Can't open $file: $!";
ok($p->unbroken_text);
if ($p->get_tag("foo", "title")) {
    my $title = $p->get_trimmed_text;
    #diag "Title: $title";
    is($title, "This is the <title>");
}
undef($p);

# Test with reference to glob
open(F, $file) || die "Can't open $file: $!";
$p = HTML::TokeParser->new(\*F);
my $scount = 0;
my $ecount = 0;
my $tcount = 0;
my $pcount = 0;
while (my $token = $p->get_token) {
    $scount++ if $token->[0] eq "S";
    $ecount++ if $token->[0] eq "E";
    $pcount++ if $token->[0] eq "PI";
}
undef($p);
close F;

# Test with glob
open(F, $file) || die "Can't open $file: $!";
$p = HTML::TokeParser->new(*F);
$tcount++ while $p->get_tag;
undef($p);
close F;

# Test with plain file name
$p = HTML::TokeParser->new($file) || die;
$tcount++ while $p->get_tag;
undef($p);

#diag "Number of tokens found: $tcount/2 = $scount + $ecount";
is($tcount, 34);
is($scount, 10);
is($ecount, 7);
is($pcount, 1);
is($tcount/2, $scount + $ecount);

ok(!HTML::TokeParser->new("/noT/thEre/$$"));


$p = HTML::TokeParser->new($file) || die;
$p->get_tag("a");
my $atext = $p->get_text;
undef($p);

is($atext, "Perl\240Institute");

# test parsing of embeded document
$p = HTML::TokeParser->new(\<<HTML);
<title>Title</title>
<H1>
Heading
</h1>
HTML

ok($p->get_tag("h1"));
is($p->get_trimmed_text, "Heading");
undef($p);

# test parsing of large embedded documents
my $doc = "<a href='foo'>foo is bar</a>\n\n\n" x 2022;

#use Time::HiRes qw(time);
my $start = time;
$p = HTML::TokeParser->new(\$doc);
#diag "Construction time: ", time - $start;

my $count;
while (my $t = $p->get_token) {
    $count++ if $t->[0] eq "S";
}
#diag "Parse time: ", time - $start;

is($count, 2022);

$p = HTML::TokeParser->new(\<<'EOT');
<H1>This is a heading</H1>
This is s<b>o</b>me<hr>text.
<br />
This is some more text.
<p>
This is even some more.
EOT

$p->get_tag("/h1");

my $t = $p->get_trimmed_text("br", "p");
is($t, "This is some text.");

$p->get_tag;

$t = $p->get_trimmed_text("br", "p");
is($t,"This is some more text.");

undef($p);

$p = HTML::TokeParser->new(\<<'EOT');
<H1>This is a <b>bold</b> heading</H1>
This is some <i>italic</i> text.<br />This is some <span id=x>more text</span>.
<p>
This is even some more.
EOT

$p->get_tag("h1");

$t = $p->get_phrase;
is($t, "This is a bold heading");

$t = $p->get_phrase;
is($t, "");

$p->get_tag;

$t = $p->get_phrase;
is($t, "This is some italic text. This is some more text.");

undef($p);