1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82
|
#!perl -T
use 5.010001;
use warnings;
use strict;
use Test::More tests => 2;
use HTML::Tidy5;
use Encode ();
use Carp;
use lib 't';
use TidyTestUtils;
# Suck in the reference HTML document.
open( my $html_in, '<:encoding(UTF-8)', 't/unicode.html' ) or Carp::croak( "Can't read unicode.html: $!" );
my $html = join( '', <$html_in> );
close $html_in or die $!;
# Suck in the correct, cleaned doc (from DATA)
binmode DATA, ':encoding(UTF-8)';
my $reference = join( '', <DATA> );
subtest 'utf8 testing' => sub {
plan tests => 8;
my $tidy_constructor_args = { newline => 'LF', wrap => 0 };
my $tidy = HTML::Tidy5->new( $tidy_constructor_args );
$tidy->ignore( type => TIDY_INFO );
# Make sure both are unicode characters (not utf-x octets).
ok(utf8::is_utf8($html), 'html is utf8');
ok(utf8::is_utf8($reference), 'reference is utf8');
my $clean = $tidy->clean( $html );
ok(utf8::is_utf8($clean), 'cleaned output is also unicode');
$clean = remove_specificity( $clean );
is($clean, $reference, q{Cleanup didn't break anything});
my @messages = $tidy->messages;
is_deeply( \@messages, [], q{There still shouldn't be any errors} );
$tidy = HTML::Tidy5->new( $tidy_constructor_args );
isa_ok( $tidy, 'HTML::Tidy5' );
my $rc = $tidy->parse( '', $html );
ok( $rc, 'Parsed OK' );
@messages = $tidy->messages;
is_deeply( \@messages, [], q{There still shouldn't be any errors} );
};
subtest 'Try send bytes to clean method.' => sub {
plan tests => 3;
my $tidy_constructor_args = { newline => 'LF', wrap => 0 };
my $tidy = HTML::Tidy5->new( $tidy_constructor_args );
$tidy->ignore( type => TIDY_INFO );
my $encoded_html = Encode::encode('utf8',$html);
ok(!utf8::is_utf8($encoded_html), 'html is row bytes');
my $clean = $tidy->clean( $encoded_html );
ok(utf8::is_utf8($clean), 'but cleaned output is string');
$clean = remove_specificity( $clean );
is($clean, $reference, q{Cleanup didn't break anything});
};
exit 0;
__DATA__
<!DOCTYPE html PUBLIC "-//W3C//DTD HTML 3.2//EN">
<html>
<head>
<meta name="generator" content="TIDY">
<title>日本語のホムページ</title>
</head>
<body>
<p>Unicodeが好きですか?</p>
</body>
</html>
|