File: unicode.t

package info (click to toggle)
libhtml-tidy5-perl 1.06-3
  • links: PTS, VCS
  • area: main
  • in suites: forky, sid, trixie
  • size: 368 kB
  • sloc: perl: 1,853; makefile: 14
file content (82 lines) | stat: -rw-r--r-- 2,259 bytes parent folder | download | duplicates (3)
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
#!perl -T

use 5.010001;
use warnings;
use strict;

use Test::More tests => 2;

use HTML::Tidy5;
use Encode ();
use Carp;

use lib 't';

use TidyTestUtils;

# Suck in the reference HTML document.
open( my $html_in, '<:encoding(UTF-8)', 't/unicode.html' ) or Carp::croak( "Can't read unicode.html: $!" );
my $html = join( '', <$html_in> );
close $html_in or die $!;

# Suck in the correct, cleaned doc (from DATA)
binmode DATA, ':encoding(UTF-8)';
my $reference = join( '', <DATA> );

subtest 'utf8 testing' => sub {
    plan tests => 8;

    my $tidy_constructor_args = { newline => 'LF', wrap => 0 };
    my $tidy = HTML::Tidy5->new( $tidy_constructor_args );
    $tidy->ignore( type => TIDY_INFO );

    # Make sure both are unicode characters (not utf-x octets).
    ok(utf8::is_utf8($html), 'html is utf8');
    ok(utf8::is_utf8($reference), 'reference is utf8');

    my $clean = $tidy->clean( $html );
    ok(utf8::is_utf8($clean), 'cleaned output is also unicode');

    $clean = remove_specificity( $clean );
    is($clean, $reference, q{Cleanup didn't break anything});

    my @messages = $tidy->messages;
    is_deeply( \@messages, [], q{There still shouldn't be any errors} );

    $tidy = HTML::Tidy5->new( $tidy_constructor_args );
    isa_ok( $tidy, 'HTML::Tidy5' );
    my $rc = $tidy->parse( '', $html );
    ok( $rc, 'Parsed OK' );
    @messages = $tidy->messages;
    is_deeply( \@messages, [], q{There still shouldn't be any errors} );
};

subtest 'Try send bytes to clean method.' => sub {
    plan tests => 3;

    my $tidy_constructor_args = { newline => 'LF', wrap => 0 };
    my $tidy = HTML::Tidy5->new( $tidy_constructor_args );
    $tidy->ignore( type => TIDY_INFO );

    my $encoded_html = Encode::encode('utf8',$html);
    ok(!utf8::is_utf8($encoded_html), 'html is row bytes');
    my $clean = $tidy->clean( $encoded_html );
    ok(utf8::is_utf8($clean), 'but cleaned output is string');
    $clean = remove_specificity( $clean );
    is($clean, $reference, q{Cleanup didn't break anything});
};

exit 0;


__DATA__
<!DOCTYPE html PUBLIC "-//W3C//DTD HTML 3.2//EN">
<html>
<head>
<meta name="generator" content="TIDY">
<title>日本語のホムページ</title>
</head>
<body>
<p>Unicodeが好きですか?</p>
</body>
</html>