File: scrape_text_by_id.t

package info (click to toggle)
libtest-www-mechanize-perl 1.60-1
  • links: PTS, VCS
  • area: main
  • in suites: bookworm, forky, sid, trixie
  • size: 380 kB
  • sloc: perl: 2,725; makefile: 4
file content (197 lines) | stat: -rw-r--r-- 6,821 bytes parent folder | download | duplicates (4)
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
#!/usr/bin/perl -T

use strict;
use warnings;

use Test::Builder::Tester;

use Test::More tests => 3;

use URI::file ();

use Test::WWW::Mechanize ();

subtest scrape_text_by_id => sub {
    plan tests => 8;

    my $mech = Test::WWW::Mechanize->new( autolint => 0 );
    isa_ok( $mech, 'Test::WWW::Mechanize' );

    my $uri = URI::file->new_abs( 't/goodlinks.html' )->as_string;
    $mech->get_ok( $uri, 'Get a dummy page just to have one' );

    subtest 'nothing to find' => sub {
        plan tests => 2;
        $mech->update_html( '<html><head><title></title></head><body></body></html>' );

        is_deeply( [$mech->scrape_text_by_id( 'asdf' )], [], 'empty list returned in list context' );
        is( $mech->scrape_text_by_id( 'asdf' ), undef, 'undef returned in scalar context' );
    };

    subtest 'find one' => sub {
        plan tests => 2;
        $mech->update_html( '<html><head><title></title></head><body><p id="asdf">contents</p></body></html>' );
        is_deeply( [$mech->scrape_text_by_id( 'asdf' )], ['contents'], 'list context' );
        is( $mech->scrape_text_by_id( 'asdf' ), 'contents', 'scalar context' );
    };

    subtest 'find multiple' => sub {
        plan tests => 2;

        $mech->update_html( '<html><head><title></title></head><body><p id="asdf">contents</p><p id="asdf">further</p></body></html>' );
        is_deeply( [$mech->scrape_text_by_id( 'asdf' )], ['contents', 'further'], 'empty list returned in list context' );
        is( $mech->scrape_text_by_id( 'asdf' ), 'contents', 'first string returned in scalar context' );
    };

    subtest 'present but empty' => sub {
        plan tests => 2;

        $mech->update_html( '<html><head><title></title></head><body><p id="asdf"></p></body></html>' );
        is_deeply( [$mech->scrape_text_by_id( 'asdf' )], [''], 'list context' );
        is( $mech->scrape_text_by_id( 'asdf' ), '', 'scalar context' );
    };

    subtest 'present but emptier' => sub {
        plan tests => 2;

        $mech->update_html( '<html><head><title></title></head><body><p id="asdf" /></body></html>' );
        is_deeply( [$mech->scrape_text_by_id( 'asdf' )], [''], 'list context' );
        is( $mech->scrape_text_by_id( 'asdf' ), '', 'scalar context' );
    };

    subtest 'nested tag' => sub {
        plan tests => 2;

        $mech->update_html( '<html><head><title></title></head><body><p id="asdf">Bob and <b>Bongo!</b></p></body></html>' );
        is_deeply( [$mech->scrape_text_by_id( 'asdf' )], ['Bob and Bongo!'], 'list context' );
        is( $mech->scrape_text_by_id( 'asdf' ), 'Bob and Bongo!', 'scalar context' );
    };
};


subtest 'scraped_id_is and scraped_id_like' => sub {
    plan tests => 5;

    my $mech = Test::WWW::Mechanize->new( autolint => 0 );
    isa_ok( $mech, 'Test::WWW::Mechanize' );

    my $uri = URI::file->new_abs( 't/goodlinks.html' )->as_string;
    $mech->get_ok( $uri, 'Get a dummy page just to have one' );

    subtest 'find one' => sub {
        plan tests => 2;
        $mech->update_html( '<html><head><title></title></head><body><p id="asdf">contents</p></body></html>' );
        $mech->scraped_id_is( 'asdf', 'contents', 'Works in scalar context' );
        $mech->scraped_id_like( 'asdf', qr/con.+s/, 'Works on regexes' );
    };

    subtest 'nested tag' => sub {
        plan tests => 2;

        $mech->update_html( '<html><head><title></title></head><body><p id="asdf">Bob and <b>Bongo!</b></p></body></html>' );
        $mech->scraped_id_is( 'asdf', 'Bob and Bongo!' );
        $mech->scraped_id_like( 'asdf', qr/Bob.+Bongo/ );
    };

    subtest 'failures' => sub {
        plan tests => 6;

        $mech->update_html( '<html><head><title></title></head><body><p id="asdf">Bob and <b>Bongo!</b></p><p id="empty"></p></body></html>' );

        # Test standard successes.
        $mech->scraped_id_is( 'asdf', 'Bob and Bongo!' );
        $mech->scraped_id_like( 'asdf', qr/Bob.+Bongo/ );

        # Test failures.
        test_out( 'not ok 1 - Trying to match nonexistent ID to a string' );
        test_fail( +2 );
        test_diag( q{Can't find ID "nonexistent" to compare to "foo"} );
        $mech->scraped_id_is( 'nonexistent', 'foo', 'Trying to match nonexistent ID to a string' );
        test_test( 'Fails when trying to find nonexistent ID' );

        my $regex = qr/Dave/ism;
        test_out( 'not ok 1 - Trying to match nonexistent ID to a regex' );
        test_fail( +2 );
        test_diag( qq{Can't find ID "nonexistent" to match against $regex} );
        $mech->scraped_id_like( 'nonexistent', $regex, 'Trying to match nonexistent ID to a regex' );
        test_test( 'Fails when mismatched against existing ID' );

        # Make sure that empty tags don't get seen as non-existent.
        $mech->scraped_id_is( 'empty', '' );
        $mech->scraped_id_like( 'empty', qr/^$/ );

    };
};


# Previous versions would miss a search for id="foo" if it was not
# exactly id="foo".  Here we test for variants.
subtest 'scrape_text_by_id optimization' => sub {
    plan tests => 6;

    _find_the_chips( <<'HTML', 'Double-quoted ID' );
        <html>
            <head><title>Bongo</title></head>
            <body>not chips<p id="fish">chips</p>also not chips</body></html>
HTML

    _find_the_chips( <<'HTML', 'Single-quoted ID' );
        <html>
            <head><title>Bongo</title></head>
            <body>not chips<p id='fish'>chips</p>also not chips</body></html>
HTML

    _find_the_chips( <<'HTML', 'Unquoted ID' );
        <html>
            <head><title>Bongo</title></head>
            <body>not chips<p id=fish>chips</p>also not chips</body></html>
HTML

    _find_the_chips( <<'HTML', 'Abnormal spacing' );
        <html>
            <head><title>Bongo</title></head>
            <body>not chips<p id = fish >chips</p>also not chips</body></html>
HTML

    _find_the_chips( <<'HTML', 'Unquoted broken across lines' );
        <html>
            <head><title>Bongo</title></head>
            <body>not chips<p id
            =
            fish >chips</p>also not chips</body></html>
HTML

    _find_the_chips( <<'HTML', 'Quoted broken across lines' );
        <html>
            <head><title>Bongo</title></head>
            <body>not chips<p
            id
            =
            "fish"
            >
            chips
            </p>
            also not chips</body></html>
HTML
};

sub _find_the_chips {
    local $Test::Builder::Level = $Test::Builder::Level + 1;

    my $html = shift;
    my $msg  = shift or die;

    return subtest "_find_the_chips( $msg )" => sub {
        plan tests => 2;

        my $mech = Test::WWW::Mechanize->new( autolint => 0 );
        isa_ok( $mech, 'Test::WWW::Mechanize' );
        $mech->update_html( $html );
        $mech->scraped_id_is( 'fish', 'chips' );
    };
}


done_testing();

exit 0;