1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63
|
#!/usr/bin/perl -w
# Copyright (C) 2002 Nigel Horne <njh@bandsman.co.uk>
#
# This program is free software; you can redistribute it and/or modify
# it under the terms of the GNU General Public License as published by
# the Free Software Foundation; either version 2 of the License, or
# (at your option) any later version.
#
# This program is distributed in the hope that it will be useful,
# but WITHOUT ANY WARRANTY; without even the implied warranty of
# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
# GNU General Public License for more details.
#
# You should have received a copy of the GNU General Public License
# along with this program; if not, write to the Free Software
# Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA.
use strict;
use WWW::RobotRules::AnyDBM_File;
use LWP::RobotUA; # for spidering and cache lookup
use HTTP::Cache::Transparent;
HTTP::Cache::Transparent::init( { BasePath => '/tmp/cache',
Verbose => 1 } );
my $url = "http://www.bandsman.co.uk";
my $rules = WWW::RobotRules::AnyDBM_File->new('www.bandsman.co.uk/Spider', '/tmp/robots.cache');
my $robot = LWP::RobotUA->new('www.bandsman.co.uk/Spider', 'njh@despammed.com', $rules);
$robot->timeout(20);
# $robot->delay(1/60); # wait 1 second between accesses to same site
$robot->delay(0);
$robot->protocols_allowed(['http']); # disabling all others
my $request = new HTTP::Request 'GET' => $url;
$request->header('Accept' => 'text/html');
$robot->max_size(2048);
$request->header('Accept-Encoding' => 'gzip; deflate',
'Referer' => 'http://www.bandsman.co.uk');
my $webdoc = $robot->simple_request($request);
if(!$webdoc->is_success) {
die $webdoc->status_line . "\n";
exit 1;
}
my $content = $webdoc->content;
if(my $encoding = $webdoc->content_encoding) {
if($encoding =~ /gzip/i) {
$content = Compress::Zlib::memGunzip($content);
if(!defined $content) {
die "$url can't be gunziped\n";
}
} elsif($encoding =~ /deflate/i) {
$content = Compress::Zlib::uncompress($content);
if(!defined $content) {
die "$url can't be uncompressed\n";
}
}
}
print "$url: " . length($content) . " bytes\n";
print $content . "\n";
|