File: SpiderUA.pm

package info (click to toggle)
wdg-html-validator 1.5.2-5
links: PTS
area: main
in suites: woody
size: 1,268 kB
ctags: 83
sloc: perl: 1,904; ansic: 1,228; makefile: 51
file content (137 lines) | stat: -rw-r--r-- 3,287 bytes
package SpiderUA;

require LWP::RobotUA;
@ISA = qw(LWP::RobotUA);
$VERSION = "0.8";

require HTTP::Response;

use URI ();
use strict;

=head1 NAME

SpiderUA - A class for Web spiders

=head1 SYNOPSIS

  use SpiderUA;
  $ua = SpiderUA->new('my-spider/0.1', 'me@foo.com');
  ...
  # use it just like LWP::RobotUA
  $res = $ua->request($req);

=head1 DESCRIPTION

This class implements a user agent that is suitable for spidering
Web sites.  The class inherits from I<LWP::RobotUA> so that it behaves
as a Web robot should.  Additionally, I<SpiderUA> does not visit the
same URL twice, and it does not visit a URL that is outside the
scope of the original URL.

For example, if the original URL requested is
"http://www.example.com/foo/bar.html", then I<SpiderUA> will return
a 403 response--"Forbidden by spider rules"--for
"http://offsite.example.com/" and "http://www.example.com/baz.html"
while allowing "http://www.example.com/foo/" and
"http://www.example.com/foo/bar/baz.html". 

=head1 METHODS

The SpiderUA is a sub-class of LWP::RobotUA and implements the
same methods. The constructor differs slightly:

=over 4

=cut


=item $ua = SpiderUA->new($agent_name, $from)

Your spider's name and the mail address of the human responsible for
the spider (i.e. you) are required by the constructor.

=cut

sub new
{
    my($class,$name,$from) = @_;

    my $self = new LWP::RobotUA $name, $from;
    $self = bless $self, $class;

    $self;
}

sub request
{
    my($self, $request, $arg, $size) = @_;

    my $res = $self->SUPER::request($request, $arg, $size);

    unless (exists $self->{'start_uri'}) {
        # The first time that the request method is called, a
        # recursive call will be made to fetch robots.txt.
        if ($request->url->path ne '/robots.txt') {
            if (defined $res->request) {
                $self->{'start_uri'} = $res->request->url->canonical;
            } else {
                $self->{'start_uri'} = $request->url->canonical;
            }

            if ($self->{'start_uri'}->path =~ m#(.*/)#s) {
                $self->{'start_uri_base_path'} = $1;
            }
        }

    }

    $res;
}

sub simple_request
{
    my($self, $request, $arg, $size) = @_;

    my $uri = $request->url->canonical;

    if (exists $self->{'start_uri'}) {
        # Check if our spider should visit this URI.  We visit a
        # URI if each of the following is true:
        # 1. the host and port match the starting URI;
        # 2. the URI contains the starting URI's base path;
        # 3. we have not previously visited the URI.

        unless ($uri->host_port eq $self->{'start_uri'}->host_port
            && index($uri->path, $self->{'start_uri_base_path'}) == 0
            && !exists $self->{'visited'}{$uri->as_string})
        {
            return new HTTP::Response
              &HTTP::Status::RC_FORBIDDEN, 'Forbidden by spider rules';
        }
    }

    # Perform the request
    my $res = $self->SUPER::simple_request($request, $arg, $size);

    $self->{'visited'}{$uri->as_string} = 1;

    $res;
}

1;

=back

=head1 SEE ALSO

L<LWP::RobotUA>, L<LWP::UserAgent>

=head1 COPYRIGHT

Copyright 2001 Liam Quinn.

This library is free software; you can redistribute it and/or
modify it under the same terms as Perl itself.

=cut