1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82
|
package HTMLLinkExtractor;
# HTML link extractor
#
# Extracts links from an HTML document. Only links that may
# be HTML documents (e.g., no IMG SRCs) are extracted.
#
# Based on HTML::LinkExtor and used in a similar way.
require HTML::Parser;
@ISA = qw(HTML::Parser);
use strict;
use vars qw(%LINK_ELEMENT);
# Elements that might contain HTML links and the name of the link attribute
%LINK_ELEMENT =
(
'a' => 'href',
'img' => [qw(longdesc usemap)],
'base' => 'href',
'link' => 'href',
'area' => 'href',
'frame' => [qw(src longdesc)],
'iframe' => [qw(src longdesc)],
'object' => [qw(data usemap)],
'input' => 'usemap',
'blockquote' => 'cite',
'q' => 'cite',
'del' => 'cite',
'ins' => 'cite',
'head' => 'profile',
'meta' => 'content',
);
sub new
{
my($class, $cb) = @_;
my $self = $class->SUPER::new;
$self->{extractlink_cb} = $cb;
$self;
}
sub start
{
my($self, $tag, $attr) = @_; # $attr is reference to a HASH
return unless exists $LINK_ELEMENT{$tag};
my $links = $LINK_ELEMENT{$tag};
$links = [$links] unless ref $links;
my @links;
my $a;
if ($tag eq 'meta') {
next unless exists $attr->{'http-equiv'};
next unless exists $attr->{'content'};
if ($attr->{'http-equiv'} =~ /^refresh$/oi
&& $attr->{'content'} =~ /.+?;\s*url\s*=\s*(.+)$/ois)
{
push(@links, $1);
}
} else {
for $a (@$links) {
next unless exists $attr->{$a};
push(@links, $attr->{$a});
}
}
return unless @links;
$self->_found_link(@links);
}
sub _found_link
{
my $self = shift;
my $cb = $self->{extractlink_cb};
if ($cb) {
&$cb(@_);
}
}
1;
|