File: HTMLLinkExtractor.pm

package info (click to toggle)

wdg-html-validator 1.0-6

links: PTS
area: contrib
in suites: potato
size: 1,404 kB
ctags: 103
sloc: perl: 2,912; ansic: 1,228; makefile: 48

file content (82 lines) | stat: -rw-r--r-- 1,690 bytes

package HTMLLinkExtractor;

# HTML link extractor
#
# Extracts links from an HTML document.  Only links that may
# be HTML documents (e.g., no IMG SRCs) are extracted.
#
# Based on HTML::LinkExtor and used in a similar way.

require HTML::Parser;
@ISA = qw(HTML::Parser);

use strict;
use vars qw(%LINK_ELEMENT);

# Elements that might contain HTML links and the name of the link attribute
%LINK_ELEMENT =
(
'a'      => 'href',
'img'    => [qw(longdesc usemap)],
'base'   => 'href',
'link'   => 'href',
'area'   => 'href',
'frame'  => [qw(src longdesc)],
'iframe' => [qw(src longdesc)],
'object' => [qw(data usemap)],
'input'  => 'usemap',
'blockquote' => 'cite',
'q'      => 'cite',
'del'    => 'cite',
'ins'    => 'cite',
'head'   => 'profile',
'meta'   => 'content',
);

sub new
{
    my($class, $cb) = @_;
    my $self = $class->SUPER::new;
    $self->{extractlink_cb} = $cb;
    $self;
}

sub start
{
    my($self, $tag, $attr) = @_;  # $attr is reference to a HASH
    return unless exists $LINK_ELEMENT{$tag};

    my $links = $LINK_ELEMENT{$tag};
    $links = [$links] unless ref $links;

    my @links;
    my $a;

    if ($tag eq 'meta') {
      next unless exists $attr->{'http-equiv'};
      next unless exists $attr->{'content'};
      if ($attr->{'http-equiv'} =~ /^refresh$/oi
          && $attr->{'content'} =~ /.+?;\s*url\s*=\s*(.+)$/ois)
      {
        push(@links, $1);
      }
    } else {
      for $a (@$links) {
		next unless exists $attr->{$a};
		push(@links, $attr->{$a});
      }
    }
    return unless @links;
    $self->_found_link(@links);
}

sub _found_link
{
    my $self = shift;
    my $cb = $self->{extractlink_cb};
    if ($cb) {
	&$cb(@_);
    }
}

1;