File: HTMLLinkExtractor.pm

package info (click to toggle)
wdg-html-validator 1.0-6
  • links: PTS
  • area: contrib
  • in suites: potato
  • size: 1,404 kB
  • ctags: 103
  • sloc: perl: 2,912; ansic: 1,228; makefile: 48
file content (82 lines) | stat: -rw-r--r-- 1,690 bytes parent folder | download
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
package HTMLLinkExtractor;

# HTML link extractor
#
# Extracts links from an HTML document.  Only links that may
# be HTML documents (e.g., no IMG SRCs) are extracted.
#
# Based on HTML::LinkExtor and used in a similar way.

require HTML::Parser;
@ISA = qw(HTML::Parser);

use strict;
use vars qw(%LINK_ELEMENT);

# Elements that might contain HTML links and the name of the link attribute
%LINK_ELEMENT =
(
'a'      => 'href',
'img'    => [qw(longdesc usemap)],
'base'   => 'href',
'link'   => 'href',
'area'   => 'href',
'frame'  => [qw(src longdesc)],
'iframe' => [qw(src longdesc)],
'object' => [qw(data usemap)],
'input'  => 'usemap',
'blockquote' => 'cite',
'q'      => 'cite',
'del'    => 'cite',
'ins'    => 'cite',
'head'   => 'profile',
'meta'   => 'content',
);

sub new
{
    my($class, $cb) = @_;
    my $self = $class->SUPER::new;
    $self->{extractlink_cb} = $cb;
    $self;
}

sub start
{
    my($self, $tag, $attr) = @_;  # $attr is reference to a HASH
    return unless exists $LINK_ELEMENT{$tag};

    my $links = $LINK_ELEMENT{$tag};
    $links = [$links] unless ref $links;

    my @links;
    my $a;

    if ($tag eq 'meta') {
      next unless exists $attr->{'http-equiv'};
      next unless exists $attr->{'content'};
      if ($attr->{'http-equiv'} =~ /^refresh$/oi
          && $attr->{'content'} =~ /.+?;\s*url\s*=\s*(.+)$/ois)
      {
        push(@links, $1);
      }
    } else {
      for $a (@$links) {
		next unless exists $attr->{$a};
		push(@links, $attr->{$a});
      }
    }
    return unless @links;
    $self->_found_link(@links);
}

sub _found_link
{
    my $self = shift;
    my $cb = $self->{extractlink_cb};
    if ($cb) {
	&$cb(@_);
    }
}

1;