File: AutoPagerize.pm

package info (click to toggle)
libhtml-autopagerize-perl 0.02-2
  • links: PTS, VCS
  • area: main
  • in suites: forky, sid, trixie
  • size: 264 kB
  • sloc: perl: 4,318; makefile: 6
file content (125 lines) | stat: -rw-r--r-- 2,808 bytes parent folder | download | duplicates (3)
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
package HTML::AutoPagerize;

use strict;
use 5.8.1;
our $VERSION = '0.02';

use Carp;
use HTML::TreeBuilder::XPath;
use URI;

sub new {
    my $class = shift;
    bless { sites => [] }, $class;
}

sub sites {
    my $self = shift;
    $self->{sites} = shift if @_;
    $self->{sites};
}

sub sorted_sites {
    my $self = shift;
    return [ sort { length $b->{url} <=> length $a->{url} } @{ $self->sites } ];
}

sub add_site {
    my($self, %site) = @_;

    for my $key (qw( url nextLink )) {
        unless (defined $site{$key}) {
            croak "key '$key' needed for SITEINFO";
        }
    }

    $site{url} = qr/$site{url}/; # compile the regexp
    push @{$self->{sites}}, \%site;
}

sub handle {
    my($self, $uri, $html) = @_;

    my $siteinfo = $self->site_info_for($uri) or return;

    my $tree = HTML::TreeBuilder::XPath->new;
    $tree->parse($html);

    my $res;

    my $next_link = $siteinfo->{nextLink};
    if (my $nodes = $tree->findnodes($next_link)) {
        $res->{next_link} = URI->new_abs($nodes->shift->attr('href'), $uri);
    }

    if (my $page_element = $siteinfo->{pageElement}) {
        if (my $nodes = $tree->findnodes($page_element)) {
            $res->{page_element} = $nodes;
        }
    }

    return $res;
}

sub site_info_for {
    my($self, $uri) = @_;

    for my $site (@{ $self->sorted_sites }) {
        if ($uri =~ $site->{url}) {
            return $site;
        }
    }

    return;
}

1;
__END__

=for stopwords AutoPagerize SITEINFO userscript

=head1 NAME

HTML::AutoPagerize - Utility to load AutoPagerize SITEINFO stuff

=head1 SYNOPSIS

  use HTML::AutoPagerize;

  my $autopager = HTML::AutoPagerize->new;
  $autopager->add_site(
      url         => 'http://.+.tumblr.com/',
      nextLink    => '//div[@id="content" or @id="container"]/div[last()]/a[last()]',
      pageElement => '//div[@id="content" or @id="container"]/div[@class!="footer" or @class!="navigation"]',
  );

  my $uri  = 'http://otsune.tumblr.com/';
  my $html = LWP::Simple::get($uri);

  my $res = $autopager->handle($uri, $html);
  if ($res) {
      my $next_link = $res->{next_link};    # URI object
      my $content   = $res->{page_element}; # XML::XPathEngine::NodeSet object. may be empty
  }

=head1 DESCRIPTION

HTML::AutoPagerize is an utility module to load SITEINFO defined in
AutoPagerize. AutoPagerize is an userscript to automatically figure
out the L<next link> of the current page, then fetch the content and
insert the content by extracting the L<page element>.

=head1 AUTHOR

Tatsuhiko Miyagawa E<lt>miyagawa@bulknews.netE<gt>

=head1 LICENSE

This library is free software; you can redistribute it and/or modify
it under the same terms as Perl itself.

=head1 SEE ALSO

L<WWW::Mechanize::AutoPager>, L<http://swdyh.infogami.com/autopagerize>

=cut