1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156
|
package SHARYANTO::HTML::Extract::ImageLinks;
our $DATE = '2015-09-04'; # DATE
our $VERSION = '0.77'; # VERSION
use 5.010;
use strict;
use warnings;
use HTML::Parser;
use URI::URL;
use Exporter;
our @ISA = qw(Exporter);
our @EXPORT_OK = qw(extract_image_links);
our %SPEC;
$SPEC{extract_image_links} = {
v => 1.1,
summary => 'Extract image links from HTML document',
description => <<'_',
Either specify either url, html.
_
args => {
html => {
schema => 'str*',
req => 1,
summary => 'HTML document to extract from',
cmdline_src => 'stdin_or_files',
},
base => {
schema => 'str',
summary => 'base URL for images',
},
},
};
sub extract_image_links {
my %args = @_;
my $html = $args{html};
my $base = $args{base};
# get base from <BASE HREF> if exists
if (!$base) {
if ($html =~ /<base\b[^>]*\bhref\s*=\s*(["']?)(\S+?)\1[^>]*>/is) {
$base = $2;
}
}
my %memory;
my @res;
my $p = HTML::Parser->new(
api_version => 3,
start_h => [
sub {
my ($tagname, $attr) = @_;
return unless $tagname =~ /^img$/i;
for ($attr->{src}) {
s/#.*//;
if (++$memory{$_} == 1) {
push @res, URI::URL->new($_, $base)->abs()->as_string;
}
}
}, "tagname, attr"],
);
$p->parse($html);
[200, "OK", \@res];
}
1;
# ABSTRACT: Extract image links from HTML document
__END__
=pod
=encoding UTF-8
=head1 NAME
SHARYANTO::HTML::Extract::ImageLinks - Extract image links from HTML document
=head1 VERSION
This document describes version 0.77 of SHARYANTO::HTML::Extract::ImageLinks (from Perl distribution SHARYANTO-Utils), released on 2015-09-04.
=head1 SEE ALSO
L<SHARYANTO>
=head1 FUNCTIONS
=head2 extract_image_links(%args) -> [status, msg, result, meta]
Extract image links from HTML document.
Either specify either url, html.
Arguments ('*' denotes required arguments):
=over 4
=item * B<base> => I<str>
base URL for images.
=item * B<html>* => I<str>
HTML document to extract from.
=back
Returns an enveloped result (an array).
First element (status) is an integer containing HTTP status code
(200 means OK, 4xx caller error, 5xx function error). Second element
(msg) is a string containing error message, or 'OK' if status is
200. Third element (result) is optional, the actual result. Fourth
element (meta) is called result metadata and is optional, a hash
that contains extra information.
Return value: (any)
=head1 HOMEPAGE
Please visit the project's homepage at L<https://metacpan.org/release/SHARYANTO-Utils>.
=head1 SOURCE
Source repository is at L<https://github.com/perlancar/perl-SHARYANTO-Utils>.
=head1 BUGS
Please report any bugs or feature requests on the bugtracker website L<https://rt.cpan.org/Public/Dist/Display.html?Name=SHARYANTO-Utils>
When submitting a bug or request, please include a test-file or a
patch to an existing test-file that illustrates the bug or desired
feature.
=head1 AUTHOR
perlancar <perlancar@cpan.org>
=head1 COPYRIGHT AND LICENSE
This software is copyright (c) 2015 by perlancar@cpan.org.
This is free software; you can redistribute it and/or modify it under
the same terms as the Perl 5 programming language system itself.
=cut
|