File: Scrubber.pm

package info (click to toggle)
libperlanet-perl 2.2.1-1
  • links: PTS, VCS
  • area: main
  • in suites: bookworm, forky, sid, trixie
  • size: 444 kB
  • sloc: xml: 1,177; perl: 757; sh: 6; makefile: 5
file content (137 lines) | stat: -rw-r--r-- 2,942 bytes parent folder | download
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
package Perlanet::Trait::Scrubber;

use 5.10.0;
use strict;
use warnings;

use Moose::Role;
use namespace::autoclean;

use HTML::Scrubber;

=head1 NAME

Perlanet::Trait::Scrubber - clean posts with HTML::Scrubber before aggregating

=head1 DESCRIPTION

Before adding a post to the aggregated feed, it will first be cleaned with
L<HTML::Scrubber>.

=head1 ATTRIBUTES

=head1 scrubber

An instance of L<HTML::Scrubber> used to remove unwanted content from
the feed entries. For default settings see source of Perlanet.pm.

=cut

has 'scrubber' => (
  is         => 'rw',
  lazy_build => 1
);

sub _build_scrubber {
  my $self = shift;

  my %scrub_rules = (
    img => {
      src   => qr{^https?://},  # only URL with http://
      alt   => 1,               # alt attributes allowed
      align => 1,               # allow align on images
      style => 1,
      width => 1,
      height => 1,
      '*'   => 0,               # deny all others
    },
    style => 0,
    script => 0,
    span => {
      id => 0,                  # blogger(?) includes spans with id attribute
    },
    a => {
      href => 1,
      '*'  => 0,
    },
    iframe => {
      frameborder => 0,
      scrolling   => 0,
      seamless    => 0,
      '*'         => 1,
    },
    centre => 0,
  );

  # Definitions for HTML::Scrub
  my %scrub_def = (
    '*'           => 1,
    'href'        => qr{^(?!(?:java)?script)}i,
    'src'         => qr{^(?!(?:java)?script)}i,
    'cite'        => '(?i-xsm:^(?!(?:java)?script))',
    'language'    => 0,
    'name'        => 1,
    'value'       => 1,
    'onblur'      => 0,
    'onchange'    => 0,
    'onclick'     => 0,
    'ondblclick'  => 0,
    'onerror'     => 0,
    'onfocus'     => 0,
    'onkeydown'   => 0,
    'onkeypress'  => 0,
    'onkeyup'     => 0,
    'onload'      => 0,
    'onmousedown' => 0,
    'onmousemove' => 0,
    'onmouseout'  => 0,
    'onmouseover' => 0,
    'onmouseup'   => 0,
    'onreset'     => 0,
    'onselect'    => 0,
    'onsubmit'    => 0,
    'onunload'    => 0,
    'src'         => 1,
    'type'        => 1,
    'style'       => 1,
    'class'       => 0,
    'id'          => 0,
    'frameborder' => 0,
    'border'      => 0,
  );

  my $scrub = HTML::Scrubber->new;
  $scrub->rules(%scrub_rules);
  $scrub->default(1, \%scrub_def);
  $scrub->script(0);

  return $scrub;
}

around 'clean_html' => sub {
  my $orig = shift;
  my $self = shift;
  my ($html) = @_;

  warn __PACKAGE__, '::clean_html' if $ENV{PERLANET_DEBUG};

  $html = $self->$orig($html);
  my $scrubbed = $self->scrubber->scrub($html);
  return $scrubbed;
};

=head1 AUTHOR

Dave Cross, <dave@perlhacks.com>

=head1 COPYRIGHT AND LICENSE

Copyright (c) 2010 by Magnum Solutions Ltd.

This library is free software; you can redistribute it and/or modify
it under the same terms as Perl itself, either Perl version 5.10.0 or,
at your option, any later version of Perl 5 you may have available.

=cut

1;