1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149
|
package HTTP::Proxy::BodyFilter::htmlparser;
$HTTP::Proxy::BodyFilter::htmlparser::VERSION = '0.304';
use strict;
use Carp;
use HTTP::Proxy::BodyFilter;
use vars qw( @ISA );
@ISA = qw( HTTP::Proxy::BodyFilter );
sub init {
croak "First parameter must be a HTML::Parser object"
unless $_[1]->isa('HTML::Parser');
my $self = shift;
$self->{_parser} = shift;
my %args = (@_);
$self->{rw} = delete $args{rw};
}
sub filter {
my ( $self, $dataref, $message, $protocol, $buffer ) = @_;
@{ $self->{_parser} }{qw( output message protocol )} =
( "", $message, $protocol );
$self->{_parser}->parse($$dataref);
$self->{_parser}->eof if not defined $buffer; # last chunk
$$dataref = $self->{_parser}{output} if $self->{rw};
}
sub will_modify { $_[0]->{rw} }
1;
__END__
=head1 NAME
HTTP::Proxy::BodyFilter::htmlparser - Filter using HTML::Parser
=head1 SYNOPSIS
use HTTP::Proxy::BodyFilter::htmlparser;
# $parser is a HTML::Parser object
$proxy->push_filter(
mime => 'text/html',
response => HTTP::Proxy::BodyFilter::htmlparser->new( $parser );
);
=head1 DESCRIPTION
The L<HTTP::Proxy::BodyFilter::htmlparser> lets you create a
filter based on the L<HTML::Parser> object of your choice.
This filter takes a L<HTML::Parser> object as an argument to its constructor.
The filter is either read-only or read-write. A read-only filter will
not allow you to change the data on the fly. If you request a read-write
filter, you'll have to rewrite the response-body completely.
With a read-write filter, you B<must> recreate the whole body data. This
is mainly due to the fact that the L<HTML::Parser> has its own buffering
system, and that there is no easy way to correlate the data that triggered
the L<HTML::Parser> event and its original position in the chunk sent by the
origin server. See below for details.
Note that a simple filter that modify the HTML text (not the tags) can
be created more easily with L<HTTP::Proxy::BodyFilter::htmltext>.
=head2 Creating a HTML::Parser that rewrites pages
A read-write filter is declared by passing C<rw =E<gt> 1> to the constructor:
HTTP::Proxy::BodyFilter::htmlparser->new( $parser, rw => 1 );
To be able to modify the body of a message, a filter created with
L<HTTP::Proxy::BodyFilter::htmlparser> must rewrite it completely. The
L<HTML::Parser> object can update a special attribute named C<output>.
To do so, the L<HTML::Parser> handler will have to request the C<self>
attribute (that is to say, require access to the parser itself) and
update its C<output> key.
The following attributes are added to the L<HTML::Parser> object by this filter:
=over 4
=item output
A string that will hold the data sent back by the proxy.
This string will be used as a replacement for the body data only
if the filter is read-write, that is to say, if it was initialised with
C<rw =E<gt> 1>.
Data should always be B<appended> to C<$parser-E<gt>{output}>.
=item message
A reference to the L<HTTP::Message> that triggered the filter.
=item protocol
A reference to the L<HTTP::Protocol> object.
=back
=head1 METHODS
This filter defines three methods, called automatically:
=over 4
=item filter()
The C<filter()> method handles all the interactions with the L<HTML::Parser>
object.
=item init()
Initialise the filter with the HTML::Parser object passed to the constructor.
=item will_modify()
This method returns a boolean value that indicates to the system
if it will modify the data passing through. The value is actually
the value of the C<rw> parameter passed to the constructor.
=back
=head1 SEE ALSO
L<HTTP::Proxy>, L<HTTP::Proxy::Bodyfilter>,
L<HTTP::Proxy::BodyFilter::htmltext>.
=head1 AUTHOR
Philippe "BooK" Bruhat, E<lt>book@cpan.orgE<gt>.
=head1 COPYRIGHT
Copyright 2003-2015, Philippe Bruhat.
=head1 LICENSE
This module is free software; you can redistribute it or modify it under
the same terms as Perl itself.
=cut
|