File: Body.pm

package info (click to toggle)
libmail-mboxparser-perl 0.55-3
  • links: PTS, VCS
  • area: main
  • in suites: wheezy
  • size: 404 kB
  • sloc: perl: 1,011; makefile: 2
file content (341 lines) | stat: -rw-r--r-- 8,594 bytes parent folder | download | duplicates (6)
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
# Mail::MboxParser - object-oriented access to UNIX-mailboxes
# Body.pm		   - the (textual) body of an email
#
# Copyright (C) 2001  Tassilo v. Parseval
# This program is free software; you can redistribute it and/or
# modify it under the same terms as Perl itself.

# Version: $Id: Body.pm,v 1.14 2002/02/21 09:06:14 parkerpine Exp $

package Mail::MboxParser::Mail::Body;

require 5.004;

use Carp;

use strict;
use base qw(Exporter);
use vars qw($VERSION @EXPORT @ISA $AUTOLOAD $_HAVE_NOT_URI_FIND);
$VERSION 	= "0.15";
@EXPORT  	= qw();
@ISA	 	= qw(Mail::MboxParser::Base Mail::MboxParser::Mail);

use overload '""' => sub { shift->as_string }, fallback => 1;

BEGIN { 
    eval { require URI::Find; };
    if ($@) { 
	$_HAVE_NOT_URI_FIND = 1;
    }
}

sub init(@) {
    my ($self, $ent, $bound, $conf) = @_;
    $self->{CONTENT}	= $ent->body; 
    $self->{BOUNDARY}	= $bound;	     # the one in Content-type
    $self->{ARGS}	= $conf;

    $self->{ARGS}->{decode} ||= 'NEVER';

    $self->_make_decoder($ent->head->mime_encoding) 
	if $self->{ARGS}->{decode} =~ /BODY|ALL/;;
    $self;
}

sub _make_decoder {
    my ($self, $enc) = @_;
    if ($enc eq 'base64') {
	require MIME::Base64;
	return $self->{DECODER} = sub { MIME::Base64::decode_base64(shift) };
    } 
    if ($enc eq 'quoted-printable') {
	require MIME::QuotedPrint;
	return $self->{DECODER} = sub { MIME::QuotedPrint::decode_qp(shift) };
    }
    $self->{DECODER} = sub { $_[0] };
} 

sub as_string {	
    my ($self, %args) = @_;
    $self->reset_last;
    return join "", $self->as_lines(strip_sig => 1) if $args{strip_sig};
    my $decode = $self->{ARGS}->{decode};
    if ($decode eq 'BODY' || $decode eq 'ALL') {
	return join "", map { $self->{DECODER}->($_) } @{$self->{CONTENT}};
    }
    return join "", @{$self->{CONTENT}};
}
	

sub as_lines() { 
    my ($self, %args) = @_;
    $self->reset_last;
    my $decode = $self->{ARGS}->{decode};
    if ($decode eq 'BODY' || $decode eq 'ALL') {
	return map { $self->{DECODER}->($_) } @{$self->{CONTENT}};
    }

    return @{$self->{CONTENT}} if ! $args{strip_sig};

    my @lines;
    for (@{ $self->{CONTENT} }) {
	last if /^--\040?[\r\n]?$/;
	push @lines, $_;
    }
    return @lines;
}
					   
	
sub signature() {
    my $self = shift;
    $self->reset_last;
    my $decode = $self->{ARGS}->{decode};
    my $bound = $self->{BOUNDARY};

    my @signature;
    my $seperator = 0;
    for (@{$self->{CONTENT}}) {

	# we are still outside the signature
	if (! /^--\040?[\r\n]?$/ && not $seperator) {
	    next;
	}

	# we hit the signature delimiter (--)
	elsif (not $seperator) { $seperator = 1; next }

	chomp;	

	# we are inside signature: is line perhaps MIME-boundary?
	last if $bound && /^--\Q$bound\E/ && $seperator;

	# none of the above => signature line
	push @signature, $_; 
    }
	
    $self->{LAST_ERR} = "No signature found" if !@signature;
    if ($decode eq 'BODY' || $decode eq 'ALL') {
	$_ = $self->{DECODER}->($_) for @signature;
    }
    return @signature if $seperator;
    return ();
}

sub extract_urls(@) {
    my ($self, %args) = @_;
    $self->reset_last;

    $args{unique} = 0 if not exists $args{unique};

    if ($_HAVE_NOT_URI_FIND) {
	carp <<EOW;
You need the URI::Find module in order to use extract_urls.
EOW
	return;
    }
    else { 
	my @uris; my %seen;

	for my $line (@{$self->{CONTENT}}) {
	    chomp $line;
	    URI::Find::find_uris($line, sub {
		    my (undef, $url) = @_;
		    $line =~ s/^\s+|\s+$//;
		    if (not $seen{$url}) {
			push @uris, { url => $url, context => $line };
		    }
		    $seen{$url}++ if $args{unique};
		}
	    );
	}
	$self->{LAST_ERR} = "No URLs found" if @uris == 0;

	return @uris;
    }
}

sub quotes() {
    my $self = shift;
    my $decode = $self->{ARGS}->{decode};
    $self->reset_last;

    my %ret;
    my $q 		= 0; # num of '>'
    my $in 		= 0; # being inside a quote
    my $last 	= 0; # num of quotes in last line

    for (@{$self->{CONTENT}}) {

	if ($decode eq 'ALL' || $decode eq 'BODY') {
	    $_ = $self->{DECODER}->($_);
	}
        
	# count quotation signs
	$q = 0;
	my $t = "a" x length;
	for my $c (unpack $t, $_) {
	    if ($c eq '>') 		{ $q++ }
	    if ($c ne '>' && $c ne ' ') { last }
	}

	# first: create a hash-element for level $q
	if (! exists $ret{$q}) {
	    $ret{$q} = [];
	}

	# if last line had the same level as current one:
	# attach the line to the last one
	if ($last == $q) {
	    if (@{$ret{$q}} == 0)   { $ret{$q}->[$q] .= $_ }
	    else		    { $ret{$q}->[-1] .= $_ }
	}
	# if not:
	# create a new array-element in the appropriate hash-element
	else { 
	    push @{$ret{$q}}, $_;
	}
	$last = $q;
    }
    return \%ret;
}


1;

__END__

=head1 NAME

Mail::MboxParser::Mail::Body - rudimentary mail-body object

=head1 SYNOPSIS

    use Mail::MboxParser;

    [...]

    # $msg is a Mail::MboxParser::Mail
    my $body = $msg->body(0);

    # or preferably

    my $body = $msg->body($msg->find_body);

    for my $line ($body->signature) { print $line, "\n" }
    for my $url ($body->extract_urls(unique => 1)) {
        print $url->{url}, "\n";
        print $url->{context}, "\n";
    }
        
=head1 DESCRIPTION

This class represents the body of an email-message.  Since emails can have
multiple MIME-parts and each of these parts has a body it is not always easy to
say which part actually holds the text of the message (if there is any at all).
Mail::MboxParser::Mail::find_body will help and suggest a part.

=head1 METHODS

=over 4

=item B<as_string ([strip_sig =E<gt> 1])>

Returns the textual representation of the body as one string. Decoding takes
place when the mailbox has been opened using the decode => 'BODY' | 'ALL'
option.

If 'strip_sig' is set to a true value, the signature is stripped from the
string.

=item B<as_lines ([strip_sig =E<gt> 1])>

Sames as as_string() just that you get an array of lines with newlines attached
to each line.

B<NOTE:> When the body is actually some encoded binary data (most commonly such
a body is base64-encoded), you can still use this method. Then you wont really
get proper lines. Instead you get chunks of binary data that you should
concatenate as in

    my $binary = join "", $body->as_lines;

If 'strip_sig' is set to a true value, the signature is stripped from the
string.

=item B<signature>

Returns the signature of a message as an array of lines. Trailing newlines are
already removed.

$body->error returns a string if no signature has been found.

=item B<extract_urls>

=item B<extract_urls (unique =E<gt> 1)>

Returns an array of hash-refs. Each hash-ref has two fields: 'url' and
'context' where context is the line in which the 'url' appeared.

When calling it like $mail->extract_urls(unique => 1), duplicate URLs will be
filtered out regardless of the 'context'. That's useful if you just want a list
of all URLs that can be found in your mails.

$body->error() will return a string if no URLs could be found within the body.

=item B<quotes>

Returns a hash-ref of array-refs where the hash-keys are the several levels of
quotation. Each array-element contains the paragraphs of this quotation-level
as one string. Example:

	my $quotes = $msg->body($msg->find_body)->quotes;
	print $quotes->{1}->[0], "\n";
	print $quotes->{0}->[0], "\n";

This should print the first paragraph of the mail-body that has been quoted
once and below that the paragraph that supposedly is the reply to this
paragraph. Perhaps thus:

	> I had been trying to work with the CGI module 
	> but I didn't yet fully understand it.

	Ah, it is tricky. Have you read the CGI-FAQ that 
	comes with the module?

Mark that empty lines will not be ignored and are part of the lines contained
in the array of $quotes->{0}.

So below is a little code-snippet that should, in most cases, restore the first
5 paragraphs (containing quote-level 0 and 1) of an email:

	for (0 .. 4) {
		print $quotes->{0}->[$_];
		print $quotes->{1}->[$_];
	}

Since quotes() considers an empty line between two quotes paragraphs as a
paragraph in $quotes->{0}, the paragraphs with one quote and those with zero
are balanced. That means: 

scalar @{$quotes->{0}} - DIFF == scalar @{$quotes->{1}} where DIFF is element
of {-1, 0, 1}.

Unfortunately, quotes() can up to now only deal with '>' as quotation-marks.

=back

=head1 VERSION

This is version 0.55.

=head1 AUTHOR AND COPYRIGHT

Tassilo von Parseval <tassilo.von.parseval@rwth-aachen.de>

Copyright (c)  2001-2005 Tassilo von Parseval.
This program is free software; you can redistribute it and/or
modify it under the same terms as Perl itself.

=head1 SEE ALSO

=cut