File: html_text.pm

package info (click to toggle)
libcatmandu-html-perl 0.02%2Bdfsg-2
  • links: PTS, VCS
  • area: main
  • in suites: forky, sid, trixie
  • size: 176 kB
  • sloc: perl: 305; makefile: 2
file content (85 lines) | stat: -rw-r--r-- 1,463 bytes parent folder | download
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
package Catmandu::Fix::html_text;

our $VERSION = '0.02';

use Catmandu::Sane;
use Moo;
use Catmandu::Util;
use Catmandu::Fix::Has;

with 'Catmandu::Fix::Inlineable';

has join  => (fix_opt => 1);
has split => (fix_opt => 1);

sub fix {
    my ($self,$data) = @_;

    return $data unless Catmandu::Util::is_array_ref($data->{html});

    my $join_char = $self->join // '';
    my $is_split  = $self->split;

    my @token;

    for (@{$data->{html}}) {
        if ($_->[0] eq 'S') {
            push @token , $_->[4];
        }
        elsif ($_->[0] eq 'E') {
            push @token , $_->[2];
        }
        elsif ($_->[0] eq 'T') {
            push @token , $_->[1];
        }
        elsif ($_->[0] eq 'C') {
            push @token , $_->[1];
        }
        elsif ($_->[0] eq 'D') {
            push @token , $_->[1];
        }
        elsif ($_->[0] eq 'PI') {
            push @token , $_->[2];
        }
    }

    if ($is_split) {
        $data->{html} = \@token;
    }
    else {
        $data->{html} = join $join_char , @token;
    }

    return $data;
}

1;

__END__

=pod

=head1 NAME

Catmandu::Fix::html_text - keep only the textual data in the HTML

=head1 SYNOPSIS

   # keep only the text
   html_text()
   # returns:
   #  html: "<html>...</html>"

   # keep only the text but return an array_ref
   html_text(split:1)
   # returns:
   #  html:
   #    - <html>
   #    - ...
   #    - </html>

=head1 SEE ALSO

L<Catmandu::Fix>

=cut