File: Table.pm

package info (click to toggle)
libhtml-tableparser-perl 0.43-1
links: PTS, VCS
area: main
in suites: bookworm, bullseye, buster, forky, sid, trixie
size: 604 kB
sloc: perl: 927; makefile: 2
file content (826 lines) | stat: -rw-r--r-- 18,484 bytes
package HTML::TableParser::Table;

# ABSTRACT: support class for HTML::TableParser

use strict;
use warnings;

use HTML::Entities qw();

our $VERSION = '0.43';

## no critic ( ProhibitAccessOfPrivateData )

#pod =method new
#pod
#pod =cut

sub new
{
  my $this = shift;

  my $class = ref($this) || $this;

  my $self = {
              data      => [[]],        # row data (for overlapping rows)
              row       => undef,       # row info
              col       => undef,       # column info
              hdr       => undef,       # accumulated header info
              hdr_row   => 0,           # index of header row
              hdr_line  => undef,       # line in file of header row
              in_hdr    => 0,           # are we in a header row?
              prev_hdr  => 0,           # was the previous row a header row?
              line      => undef,       # line in file of current row
              start_line => undef,      # line in file of table start
              req       => undef,       # the matching table request
              exclreqs  => {},          # the requests which exlude this table
             };

  bless $self, $class;

  my ( $parser, $ids, $reqs, $line ) = @_;

  $self->{parser} = $parser;
  $self->{start_line} = $line;

  # if called with no args, create an empty, placeholder object
  unless ( defined $ids )
  {
    $self->{ids} = [ 0 ];
    $self->{process} = 0;
    $self->{id} =  'sentinel';
  }

  else
  {
    $ids->[-1]++;
    $self->{oids} = [ @$ids ];
    $self->{ids} = [ @$ids, 0 ];
    $self->{id} =  join( '.', grep { $_ != 0 } @{$ids} );

    $self->{reqs} = $reqs;

    # are we interested in this table?
    $self->match_id();

    # inform user of table start.  note that if we're looking for
    # for column name matches, we don't want to do the callback;
    # in that case $self->{req} isn't set and callback() won't
    # actually make the call.
    $self->callback( 'start', $self->{start_line} )
                     if $self->{process};
  }

  $self;
}

#pod =method match_id
#pod
#pod =cut

sub match_id
{
  my $self = shift;

  $self->{process} = 0;
  $self->{req} = undef;

  # 1. look for explicit id matches
  # 2. if no explicit id match, use header matches
  # 3. if no header matches, use DEFAULT
  # 4. if no DEFAULT, no match

  # 1. explicit id.

  my ( $skip, $req );

  ( $skip, $req ) =
    req_match_id( $self->{reqs}, $self->{id}, $self->{oids},
                  $self->{exclreqs} );

  # did we match a skip table request?
  return if $skip;

  if ( $req )
  {
    $self->match_req( $req );
    return;
  }


  # 2. header match.
  # don't set {req}, as that'll trigger callbacks and we're not sure
  # this is a match yet

  if ( grep { @{$_->{cols}} } @{$self->{reqs}})
  {
    $self->{process} = 1;
    $self->{req} = undef;
    return;
  }

  # 3. DEFAULT match
  ( $skip, $req ) =
    req_match_id( $self->{reqs}, 'DEFAULT', $self->{oids}, $self->{exclreqs} );

  # did we match a skip table request? Does this make sense for DEFAULT?
  return if $skip;

  if ( $req )
  {
    $self->match_req( $req );
    return;
  }

  # 4. out of luck. no match.
}

#pod =method req_match_id
#pod
#pod =cut

# determine if a request matches an id.  requests should
# be real objects, but until then...
sub req_match_id
{
  my ( $reqs, $id, $oids, $excluded ) = @_;

  for my $req ( @$reqs )
  {
    # if we've already excluded this request, don't bother again.
    # this is needed for id = DEFAULT passes where we've previously
    # excluded based on actual table id and should again.
    next if exists $excluded->{$req};

    # bail if this request has already matched and we're not
    # multi-matching
    next if $req->{match} && ! $req->{MultiMatch};

    for my $cmp ( @{$req->{id}} )
    {
      # is this a subroutine to call?
      if ( 'CODE' eq ref $cmp->{match} )
      {
        next unless $cmp->{match}->($id, $oids );
      }

      # regular expression
      elsif( 'Regexp' eq ref $cmp->{match} )
      {
        next unless $id =~ /$cmp->{match}/;
      }

      # a direct match?
      else
      {
        next unless $id eq $cmp->{match};
      }

      # we get here only if there was a match.

      # move on to next request if this was an explicit exclude
      # request.
      if ( $cmp->{exclude} )
      {
        $excluded->{$req}++;
        next;
      }

      # return match, plus whether this is a global skip request
      return ( $cmp->{skip}, $req );
    }
  }

  ( 0, undef );
}

#pod =method req_match_cols
#pod
#pod =cut

# determine if a request matches a column.  requests should
# be real objects, but until then...
sub req_match_cols
{
  my ( $reqs, $cols, $id, $oids ) = @_;

  for my $req ( @$reqs )
  {
    # bail if this request has already matched and we're not
    # multi-matching
    next if $req->{match} && ! $req->{MultiMatch};

    my @fix_cols = @$cols;
    fix_texts($req, \@fix_cols);

    for my $cmp ( @{$req->{cols}} )
    {
      # is this a subroutine to call?
      if ( 'CODE' eq ref $cmp->{match} )
      {
        next unless $cmp->{match}->( $id, $oids, \@fix_cols );
      }

      # regular expression
      elsif( 'Regexp' eq ref $cmp->{match} )
      {
        next unless grep { /$cmp->{match}/ } @fix_cols;
      }

      # a direct match?
      else
      {
        next unless grep { $_ eq $cmp->{match} } @fix_cols;
      }

      # we get here only if there was a match

      # move on to next request if this was an explicit exclude
      # request.
      next if $cmp->{exclude};

      # return match, plus whether this is a global skip request
      return ( $cmp->{skip}, $req );
    }

  }

  (0, undef);
}

#pod =method match_hdr
#pod
#pod =cut

# we've pulled in a header; does it match against one of the requests?
sub match_hdr
{
  my ( $self, @cols ) = @_;


  # 1. check header matches
  # 2. if no header matches, use DEFAULT id
  # 3. if no DEFAULT, no match

  # 1. check header matches
  my ( $skip, $req ) = req_match_cols( $self->{reqs}, \@cols, $self->{id},
                                       $self->{oids} );
  # did we match a skip table request?
  return 0 if $skip;

  if ( $req )
  {
    $self->match_req( $req );
    return 1;
  }


  # 2. DEFAULT match
  ( $skip, $req ) =
    req_match_id( $self->{reqs}, 'DEFAULT', $self->{oids}, $self->{exclreqs} );

  # did we match a skip table request? Does this make sense for DEFAULT?
  return 0 if $skip;

  if ( $req )
  {
    $self->match_req( $req );
    return 1;
  }

  # 3. if no DEFAULT, no match

  0;
}

#pod =method match_req
#pod
#pod =cut

sub match_req
{
  my ( $self, $req ) = @_;

  if ( $req->{class} )
  {
#    no strict 'refs';
    my $new = $req->{new};
    $self->{obj} = $req->{class}->$new( $req->{id}, $req->{udata} );
  }
  elsif ( $req->{obj} )
  {
    $self->{obj} = $req->{obj};
  }

  $self->{process} = 1;
  $self->{req} = $req;
  $self->{req}{match}++;
}


#pod =method callback
#pod
#pod =cut

# generic call back interface.  handle method calls as well as
# subroutine calls.
sub callback
{
  my $self = shift;
  my $method = shift;

  return unless
    defined $self->{req} && exists $self->{req}->{$method};

  my $req = $self->{req};
  my $call = $req->{$method};

  if ( 'CODE' eq ref $call )
  {
    $call->( $self->{id}, @_, $req->{udata} );
  }
  else
  {
    # if the object was destroyed before we get here (if it
    # was created by us and thus was destroyed before us if
    # there was an error), we can't call a method
    $self->{obj}->$call( $self->{id}, @_, $req->{udata} )
      if defined $self->{obj};
  }
}


# handle <th>

#pod =method start_header
#pod
#pod =cut

sub start_header
{
  my $self = shift;
  my ( undef, $line ) = @_;

  $self->{in_hdr}++;
  $self->{prev_hdr}++;
  $self->{hdr_line} = $line;
  $self->start_column( @_ );
}


# handle </th>

#pod =method end_header
#pod
#pod =cut

sub end_header
{
  my $self = shift;
  $self->end_column();
}

# handle <td>
#pod =method start_column
#pod
#pod =cut

sub start_column
{
  my $self = shift;
  my ( $attr, $line ) =  @_;

  # end last column if not explicitly ended. perform check here
  # to avoid extra method call
  $self->end_column() if defined $self->{col};

  # we really shouldn't be here if a row hasn't been started
  unless ( defined $self->{row} )
  {
    $self->callback( 'warn', $self->{id}, $line,
                     "<td> or <th> without <tr> at line $line\n" );
    $self->start_row( {}, $line );
  }

  # even weirder.  if the last row was a header we have to process it now,
  # rather than waiting until the end of this row, as there might be
  # a table in one of the cells in this row and if the enclosing table
  # was using a column match/re, we won't match it's header until after
  # the enclosed table is completely parsed.  this is bad, as it may
  # grab a match (if there's no multimatch) meant for the enclosing table.

  # if we're one row past the header, we're done with the header
  $self->finish_header()
    if ! $self->{in_hdr} && $self->{prev_hdr};

  $self->{col} = { attr =>  { %$attr}  };
  $self->{col}{attr}{colspan} ||= 1;
  $self->{col}{attr}{rowspan} ||= 1;
}

# handle </td>

#pod =method end_column
#pod
#pod =cut

sub end_column
{
  my $self = shift;

  return unless defined $self->{col};

  $self->{col}{text} = defined $self->{text} ? $self->{text} : '' ;

  push @{$self->{row}}, $self->{col};

  $self->{col} = undef;
  $self->{text} = undef;
}

#pod =method start_row
#pod
#pod =cut

sub start_row
{
  my $self = shift;
  my ( $attr, $line ) = @_;

  # end last row if not explicitly ended
  $self->end_row();

  $self->{row} = [];
  $self->{line} = $line;
}


#pod =method end_row
#pod
#pod =cut

sub end_row
{
  my $self = shift;

  return unless defined $self->{row};

  # perhaps an unfinished row. first finish column
  $self->end_column();

  # if we're in a header, deal with overlapping cells differently
  # then if we're in the data section
  if ( $self->{in_hdr} )
  {

    my $cn = 0;
    foreach my $col ( @{$self->{row}} )
    {
      # do this just in case there are newlines and we're concatenating
      # column names later on.  causes strange problems.  besides,
      # column names should be regular
      $col->{text} =~ s/^\s+//;
      $col->{text} =~ s/\s+$//;

      # need to find the first undefined column
      $cn++ while defined $self->{hdr}[$cn][$self->{hdr_row}];

      # note that header is stored as one array per column, not row!
      for ( my $cnn = 0 ; $cnn < $col->{attr}{colspan} ; $cnn++, $cn++ )
      {
        $self->{hdr}[$cn] ||= [];
        $self->{hdr}[$cn][$self->{hdr_row}] = $col->{text};

        # put empty placeholders in the rest of the rows
        for ( my $rnn = 1 ; $rnn < $col->{attr}{rowspan} ; $rnn++ )
        {
          $self->{hdr}[$cn][$rnn + $self->{hdr_row}] = '';
        }
      }
    }

    $self->{hdr_row}++;
  }
  else
  {
    my $cn = 0;
    foreach my $col ( @{$self->{row}} )
    {
      # need to find the first undefined column
      $cn++ while defined $self->{data}[0][$cn];

      for ( my $cnn = 0 ; $cnn < $col->{attr}{colspan} ; $cnn++, $cn++ )
      {
        for ( my $rnn = 0 ; $rnn < $col->{attr}{rowspan} ; $rnn++ )
        {
          $self->{data}[$rnn] ||= [];
          $self->{data}[$rnn][$cn] = $col->{text};
        }
      }
    }
  }

  # if we're one row past the header, we're done with the header
  $self->finish_header()
    if ! $self->{in_hdr} && $self->{prev_hdr};

  # output the data if we're not in a header
  $self->callback( 'row', $self->{line},
                   fix_texts( $self->{req}, shift @{$self->{data}} ) )
      unless $self->{in_hdr};

  $self->{in_hdr} = 0;
  $self->{row} = undef;
}

# collect the possible multiple header rows into one array and
# send it off

#pod =method finish_header
#pod
#pod =cut

sub finish_header
{
  my $self = shift;

  return unless $self->{hdr};

  my @header = map { join( ' ', grep { defined $_ && $_ ne '' } @{$_}) }
                        @{ $self->{hdr} };

  # if we're trying to match header columns, check that here.
  if ( defined $self->{req} )
  {
    fix_texts( $self->{req}, \@header );
    $self->callback( 'hdr',  $self->{hdr_line}, \@header );
  }

  else
  {
    if ( $self->match_hdr( @header ) )
    {
      # haven't done this callback yet...
      $self->callback( 'start', $self->{start_line} );

      fix_texts( $self->{req}, \@header );
      $self->callback( 'hdr',  $self->{hdr_line}, \@header );
    }

    # no match.  reach up to the controlling parser and turn off
    # processing of this table. this is kind of kludgy!
    else
    {
      $self->{parser}->process(0);
    }
  }


  $self->{hdr} = undef;
  $self->{prev_hdr} = undef;
  $self->{hdr_row} = 0;
}

DESTROY
{
  my $self = shift;

  # if we're actually parsing this table, do something.
  if ( $self->{process} )
  {
    # just in case
    $self->end_row();

    # just in case there's no table body
    $self->finish_header();

    $self->callback( 'end', $self->{line} );
  }
}

#pod =method fix_texts
#pod
#pod =cut

sub fix_texts
{
  my ( $req, $texts  ) = @_;

  for ( @$texts )
  {
    local $HTML::Entities::entity2char{nbsp} =
      $HTML::Entities::entity2char{nbsp};

    $HTML::Entities::entity2char{nbsp} = ' '
      if $req->{DecodeNBSP};

    chomp $_
      if $req->{Chomp};

    HTML::Entities::decode_entities( $_ )
      if $req->{Decode};


    if ( $req->{Trim} )
    {
      s/^\s+//;
      s/\s+$//;
    }
  }

  $texts;
}

#pod =method text
#pod
#pod =cut

sub text
{
  my $self = shift;

  $self->{text} = shift;
}

#pod =method id
#pod
#pod =cut

sub id  { $_[0]->{id} }

#pod =method ids
#pod
#pod =cut

sub ids { $_[0]->{ids} }


#pod =method process
#pod
#pod =cut

sub process { $_[0]->{process} }

1;

#
# This file is part of HTML-TableParser
#
# This software is Copyright (c) 2018 by Smithsonian Astrophysical Observatory.
#
# This is free software, licensed under:
#
#   The GNU General Public License, Version 3, June 2007
#

=pod

=head1 NAME

HTML::TableParser::Table - support class for HTML::TableParser

=head1 VERSION

version 0.43

=head1 DESCRIPTION

This class is used to keep track of information related to a table and
to create the information passed back to the user callbacks.  It is in
charge of marshalling the massaged header and row data to the user
callbacks.

An instance is created when the controlling TableParser class finds a
C<<table> tag.  The object is given an id based upon which table it is
to work on.  Its methods are invoked from the TableParser callbacks
when they run across an appropriate tag (C<tr>, C<th>, C<td>).  The
object is destroyed when the matching C</table> tag is found.

Since tables may be nested, multiple B<HTML::TableParser::Table>
objects may exist simultaneously.  B<HTML::TableParser> uses two
pieces of information held by this class -- ids and process.  The
first is an array of table ids, one element per level of table
nesting.  The second is a flag indicating whether this table is being
processed (i.e. it matches a requested table) or being ignored.  Since
B<HTML::TableParser> uses the ids information from an existing table
to initialize a new table, it first creates an empty sentinel (place
holder) table (by calling the B<HTML::TableParser::Table> constructor
with no arguments).

The class handles missing C</tr>, C</td>, and C</th> tags.  As such
(especially when handling multi-row headers) user callbacks may
be slightly delayed (and data cached).  It also handles rows
with overlapping columns

=head1 METHODS

=head2 new

=head2 match_id

=head2 req_match_id

=head2 req_match_cols

=head2 match_hdr

=head2 match_req

=head2 callback

=head2 start_header

=head2 end_header

=head2 start_column

=head2 end_column

=head2 start_row

=head2 end_row

=head2 finish_header

=head2 fix_texts

=head2 text

=head2 id

=head2 ids

=head2 process

=head1 BUGS

Please report any bugs or feature requests on the bugtracker website
L<https://rt.cpan.org/Public/Dist/Display.html?Name=HTML-TableParser> or by
email to
L<bug-HTML-TableParser@rt.cpan.org|mailto:bug-HTML-TableParser@rt.cpan.org>.

When submitting a bug or request, please include a test-file or a
patch to an existing test-file that illustrates the bug or desired
feature.

=head1 SOURCE

The development version is on github at L<https://github.com/djerius/html-tableparser>
and may be cloned from L<git://github.com/djerius/html-tableparser.git>

=head1 SEE ALSO

Please see those modules/websites for more information related to this module.

=over 4

=item *

L<HTML::TableParser|HTML::TableParser>

=back

=head1 AUTHOR

Diab Jerius <djerius@cpan.org>

=head1 COPYRIGHT AND LICENSE

This software is Copyright (c) 2018 by Smithsonian Astrophysical Observatory.

This is free software, licensed under:

  The GNU General Public License, Version 3, June 2007

=cut

__END__

#pod =head1 DESCRIPTION
#pod
#pod This class is used to keep track of information related to a table and
#pod to create the information passed back to the user callbacks.  It is in
#pod charge of marshalling the massaged header and row data to the user
#pod callbacks.
#pod
#pod An instance is created when the controlling TableParser class finds a
#pod C<<table> tag.  The object is given an id based upon which table it is
#pod to work on.  Its methods are invoked from the TableParser callbacks
#pod when they run across an appropriate tag (C<tr>, C<th>, C<td>).  The
#pod object is destroyed when the matching C</table> tag is found.
#pod
#pod Since tables may be nested, multiple B<HTML::TableParser::Table>
#pod objects may exist simultaneously.  B<HTML::TableParser> uses two
#pod pieces of information held by this class -- ids and process.  The
#pod first is an array of table ids, one element per level of table
#pod nesting.  The second is a flag indicating whether this table is being
#pod processed (i.e. it matches a requested table) or being ignored.  Since
#pod B<HTML::TableParser> uses the ids information from an existing table
#pod to initialize a new table, it first creates an empty sentinel (place
#pod holder) table (by calling the B<HTML::TableParser::Table> constructor
#pod with no arguments).
#pod
#pod The class handles missing C</tr>, C</td>, and C</th> tags.  As such
#pod (especially when handling multi-row headers) user callbacks may
#pod be slightly delayed (and data cached).  It also handles rows
#pod with overlapping columns