File: extract-table.pl

package info (click to toggle)
emacspeak 29.0-9
  • links: PTS
  • area: main
  • in suites: wheezy
  • size: 12,904 kB
  • sloc: xml: 55,354; lisp: 48,335; cpp: 2,321; tcl: 1,500; makefile: 936; python: 836; sh: 785; perl: 459; ansic: 241
file content (74 lines) | stat: -rwxr-xr-x 1,648 bytes parent folder | download | duplicates (3)
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
#!/usr/bin/perl -w
#$Id: extract-table.pl 5842 2008-08-25 18:54:14Z tv.raman.tv $
# Accepts a URI and table spec
#returns csv output on STDOUT 
use strict;
use FileHandle;
use LWP::UserAgent;
use HTML::TableExtract;
use IO::File;
use File::Temp qw(tempfile);
use Getopt::Long;
use vars qw (%options);
my ($url, $file, $depth, $count, $cols);

my %options = (
    url => \$url,
    file => \$file,
    depth => \$depth,
    count => \$count,
    headers => \$cols);
GetOptions (\%options,
            'file=s',
            'url=s',
            'depth=i',
            'count=i',
            'headers=s');

my ($input, $inputname);
if (defined ($file)) {
  $input = $file;
} else {
    ($input, $inputname) = tempfile(suffix=>'.html');
  RetrieveURLToFile($url, $inputname);
}

my $te;
if ( defined ($cols)) {
  my @headers = split(',', $cols);
  $te = new HTML::TableExtract(headers=>\@headers);
} else {
 $te = new HTML::TableExtract( depth => $depth, count=>$count); 
}
$te->parse_file($input);

my ($ts,$row);
my $output =\*STDOUT;
foreach $ts ($te->table_states) {
          foreach $row ($ts->rows) {
             $output->print ( join(',', @$row), "\n");
          }
        }

$output->close();

if (defined ($url)) {
  unlink ($inputname);
}
# {{{  retrieve URL to file

sub RetrieveURLToFile {
  my ($url, $filename) = @_;
  my $ua = new LWP::UserAgent;
  # Create a request
  my $req = new HTTP::Request( 'GET' => $url);
  # Pass request to the user agent and get a response back
  my $res = $ua->request($req, $filename);
  if ($res->is_success()) {
  } elsif ($res->is_error()) {
      die ("Retrieval failed  for $url");
  }
}

# }}}
1;