File: extract-table.pl

package info (click to toggle)
emacspeak 24-1
  • links: PTS
  • area: main
  • in suites: etch, etch-m68k
  • size: 9,092 kB
  • ctags: 4,597
  • sloc: xml: 54,699; lisp: 42,103; tcl: 1,799; makefile: 810; cpp: 603; sh: 566; ansic: 153; perl: 124
file content (74 lines) | stat: -rwxr-xr-x 1,716 bytes parent folder | download
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
#!/usr/bin/perl -w
#$Id: extract-table.pl,v 24.0 2006/05/03 02:54:03 raman Exp $
# Accepts a URI and table spec
#returns a csv file
use strict;
use FileHandle;
use LWP::UserAgent;
use HTML::TableExtract;
use IO::File;
use Getopt::Long;
use vars qw (%options);
my ($url, $file, $task, $depth, $count, $cols);

my %options = (task => \$task,
           url => \$url,
file => \$file,
           depth => \$depth,
count => \$count,
headers => \$cols);
GetOptions (\%options,
            'file=s',
            'url=s',
            'task=s',
            'depth=i',
            'count=i',
            'headers=s');
$task ||= "extract-table";
my $input;
if (defined ($file)) {
  $input = $file;
} else {
  $input="/tmp/$options{task}.html";
  RetrieveURLToFile($url, $input);
}

my $te;
if ( defined ($cols)) {
  my @headers = split(',', $cols);
  $te = new HTML::TableExtract(headers=>\@headers);
} else {
 $te = new HTML::TableExtract( depth => $depth, count=>$count); 
}
$te->parse_file($input);
my $output = new FileHandle (">  /tmp/$task.csv");
my ($ts,$row);
foreach $ts ($te->table_states) {
          foreach $row ($ts->rows) {
             $output->print ( join(',', @$row), "\n");
          }
        }

$output->close();

if (defined ($url)) {
  unlink ($input);
}
# {{{  retrieve URL to file

sub RetrieveURLToFile {
  my ($url, $filename) = @_;
  my $ua = new LWP::UserAgent;
  # Create a request
  my $req = new HTTP::Request( 'GET' => $url);
  # Pass request to the user agent and get a response back
  my $res = $ua->request($req, $filename);
  if ($res->is_success()) {
    warn"table: Retrieved $url to $filename\n";
  } elsif ($res->is_error()) {
    exit ("Retrieval for $url failed\n");
  }
}

# }}}
1;