1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93
|
#!/usr/bin/perl
use strict;
use warnings;
use QtCore4;
use QtGui4;
sub parseHtmlFile {
my ($fileName) = @_;
my $file = Qt::File($fileName);
print 'Analysis of HTML file: ' . $fileName . "\n";
if (!$file->open(Qt::IODevice::ReadOnly())) {
print ' Couldn\'t open the file.' . "\n" . "\n" . "\n";
return;
}
# [0]
my $reader = Qt::XmlStreamReader($file);
# [0]
# [1]
my $paragraphCount = 0;
my @links;
my $title;
while (!$reader->atEnd()) {
$reader->readNext();
if ($reader->isStartElement()) {
if ($reader->name()->toString() eq 'title') {
$title = $reader->readElementText();
}
elsif($reader->name()->toString() eq 'a') {
push @links, $reader->attributes()->value('href')->toString();
}
elsif($reader->name()->toString() eq 'p') {
++$paragraphCount;
}
}
}
# [1]
# [2]
if ($reader->hasError()) {
print ' The HTML file isn\'t well-formed: ' . $reader->errorString()
. "\n" . "\n" . "\n";
return;
}
# [2]
print ' Title: \'' . $title . '\'' . "\n"
. ' Number of paragraphs: ' . $paragraphCount . "\n"
. ' Number of links: ' . scalar @links . "\n"
. ' Showing first few links:' . "\n";
while( scalar @links > 5 ) {
pop @links;
}
foreach my $link (@links) {
print ' ' . $link . "\n";
}
print "\n" . "\n";
}
sub main
{
# intialize QtCore application
my $app = Qt::CoreApplication(\@ARGV);
# get a list of all html files in the current directory
my @filter = (
'*.htm',
'*.html',
);
my $htmlFiles = Qt::Dir::current()->entryList(\@filter, Qt::Dir::Files());
if (ref $htmlFiles eq 'ARRAY' && !scalar @{$htmlFiles}) {
print 'No html files available.';
return 1;
}
# parse each html file and write the result to file/stream
foreach my $file (@{$htmlFiles}) {
parseHtmlFile($file);
}
return 0;
}
exit main();
|