1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161
|
#!/usr/bin/perl -w
use strict;
#
# Version 1.0.1 12-Feb-2002
# Written by David Adams <d.j.adams@soton.ac.uk>
#
# Uses pdftotext & pdfinfo utilities from the xpdf package
# to read an Adobe Acrobat file and produce HTML output.
#
# Can be called directly from htdig as an external converter,
# or may be called by doc2html.pl converter script.
#
####--- Configuration ---####
# Full paths of pdtotext and pdfinfo
# (get them from the xpdf package at http://www.foolabs.com/xpdf/):
#### YOU MUST SET THESE ####
my $PDFTOTEXT = "/... .../pdftotext";
my $PDFINFO = "/... .../pdfinfo";
#
# De-hyphenation option (only affects end-of-line hyphens):
my $Dehyphenate = 1;
#
# Set title to be used when none is found:
my $Default_title = "Adobe Acrobat Document";
#
# make portable to win32 platform or unix:
my $null = "/dev/null";
if ($^O eq "MSWin32") {$null = "nul";}
####--- End of configuration ---###
if (! -x $PDFTOTEXT) { die "Unable to execute pdftotext" }
my $Input = $ARGV[0] || die "Usage: pdf2html.pl filename [mime-type] [URL]";
my $MIME_type = $ARGV[1] || '';
if ($MIME_type and ($MIME_type !~ m#^application/pdf#i)) {
die "MIME/type $MIME_type wrong";
}
my $Name = $ARGV[2] || '';
$Name =~ s#^(.*/)##;
# decode if 2nd argument was a URL
$Name =~ s/%([A-F0-9][A-F0-9])/pack("C", hex($1))/gie if $1;
&pdf_head;
&pdf_body;
exit;
#------------------------------------------------------------------------------
sub pdf_head {
#
# Contributed by Greg Holmes and Michael Fuller
# (any errors by David Adams)
#
my $title = '';
my $subject = '';
my $keywords = '';
if (open(INFO, "$PDFINFO '$Input' 2>$null |")) {
while (<INFO>) {
if (m/^title:/i) {
s/^title:\s+//i;
$title = &clean_pdf($_);
} elsif (m/^subject:/i) {
s/^subject:\s+//i;
$subject = &clean_pdf($_);
} elsif (m/^keywords:/i) {
s/^keywords:\s+//i;
$keywords = &clean_pdf($_);
}
}
close INFO;
} else { warn "cannot execute pdfinfo" }
if (not length $title) {
if ($Name) {
$title = '[' . $Name . ']';
} else {
$title = $Default_title;
}
}
print "<HTML>\n<HEAD>\n";
print "<TITLE>$title</TITLE>\n";
if (length $subject) {
print '<META NAME="DESCRIPTION" CONTENT="' . $subject. "\">\n";
}
if (length $keywords) {
print '<META NAME="KEYWORDS" CONTENT="' . $keywords . "\">\n";
}
print "</HEAD>\n";
###print STDERR "\n$Name:\n";
###print STDERR "\tTitle:\t$title\n";
###print STDERR "\tDescription:\t$subject\n";
###print STDERR "\tKeywords:\t$keywords\n";
}
#------------------------------------------------------------------------------
sub pdf_body {
my $bline = '';
open(CAT, "$PDFTOTEXT -raw '$Input' - |") ||
die "$PDFTOTEXT doesn't want to be opened using pipe\n";
print "<BODY>\n";
while (<CAT>) {
while ( m/[A-Za-z\300-\377]-\s*$/ && $Dehyphenate) {
$_ .= <CAT>;
last if eof;
s/([A-Za-z\300-\377])-\s*\n\s*([A-Za-z\300-\377])/$1$2/s;
}
s/\255/-/g; # replace dashes with hyphens
# replace bell, backspace, tab. etc. with single space:
s/[\000-\040]+/ /g;
$_ = &HTML($_);
if (length) {
print $bline, $_, "\n";
$bline = "<br>\n";
} else {
$bline = "<p>\n";
}
}
close CAT;
print "</BODY>\n</HTML>\n";
return;
}
#------------------------------------------------------------------------------
sub HTML {
my $text = shift;
$text =~ s/\f/\n/gs; # replace form feed
$text =~ s/\s+/ /g; # replace multiple spaces, etc. with a single space
$text =~ s/\s+$//gm; # remove trailing space
$text =~ s/&/&/g;
$text =~ s/</</g;
$text =~ s/>/>/g;
chomp $text;
return $text;
}
#------------------------------------------------------------------------------
sub clean_pdf {
# removes odd pair of characters that may be in pdfinfo output
# Any double quotes are replaced with single
my $text = shift;
chomp $text;
$text =~ s/\376\377//g;
$text =~ s/\"/\'/g;
return $text;
}
|