1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238
|
#!/usr/local/bin/perl
# 1998/12/10
# Added: push @allwords, $fields[$x]; <carl@dpiwe.tas.gov.au>
# Replaced: matching patterns. they match words starting or ending with ()[]'`;:?.,! now, not when in between!
# Gone: the variable $line is gone (using $_ now)
#
# 1998/12/11
# Added: catdoc test (is catdoc runnable?) <carl@dpiwe.tas.gov.au>
# Changed: push line semi-colomn wrong. <carl@dpiwe.tas.gov.au>
# Changed: matching works for end of lines now <carl@dpiwe.tas.gov.au>
# Added: option to rigorously delete all punctuation <carl@dpiwe.tas.gov.au>
#
# 1999/02/09
# Added: option to delete all hyphens <grdetil@scrc.umanitoba.ca>
# Added: uses ps2ascii to handle PS files <grdetil@scrc.umanitoba.ca>
# 1999/02/15
# Added: check for some file formats <Frank.Richter@hrz.tu-chemnitz.de>
# 1999/02/25
# Added: uses pdftotext to handle PDF files <grdetil@scrc.umanitoba.ca>
# Changed: generates a head record with punct. <grdetil@scrc.umanitoba.ca>
# 1999/03/01
# Added: extra checks for file "wrappers" <grdetil@scrc.umanitoba.ca>
# & check for MS Word signature (no longer defaults to catdoc)
# 1999/03/05
# Changed: rejoin hyphenated words across lines <grdetil@scrc.umanitoba.ca>
# (in PDFs) & remove multiple punct. chars. between words (all)
# 1999/03/10
# Changed: fix handling of minimum word length <grdetil@scrc.umanitoba.ca>
# 1999/08/12
# Changed: adapted for xpdf 0.90 release <grdetil@scrc.umanitoba.ca>
# Added: uses pdfinfo to handle PDF titles <grdetil@scrc.umanitoba.ca>
# Changed: keep hyphens by default, as htdig <grdetil@scrc.umanitoba.ca>
# does, but change dashes to hyphens
# 1999/09/09
# Changed: fix to handle empty PDF title right <grdetil@scrc.umanitoba.ca>
# 2000/01/12
# Changed: "break" to "last" (no break in Perl) <wjones@tc.fluke.com>
# Changed: code for parsing a line into a list of
# words, to use "split", other streamlining.
# 2001/07/12
# Changed: fix "last" handling in dehyphenation <grdetil@scrc.umanitoba.ca>
# Added: handle %xx codes in title from URL <grdetil@scrc.umanitoba.ca>
# 2003/06/07
# Changed: allow file names with spaces <lha@users.sourceforge.net>
#########################################
#
# set this to your MS Word to text converter
# get it from: http://www.fe.msk.ru/~vitus/catdoc/
#
$CATDOC = "/usr/local/bin/catdoc";
#
# set this to your WordPerfect to text converter, or /bin/true if none available
# this nabs WP documents with .doc suffix, so catdoc doesn't see them
#
$CATWP = "/bin/true";
#
# set this to your RTF to text converter, or /bin/true if none available
# this nabs RTF documents with .doc suffix, so catdoc doesn't see them
#
$CATRTF = "/bin/true";
#
# set this to your PostScript to text converter
# get it from the ghostscript 3.33 (or later) package
#
$CATPS = "/usr/bin/ps2ascii";
#
# set this to your PDF to text converter, and pdfinfo tool
# get it from the xpdf 0.90 package at http://www.foolabs.com/xpdf/
#
$CATPDF = "/usr/bin/pdftotext";
$PDFINFO = "/usr/bin/pdfinfo";
#$CATPDF = "/usr/local/bin/pdftotext";
#$PDFINFO = "/usr/local/bin/pdfinfo";
# need some var's
$minimum_word_length = 3;
$head = "";
@allwords = ();
@temp = ();
$x = 0;
#@fields = ();
$calc = 0;
$dehyphenate = 0;
$title = "";
#
# okay. my programming style isn't that nice, but it works...
#for ($x=0; $x<@ARGV; $x++) { # print out the args
# print STDERR "$ARGV[$x]\n";
#}
# Read first bytes of file to check for file type (like file(1) does)
open(FILE, "< $ARGV[0]") || die "Oops. Can't open file $ARGV[0]: $!\n";
read FILE,$magic,8;
close FILE;
if ($magic =~ /^\0\n/) { # possible MacBinary header
open(FILE, "< $ARGV[0]") || die "Oops. Can't open file $ARGV[0]: $!\n";
read FILE,$magic,136; # let's hope parsers can handle them!
close FILE;
}
if ($magic =~ /%!|^\033%-12345/) { # it's PostScript (or HP print job)
$parser = $CATPS; # gs 3.33 leaves _temp_.??? files in .
$parsecmd = "(cd /tmp; $parser; rm -f _temp_.???) < \"$ARGV[0]\" |";
# keep quiet even if PS gives errors...
# $parsecmd = "(cd /tmp; $parser; rm -f _temp_.???) < \"$ARGV[0]\" 2>/dev/null |";
$type = "PostScript";
$dehyphenate = 0; # ps2ascii already does this
if ($magic =~ /^\033%-12345/) { # HP print job
open(FILE, "< $ARGV[0]") || die "Oops. Can't open file $ARGV[0]: $!\n";
read FILE,$magic,256;
close FILE;
exit unless $magic =~ /^\033%-12345X\@PJL.*\n*.*\n*.*ENTER\s*LANGUAGE\s*=\s*POSTSCRIPT.*\n*.*\n*.*\n%!/
}
} elsif ($magic =~ /%PDF-/) { # it's PDF (Acrobat)
$parser = $CATPDF;
$parsecmd = "$parser -raw \"$ARGV[0]\" - |";
# to handle single-column, strangely laid out PDFs, use coalescing feature...
# $parsecmd = "$parser \"$ARGV[0]\" - |";
$type = "PDF";
$dehyphenate = 1; # PDFs often have hyphenated lines
if (open(INFO, "$PDFINFO \"$ARGV[0]\" 2>/dev/null |")) {
while (<INFO>) {
if (/^Title:/) {
$title = $_;
$title =~ s/^Title:\s+//;
$title =~ s/\s+$//;
$title =~ s/\s+/ /g;
$title =~ s/&/\&\;/g;
$title =~ s/</\<\;/g;
$title =~ s/>/\>\;/g;
last;
}
}
close INFO;
}
} elsif ($magic =~ /WPC/) { # it's WordPerfect
$parser = $CATWP;
$parsecmd = "$parser \"$ARGV[0]\" |";
$type = "WordPerfect";
$dehyphenate = 0; # WP documents not likely hyphenated
} elsif ($magic =~ /^{\\rtf/) { # it's Richtext
$parser = $CATRTF;
$parsecmd = "$parser \"$ARGV[0]\" |";
$type = "RTF";
$dehyphenate = 0; # RTF documents not likely hyphenated
} elsif ($magic =~ /\320\317\021\340/) { # it's MS Word
$parser = $CATDOC;
$parsecmd = "$parser -a -w \"$ARGV[0]\" |";
$type = "Word";
$dehyphenate = 0; # Word documents not likely hyphenated
} else {
die "Can't determine type of file $ARGV[0]; content-type: $ARGV[1]; URL: $ARGV[2]\n";
}
# print STDERR "$ARGV[0]: $type $parsecmd\n";
die "Hmm. $parser is absent or unwilling to execute.\n" unless -x $parser;
# open it
open(CAT, "$parsecmd") || die "Hmmm. $parser doesn't want to be opened using pipe.\n";
while (<CAT>) {
while (/[A-Za-z\300-\377]-\s*$/ && $dehyphenate) {
$_ .= <CAT>;
last if eof;
s/([A-Za-z\300-\377])-\s*\n\s*([A-Za-z\300-\377])/$1$2/s
}
$head .= " " . $_;
# s/\s+[\(\)\[\]\\\/\^\;\:\"\'\`\.\,\?!\*]+|[\(\)\[\]\\\/\^\;\:\"\'\`\.\,\?!\*]+\s+|^[\(\)\[\]\\\/\^\;\:\"\'\`\.\,\?!\*]+|[\(\)\[\]\\\/\^\;\:\"\'\`\.\,\?!\*]+$/ /g; # replace reading-chars with space (only at end or begin of word, but allow multiple characters)
## s/\s[\(\)\[\]\\\/\^\;\:\"\'\`\.\,\?!\*]|[\(\)\[\]\\\/\^\;\:\"\'\`\.\,\?!\*]\s|^[\(\)\[\]\\\/\^\;\:\"\'\`\.\,\?!\*]|[\(\)\[\]\\\/\^\;\:\"\'\`\.\,\?!\*]$/ /g; # replace reading-chars with space (only at end or begin of word)
## s/[\(\)\[\]\\\/\^\;\:\"\'\`\.\,\?!\*]/ /g; # rigorously replace all by <carl@dpiwe.tas.gov.au>
## s/[\-\255]/ /g; # replace hyphens with space
# s/[\255]/-/g; # replace dashes with hyphens
# @fields = split; # split up line
# next if (@fields == 0); # skip if no fields (does it speed up?)
# for ($x=0; $x<@fields; $x++) { # check each field if string length >= 3
# if (length($fields[$x]) >= $minimum_word_length) {
# push @allwords, $fields[$x]; # add to list
# }
# }
# Delete valid punctuation. These are the default values
# for valid_punctuation, and should be changed other values
# are specified in the config file.
tr{-\255._/!#$%^&'}{}d;
push @allwords, grep { length >= $minimum_word_length } split /\W+/;
}
close CAT;
exit unless @allwords > 0; # nothing to output
#############################################
# print out the title, if it's set, and not just a file name
if ($title !~ /^$/ && $title !~ /^[A-G]:[^\s]+\.[Pp][Dd][Ff]$/) {
print "t\t$title\n";
} else { # otherwise generate a title
@temp = split(/\//, $ARGV[2]); # get the filename, get rid of basename
$temp[-1] =~ s/%([A-F0-9][A-F0-9])/pack("C", hex($1))/gie;
print "t\t$type Document $temp[-1]\n"; # print it
}
#############################################
# print out the head
$head =~ s/^\s+//; # remove leading and trailing space
$head =~ s/\s+$//;
$head =~ s/\s+/ /g;
$head =~ s/&/\&\;/g;
$head =~ s/</\<\;/g;
$head =~ s/>/\>\;/g;
print "h\t$head\n";
#$calc = @allwords;
#print "h\t";
##if ($calc >100) { # but not more than 100 words
## $calc = 100;
##}
#for ($x=0; $x<$calc; $x++) { # print out the words for the exerpt
# print "$allwords[$x] ";
#}
#print "\n";
#############################################
# now the words
#for ($x=0; $x<@allwords; $x++) {
# $calc=int(1000*$x/@allwords); # calculate rel. position (0-1000)
# print "w\t$allwords[$x]\t$calc\t0\n"; # print out word, rel. pos. and text type (0)
#}
$x = 0;
for ( @allwords ) {
# print out word, rel. pos. and text type (0)
printf "w\t%s\t%d\t0\n", $_, 1000*$x++/@allwords;
}
$calc=@allwords;
# print STDERR "# of words indexed: $calc\n";
|