1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68
|
#!/usr/bin/perl -w
use strict;
# This is a short example that basically does the same
# thing as the default file system access method by
# recursing directories, but also shows how to process different
# file types -- in this example pdf is converted to xml for indexing.
# in this example, only .pdf and .config files are indexed.
# the pdf2xml module is in the prog-bin directory of the swish-e distribution
use lib '../prog-bin';
use File::Find; # for recursing a directory tree
use pdf2xml; # example module for pdf to xml conversion
# Not that you need IndexContents XML .pdf in the
# swish-e config file
# See perldoc File::Find for information on following symbolic links
use constant DEBUG => 0;
# See if a directory was passed in via the SwishProgParameters swish
# directive
my $dir = shift || '.';
find(
{
wanted => \&wanted,
no_chdir => 1,
},
$dir,
);
sub wanted {
return if -d;
if ( /\.pdf$/ ) {
print STDERR "Indexing pdf $File::Find::name\n" if DEBUG;
print ${ pdf2xml( $File::Find::name ) };
} elsif ( /\.config$/ ) {
print STDERR "Indexing $File::Find::name\n" if DEBUG;
print ${ get_content( $File::Find::name ) };
} else {
print STDERR "Skipping $File::Find::name\n" if DEBUG;
}
}
sub get_content {
my $path = shift;
my ( $size, $mtime ) = (stat $path )[7,9];
open FH, $path or die "$path: $!";
my $content = <<EOF;
Content-Length: $size
Last-Mtime: $mtime
Path-Name: $path
EOF
local $/ = undef;
$content .= <FH>;
return \$content;
}
|