1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75
|
#!/usr/bin/perl -T
use lib '.'; use lib 't';
use SATest; sa_t_init("extracttext");
use Mail::SpamAssassin::Util;
use Test::More;
use constant PDFTOTEXT => eval { my $f = Mail::SpamAssassin::Util::find_executable_in_env_path('pdftotext'); ($f !~ /\s/)?$f:'"'.$f.'"'};
use constant TESSERACT => eval { my $f = Mail::SpamAssassin::Util::find_executable_in_env_path('tesseract'); ($f !~ /\s/)?$f:'"'.$f.'"'};
use constant CAT => eval { my $f = Mail::SpamAssassin::Util::find_executable_in_env_path('cat'); ($f !~ /\s/)?$f:'"'.$f.'"'};
my $tests = 0;
$tests += 2 if (PDFTOTEXT);
$tests += 1 if (TESSERACT);
$tests += 1 if (CAT);
if ($tests && $tests < 4) { diag("some binaries missing, not running all tests\n"); }
plan skip_all => "no needed binaries found, pdftotext, tesseract, or cat" unless $tests;
plan tests => $tests;
%patterns_gtube = (
q{ 1000 GTUBE }, 'gtube',
);
if (PDFTOTEXT) {
tstprefs("
extracttext_external pdftotext ".PDFTOTEXT." -nopgbrk -layout -enc UTF-8 {} -
extracttext_use pdftotext .pdf
extracttext_timeout 30 40
");
%anti_patterns = ();
%patterns = %patterns_gtube;
sarun ("-L -t < data/spam/extracttext/gtube_pdf.eml", \&patterns_run_cb);
ok_all_patterns();
clear_pattern_counters();
# Should fail
tstprefs("
extracttext_external pdftotext ".PDFTOTEXT." -nopgbrk -layout -enc UTF-8 {} -
extracttext_use pdftotext .FOO
extracttext_timeout 30 40
");
%anti_patterns = %patterns_gtube;
%patterns = ();
sarun ("-L -t < data/spam/extracttext/gtube_pdf.eml", \&patterns_run_cb);
ok_all_patterns();
clear_pattern_counters();
}
if (TESSERACT) {
tstprefs("
extracttext_external tesseract {OMP_THREAD_LIMIT=1} ".TESSERACT." -c page_separator= {} -
extracttext_use tesseract .jpg .png .bmp .tif .tiff image/(?:jpeg|png|x-ms-bmp|tiff)
extracttext_timeout 30 1
");
%anti_patterns = ();
%patterns = %patterns_gtube;
sarun ("-L -t < data/spam/extracttext/gtube_png.eml", \&patterns_run_cb);
ok_all_patterns();
clear_pattern_counters();
}
if (CAT) {
tstprefs("
extracttext_external cat ".CAT." {}
extracttext_use cat .txt
extracttext_timeout 30 1
");
%anti_patterns = ();
%patterns = %patterns_gtube;
sarun ("-L -t < data/spam/extracttext/gtube_b64_oct.eml", \&patterns_run_cb);
ok_all_patterns();
clear_pattern_counters();
}
|