File: FuzzyOcr.cf

package info (click to toggle)
fuzzyocr 3.6.0-9
  • links: PTS, VCS
  • area: main
  • in suites: jessie, jessie-kfreebsd
  • size: 684 kB
  • ctags: 154
  • sloc: perl: 3,150; sh: 45; makefile: 42
file content (382 lines) | stat: -rw-r--r-- 11,541 bytes parent folder | download | duplicates (7)
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
# Syntax:
# loadplugin <Plugin_Name> <Location>
#  <Location> path where Plugin resides.

loadplugin FuzzyOcr

ifplugin FuzzyOcr

body     FUZZY_OCR                   eval:fuzzyocr_check()
body     FUZZY_OCR_WRONG_CTYPE       eval:dummy_check()
body     FUZZY_OCR_CORRUPT_IMG       eval:dummy_check()
body     FUZZY_OCR_WRONG_EXTENSION   eval:dummy_check()
body     FUZZY_OCR_KNOWN_HASH        eval:dummy_check()

describe FUZZY_OCR                   Mail contains an image with common spam text inside
describe FUZZY_OCR_WRONG_CTYPE       Mail contains an image with wrong content-type set
describe FUZZY_OCR_WRONG_EXTENSION   Mail contains an image with wrong file extension
describe FUZZY_OCR_CORRUPT_IMG       Mail contains a corrupted image
describe FUZZY_OCR_KNOWN_HASH        Mail contains an image with known hash

priority FUZZY_OCR 900

###
### Plugin Configuration 
###

###
### Logging options
###

# Verbosity level (see manual)
# Level 0 - Errors only
# Level 1 - Errors and Warnings
# Level 2 - Errors, Warnings and Info Messages
# Level 3 - Full debug output
# Default value: 1
#focr_verbose 3

# Log Message-Id, From, To
# Default: 1
#focr_log_pmsinfo 0

# Send logging output to stderr.
# Default value: 1
#focr_log_stderr 0

# Logfile (make sure it is writable by the plugin) 
# Default value: none
#focr_logfile /tmp/FuzzyOcr.log

###
### Wordlists 
###

# Here we defined the words to scan for
# Default value: /etc/spamassassin/FuzzyOcr.words
#focr_global_wordlist /etc/spamassassin/FuzzyOcr.words
#
# This is the path RELATIVE to the respective home directory
# for the personalized list. This list is merged with the global 
# word list on execution.
# Default value: ~/.spamassassin/fuzzyocr.words 
# If value begins with '/', it is treated as fixed path.
#focr_personal_wordlist fuzzyocr.words
#
# This option allows you to disable the whole personalization stuff,
# i.e. FuzzyOcr will not call functions in SA that require home
# directories for your users. This is only required if you are running
# an environment where the users don't have home directories at all.
# Default value: 0
#
#focr_no_homedirs 1
#
## Optionally, disable this option if you want to scan for numbers
## Setting this to 0 will cause FuzzyOcr not to strip numbers from
## both the wordlist and the OCR results
#
#focr_strip_numbers 1


###
### Helper Applications
###

# These parameters can be used to change other detection settings
# If you leave these commented out, the defaults will be used.
# Do not use " " around any parameters!

###
### Step 1:
### Inform the plugin which helper apps are required.
###

# The following are already included by default:
#
#focr_bin_helper gifsicle, giffix, giftext, gifinter, giftopnm
#focr_bin_helper jpegtopnm, pngtopnm, bmptopnm, tifftopnm, ppmhist
#focr_bin_helper gocr, ocrad

# Include additional scanner/preprocessor commands here:
#
focr_bin_helper pnmnorm, pnminvert,  ppmtopgm
#not available in Debian: pamthreshold,pamtopnm
focr_bin_helper tesseract

# These helpers must be defined before enabling PDF scanning
#focr_bin_helper pdfinfo, pdftops, pstopnm

###
### Step 2:
### Inform the plugin of the search path to find all helper apps.
### Only the first match will be considered, so the order is important.
###

# Search path for locating helper applications
#focr_path_bin /usr/local/netpbm/bin:/usr/local/bin:/usr/bin

###
### Step 3:
### You can optionally define a helper application location, bypassing
### the search path algorithm. Please note that if the helper app is not
### previously defined, it will generate an error:

#focr_bin_gifsicle /usr/bin/gifsicle
#focr_bin_giffix /usr/bin/giffix
#focr_bin_giftext /usr/bin/giftext
#focr_bin_gifinter /usr/bin/gifinter
#focr_bin_giftopnm /usr/bin/giftopnm
#focr_bin_jpegtopnm /usr/bin/jpegtopnm
#focr_bin_pngtopnm /usr/bin/pngtopnm
#focr_bin_bmptopnm /usr/bin/bmptopnm
#focr_bin_tifftopnm /usr/bin/tifftopnm
#focr_bin_ppmhist /usr/bin/ppmhist
#focr_bin_gocr /usr/bin/gocr
#focr_bin_ocrad /usr/bin/ocrad

#focr_bin_pnmnorm /usr/bin/pnmnorm
#focr_bin_pnminvert /usr/bin/pnminvert

#focr_bin_pdfinfo /usr/bin/pdfinfo
#focr_bin_pdftops /usr/bin/pdftops
#focr_bin_pstopnm /usr/bin/pstopnm

###
### Scansets 
###

# Paths to the files containing Scansets and Preprocessors definitions
#
#focr_preprocessor_file /etc/spamassassin/FuzzyOcr.preps
#focr_scanset_file /etc/spamassassin/FuzzyOcr.scansets

# Setting this to 1 will cause FuzzyOcr to skip all other scansets,
# if a scanset has reached the amount of hits specified in 
# focr_counts_required. (i.e. if the image is detected as spam).
# This saves resources, but lowers the scores because not the best, 
# but the first best scanset is taken as result.
# Default value: 1
#focr_minimal_scanset 0

# This option is only used when focr_minimal_scanset is enabled. 
# Basically, this counts the effectiveness of a scanset on the current 
# mail traffic and resorts the scansets with the most effective first.
# This saves unnecessary scanner passes and saves resources. 
# Default value: 1.
#focr_autosort_scanset 0

# This is a parameter for the focr_autosort_scanset function, and specifies
# the maximum value of the effectiveness counter used in each scanset. If you
# increase this, it will take longer until the autosort function adapts to new
# types of spam, setting it too low will lower the effectiveness of the 
# function. 
# Default value: 10
#focr_autosort_buffer 10

###
### Scan Settings
###

# Timeout for the plugin, in seconds. (Maximum runtime of the plugin)
# Default value: 10
#focr_timeout 15

# Use a global timeout value instead of per helper application.
# Default value: 0
#focr_global_timeout 1

# Minimum image size to scan. Images with dimensions smaller than the
# ones specified here will be skipped:
# (This parameter does not apply to PDF files)
# Default: Height:4 Width:4
#
#focr_min_height 4
#focr_min_width 4

# Maximum image size to scan. Images with dimensions bigger than the
# ones specified here will be skipped:
# (This parameter does not apply to PDF files)
# Default: Height:800 Width:800
#
#focr_max_height 800
#focr_max_width 800


# Maximum file size for different formats in byte, bigger pictures 
# will not be scanned 
# Default values: Unlimited)
#focr_max_size_gif 80000
#focr_max_size_jpeg 100000
#focr_max_size_png 80000
#focr_max_size_bmp 500000
#focr_max_size_tiff 500000

# Skip checking the following image types 
# Default value: 0 (check image type)
#focr_skip_gif 1
#focr_skip_jpeg 1
#focr_skip_png 1
#focr_skip_bmp 1
#focr_skip_tiff 1
#

# PDF specific options
# WARNING: Enable this at your own risk, this might lead to false positives and classify
#          important documents as spam. YOU HAVE BEEN WARNED.
#focr_scan_pdfs 0
# PDFs having more pages than this value will be skipped
#focr_pdf_maxpages 1

# Default detection treshold (see manual) 
# Default value: 0.25 (Can be changed on a per word basis in the wordlist).
#focr_threshold 0.20

# Number of minimum matches before the rule scores (Default value: 2)
#focr_counts_required 3

# Setting this will cause every word to be matched only once per image (Default value: 0)
#focr_unique_matches 1

# This is the score for a hit after focr_counts_required matches
# Default value: 5
#focr_base_score 5

# This is the additional score for every additional match after 
# focr_counts_required matches
# Default value: 1
#focr_add_score 0.375

# This option defines the factor, which is multiplied with the number
# of matches, that were made without stripping spaces. FuzzyOcr does two
# matching attempts on OCR results, one without space strippings and one with.
# To weight the first match type more, this factor is applied.
# Default value: 1.5
#focr_twopass_scoring_factor 1.5

# This is the score to give for a wrong content-type.
# e.g. JPEG image but content type says GIF
# Default value: 1.5
#focr_wrongctype_score 1.5

# This is the score to give for a wrong file extension.
# e.g. JPEG image but file extension says GIF
# Default value: 1.5
#focr_wrongext_score 1.5

# This is the score to give for a corrupted image.
# This currently affects only GIF images
# Default value: 2.5
#focr_corrupt_score 2.5

# This is the score to give for a corrupted unfixable image.
# This currently affects only GIF images.
# Default value: 5
#focr_corrupt_unfixable_score 5

# This is used to disable the OCR engine if the message has 
# already more points than this value 
# Default value: 10
#focr_autodisable_score 30

# This is used to disable the OCR engine if the message has
# already less points than this value 
# Default value: -5
#focr_autodisable_negative_score -5


###
### Hashing Options (Optional)
###

# Select which type of image hashing to use:
# Default value: 0 (disabled)
# Allowed values:
#  1 ... use digest_hash only (deprecated)
#  2 ... use digest_db w/digest_hash import (see requirements, recommended)
#  3 ... use mysql database (see requirements, experimental)
#--
# The score is saved with the hash in the database, allowing the plugin to
# skip the scans when the image is found in the database, using the score
# from the previous scans.
#--
#focr_enable_image_hashing 3

# Set this to skip updating the hashing database at startup
# Default value: 0 (update at startup)
#focr_skip_updates 1

# Automatically add hashes of spam images recognized by OCR to the Image 
# Hash database, to disable, set to 0
# Default value: 1 (learn)
#focr_hashing_learn_scanned 1

# Score images who's global word count is below focr_counts_required using 
# the following formulae: (focr_add_score * word count) as score.
# Default value: 0 (ignore images)
#focr_score_ham 1

# If the image hash database feature is enabled (Type 1 Hashing),
# specify the file to use as database
# Default value: /etc/spamassassin/FuzzyOcr.hashdb
#focr_digest_db /etc/spamassassin/FuzzyOcr.hashdb

# If the image hash db feature is enabled (Type 2 Hashing),
# specify the file to use as the SPAM database
# Default value: /etc/spamassassin/FuzzyOcr.db
#focr_db_hash /etc/spamassassin/FuzzyOcr.db

# If the image hash db feature is enabled (Type 2 Hashing), 
# specify the file to use as the HAM database
# Default value: /etc/spamassassin/FuzzyOcr.safe.db
#focr_db_safe /etc/spamassassin/FuzzyOcr.safe.db

# Auto-prune: Expire records from hasing databases after these many days
# Default value: 35
#focr_db_max_days 15

###
### MySQL options (Type 3 Hashing)
###

#focr_mysql_db FuzzyOcr
#focr_mysql_hash Hash
#focr_mysql_safe Safe
#focr_mysql_user fuzzyocr
#focr_mysql_pass fuzzyocr
#focr_mysql_host localhost
#focr_mysql_port 3306
#focr_mysql_socket /tmp/mysql.sock

# If set, the database table is updated with different data from one of
# the following:
#  + filename, 
#  + image-params,
#  + content-type, 
#  + file-type, 
#  + score, 
#  + word-info
# Default value: 0
#focr_mysql_update_hash 1

###
### Miscellaneous Options
###

# The pluging uses a temporary directory to store intermediate information.
# In order to Keep these files for debugging purposes use any of these
# values:
#  0 = always cleanup (default value)
#  1 = keep only if error
#  2 = always keep
#--
# Keeping these intermediate files could fill your HDD _very_ fast!
# Make shure you periodically empty your temp dir (usually: /tmp) or
# suffer the conscecuences.  You've been warned!!
#--
#focr_keep_bad_images 1

#################################################################
# DO NOT REMOVE THIS LINE, IT IS REQUIRED UNDER ALL CIRCUMSTANCES
focr_end_config

endif