File: libdspam_objects.h

package info (click to toggle)
dspam 3.10.1+dfsg-11
  • links: PTS, VCS
  • area: main
  • in suites: wheezy
  • size: 6,656 kB
  • sloc: ansic: 26,034; sh: 12,546; perl: 5,469; makefile: 690; sql: 379
file content (398 lines) | stat: -rw-r--r-- 12,013 bytes parent folder | download
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
/* $Id: libdspam_objects.h,v 1.27 2011/07/11 21:29:57 sbajic Exp $ */

/*
 DSPAM
 COPYRIGHT (C) 2002-2011 DSPAM PROJECT

 This program is free software: you can redistribute it and/or modify
 it under the terms of the GNU Affero General Public License as
 published by the Free Software Foundation, either version 3 of the
 License, or (at your option) any later version.

 This program is distributed in the hope that it will be useful,
 but WITHOUT ANY WARRANTY; without even the implied warranty of
 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
 GNU Affero General Public License for more details.

 You should have received a copy of the GNU Affero General Public License
 along with this program.  If not, see <http://www.gnu.org/licenses/>.

*/

#ifndef _LIBDSPAM_OBJECTS_H
#  define _LIBDSPAM_OBJECTS_H

#ifdef HAVE_CONFIG_H
#include <auto-config.h>
#endif

#include <time.h>
#include "config.h"
#include "config_shared.h"
#include "decode.h"

#if ((defined(__sun__) && defined(__svr4__)) || (defined(__sun) && defined(__SUNPRO_C))) && !defined(u_int32_t) && !defined(__BIT_TYPES_DEFINED__)
#define __BIT_TYPES_DEFINED__
typedef unsigned long long u_int64_t;
typedef unsigned int u_int32_t;
typedef unsigned short u_int16_t;
typedef unsigned char u_int8_t;
#endif

#ifdef _WIN32
typedef unsigned int u_int32_t;
typedef u_int32_t uid_t;
#endif

extern void *_drv_handle; /* Handle to storage driver library */

/*
 *  struct dspam_factor - A single determining factor
 *
 *  An element containing a determining factor in the dominant calculation of
 *  a message.  An array of these are returned to the calling  application to
 *  explain libdspam's final classification decision.
 */

struct dspam_factor {
  char *token_name;
  float value;
};

/*
 *  struct _ds_spam_totals - User spam totals
 *
 *  Spam totals loaded into the user's filter context upon a call to 
 *  dspam_init().  This structure represents the user's cumulative statistics.
 *
 *  spam_learned, innocent_learned
 *    The total number of messages trained on.
 *
 *  spam_misclassified, innocent_misclassified
 *    The total number of messages that were misclassified by DSPAM, and
 *    submitted for retraining.
 *
 *  spam_classified, innocent_classified
 *    The total number of messages that were classified by DSPAM, but not
 *    learned.  Used exclusively with Train-on-Error mode.
 *
 *  spam_corpusfed, innocent_corpusfed
 *    The total number of messages supplied by the end-user for training.
 *
 *  NOTE: The ordering  of the variables  in the  structure must remain
 *        consistent to ensure backward-compatibility with some storage
 *        drivers (such as the Berkeley DB drivers)
 */

struct _ds_spam_totals
{
  long spam_learned;
  long innocent_learned;
  long spam_misclassified;
  long innocent_misclassified;
  long spam_corpusfed;
  long innocent_corpusfed;
  long spam_classified;
  long innocent_classified;
};

/*
 *  struct _ds_spam_stat - Statistics for a single token:
 *
 *  probability
 *    The calculated probability of the token based on the active pvalue 
 *    algorithm (selected at configure-time).
 *
 *  spam_hits, innocent_hits
 *    The total  number of times the token has appeared in each class  of 
 *    message. If Train-on-Error or Train-until-Mature training modes are
 *    employed,  these values will not  necessarily be updated for  every 
 *    message.
 *
 *  status
 *    TST_DISK	Value was loaded from the storage interface
 *    TST_DIRTY	Statistic is dirty (not written to disk since last modified)
 */

typedef struct _ds_spam_stat
{
  double probability;
  long spam_hits;
  long innocent_hits;
  char status;
  unsigned long offset;
} *ds_spam_stat_t;

/*
 *  struct _ds_spam_signature - A historical classification signature
 *
 *  A binary representation of the original training instance.  The spam
 *  signature  contains all the  metadata used  in the original decision
 *  about the  message, so  that a 1:1 retraining  can take place if the 
 *  message  is submitted for  retraining (e.g. was  misclassified). The
 *  signature contains a series of _ds_signature_token structures, which
 *  house the  original set of tokens used and their frequency counts in 
 *  the message.  A spam signature is a temporary  piece of data that is 
 *  usually purged from disk after a short period of time.
 */

struct _ds_spam_signature
{
  void *data;
  unsigned long length;
};

/*
 *  struct _ds_signature_token - An entry in the classification signature
 *
 *  A signature token is a single entry in the binary _ds_spam_signature 
 *  data  blob,  representing  a single  data point  from  the  original 
 *  training instance.
 *
 *  token
 *    The checksum of the original token in the message
 *
 *  frequency
 *    The token's frequency in the original message
 */

struct _ds_signature_token
{
  unsigned long long token;
  unsigned char frequency;
};

/* 
 *  struct _ds_config - libdspam attributes configuration
 *
 *  Each  classification context may have an attributes  configuration
 *  which  is read by various  components of libdspam.  This structure
 *  contains an array of attributes and the size of the array.
 */

struct _ds_config
{
  config_t attributes;
  long size;
};

/*
 *  DSPAM_CTX - The DSPAM Classification Context
 *
 *  A classification context is attached directly to a filter instance 
 *  and supplies the entire context for the filter instance to operate
 *  under.  This  includes  the  user  and group,  operational  flags, 
 *  training  mode, and  the message  being  operated  on. The  filter
 *  instance also  sets specific output variables  within the  context
 *  such  as the  result of a  classification,  confidence  level, and
 *  etcetera.
 *
 *  username, group (input)
 *    The current username and group that is being operated on. 
 *
 *  totals (output)
 *    The set of statistics loaded when dspam_init() is called.
 *
 *  signature (input, output)
 *    The signature represents a DSPAM signature, and can be  supplied
 *    as  an input  variable for  retraining  (e.g. in the  event of a 
 *    misclassification)  or  used as  an output  variable  to store a 
 *    signature  generated   by  the  filter  instance  during  normal 
 *    classification.
 *
 *  message (input)
 *    The  message being operated on, post-actualization. This can  be 
 *    left NULL, and libdspam will automatically actualize the message
 *
 *  probability (output)
 *    The probability of the resulting operation.  This is generally a
 *    floating  point number  between 0 and  1, 1  being  the  highest 
 *    probability of high order classification.
 *
 *  result (output)
 *    The  final result of the requested operation.  This is generally 
 *    either DSR_ISSPAM, DSR_ISINNOCENT, or DSR_WHITELISTED.
 *
 *  confidence (output)
 *    The  confidence  that the  filter has  in  its  returned  result. 
 *    NOTE: Confidence is not always supported, and may be zero.
 *
 *  operating_mode (input)
 *    Sets the operating mode of the filter instance.  This can be one
 *    of the following:
 *
 *	DSM_PROCESS	Classify and learn the  supplied message using 
 *			whatever training mode is specified 
 *
 *	DSM_CLASSIFY	Classify the  supplied  message  only; do  not 
 *                      learn or update any counters.
 *
 *	DSM_TOOLS	Identifies that  the calling function is  from
 *			a utility, and no operation will be requested.
 *
 *  training_mode (input)
 *    The training mode sets the type of training the filter  instance 
 *    should apply to the process. This can be one of:
 *
 *	DST_TEFT		Train-on-Everything
 *				Trains every single message  processed
 *
 *	DST_TOE			Train-on-Error
 *				Trains only on a misclassification  or
 *                              corpus-fed message.
 *
 *	DST_TUM			Train-until-Mature
 *				Trains individual tokens based on  the
 *				maturity of the user's dictionary
 *
 *      DST_NOTRAIN		No Training
 *                              Process the message but do not perform 
 *                              any training.
 *  training_buffer (input)
 *    Sets the amount  of training-loop buffering.  This  number is  a 
 *    range from 0-10  and changes  the amount of  token sedation used 
 *    during the training loop.  The higher the number, the more token 
 *    statistics are watered down  during initial  training to prevent 
 *    false  positives.  Setting  this  value to  zero results  in  no 
 *    sedation being performed.
 *
 *  flags (input)
 *    Applies different fine-tuning behavior to the context:
 *
 *	DSF_NOISE		Apply Bayesian Noise Reduction logic
 *	DSF_SIGNATURE		Signature is provided/requested
 *      DSF_WHITELIST		Use automatic whitelisting logic
 *      DSF_MERGED		Merge user/group data in memory
 *      DSF_UNLEARN		Unlearn the message
 *      DSF_BIAS		Assign processor bias to unknown tokens
 *
 *  tokenizer (input)
 *    Specifies which tokenizer to use
 *
 *      DSZ_WORD		Use WORD (uniGram) tokenizer
 *      DSZ_CHAIN		Use CHAIN (biGram) tokenizer
 *      DSZ_SBPH		Use SBPH (Sparse Binary Polynomial Hashing) tokenizer
 *      DSZ_OSB			Use OSB (Orthogonal Sparse biGram) tokenizer
 *
 *  algorithms (input)
 *    Optional API to override the default algorithms. This value is set
 *    with the default compiled values whenever dspam_create() is called.
 *
 *	DSA_GRAHAM		Graham-Bayesian
 *	DSA_BURTON		Burton-Bayesian
 *	DSA_ROBINSON		Robinson's Geometric Mean Test
 *	DSA_CHI_SQUARE		Fisher-Robinson's Chi-Square
 *      DSA_NAIVE		Naive-Bayesian
 *
 *    P-Value Computations:
 *
 *      DSP_ROBINSON		Robinson's Technique
 *      DSP_GRAHAM		Graham's Technique
 *      DSP_MARKOV		Markov Weighted Technique
 *
 *  locked (output)
 *    Identifies that the user's storage is presently locked
 */

typedef struct
{
  struct _ds_spam_totals	totals;
  struct _ds_spam_signature *	signature;
  struct _ds_message *		message;
  struct _ds_config *		config;

  char		*username;
  char		*group;
  char		*home;		 /* DSPAM Home */
  int		operating_mode;  /* DSM_ */
  int		training_mode;   /* DST_ */
  int		training_buffer; /* 0-10 */
  int		wh_threshold;    /* Whitelisting Threshold (default 10) */
  int		classification;  /* DSR_ */
  int		source;		 /* DSS_ */
  int		learned;	 /* Did we actually learn something? */
  int           tokenizer;       /* DSZ_ */
  u_int32_t	flags;
  u_int32_t	algorithms;

  int		result;
  char		class[32];
  float		probability;
  float		confidence;

  int		locked;
  void *	storage;
  time_t	_process_start;
  int		_sig_provided;

  struct nt *	factors;

} DSPAM_CTX;

/* Processing Flags */

#define DSF_SIGNATURE		0x02
#define DSF_BIAS		0x04
#define DSF_NOISE		0x08
#define DSF_WHITELIST		0x10
#define DSF_MERGED		0x20
#define DSF_UNLEARN		0x80

/* Tokenizers */

#define DSZ_WORD		0x01
#define DSZ_CHAIN		0x02
#define DSZ_SBPH		0x03
#define DSZ_OSB			0x04

/* Algorithms */

#define DSA_GRAHAM		0x01
#define DSA_BURTON		0x02
#define DSA_ROBINSON		0x04
#define DSA_CHI_SQUARE		0x08
#define DSP_ROBINSON		0x10
#define DSP_GRAHAM		0x20
#define DSP_MARKOV		0x40
#define DSA_NAIVE		0x80

/* Operating Modes */

#define DSM_PROCESS             0x00
#define DSM_TOOLS		0x01
#define DSM_CLASSIFY		0x02
#define DSM_NONE		0xFF

/* Training Modes */

#define DST_TEFT		0x00
#define DST_TOE			0x01
#define DST_TUM			0x02
#define DST_NOTRAIN		0xFE

/* Classification Results */

#define	DSR_ISSPAM		0x01
#define DSR_ISINNOCENT		0x02
#define DSR_NONE		0xFF

/* Classification Sources */

#define DSS_ERROR       0x00 /* Retraining an error */
#define DSS_CORPUS      0x01 /* Training a message from corpus */
#define DSS_INOCULATION 0x02 /* Message is an inoculation */
#define DSS_NONE	0xFF /* Standard inbound processing */

/* Statuses for token-status bit */
#define TST_DISK	0x01
#define TST_DIRTY	0x02

/* Token Types */
#define DTT_DEFAULT	0x00
#define DTT_BNR		0x01

#define DSP_UNCALCULATED	-1

#define BURTON_WINDOW_SIZE	27

#endif /* _LIBDSPAM_OBJECTS */