File: tokenizer.h

package info (click to toggle)
dspam 3.10.1+dfsg-11
  • links: PTS, VCS
  • area: main
  • in suites: wheezy
  • size: 6,656 kB
  • sloc: ansic: 26,034; sh: 12,546; perl: 5,469; makefile: 690; sql: 379
file content (102 lines) | stat: -rw-r--r-- 2,248 bytes parent folder | download
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
/* $Id: tokenizer.h,v 1.10 2011/06/28 00:13:48 sbajic Exp $ */

/*
 DSPAM
 COPYRIGHT (C) 2002-2011 DSPAM PROJECT

 This program is free software: you can redistribute it and/or modify
 it under the terms of the GNU Affero General Public License as
 published by the Free Software Foundation, either version 3 of the
 License, or (at your option) any later version.

 This program is distributed in the hope that it will be useful,
 but WITHOUT ANY WARRANTY; without even the implied warranty of
 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
 GNU Affero General Public License for more details.

 You should have received a copy of the GNU Affero General Public License
 along with this program.  If not, see <http://www.gnu.org/licenses/>.

*/

#ifndef _TOKENIZER_H
#  define _TOKENIZER_H

#include "diction.h"
#include "nodetree.h"
#include "error.h"
#include "storage_driver.h"
#include "decode.h"

#define SPARSE_WINDOW_SIZE       5

int _ds_tokenize(
  DSPAM_CTX * CTX,
   char *headers,
   char *body,
   ds_diction_t diction);

int _ds_tokenize_sparse(
  DSPAM_CTX * CTX, 
  char *headers, 
  char *body, 
  ds_diction_t diction);

int _ds_tokenize_ngram(
  DSPAM_CTX * CTX,
  char *headers,
  char *body,
  ds_diction_t diction);

/* _ds_process: ngram token generation routines */

int _ds_process_header_token(
  DSPAM_CTX * CTX,
  char *joined_token,
  const char *previous_token,
  ds_diction_t diction,
  const char *heading);

int _ds_process_body_token(
  DSPAM_CTX * CTX,
  char *joined_token,
  const char *previous_token,
  ds_diction_t diction); 

/* _ds_map: sparse token generation routines */

int _ds_map_header_token(
  DSPAM_CTX * CTX,
  char *token,
  char **previous_tokens,
  ds_diction_t diction,
  const char *heading,
  const char *bitpattern);

int _ds_map_body_token(
  DSPAM_CTX * CTX,
  char *token,
  char **previous_tokens,
  ds_diction_t diction,
  const char *bitpattern);

int _ds_degenerate_message(
  DSPAM_CTX *CTX,
  buffer *header,
  buffer *body);

int _ds_url_tokenize(
  ds_diction_t diction,
  char *body,
  const char *key);

void _ds_sparse_clear
  (char **previous_tokens);

char * _ds_truncate_token
  (const char *token);

char *_ds_generate_bitpattern
  (int breadth);

#endif