File: bayesian-filter.scm.in

package info (click to toggle)
scmail 1.3-4.1
  • links: PTS
  • area: main
  • in suites: bullseye, forky, sid, trixie
  • size: 592 kB
  • sloc: lisp: 1,535; sh: 178; makefile: 145
file content (145 lines) | stat: -rw-r--r-- 4,670 bytes parent folder | download | duplicates (2)
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
;;; -*- scheme -*-
(use gauche.test)
(use srfi-1)
(use srfi-13)
(test-start "scmail.bayesian-filter")
(use scmail.bayesian-filter)
(use scmail.config-core)
(use scmail.config)
(use scmail.mail)
(use file.util)

(test-module 'scmail.bayesian-filter)

(test* "token-table-index-of-spam" #t
       (and (= (token-table-index-of-spam #t) 1)
            (= (token-table-index-of-spam 'ja) 3)))

(test* "token-table-index-of-nonspam" #t
       (and (= (token-table-index-of-nonspam #t) 0)
            (= (token-table-index-of-nonspam 'ja) 2)))

(with-module
 scmail.config
 ;; disable choosing old files.
 (define (choose path) path))

(if (file-is-directory? "test.bayesian")
    (remove-directory* "test.bayesian"))
(scmail-config-set-directory! (build-path (current-directory)
                                          "test.bayesian"))
(scmail-config-make-directory)

(define (get-token-count)
  (with-token-table
   (scmail-config-get-path 'token-table) :read
   (lambda ()
     (vector->list (token-table-token-count)))))

(define (get-message-count)
  (with-token-table
   (scmail-config-get-path 'token-table) :read
   (lambda ()
     (vector->list (token-table-message-count)))))

(define (learn process-words)
  (let ((mail-table (list 
                     (list token-table-index-of-nonspam 
                           (token-table-index-of-nonspam #t)  "1")
                     (list token-table-index-of-nonspam
                           (token-table-index-of-nonspam #t)  "2")
                     (list token-table-index-of-nonspam 
                           (token-table-index-of-nonspam 'ja) "3")
                     (list token-table-index-of-spam 
                           (token-table-index-of-spam 'ja) "4")
                     (list token-table-index-of-spam
                           (token-table-index-of-spam #t)  "5"))))
    (for-each
     (lambda (item)
       (let* ((table (first item))
              (index (second item))
              (file  (third item)))
         (with-token-table
          (scmail-config-get-path 'token-table) :write
          (lambda ()
            (process-words (make <mail> :file file) table)
            (token-table-cache-flush)
          ))))
     mail-table)))

(test "token-table-collect-words" #t
      (lambda ()
        (learn token-table-collect-words)
        (and (every (lambda (x) (> x 0)) (get-token-count))
             (= (apply + (get-message-count)) 5))))

;; run twice
(dotimes (i 2)
         (test "token-table-discard-words" #t
               (lambda ()
                 (learn token-table-discard-words)
                 (and (every (lambda (x) (= x 0)) (get-token-count))
                      (= (apply + (get-message-count)) 0)))))

(test "with-token-table/token-table-collect-words" #t
      (lambda ()
        (learn token-table-collect-words)
        (let1 first-time (get-token-count)
              ;; repeat four times for fast learning
              (dotimes (i 4)
                       (learn token-table-collect-words)
                       (let1 v (get-token-count)
                             (unless (equal? v first-time)
                                     (errorf "total-token-count differs")))))
        #t))

(test "token-table-for-each" #t
      (lambda ()
         (with-token-table
          (scmail-config-get-path 'token-table) :read
          (lambda ()
            (token-table-for-each 
             (lambda (key value)
               (unless (string-prefix? (token-table-special-key-prefix) key)
                       (format #t "~a\t~a\n" key value))))))
         #t))

(test* "load-prob-tables" #t (load-prob-tables "foo"))

(test* "spamness-of-word" #t
       (and (> (spamness-of-word "viagra" #t) 0.9)
            (> (spamness-of-word "" 'ja) 0.9)))

(test* "spamness-of-word" #t
       (and (< (spamness-of-word "test" #t) 0.1)
            (< (spamness-of-word "ƥ" 'ja) 0.1)))

(test* "spamness-of-mail" #t
       (and
        (every (lambda (file)
                 (let1 spamness (spamness-of-mail (make <mail> :file file))
                       (< spamness 0.1)))
               (list "1" "2" "3"))))

(test* "spamness-of-mail" #t
       (and
        (every (lambda (file)
                 (let1 spamness (spamness-of-mail (make <mail> :file file))
                       (> spamness 0.9)))
               (list "4" "5"))))

(test* "mail-is-spam?" #t
       (and 
        (not (mail-is-spam? "1"))
        (not (mail-is-spam? "2"))
        (not (mail-is-spam? "3"))))

(test* "mail-is-spam?" #t
       (and 
        (mail-is-spam? (make <mail> :file "4"))
        (mail-is-spam? (make <mail> :file "5"))))

;; convert-database

(test-end)