File: cmp.py

package info (click to toggle)
spambayes 1.0.3-1
  • links: PTS
  • area: main
  • in suites: sarge
  • size: 2,764 kB
  • ctags: 3,166
  • sloc: python: 29,036; ansic: 195; sh: 110; lisp: 83; makefile: 76
file content (176 lines) | stat: -rwxr-xr-x 4,872 bytes parent folder | download | duplicates (7)
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
#!/usr/bin/env python

"""
cmp.py sbase1 sbase2

Combines output from sbase1.txt and sbase2.txt, which are created by
rates.py from timtest.py output, and displays comparison statistics to
stdout.
"""

import sys
f1n, f2n = sys.argv[1:3]

# Return
#  (list of all f-p rates,
#   list of all f-n rates,
#   total f-p,
#   total f-n,
#   average f-p rate,
#   average f-n rate,
#   list of all ham score deviations,
#   list of all spam score deviations,
#   ham score deviation for all runs,
#   spam score deviations for all runs,
# )
# from summary file f.
def suck(f):
    fns = []
    fps = []
    hamdev = []
    spamdev = []
    hamdevall = spamdevall = (0.0, 0.0)

    get = f.readline
    while 1:
        line = get()
        if line.startswith('-> <stat> tested'):
            print line,
        if line.find(' items; mean ') != -1:
            # -> <stat> Ham distribution for this pair: 1000 items; mean 0.05; sample sdev 0.68
            # and later "sample " went away
            vals = line.split(';')
            mean = float(vals[1].split()[-1])
            sdev = float(vals[2].split()[-1])
            val = (mean, sdev)
            typ = vals[0].split()[2]
            if line.find('for all runs') != -1:
                if typ == 'Ham':
                    hamdevall = val
                else:
                    spamdevall = val
            elif line.find('all in this') != -1:
                if typ == 'Ham':
                    hamdev.append(val)
                else:
                    spamdev.append(val)
            continue
        if line.startswith('-> '):
            continue
        if line.startswith('total'):
            break
        # A line with an f-p rate and an f-n rate.
        p, n = map(float, line.split())
        fps.append(p)
        fns.append(n)

    # "total unique false pos 0"
    # "total unique false neg 0"
    # "average fp % 0.0"
    # "average fn % 0.0"
    fptot = int(line.split()[-1])
    fntot = int(get().split()[-1])
    fpmean = float(get().split()[-1])
    fnmean = float(get().split()[-1])
    return (fps, fns, fptot, fntot, fpmean, fnmean,
            hamdev, spamdev, hamdevall, spamdevall)

def tag(p1, p2):
    if p1 == p2:
        t = "tied          "
    else:
        t = p1 < p2 and "lost " or "won  "
        if p1:
            p = (p2 - p1) * 100.0 / p1
            t += " %+7.2f%%" % p
        else:
            t += " +(was 0)"
    return t

def mtag(m1, m2):
    mean1, dev1 = m1
    mean2, dev2 = m2
    t = "%7.2f %7.2f " % (mean1, mean2)
    if mean1:
        mp = (mean2 - mean1) * 100.0 / mean1
        t += "%+7.2f%%" % mp
    else:
        t += "+(was 0)"
    t += "     %7.2f %7.2f " % (dev1, dev2)
    if dev1:
        dp = (dev2 - dev1) * 100.0 / dev1
        t += "%+7.2f%%" % dp
    else:
        t += "+(was 0)"
    return t

def dump(p1s, p2s):
    alltags = ""
    for p1, p2 in zip(p1s, p2s):
        t = tag(p1, p2)
        print "    %5.3f  %5.3f  %s" % (p1, p2, t)
        alltags += t + " "
    print
    for t in "won", "tied", "lost":
        print "%-4s %2d times" % (t, alltags.count(t))
    print

def dumpdev(meandev1, meandev2):
    for m1, m2 in zip(meandev1, meandev2):
        print mtag(m1, m2)

def windowsfy(fn):
    import os
    if os.path.exists(fn + '.txt'):
        return fn + '.txt'
    else:
        return fn

print f1n, '->', f2n


f1n = windowsfy(f1n)
f2n = windowsfy(f2n)

(fp1, fn1, fptot1, fntot1, fpmean1, fnmean1,
 hamdev1, spamdev1, hamdevall1, spamdevall1) = suck(file(f1n))

(fp2, fn2, fptot2, fntot2, fpmean2, fnmean2,
 hamdev2, spamdev2, hamdevall2, spamdevall2) = suck(file(f2n))

print
print "false positive percentages"
dump(fp1, fp2)
print "total unique fp went from", fptot1, "to", fptot2, tag(fptot1, fptot2)
print "mean fp % went from", fpmean1, "to", fpmean2, tag(fpmean1, fpmean2)

print
print "false negative percentages"
dump(fn1, fn2)
print "total unique fn went from", fntot1, "to", fntot2, tag(fntot1, fntot2)
print "mean fn % went from", fnmean1, "to", fnmean2, tag(fnmean1, fnmean2)

print
if len(hamdev1) == len(hamdev2) and len(spamdev1) == len(spamdev2):
    print "ham mean                     ham sdev"
    dumpdev(hamdev1, hamdev2)
    print
    print "ham mean and sdev for all runs"
    dumpdev([hamdevall1], [hamdevall2])


    print
    print "spam mean                    spam sdev"
    dumpdev(spamdev1, spamdev2)
    print
    print "spam mean and sdev for all runs"
    dumpdev([spamdevall1], [spamdevall2])

    print
    diff1 = spamdevall1[0] - hamdevall1[0]
    diff2 = spamdevall2[0] - hamdevall2[0]
    print "ham/spam mean difference: %2.2f %2.2f %+2.2f" % (diff1,
                                                            diff2,
                                                            diff2 - diff1)
else:
    print "[info about ham & spam means & sdevs not available in both files]"