File: incremental.py

package info (click to toggle)
spambayes 1.0.3-1
links: PTS
area: main
in suites: sarge
size: 2,764 kB
ctags: 3,166
sloc: python: 29,036; ansic: 195; sh: 110; lisp: 83; makefile: 76
file content (426 lines) | stat: -rwxr-xr-x 13,670 bytes
parent folder | download | duplicates (3)
"""incremental.py

This is a test harness for doing testing of incremental
training regimes.  The individual regimes used should
be specified in regime.py.

Options:
  -h  --help         Display this message.
  -r [regime]        Use this regime (default: perfect).
  -s [number]        Run only this set.
"""

###
### This is a test harness for doing testing of incremental
### training regimes.  The individual regimes used should
### be specified in regime.py; see the perfect and
### corrected classes for examples.
###

import getopt
import glob
import os
import sys

sys.path.insert(-1, os.getcwd())
sys.path.insert(-1, os.path.dirname(os.getcwd()))

from spambayes.Options import options
from spambayes import classifier
from spambayes import msgs
import email
from email import Message
import regimes

try:
    True, False
except NameError:
    # Maintain compatibility with Python 2.2
    True, False = 1, 0

class Test:
    # Pass a classifier instance (an instance of Bayes).
    # Loop:
    #     # Train the classifer with new ham and spam.
    #     train(ham, spam) # this implies reset_test_results
    #     Loop:
    #         Optional:
    #             # Possibly fiddle the classifier.
    #             set_classifier()
    #             # Forget smessages the classifier was trained on.
    #             untrain(ham, spam) # this implies reset_test_results
    #         Optional:
    #             reset_test_results()
    #         # Predict against (presumably new) examples.
    #         predict(ham, spam)
    #         Optional:
    #             suck out the results, via instance vrbls and
    #             false_negative_rate(), false_positive_rate(),
    #             false_negatives(), and false_positives()

    def __init__(self, classifier):
        self.set_classifier(classifier)
        self.reset_test_results()

    # Tell the tester which classifier to use.
    def set_classifier(self, classifier):
        self.classifier = classifier

    def reset_test_results(self):
        # The number of ham and spam instances tested.
        self.nham_tested = self.nspam_tested = 0

        # The number of ham and spam instances trained.
        self.nham_trained = self.nspam_trained = 0

        # The number of test instances correctly and incorrectly classified.
        self.nham_right = 0
        self.nham_wrong = 0
        self.nham_unsure = 0;
        self.nspam_right = 0
        self.nspam_wrong = 0
        self.nspam_unsure = 0;

        # Lists of bad predictions.
        self.ham_wrong_examples = []    # False positives:  ham called spam.
        self.spam_wrong_examples = []   # False negatives:  spam called ham.
        self.unsure_examples = []       # ham and spam in middle ground

    # Train the classifier on streams of ham and spam.
    def train(self, hamstream=None, spamstream=None):
        learn = self.classifier.learn
        if hamstream is not None:
            for example in hamstream:
                learn(example, False)
                self.nham_trained += 1
        if spamstream is not None:
            for example in spamstream:
                learn(example, True)
                self.nspam_trained += 1

    # Untrain the classifier on streams of ham and spam.
    def untrain(self, hamstream=None, spamstream=None):
        unlearn = self.classifier.unlearn
        if hamstream is not None:
            for example in hamstream:
                unlearn(example, False)
                self.nham_trained -= 1
        if spamstream is not None:
            for example in spamstream:
                unlearn(example, True)
                self.nspam_trained -= 1

    # Run prediction on each sample in stream.  You're swearing that stream
    # is entirely composed of spam (is_spam True), or of ham (is_spam False).
    # Note that mispredictions are saved, and can be retrieved later via
    # false_negatives (spam mistakenly called ham) and false_positives (ham
    # mistakenly called spam).  For this reason, you may wish to wrap examples
    # in a little class that identifies the example in a useful way, and whose
    # __iter__ produces a token stream for the classifier.
    #
    def predict(self, stream, is_spam):
        guess = self.classifier.spamprob
        for example in stream:
            prob = guess(example)
            is_ham_guessed  = prob <  options["Categorization", "ham_cutoff"]
            is_spam_guessed = prob >= options["Categorization", "spam_cutoff"]
            if is_spam:
                self.nspam_tested += 1
                if is_spam_guessed:
                    self.nspam_right += 1
                elif is_ham_guessed:
                    self.nspam_wrong += 1
                    self.spam_wrong_examples.append(example)
                else:
                    self.nspam_unsure += 1
                    self.unsure_examples.append(example)
            else:
                self.nham_tested += 1
                if is_ham_guessed:
                    self.nham_right += 1
                elif is_spam_guessed:
                    self.nham_wrong += 1
                    self.ham_wrong_examples.append(example)
                else:
                    self.nham_unsure += 1
                    self.unsure_examples.append(example)

        assert (self.nham_right + self.nham_wrong + self.nham_unsure ==
                self.nham_tested)
        assert (self.nspam_right + self.nspam_wrong + self.nspam_unsure ==
                self.nspam_tested)
        num = 0
        if is_ham_guessed:
            num = 1
        if is_spam_guessed:
            num = -1
        return (num, prob)

    def false_positive_rate(self):
        """Percentage of ham mistakenly identified as spam, in 0.0..100.0."""
        return self.nham_wrong * 1e2 / (self.nham_tested or 1)

    def false_negative_rate(self):
        """Percentage of spam mistakenly identified as ham, in 0.0..100.0."""
        return self.nspam_wrong * 1e2 / (self.nspam_tested or 1)

    def unsure_rate(self):
        return ((self.nham_unsure + self.nspam_unsure) * 1e2 /
                ((self.nham_tested + self.nspam_tested) or 1))

    def false_positives(self):
        return self.ham_wrong_examples

    def false_negatives(self):
        return self.spam_wrong_examples

    def unsures(self):
        return self.unsure_examples

class _Example:
    def __init__(self, name, words):
        self.name = name
        self.words = words
    def __iter__(self):
        return iter(self.words)

_easy_test = """
    >>> from spambayes.classifier import Bayes
    >>> from spambayes.Options import options
    >>> options["Categorization", "ham_cutoff"] = options["Categorization", "spam_cutoff"] = 0.5

    >>> good1 = _Example('', ['a', 'b', 'c'])
    >>> good2 = _Example('', ['a', 'b'])
    >>> bad1 = _Example('', ['c', 'd'])

    >>> t = Test(Bayes())
    >>> t.train([good1, good2], [bad1])
    >>> t.predict([_Example('goodham', ['a', 'b']),
    ...            _Example('badham', ['d'])    # FP
    ...           ], False)
    >>> t.predict([_Example('goodspam', ['d']),
    ...            _Example('badspam1', ['a']), # FN
    ...            _Example('badspam2', ['a', 'b']),    # FN
    ...            _Example('badspam3', ['d', 'a', 'b'])    # FN
    ...           ], True)

    >>> t.nham_tested
    2
    >>> t.nham_right, t.nham_wrong
    (1, 1)
    >>> t.false_positive_rate()
    50.0
    >>> [e.name for e in t.false_positives()]
    ['badham']

    >>> t.nspam_tested
    4
    >>> t.nspam_right, t.nspam_wrong
    (1, 3)
    >>> t.false_negative_rate()
    75.0
    >>> [e.name for e in t.false_negatives()]
    ['badspam1', 'badspam2', 'badspam3']

    >>> [e.name for e in t.unsures()]
    []
    >>> t.unsure_rate()
    0.0
"""

__test__ = {'easy': _easy_test}

def _test():
    import doctest, Tester
    doctest.testmod(Tester)


def group_perfect(which, test):
    pass

def guess_perfect(which, test, guess, actual, msg):
    return actual


spam_to_ham = []
ham_to_spam = []
unsure_to_ham = []
unsure_to_spam = []

def group_corrected(which, test):
    global spam_to_ham
    global ham_to_spam
    global unsure_to_ham
    global unsure_to_spam
    test.untrain(ham_to_spam[which], spam_to_ham[which])
    test.train(spam_to_ham[which], ham_to_spam[which])
    test.train(unsure_to_ham[which], unsure_to_spam[which])

def guess_corrected(which, test, guess, actual, msg):
    global spam_to_ham
    global ham_to_spam
    global unsure_to_ham
    global unsure_to_spam
    if guess[0] != actual:
        if actual < 0:
            if guess == 0:
                try:
                    unsure_to_spam[which].append(msg)
                except:
                    unsure_to_spam[which] = [msg]
            else:
                try:
                    ham_to_spam[which].append(msg)
                except:
                    ham_to_spam[which] = [msg]
        else:
            if guess == 0:
                try:
                    unsure_to_ham[which].append(msg)
                except:
                    unsure_to_ham[which] = [msg]
            else:
                try:
                    spam_to_ham[which].append(msg)
                except:
                    spam_to_ham[which] = [msg]
    return guess[0]




def main():
    group_action = None
    guess_action = None

    regime = "perfect"
    which = None

    opts, args = getopt.getopt(sys.argv[1:], 'hs:r:', ['help', 'examples'])
    for opt, arg in opts:
        if opt == '-s':
            which = int(arg) - 1
        elif opt == '-r':
            regime = arg
        elif opt == '-h' or opt == '--help':
            print __doc__
            sys.exit()

    nsets = len(glob.glob("Data/Ham/Set*"))

    files = glob.glob("Data/*/Set*/*")
    files.sort(lambda a,b: cmp(os.path.basename(a), os.path.basename(b)))

    tests = []
    rules = []
    nham_tested = []
    nham_trained = []
    nham_right = []
    nham_wrong = []
    nham_unsure = []
    nspam_tested = []
    nspam_trained = []
    nspam_right = []
    nspam_wrong = []
    nspam_unsure = []
    for j in range(0, nsets):
        # if which is not None and j != which:
        #     continue
        tests.append(Test(classifier.Bayes()))
        exec """rules.append(regimes.%s())""" % (regime) in globals(), locals()
        nham_tested.append([])
        nham_trained.append([])
        nham_right.append([])
        nham_wrong.append([])
        nham_unsure.append([])
        nspam_tested.append([])
        nspam_trained.append([])
        nspam_right.append([])
        nspam_wrong.append([])
        nspam_unsure.append([])

    oldgroup = 0
    for f in files:
        base = os.path.basename(f)
        group = int(base.split('-')[0]);
        dir = os.path.dirname(f)
        set = os.path.basename(dir)
        set = int(set[3:]) - 1
        isspam = (dir.find('Spam') >= 0)

        msg = msgs.Msg(dir, base)

        for j in range(0, nsets):
            if which is not None and j != which:
                continue
            if group != oldgroup:
                sys.stderr.write("%-78s\r" % ("%s  : %d" % (base, set)))
                sys.stderr.flush()

                nham_tested[j].append(tests[j].nham_tested)
                nham_trained[j].append(tests[j].nham_trained)
                nham_right[j].append(tests[j].nham_right)
                nham_wrong[j].append(tests[j].nham_wrong)
                nham_unsure[j].append(tests[j].nham_unsure)
                nspam_tested[j].append(tests[j].nspam_tested)
                nspam_trained[j].append(tests[j].nspam_trained)
                nspam_right[j].append(tests[j].nspam_right)
                nspam_wrong[j].append(tests[j].nspam_wrong)
                nspam_unsure[j].append(tests[j].nspam_unsure)
                # tests[j].reset_test_results()
                rules[j].group_action(j, tests[j])

            if j != set:
                guess = tests[j].predict([msg], isspam)
                if isspam:
                    actual = -1
                else:
                    actual = 1
                todo = rules[j].guess_action(j, tests[j], guess, actual, msg)
                if todo == -1:
                    tests[j].train(None, [msg])
                elif todo == 1:
                    tests[j].train([msg], None)

        oldgroup = group

    sys.stderr.write("\n")
    sys.stderr.flush()

    for j in range(0, nsets):
        if which is not None and j != which:
            continue
        nham_tested[j].append(tests[j].nham_tested)
        nham_trained[j].append(tests[j].nham_trained)
        nham_right[j].append(tests[j].nham_right)
        nham_wrong[j].append(tests[j].nham_wrong)
        nham_unsure[j].append(tests[j].nham_unsure)
        nspam_tested[j].append(tests[j].nspam_tested)
        nspam_trained[j].append(tests[j].nspam_trained)
        nspam_right[j].append(tests[j].nspam_right)
        nspam_wrong[j].append(tests[j].nspam_wrong)
        nspam_unsure[j].append(tests[j].nspam_unsure)

    for j in range(0, nsets):
        if which is not None and j != which:
            continue
        print 'Set %d' % (j + 1)
        for k in range(0, len(nham_tested[j])):
            print '%d %d %d %d %d %d %d %d %d %d' % (
                nham_tested[j][k],
                nham_trained[j][k],
                nham_right[j][k],
                nham_wrong[j][k],
                nham_unsure[j][k],
                nspam_tested[j][k],
                nspam_trained[j][k],
                nspam_right[j][k],
                nspam_wrong[j][k],
                nspam_unsure[j][k]
            )
        print

    print '$ end'

if __name__ == '__main__':
    main()