File: api.py

package info (click to toggle)
trac-spamfilter 0.2.1%2Bsvn6871-1
  • links: PTS, VCS
  • area: main
  • in suites: lenny
  • size: 256 kB
  • ctags: 259
  • sloc: python: 1,683; cs: 341; makefile: 40
file content (251 lines) | stat: -rw-r--r-- 10,043 bytes parent folder | download | duplicates (2)
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
# -*- coding: utf-8 -*-
#
# Copyright (C) 2005-2006 Edgewall Software
# Copyright (C) 2005-2006 Matthew Good <trac@matt-good.net>
# Copyright (C) 2006 Christopher Lenz <cmlenz@gmx.de>
# All rights reserved.
#
# This software is licensed as described in the file COPYING, which
# you should have received as part of this distribution. The terms
# are also available at http://trac.edgewall.com/license.html.
#
# This software consists of voluntary contributions made by many
# individuals. For the exact contribution history, see the revision
# history and logs, available at http://projects.edgewall.com/trac/.
#
# Author: Matthew Good <trac@matt-good.net>
#         Christopher Lenz <cmlenz@gmx.de>

from difflib import SequenceMatcher
import inspect
from StringIO import StringIO
import textwrap
import time

from trac.config import BoolOption, IntOption
from trac.core import *
from trac.db import DatabaseManager
from trac.env import IEnvironmentSetupParticipant
from trac.perm import IPermissionRequestor
from trac.util.html import escape, html, Markup
from trac.util.text import shorten_line
from trac.web import Request
from tracspamfilter.model import LogEntry, schema, schema_version

__all__ = ['RejectContent', 'IFilterStrategy', 'FilterSystem']


class RejectContent(TracError):
    """Exception raised when content is rejected by a filter."""


class IFilterStrategy(Interface):

    def test(req, author, content):
        """Test the given content submission.
        
        Should return a `(points, reason)` tuple to affect the score of the
        submission, where `points` is an integer, and `reason` is a brief
        description of why the score is being affected.
        
        If the filter strategy does not want (or is not able) to effectively
        test the submission, it should return `None`.
        """

    def train(req, author, content, spam=True):
        """Train the filter by reporting a false negative or positive.
        
        The spam keyword argument is `True` if the content should be considered
        spam (a false negative), and `False` if the content was legitimate (a
        false positive).
        """


class FilterSystem(Component):
    strategies = ExtensionPoint(IFilterStrategy)

    implements(IEnvironmentSetupParticipant, IPermissionRequestor)

    min_karma = IntOption('spam-filter', 'min_karma', '0',
        """The minimum score required for a submission to be allowed.""")

    logging_enabled = BoolOption('spam-filter', 'logging_enabled', 'true',
        """Whether all content submissions and spam filtering activity should
        be logged to the database.""")

    purge_age = IntOption('spam-filter', 'purge_age', '7',
        """The number of days after which log entries should be purged.""")

    trust_authenticated = BoolOption('spam-filter', 'trust_authenticated',
                                     'true',
        """Whether content submissions by authenticated users should be trusted
        without checking for potential spam or other abuse.""")

    # Public methods

    def test(self, req, author, changes):
        """Test a submission against the registered filter strategies.
        
        @param req: the request object
        @param author: the name of the logged in user, or 'anonymous' if the
            user is not logged in
        @param changes: a list of `(old_content, new_content)` tuples for every
            modified "field", where `old_content` may contain the previous
            version of that field (if available), and `new_content` contains
            the newly submitted content
        """
        if self.trust_authenticated:
            # Authenticated users are trusted
            if req.authname and req.authname != 'anonymous':
                return

        if not author:
            author = 'anonymous'
        content = self._combine_changes(changes)
        abbrev = shorten_line(content)
        self.log.debug('Testing content %r submitted by "%s"', abbrev, author)

        score = 0
        reasons = []
        for strategy in self.strategies:
            try:
                retval = strategy.test(req, author, content)
            except Exception, e:
                self.log.exception('Filter strategy raised exception: %s', e)
            else:
                if retval:
                    points, reason = retval
                    self.log.debug('Filter strategy %r gave submission %d '
                                   'karma points (reason: %r)', strategy,
                                   points, reason)
                    score += points
                    if reason:
                        reasons.append((strategy.__class__.__name__, points,
                                        reason))

        if self.logging_enabled:
            headers = '\n'.join(['%s: %s' % (k[5:].replace('_', '-').title(), v)
                                 for k, v in req.environ.items()
                                 if k.startswith('HTTP_')])
            LogEntry(self.env, time.time(), req.path_info, author,
                     req.authname and req.authname != 'anonymous',
                     req.remote_addr, headers, content, score < self.min_karma,
                     score, ['%s (%d): %s' % r for r in reasons]).insert()
            LogEntry.purge(self.env, self.purge_age)

        if score < self.min_karma:
            self.log.warn('Rejecting submission %r by "%s" (%r) because it '
                          'earned only %d karma points (%d are required) for '
                          'the following reason(s): %r', abbrev, author,
                          req.remote_addr, score, self.min_karma,
                          ['%s: (%s) %s' % r for r in reasons])
            msg = ', '.join([r[2] for r in reasons if r[1] < 0])
            if msg:
                msg = ' (%s)' % msg
            raise RejectContent('Submission rejected as potential spam%s' % msg)

    def train(self, req, log_id, spam=True):
        environ = {}
        for name, value in req.environ.items():
            if not name.startswith('HTTP_'):
                environ[name] = value

        entry = LogEntry.fetch(self.env, log_id)
        if entry:
            self.log.debug('Marking as %s: %r submitted by "%s"',
                           spam and 'spam' or 'ham',
                           shorten_line(entry.content),
                           entry.author)
            fakeenv = environ.copy()
            for header in entry.headers.splitlines():
                name, value = header.split(':', 1)
                if name == 'Cookie': # breaks SimpleCookie somehow
                    continue
                cgi_name = 'HTTP_%s' % name.strip().replace('-', '_').upper()
                fakeenv[cgi_name] = value.strip()
            fakeenv['REQUEST_METHOD'] = 'POST'
            fakeenv['PATH_INFO'] = entry.path
            fakeenv['wsgi.input'] = StringIO('')
            fakeenv['REMOTE_ADDR'] = entry.ipnr
            if entry.authenticated:
                fakeenv['REMOTE_USER'] = entry.author

            for strategy in self.strategies:
                strategy.train(Request(fakeenv, None),
                               entry.author or 'anonymous',
                               entry.content, spam=spam)

            entry.update(rejected=spam)

    # IEnvironmentSetupParticipant

    def environment_created(self):
        self.upgrade_environment(self.env.get_db_cnx())

    def environment_needs_upgrade(self, db):
        cursor = db.cursor()
        cursor.execute("SELECT value FROM system "
                       "WHERE name='spamfilter_version'")
        try:
            row = cursor.fetchone()
            if not row or int(row[0]) < schema_version:
                return True
        except:
            db.rollback()
            return True

    def upgrade_environment(self, db):
        cursor = db.cursor()
        try:
            cursor.execute("SELECT value FROM system "
                           "WHERE name='spamfilter_version'")
            row = cursor.fetchone()
            current_version = row and int(row[0]) or 0
        except:
            db.rollback()
            current_version = 0

        from tracspamfilter import upgrades
        for version in range(current_version + 1, schema_version + 1):
            for function in upgrades.version_map.get(version):
                self.log.info(textwrap.fill(inspect.getdoc(function)))
                function(self.env, db)
                self.log.info('Done.')

        if current_version == 0:
            cursor.execute("INSERT INTO system VALUES "
                           "('spamfilter_version',%s)", (schema_version,))
            self.log.info('Created SpamFilter tables')
        else:
            cursor.execute("UPDATE system SET value=%s WHERE "
                           "name='spamfilter_version'", (schema_version,))
            self.log.info('Upgraded SpamFilter tables from version %d to %d',
                          current_version, schema_version)

    # IPermissionRequestor

    def get_permission_actions(self):
        return ['SPAM_CONFIG', 'SPAM_MONITOR', 'SPAM_TRAIN',
                ('SPAM_ADMIN', ['SPAM_CONFIG', 'SPAM_MONITOR', 'SPAM_TRAIN'])]

    # Internal methods

    def _combine_changes(self, changes, sep='\n\n'):
        fields = []
        for old_content, new_content in changes:
            if old_content:
                new_content = self._get_added_lines(old_content, new_content)
            fields.append(new_content)
        return sep.join(fields)

    def _get_added_lines(self, old_content, new_content):
        buf = []
        old_lines = old_content.splitlines()
        new_lines = new_content.splitlines()
        matcher = SequenceMatcher(None, old_lines, new_lines)
        for group in matcher.get_grouped_opcodes(0):
            for tag, i1, i2, j1, j2 in group:
                if tag in ('insert', 'replace'):
                    buf.append('\n'.join(new_lines[j1:j2]))

        return '\n'.join(buf)