File: WordFilter.h

package info (click to toggle)
bzflag 2.4.30-1
  • links: PTS, VCS
  • area: main
  • in suites: forky, sid, trixie
  • size: 26,488 kB
  • sloc: cpp: 150,376; ansic: 3,463; sh: 2,535; makefile: 2,194; perl: 486; python: 260; objc: 246; php: 206
file content (337 lines) | stat: -rw-r--r-- 10,131 bytes parent folder | download
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
/* bzflag
 * Copyright (c) 1993-2025 Tim Riker
 *
 * This package is free software;  you can redistribute it and/or
 * modify it under the terms of the license found in the file
 * named COPYING that should have accompanied this file.
 *
 * THIS PACKAGE IS PROVIDED ``AS IS'' AND WITHOUT ANY EXPRESS OR
 * IMPLIED WARRANTIES, INCLUDING, WITHOUT LIMITATION, THE IMPLIED
 * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE.
 */

#ifndef __WORDFILTER_H__
#define __WORDFILTER_H__

#include "common.h"

/* system interface headers */
#include <string>
#include <set>
#include <algorithm>
#include <string.h>

#include <fstream>
#include <iostream>

#include <ctype.h>
#include <sys/types.h>

/* common interface headers */
#include "TextUtils.h"


/* words are stored by the index of their first letter of
 * UTF-8 format.  it would be nice to eventually support
 * full indexing of
 */
#define MAX_FILTER_SETS 256


/** WordFilter will load a list of words and phrases from a file or one at
 * a time or manually.
 *
 * Loading from file assumes that words/phrases are listed one per line;
 * comments are possible using the shell-style "#" delimiter.    Words are
 * matched case-insensitive; punctuation and non-newline white-space are
 * always ignored with the default filter.
 *
 * By default (aggressive filtering), the filter will match many
 * additional combinations to avoid the need to list all potential spelling
 * variations and hacks in the filter file.
 *
 * e.g. "dumb ass" should match "dumbass!", "DUMB a s s", "d_u_m_b! a_S_S", etc
 *
 * Strings should also match repetitive identical letter expansions.
 *
 * e.g. "dumb ass" should match "dumb asssss", "dumbaaaass", "dumb as s", etc
 *
 * Strings should also match l33t-speak. (l=1, o=0, e=3, a=@, s=z, i=l, f=ph)
 *
 * e.g. "ass whipe" should match "@sz wh1p3", etc
 *
 * Strings should also match common word suffixes (at least for English)
 *   (dom|ity|memt|sion|tion|ness|ance|ence|er|or|ist) for nouns
 *   (ive|en|ic|al|able|y|ous|ful|less) for adjectives
 *   (en|ize|ate|ify|fy|ed) for verbs
 *   (ly) for adverbs
 *   (a|z|r|ah|io|rs|rz|in|n|ster|meister) for slang
 *   (s|es) for plurality
 *   (ing|let) for imperfect verb, and diminutive
 *
 * e.g. "dumb ass" should match "dumb assness", "dumb asses", "dumb assly", etc
 *
 * Strings should also match common word prefixes (at least for English)
 *   (bz|beze) for bzflag-centric words
 *
 * e.g. "bitch" should also match "bzbitch", "beezzeebitch", etc
 *
 * Since all of the above matchings are done for free with aggressive matching,
 * only root words need to be provided.  For foreign languages, it may be
 * necessary to list all tenses of certain verbs, unless the rules can be
 * strictly and simply quantified.
 *
 * There is also a simple filter mode which is not as resource intensive and
 * performs a literal match with the filter words (so you have to specify
 * absolutely everything you want to filter and all variations).    It is still
 * case-insensitive and ignores punctuation.
 */
class WordFilter
{
public:

    /** structure for a single filter word, and a compiled regular expression
     */
    typedef struct filterStruct
    {
        std::string word;
        regex_t *compiled;
    } filter_t;


private:

    /** used by the simple filter */
    std::string alphabet;

    /** set of characters used to replace filtered content */
    std::string filterChars;

    /** word expressions to be filtered including compiled regexp versions */
    struct expressionCompare
    {
        bool operator() (const filter_t& word1, const filter_t& word2) const
        {
            return (strncasecmp(word1.word.c_str(), word2.word.c_str(), 1024) < 0);
        }
    };

    typedef std::set<filter_t, expressionCompare> ExpCompareSet;

    /** main collection of what to filter.  items are stored into
     * the array indexed by the first character of the filter word.
     * this means a sparse array, but it's a small price for
     * minimal hashing and rather fast lookups.
     */
    // XXX consider making a numeric hash to avoid array overflows
    ExpCompareSet filters[MAX_FILTER_SETS];


    /** used by the aggressive filter */
    ExpCompareSet suffixes;

    /** used by the aggressive filter */
    ExpCompareSet prefixes;


    /** utility method performs an actual replacement of
     * characters in an input character ray within a specified
     * range.
     */
    inline int filterCharacters(char *input, unsigned int start, size_t length, bool filterSpaces) const;

    /** utility method adds a letter to a string if it is not
     * already in the string
     */
    inline void appendUniqueChar(std::string& string, char c) const;

    /** utility method to add words to the prefix set
     */
    inline void addPrefix(const char *word);

    /** utility method to add words to the suffix set
     */
    inline void addSuffix(const char *word);


protected:

    /** This filter does a simple case-insensitive
     * word comparison that compares all filter
     * words with all alphabetic string sets in the
     * input string. If test is a filter word, then
     * input strings "test", "testy", and "test;"
     * will get filtered to "****", "testy", and
     * "****;" respectively.
     */
    bool simpleFilter(char *input) const;


    /** This filter will take a filter word and
     * create a rather complex regular expression
     * that catches a variety of variations on a
     * word.  Variations include automatic internal
     * expansion, optional punctuation and
     * whitespace, suffix matching (including
     * plurality), leet-speak conversion and more.
     * See the header above for more details.  This
     * filter should be the default.
     */
    bool aggressiveFilter(char *input) const;

    /** provides a pointer to a fresh compiled
     * expression for some given expression
     */
    regex_t *getCompiledExpression(const std::string &expression) const;

    /** returns a set of characters that represent the
     * given character in "l33t-speak"
     */
    std::string l33tspeakSetFromCharacter(const char c) const;

    /** returns what alphabetic character a given char
     * corresponds to (e.g. 3 => e, | => il)
     */
    std::string alphabeticSetFromCharacter(const char c) const;

    /** expands a word into an uncompiled regular
     *  expression.
     */
    std::string expressionFromString(const std::string &word) const;


public:

    WordFilter(void);
    WordFilter(const WordFilter& filter);
    ~WordFilter(void);

    /** loads a set of bad words from a specified file */
    unsigned int loadFromFile(const std::string &fileName, bool verbose=false);

    /** adds a new filter to the existing filter list */
    bool addToFilter(const std::string &word, const std::string &expression);

    /** given an input string, filter the input
     * using either the simple or aggressive filter
     */
    bool filter(char *input, const bool simple=false) const;
    bool filter(std::string &input, const bool simple=false) const;

    /** dump a list of words in the filter to stdout */
    void outputWords(void) const;
    /** dump the filter to stdout (including expressions) */
    void outputFilter(void) const;
    /** retuns a count of how many words are in the filter */
    unsigned long int wordCount(void) const;

    /** Clears the Filter */
    void clear(void);
};



/** utility method performs an actual replacement of
 * characters in an input character ray within a specified
 * range.
 */
inline int WordFilter::filterCharacters(char *input, unsigned int start, size_t length, bool filterSpaces=false) const
{
    if (input == NULL)
        return -1;
    if (length <= 0)
        return -1;
    if (strlen(input) < start)
        return 0;

    int randomCharPos, previousCharPos = -1;
    int maxFilterChar = (int)filterChars.size();
    int count=0;
    for (unsigned int j=0; j < (unsigned int)length; j++)
    {
        char c = input[start + j];

        // don't repeat random chars
        do
        {
            randomCharPos = (int)((float)maxFilterChar * (float)bzfrand());
        }
        while (randomCharPos == previousCharPos);
        previousCharPos = randomCharPos;

        /* when filterspaces is true, we filter everything.
          * otherise the ascii character code ranges for a-z, A-Z, and 0-9
          * are filtered.
          */
        if (filterSpaces)
        {
            input[start + j] = filterChars[randomCharPos];
            count++;
        }
        else if (TextUtils::isAlphanumeric(c))
        {
            input[start + j] = filterChars[randomCharPos];
            count++;
        } /* else it is non-letters so we can ignore */

    }
    return count;
}


inline void WordFilter::appendUniqueChar(std::string& string, char c) const
{
#ifdef HAVE_STD__COUNT
// ISO standard std::count
    if (std::count(string.begin(), string.end(), c) == 0)
        string += c;
#else
// old HP-style std::count (SunPRO for instance)
    int n = 0;
    std::count(string.begin(), string.end(), c, n);
    if (n == 0)
        string += c;
#endif
}


inline void WordFilter::addPrefix(const char *word)
{
    filter_t fix;
    std::pair<ExpCompareSet::iterator, bool> result;
    fix.word = std::string(word);
    fix.compiled = getCompiledExpression(expressionFromString(fix.word));
    result = prefixes.insert(fix);
    if (!result.second && fix.compiled)
    {
        regfree(fix.compiled);
        free(fix.compiled);
    }
}

inline void WordFilter::addSuffix(const char *word)
{
    filter_t fix;
    std::pair<ExpCompareSet::iterator, bool> result;
    fix.word = std::string(word);
    fix.compiled = getCompiledExpression(expressionFromString(fix.word));
    result = suffixes.insert(fix);
    if (!result.second && fix.compiled)
    {
        regfree(fix.compiled);
        free(fix.compiled);
    }
}


#else
class WordFilter;
#endif

// Local Variables: ***
// mode: C++ ***
// tab-width: 4 ***
// c-basic-offset: 4 ***
// indent-tabs-mode: nil ***
// End: ***
// ex: shiftwidth=4 tabstop=4