1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 390 391 392 393 394 395 396 397
|
/* -*- Mode: C++; tab-width: 4; indent-tabs-mode: nil; c-basic-offset: 4 -*- */
/* This Source Code Form is subject to the terms of the Mozilla Public
* License, v. 2.0. If a copy of the MPL was not distributed with this
* file, You can obtain one at http://mozilla.org/MPL/2.0/. */
#ifndef COMM_MAILNEWS_EXTENSIONS_BAYESIAN_SPAM_FILTER_NSBAYESIANFILTER_H_
#define COMM_MAILNEWS_EXTENSIONS_BAYESIAN_SPAM_FILTER_NSBAYESIANFILTER_H_
#include <stdio.h>
#include "nsCOMPtr.h"
#include "nsIMsgFilterPlugin.h"
#include "PLDHashTable.h"
#include "nsITimer.h"
#include "nsTArray.h"
#include "nsString.h"
#include "nsWeakReference.h"
#include "nsIObserver.h"
#include "nsHashPropertyBag.h"
#include "mozilla/intl/WordBreaker.h"
#include "mozilla/ArenaAllocator.h"
#define DEFAULT_MIN_INTERVAL_BETWEEN_WRITES 15 * 60 * 1000
struct Token;
class TokenEnumeration;
class TokenAnalyzer;
class nsIMsgWindow;
class nsIUTF8StringEnumerator;
struct BaseToken;
struct CorpusToken;
/**
* Helper class to enumerate Token objects in a PLDHashTable
* safely and without copying (see bugzilla #174859). The
* enumeration is safe to use until an Add()
* or Remove() is performed on the table.
*/
class TokenEnumeration {
public:
explicit TokenEnumeration(PLDHashTable* table);
bool hasMoreTokens();
BaseToken* nextToken();
private:
PLDHashTable::Iterator mIterator;
};
// A trait is some aspect of a message, like being junk or tagged as
// Personal, that the statistical classifier should track. The Trait
// structure is a per-token representation of information pertaining to
// a message trait.
//
// Traits per token are maintained as a linked list.
//
struct TraitPerToken {
uint32_t mId; // identifying number for a trait
uint32_t mCount; // count of messages with this token and trait
uint32_t mNextLink; // index in mTraitStore for the next trait, or 0
// for none
TraitPerToken(uint32_t aId, uint32_t aCount); // inititializer
};
// An Analysis is the statistical results for a particular message, a
// particular token, and for a particular pair of trait/antitrait, that
// is then used in subsequent analysis to score the message.
//
// Analyses per token are maintained as a linked list.
//
struct AnalysisPerToken {
uint32_t mTraitIndex; // index representing a protrait/antitrait pair.
// So if we are analyzing 3 different traits, then
// the first trait is 0, the second 1, etc.
double mDistance; // absolute value of mProbability - 0.5
double mProbability; // relative indicator of match of trait to token
uint32_t mNextLink; // index in mAnalysisStore for the Analysis object
// for the next trait index, or 0 for none.
// initializer
AnalysisPerToken(uint32_t aTraitIndex, double aDistance, double aProbability);
};
class TokenHash {
public:
virtual ~TokenHash();
/**
* Clears out the previous message tokens.
*/
nsresult clearTokens();
uint32_t countTokens();
TokenEnumeration getTokens();
BaseToken* add(const char* word);
protected:
explicit TokenHash(uint32_t entrySize);
mozilla::ArenaAllocator<16384, 2> mWordPool;
uint32_t mEntrySize;
PLDHashTable mTokenTable;
char* copyWord(const char* word, uint32_t len);
BaseToken* get(const char* word);
};
class Tokenizer : public TokenHash {
public:
Tokenizer();
~Tokenizer();
Token* get(const char* word);
// The training set keeps an occurrence count on each word. This count
// is supposed to count the # of messages it occurs in.
// When add/remove is called while tokenizing a message and NOT the training
// set,
//
Token* add(const char* word, uint32_t count = 1);
Token* copyTokens();
void tokenize(const char* text);
/**
* Creates specific tokens based on the mime headers for the message being
* tokenized
*/
void tokenizeHeaders(nsTArray<nsCString>& aHeaderNames,
nsTArray<nsCString>& aHeaderValues);
void tokenizeAttachments(nsTArray<RefPtr<nsIPropertyBag2>>& attachments);
nsCString mBodyDelimiters; // delimiters for body tokenization
nsCString mHeaderDelimiters; // delimiters for header tokenization
// arrays of extra headers to tokenize / to not tokenize
nsTArray<nsCString> mEnabledHeaders;
nsTArray<nsCString> mDisabledHeaders;
// Delimiters used in tokenizing a particular header.
// Parallel array to mEnabledHeaders
nsTArray<nsCString> mEnabledHeadersDelimiters;
bool mCustomHeaderTokenization; // Are there any preference-set tokenization
// customizations?
uint32_t mMaxLengthForToken; // maximum length of a token
// should we convert iframe to div during tokenization?
bool mIframeToDiv;
private:
void tokenize_ascii_word(char* word);
void tokenize_japanese_word(char* chunk);
inline void addTokenForHeader(const char* aTokenPrefix, nsACString& aValue,
bool aTokenizeValue = false,
const char* aDelimiters = nullptr);
nsresult stripHTML(const nsAString& inString, nsAString& outString);
// helper function to escape \n, \t, etc from a CString
void UnescapeCString(nsCString& aCString);
nsresult ScannerNext(const char16_t* text, int32_t length, int32_t pos,
bool isLastBuffer, int32_t* begin, int32_t* end,
bool* _retval);
};
/**
* Implements storage of a collection of message tokens and counts for
* a corpus of classified messages
*/
class CorpusStore : public TokenHash {
public:
CorpusStore();
~CorpusStore();
/**
* retrieve the token structure for a particular string
*
* @param word the character representation of the token
*
* @return token structure containing counts, null if not found
*/
CorpusToken* get(const char* word);
/**
* add tokens to the storage, or increment counts if already exists.
*
* @param aTokenizer tokenizer for the list of tokens to remember
* @param aTraitId id for the trait whose counts will be remembered
* @param aCount number of new messages represented by the token list
*/
void rememberTokens(Tokenizer& aTokenizer, uint32_t aTraitId,
uint32_t aCount);
/**
* decrement counts for tokens in the storage, removing if all counts
* are zero
*
* @param aTokenizer tokenizer for the list of tokens to forget
* @param aTraitId id for the trait whose counts will be removed
* @param aCount number of messages represented by the token list
*/
void forgetTokens(Tokenizer& aTokenizer, uint32_t aTraitId, uint32_t aCount);
/**
* write the corpus information to file storage
*
* @param aMaximumTokenCount prune tokens if number of tokens exceeds
* this value. == 0 for no pruning
*/
void writeTrainingData(uint32_t aMaximumTokenCount);
/**
* read the corpus information from file storage
*/
void readTrainingData();
/**
* delete the local corpus storage file and data
*/
nsresult resetTrainingData();
/**
* get the count of messages whose tokens are stored that are associated
* with a trait
*
* @param aTraitId identifier for the trait
* @return number of messages for that trait
*/
uint32_t getMessageCount(uint32_t aTraitId);
/**
* set the count of messages whose tokens are stored that are associated
* with a trait
*
* @param aTraitId identifier for the trait
* @param aCount number of messages for that trait
*/
void setMessageCount(uint32_t aTraitId, uint32_t aCount);
/**
* get the count of messages associated with a particular token and trait
*
* @param token the token string and associated counts
* @param aTraitId identifier for the trait
*/
uint32_t getTraitCount(CorpusToken* token, uint32_t aTraitId);
/**
* Add (or remove) data from a particular file to the corpus data.
*
* @param aFile the file with the data, in the format:
*
* Format of the trait file for version 1:
* [0xFCA93601] (the 01 is the version)
* for each trait to write:
* [id of trait to write] (0 means end of list)
* [number of messages per trait]
* for each token with non-zero count
* [count]
* [length of word]word
*
* @param aIsAdd should the data be added, or removed? true if adding,
* else removing.
*
* @param aFromTraits array of trait ids used in aFile. If aFile contains
* trait ids that are not in this array, they are not
* remapped, but assumed to be local trait ids.
*
* @param aToTraits array of trait ids, corresponding to elements of
* aFromTraits, that represent the local trait ids to be
* used in storing data from aFile into the local corpus.
*
*/
nsresult UpdateData(nsIFile* aFile, bool aIsAdd,
const nsTArray<uint32_t>& aFromTraits,
const nsTArray<uint32_t>& aToTraits);
/**
* remove all counts (message and tokens) for a trait id
*
* @param aTrait trait id for the trait to remove
*/
nsresult ClearTrait(uint32_t aTrait);
protected:
/**
* return the local corpus storage file for junk traits
*/
nsresult getTrainingFile(nsIFile** aFile);
/**
* return the local corpus storage file for non-junk traits
*/
nsresult getTraitFile(nsIFile** aFile);
/**
* read token strings from the data file
*
* @param stream file stream with token data
* @param fileSize file size
* @param aTraitId id for the trait whose counts will be read
* @param aIsAdd true to add the counts, false to remove them
*
* @return true if successful, false if error
*/
bool readTokens(FILE* stream, int64_t fileSize, uint32_t aTraitId,
bool aIsAdd);
/**
* write token strings to the data file
*/
bool writeTokens(FILE* stream, bool shrink, uint32_t aTraitId);
/**
* remove counts for a token string
*/
void remove(const char* word, uint32_t aTraitId, uint32_t aCount);
/**
* add counts for a token string, adding the token string if new
*/
CorpusToken* add(const char* word, uint32_t aTraitId, uint32_t aCount);
/**
* change counts in a trait in the traits array, adding the trait if needed
*/
nsresult updateTrait(CorpusToken* token, uint32_t aTraitId,
int32_t aCountChange);
nsCOMPtr<nsIFile> mTrainingFile; // file used to store junk training data
nsCOMPtr<nsIFile> mTraitFile; // file used to store non-junk
// training data
nsTArray<TraitPerToken> mTraitStore; // memory for linked-list of counts
uint32_t mNextTraitIndex; // index in mTraitStore to first empty
// TraitPerToken
nsTArray<uint32_t> mMessageCounts; // count of messages per trait
// represented in the store
nsTArray<uint32_t> mMessageCountsId; // Parallel array to mMessageCounts,
// with the corresponding trait ID
};
class nsBayesianFilter : public nsIJunkMailPlugin,
nsIMsgCorpus,
nsIObserver,
nsSupportsWeakReference {
public:
NS_DECL_ISUPPORTS
NS_DECL_NSIMSGFILTERPLUGIN
NS_DECL_NSIJUNKMAILPLUGIN
NS_DECL_NSIMSGCORPUS
NS_DECL_NSIOBSERVER
nsBayesianFilter();
nsresult Init();
nsresult tokenizeMessage(const nsACString& messageURI,
nsIMsgWindow* aMsgWindow, TokenAnalyzer* analyzer);
void classifyMessage(Tokenizer& tokens, const nsACString& messageURI,
nsIJunkMailClassificationListener* listener);
void classifyMessage(Tokenizer& tokenizer, const nsACString& messageURI,
nsTArray<uint32_t>& aProTraits,
nsTArray<uint32_t>& aAntiTraits,
nsIJunkMailClassificationListener* listener,
nsIMsgTraitClassificationListener* aTraitListener,
nsIMsgTraitDetailListener* aDetailListener);
void observeMessage(Tokenizer& tokens, const nsACString& messageURI,
nsTArray<uint32_t>& oldClassifications,
nsTArray<uint32_t>& newClassifications,
nsIJunkMailClassificationListener* listener,
nsIMsgTraitClassificationListener* aTraitListener);
protected:
virtual ~nsBayesianFilter();
static void TimerCallback(nsITimer* aTimer, void* aClosure);
CorpusStore mCorpus;
double mJunkProbabilityThreshold;
int32_t mMaximumTokenCount;
bool mTrainingDataDirty;
int32_t mMinFlushInterval; // in milliseconds, must be positive
// and not too close to 0
nsCOMPtr<nsITimer> mTimer;
// index in mAnalysisStore for first empty AnalysisPerToken
uint32_t mNextAnalysisIndex;
// memory for linked list of AnalysisPerToken objects
nsTArray<AnalysisPerToken> mAnalysisStore;
/**
* Determine the location in mAnalysisStore where the AnalysisPerToken
* object for a particular token and trait is stored
*/
uint32_t getAnalysisIndex(Token& token, uint32_t aTraitIndex);
/**
* Set the value of the AnalysisPerToken object for a particular
* token and trait
*/
nsresult setAnalysis(Token& token, uint32_t aTraitIndex, double aDistance,
double aProbability);
};
#endif // COMM_MAILNEWS_EXTENSIONS_BAYESIAN_SPAM_FILTER_NSBAYESIANFILTER_H_
|