summaryrefslogtreecommitdiffstats
path: root/mailnews/extensions/bayesian-spam-filter
diff options
context:
space:
mode:
authorMatt A. Tobin <email@mattatobin.com>2019-11-03 00:17:46 -0400
committerMatt A. Tobin <email@mattatobin.com>2019-11-03 00:17:46 -0400
commit302bf1b523012e11b60425d6eee1221ebc2724eb (patch)
treeb191a895f8716efcbe42f454f37597a545a6f421 /mailnews/extensions/bayesian-spam-filter
parent21b3f6247403c06f85e1f45d219f87549862198f (diff)
downloadUXP-302bf1b523012e11b60425d6eee1221ebc2724eb.tar
UXP-302bf1b523012e11b60425d6eee1221ebc2724eb.tar.gz
UXP-302bf1b523012e11b60425d6eee1221ebc2724eb.tar.lz
UXP-302bf1b523012e11b60425d6eee1221ebc2724eb.tar.xz
UXP-302bf1b523012e11b60425d6eee1221ebc2724eb.zip
Issue #1258 - Part 1: Import mailnews, ldap, and mork from comm-esr52.9.1
Diffstat (limited to 'mailnews/extensions/bayesian-spam-filter')
-rw-r--r--mailnews/extensions/bayesian-spam-filter/moz.build6
-rw-r--r--mailnews/extensions/bayesian-spam-filter/src/moz.build11
-rw-r--r--mailnews/extensions/bayesian-spam-filter/src/nsBayesianFilter.cpp2758
-rw-r--r--mailnews/extensions/bayesian-spam-filter/src/nsBayesianFilter.h404
-rw-r--r--mailnews/extensions/bayesian-spam-filter/src/nsBayesianFilterCID.h22
-rw-r--r--mailnews/extensions/bayesian-spam-filter/src/nsIncompleteGamma.h259
6 files changed, 3460 insertions, 0 deletions
diff --git a/mailnews/extensions/bayesian-spam-filter/moz.build b/mailnews/extensions/bayesian-spam-filter/moz.build
new file mode 100644
index 000000000..8755860cb
--- /dev/null
+++ b/mailnews/extensions/bayesian-spam-filter/moz.build
@@ -0,0 +1,6 @@
+# vim: set filetype=python:
+# This Source Code Form is subject to the terms of the Mozilla Public
+# License, v. 2.0. If a copy of the MPL was not distributed with this
+# file, You can obtain one at http://mozilla.org/MPL/2.0/.
+
+DIRS += ['src']
diff --git a/mailnews/extensions/bayesian-spam-filter/src/moz.build b/mailnews/extensions/bayesian-spam-filter/src/moz.build
new file mode 100644
index 000000000..5819766a3
--- /dev/null
+++ b/mailnews/extensions/bayesian-spam-filter/src/moz.build
@@ -0,0 +1,11 @@
+# vim: set filetype=python:
+# This Source Code Form is subject to the terms of the Mozilla Public
+# License, v. 2.0. If a copy of the MPL was not distributed with this
+# file, You can obtain one at http://mozilla.org/MPL/2.0/.
+
+SOURCES += [
+ 'nsBayesianFilter.cpp',
+]
+
+FINAL_LIBRARY = 'mail'
+
diff --git a/mailnews/extensions/bayesian-spam-filter/src/nsBayesianFilter.cpp b/mailnews/extensions/bayesian-spam-filter/src/nsBayesianFilter.cpp
new file mode 100644
index 000000000..0fa5aa1e2
--- /dev/null
+++ b/mailnews/extensions/bayesian-spam-filter/src/nsBayesianFilter.cpp
@@ -0,0 +1,2758 @@
+/* -*- Mode: C++; tab-width: 4; indent-tabs-mode: nil; c-basic-offset: 4 -*- */
+/* This Source Code Form is subject to the terms of the Mozilla Public
+ * License, v. 2.0. If a copy of the MPL was not distributed with this
+ * file, You can obtain one at http://mozilla.org/MPL/2.0/. */
+
+#include "nsBayesianFilter.h"
+#include "nsIInputStream.h"
+#include "nsIStreamListener.h"
+#include "nsNetUtil.h"
+#include "nsQuickSort.h"
+#include "nsIMsgMessageService.h"
+#include "nsMsgUtils.h" // for GetMessageServiceFromURI
+#include "prnetdb.h"
+#include "nsIMsgWindow.h"
+#include "mozilla/Logging.h"
+#include "nsAppDirectoryServiceDefs.h"
+#include "nsUnicharUtils.h"
+#include "nsDirectoryServiceUtils.h"
+#include "nsIMIMEHeaderParam.h"
+#include "nsNetCID.h"
+#include "nsIMimeHeaders.h"
+#include "nsMsgMimeCID.h"
+#include "nsIMsgMailNewsUrl.h"
+#include "nsIMimeMiscStatus.h"
+#include "nsIPrefService.h"
+#include "nsIPrefBranch.h"
+#include "nsIStringEnumerator.h"
+#include "nsIObserverService.h"
+#include "nsIChannel.h"
+
+using namespace mozilla;
+
+// needed to mark attachment flag on the db hdr
+#include "nsIMsgHdr.h"
+
+// needed to strip html out of the body
+#include "nsIContentSerializer.h"
+#include "nsLayoutCID.h"
+#include "nsIParserUtils.h"
+#include "nsIDocumentEncoder.h"
+
+#include "nsIncompleteGamma.h"
+#include <math.h>
+#include <prmem.h>
+#include "nsIMsgTraitService.h"
+#include "mozilla/Services.h"
+#include "mozilla/Attributes.h"
+#include <cstdlib> // for std::abs(int/long)
+#include <cmath> // for std::abs(float/double)
+
+static PRLogModuleInfo *BayesianFilterLogModule = nullptr;
+
+#define kDefaultJunkThreshold .99 // we override this value via a pref
+static const char* kBayesianFilterTokenDelimiters = " \t\n\r\f.";
+static unsigned int kMinLengthForToken = 3; // lower bound on the number of characters in a word before we treat it as a token
+static unsigned int kMaxLengthForToken = 12; // upper bound on the number of characters in a word to be declared as a token
+
+#define FORGED_RECEIVED_HEADER_HINT NS_LITERAL_CSTRING("may be forged")
+
+#ifndef M_LN2
+#define M_LN2 0.69314718055994530942
+#endif
+
+#ifndef M_E
+#define M_E 2.7182818284590452354
+#endif
+
+// provide base implementation of hash lookup of a string
+struct BaseToken : public PLDHashEntryHdr
+{
+ const char* mWord;
+};
+
+// token for a particular message
+// mCount, mAnalysisLink are initialized to zero by the hash code
+struct Token : public BaseToken {
+ uint32_t mCount;
+ uint32_t mAnalysisLink; // index in mAnalysisStore of the AnalysisPerToken
+ // object for the first trait for this token
+};
+
+// token stored in a training file for a group of messages
+// mTraitLink is initialized to 0 by the hash code
+struct CorpusToken : public BaseToken
+{
+ uint32_t mTraitLink; // index in mTraitStore of the TraitPerToken
+ // object for the first trait for this token
+};
+
+// set the value of a TraitPerToken object
+TraitPerToken::TraitPerToken(uint32_t aTraitId, uint32_t aCount)
+ : mId(aTraitId), mCount(aCount), mNextLink(0)
+{
+}
+
+// shorthand representations of trait ids for junk and good
+static const uint32_t kJunkTrait = nsIJunkMailPlugin::JUNK_TRAIT;
+static const uint32_t kGoodTrait = nsIJunkMailPlugin::GOOD_TRAIT;
+
+// set the value of an AnalysisPerToken object
+AnalysisPerToken::AnalysisPerToken(
+ uint32_t aTraitIndex, double aDistance, double aProbability) :
+ mTraitIndex(aTraitIndex),
+ mDistance(aDistance),
+ mProbability(aProbability),
+ mNextLink(0)
+{
+}
+
+// the initial size of the AnalysisPerToken linked list storage
+const uint32_t kAnalysisStoreCapacity = 2048;
+
+// the initial size of the TraitPerToken linked list storage
+const uint32_t kTraitStoreCapacity = 16384;
+
+// Size of Auto arrays representing per trait information
+const uint32_t kTraitAutoCapacity = 10;
+
+TokenEnumeration::TokenEnumeration(PLDHashTable* table)
+ : mIterator(table->Iter())
+{
+}
+
+inline bool TokenEnumeration::hasMoreTokens()
+{
+ return !mIterator.Done();
+}
+
+inline BaseToken* TokenEnumeration::nextToken()
+{
+ auto token = static_cast<BaseToken*>(mIterator.Get());
+ mIterator.Next();
+ return token;
+}
+
+// member variables
+static const PLDHashTableOps gTokenTableOps = {
+ PLDHashTable::HashStringKey,
+ PLDHashTable::MatchStringKey,
+ PLDHashTable::MoveEntryStub,
+ PLDHashTable::ClearEntryStub,
+ nullptr
+};
+
+TokenHash::TokenHash(uint32_t aEntrySize)
+ : mTokenTable(&gTokenTableOps, aEntrySize, 128)
+{
+ mEntrySize = aEntrySize;
+ PL_INIT_ARENA_POOL(&mWordPool, "Words Arena", 16384);
+}
+
+TokenHash::~TokenHash()
+{
+ PL_FinishArenaPool(&mWordPool);
+}
+
+nsresult TokenHash::clearTokens()
+{
+ // we re-use the tokenizer when classifying multiple messages,
+ // so this gets called after every message classification.
+ mTokenTable.ClearAndPrepareForLength(128);
+ PL_FreeArenaPool(&mWordPool);
+ return NS_OK;
+}
+
+char* TokenHash::copyWord(const char* word, uint32_t len)
+{
+ void* result;
+ uint32_t size = 1 + len;
+ PL_ARENA_ALLOCATE(result, &mWordPool, size);
+ if (result)
+ memcpy(result, word, size);
+ return reinterpret_cast<char*>(result);
+}
+
+inline BaseToken* TokenHash::get(const char* word)
+{
+ PLDHashEntryHdr* entry = mTokenTable.Search(word);
+ if (entry)
+ return static_cast<BaseToken*>(entry);
+ return NULL;
+}
+
+BaseToken* TokenHash::add(const char* word)
+{
+ if (!word || !*word)
+ {
+ NS_ERROR("Trying to add a null word");
+ return nullptr;
+ }
+
+ MOZ_LOG(BayesianFilterLogModule, LogLevel::Debug, ("add word: %s", word));
+
+ PLDHashEntryHdr* entry = mTokenTable.Add(word, mozilla::fallible);
+ BaseToken* token = static_cast<BaseToken*>(entry);
+ if (token) {
+ if (token->mWord == NULL) {
+ uint32_t len = strlen(word);
+ NS_ASSERTION(len != 0, "adding zero length word to tokenizer");
+ if (!len)
+ MOZ_LOG(BayesianFilterLogModule, LogLevel::Debug, ("adding zero length word to tokenizer"));
+ token->mWord = copyWord(word, len);
+ NS_ASSERTION(token->mWord, "copyWord failed");
+ if (!token->mWord) {
+ MOZ_LOG(BayesianFilterLogModule, LogLevel::Error, ("copyWord failed: %s (%d)", word, len));
+ mTokenTable.RawRemove(entry);
+ return NULL;
+ }
+ }
+ }
+ return token;
+}
+
+inline uint32_t TokenHash::countTokens()
+{
+ return mTokenTable.EntryCount();
+}
+
+inline TokenEnumeration TokenHash::getTokens()
+{
+ return TokenEnumeration(&mTokenTable);
+}
+
+Tokenizer::Tokenizer() :
+ TokenHash(sizeof(Token)),
+ mBodyDelimiters(kBayesianFilterTokenDelimiters),
+ mHeaderDelimiters(kBayesianFilterTokenDelimiters),
+ mCustomHeaderTokenization(false),
+ mMaxLengthForToken(kMaxLengthForToken),
+ mIframeToDiv(false)
+{
+ nsresult rv;
+ nsCOMPtr<nsIPrefService> prefs = do_GetService(NS_PREFSERVICE_CONTRACTID, &rv);
+ NS_ENSURE_SUCCESS_VOID(rv);
+
+ nsCOMPtr<nsIPrefBranch> prefBranch;
+ rv = prefs->GetBranch("mailnews.bayesian_spam_filter.", getter_AddRefs(prefBranch));
+ NS_ENSURE_SUCCESS_VOID(rv); // no branch defined, just use defaults
+
+ /*
+ * RSS feeds store their summary as alternate content of an iframe. But due
+ * to bug 365953, this is not seen by the serializer. As a workaround, allow
+ * the tokenizer to replace the iframe with div for tokenization.
+ */
+ rv = prefBranch->GetBoolPref("iframe_to_div", &mIframeToDiv);
+ if (NS_FAILED(rv))
+ mIframeToDiv = false;
+
+ /*
+ * the list of delimiters used to tokenize the message and body
+ * defaults to the value in kBayesianFilterTokenDelimiters, but may be
+ * set with the following preferences for the body and header
+ * separately.
+ *
+ * \t, \n, \v, \f, \r, and \\ will be escaped to their normal
+ * C-library values, all other two-letter combinations beginning with \
+ * will be ignored.
+ */
+
+ prefBranch->GetCharPref("body_delimiters", getter_Copies(mBodyDelimiters));
+ if (!mBodyDelimiters.IsEmpty())
+ UnescapeCString(mBodyDelimiters);
+ else // prefBranch empties the result when it fails :(
+ mBodyDelimiters.Assign(kBayesianFilterTokenDelimiters);
+
+ prefBranch->GetCharPref("header_delimiters", getter_Copies(mHeaderDelimiters));
+ if (!mHeaderDelimiters.IsEmpty())
+ UnescapeCString(mHeaderDelimiters);
+ else
+ mHeaderDelimiters.Assign(kBayesianFilterTokenDelimiters);
+
+ /*
+ * Extensions may wish to enable or disable tokenization of certain headers.
+ * Define any headers to enable/disable in a string preference like this:
+ * "mailnews.bayesian_spam_filter.tokenizeheader.headername"
+ *
+ * where "headername" is the header to tokenize. For example, to tokenize the
+ * header "x-spam-status" use the preference:
+ *
+ * "mailnews.bayesian_spam_filter.tokenizeheader.x-spam-status"
+ *
+ * The value of the string preference will be interpreted in one of
+ * four ways, depending on the value:
+ *
+ * If "false" then do not tokenize that header
+ * If "full" then add the entire header value as a token,
+ * without breaking up into subtokens using delimiters
+ * If "standard" then tokenize the header using as delimiters the current
+ * value of the generic header delimiters
+ * Any other string is interpreted as a list of delimiters to use to parse
+ * the header. \t, \n, \v, \f, \r, and \\ will be escaped to their normal
+ * C-library values, all other two-letter combinations beginning with \
+ * will be ignored.
+ *
+ * Header names in the preference should be all lower case
+ *
+ * Extensions may also set the maximum length of a token (default is
+ * kMaxLengthForToken) by setting the int preference:
+ * "mailnews.bayesian_spam_filter.maxlengthfortoken"
+ */
+
+ char** headers;
+ uint32_t count;
+
+ // get customized maximum token length
+ int32_t maxLengthForToken;
+ rv = prefBranch->GetIntPref("maxlengthfortoken", &maxLengthForToken);
+ mMaxLengthForToken = NS_SUCCEEDED(rv) ? uint32_t(maxLengthForToken) : kMaxLengthForToken;
+
+ rv = prefs->GetBranch("mailnews.bayesian_spam_filter.tokenizeheader.", getter_AddRefs(prefBranch));
+ if (NS_SUCCEEDED(rv))
+ rv = prefBranch->GetChildList("", &count, &headers);
+
+ if (NS_SUCCEEDED(rv))
+ {
+ mCustomHeaderTokenization = true;
+ for (uint32_t i = 0; i < count; i++)
+ {
+ nsCString value;
+ prefBranch->GetCharPref(headers[i], getter_Copies(value));
+ if (value.EqualsLiteral("false"))
+ {
+ mDisabledHeaders.AppendElement(headers[i]);
+ continue;
+ }
+ mEnabledHeaders.AppendElement(headers[i]);
+ if (value.EqualsLiteral("standard"))
+ value.SetIsVoid(true); // Void means use default delimiter
+ else if (value.EqualsLiteral("full"))
+ value.Truncate(); // Empty means add full header
+ else
+ UnescapeCString(value);
+ mEnabledHeadersDelimiters.AppendElement(value);
+ }
+ NS_FREE_XPCOM_ALLOCATED_POINTER_ARRAY(count, headers);
+ }
+}
+
+Tokenizer::~Tokenizer()
+{
+}
+
+inline Token* Tokenizer::get(const char* word)
+{
+ return static_cast<Token*>(TokenHash::get(word));
+}
+
+Token* Tokenizer::add(const char* word, uint32_t count)
+{
+ MOZ_LOG(BayesianFilterLogModule, LogLevel::Debug, ("add word: %s (count=%d)",
+ word, count));
+
+ Token* token = static_cast<Token*>(TokenHash::add(word));
+ if (token)
+ {
+ token->mCount += count; // hash code initializes this to zero
+ MOZ_LOG(BayesianFilterLogModule, LogLevel::Debug,
+ ("adding word to tokenizer: %s (count=%d) (mCount=%d)",
+ word, count, token->mCount));
+ }
+ return token;
+}
+
+static bool isDecimalNumber(const char* word)
+{
+ const char* p = word;
+ if (*p == '-') ++p;
+ char c;
+ while ((c = *p++)) {
+ if (!isdigit((unsigned char) c))
+ return false;
+ }
+ return true;
+}
+
+static bool isASCII(const char* word)
+{
+ const unsigned char* p = (const unsigned char*)word;
+ unsigned char c;
+ while ((c = *p++)) {
+ if (c > 127)
+ return false;
+ }
+ return true;
+}
+
+inline bool isUpperCase(char c) { return ('A' <= c) && (c <= 'Z'); }
+
+static char* toLowerCase(char* str)
+{
+ char c, *p = str;
+ while ((c = *p++)) {
+ if (isUpperCase(c))
+ p[-1] = c + ('a' - 'A');
+ }
+ return str;
+}
+
+void Tokenizer::addTokenForHeader(const char * aTokenPrefix, nsACString& aValue,
+ bool aTokenizeValue, const char* aDelimiters)
+{
+ if (aValue.Length())
+ {
+ ToLowerCase(aValue);
+ if (!aTokenizeValue)
+ {
+ nsCString tmpStr;
+ tmpStr.Assign(aTokenPrefix);
+ tmpStr.Append(':');
+ tmpStr.Append(aValue);
+
+ add(tmpStr.get());
+ }
+ else
+ {
+ char* word;
+ nsCString str(aValue);
+ char *next = str.BeginWriting();
+ const char* delimiters = !aDelimiters ?
+ mHeaderDelimiters.get() : aDelimiters;
+ while ((word = NS_strtok(delimiters, &next)) != NULL)
+ {
+ if (strlen(word) < kMinLengthForToken)
+ continue;
+ if (isDecimalNumber(word))
+ continue;
+ if (isASCII(word))
+ {
+ nsCString tmpStr;
+ tmpStr.Assign(aTokenPrefix);
+ tmpStr.Append(':');
+ tmpStr.Append(word);
+ add(tmpStr.get());
+ }
+ }
+ }
+ }
+}
+
+void Tokenizer::tokenizeAttachment(const char * aContentType, const char * aFileName)
+{
+ nsAutoCString contentType;
+ nsAutoCString fileName;
+ fileName.Assign(aFileName);
+ contentType.Assign(aContentType);
+
+ // normalize the content type and the file name
+ ToLowerCase(fileName);
+ ToLowerCase(contentType);
+ addTokenForHeader("attachment/filename", fileName);
+
+ addTokenForHeader("attachment/content-type", contentType);
+}
+
+void Tokenizer::tokenizeHeaders(nsIUTF8StringEnumerator * aHeaderNames, nsIUTF8StringEnumerator * aHeaderValues)
+{
+ nsCString headerValue;
+ nsAutoCString headerName; // we'll be normalizing all header names to lower case
+ bool hasMore;
+
+ while (aHeaderNames->HasMore(&hasMore), hasMore)
+ {
+ aHeaderNames->GetNext(headerName);
+ ToLowerCase(headerName);
+ aHeaderValues->GetNext(headerValue);
+
+ bool headerProcessed = false;
+ if (mCustomHeaderTokenization)
+ {
+ // Process any exceptions set from preferences
+ for (uint32_t i = 0; i < mEnabledHeaders.Length(); i++)
+ if (headerName.Equals(mEnabledHeaders[i]))
+ {
+ if (mEnabledHeadersDelimiters[i].IsVoid())
+ // tokenize with standard delimiters for all headers
+ addTokenForHeader(headerName.get(), headerValue, true);
+ else if (mEnabledHeadersDelimiters[i].IsEmpty())
+ // do not break the header into tokens
+ addTokenForHeader(headerName.get(), headerValue);
+ else
+ // use the delimiter in mEnabledHeadersDelimiters
+ addTokenForHeader(headerName.get(), headerValue, true,
+ mEnabledHeadersDelimiters[i].get());
+ headerProcessed = true;
+ break; // we found the header, no need to look for more custom values
+ }
+
+ for (uint32_t i = 0; i < mDisabledHeaders.Length(); i++)
+ {
+ if (headerName.Equals(mDisabledHeaders[i]))
+ {
+ headerProcessed = true;
+ break;
+ }
+ }
+
+ if (headerProcessed)
+ continue;
+ }
+
+ switch (headerName.First())
+ {
+ case 'c':
+ if (headerName.Equals("content-type"))
+ {
+ nsresult rv;
+ nsCOMPtr<nsIMIMEHeaderParam> mimehdrpar = do_GetService(NS_MIMEHEADERPARAM_CONTRACTID, &rv);
+ if (NS_FAILED(rv))
+ break;
+
+ // extract the charset parameter
+ nsCString parameterValue;
+ mimehdrpar->GetParameterInternal(headerValue.get(), "charset", nullptr, nullptr, getter_Copies(parameterValue));
+ addTokenForHeader("charset", parameterValue);
+
+ // create a token containing just the content type
+ mimehdrpar->GetParameterInternal(headerValue.get(), "type", nullptr, nullptr, getter_Copies(parameterValue));
+ if (!parameterValue.Length())
+ mimehdrpar->GetParameterInternal(headerValue.get(), nullptr /* use first unnamed param */, nullptr, nullptr, getter_Copies(parameterValue));
+ addTokenForHeader("content-type/type", parameterValue);
+
+ // XXX: should we add a token for the entire content-type header as well or just these parts we have extracted?
+ }
+ break;
+ case 'r':
+ if (headerName.Equals("received"))
+ {
+ // look for the string "may be forged" in the received headers. sendmail sometimes adds this hint
+ // This does not compile on linux yet. Need to figure out why. Commenting out for now
+ // if (FindInReadable(FORGED_RECEIVED_HEADER_HINT, headerValue))
+ // addTokenForHeader(headerName.get(), FORGED_RECEIVED_HEADER_HINT);
+ }
+
+ // leave out reply-to
+ break;
+ case 's':
+ if (headerName.Equals("subject"))
+ {
+ // we want to tokenize the subject
+ addTokenForHeader(headerName.get(), headerValue, true);
+ }
+
+ // important: leave out sender field. Too strong of an indicator
+ break;
+ case 'x': // (2) X-Mailer / user-agent works best if it is untokenized, just fold the case and any leading/trailing white space
+ // all headers beginning with x-mozilla are being changed by us, so ignore
+ if (Substring(headerName, 0, 9).Equals("x-mozilla"))
+ break;
+ // fall through
+ MOZ_FALLTHROUGH;
+ case 'u':
+ addTokenForHeader(headerName.get(), headerValue);
+ break;
+ default:
+ addTokenForHeader(headerName.get(), headerValue);
+ break;
+ } // end switch
+
+ }
+}
+
+void Tokenizer::tokenize_ascii_word(char * aWord)
+{
+ // always deal with normalized lower case strings
+ toLowerCase(aWord);
+ uint32_t wordLength = strlen(aWord);
+
+ // if the wordLength is within our accepted token limit, then add it
+ if (wordLength >= kMinLengthForToken && wordLength <= mMaxLengthForToken)
+ add(aWord);
+ else if (wordLength > mMaxLengthForToken)
+ {
+ // don't skip over the word if it looks like an email address,
+ // there is value in adding tokens for addresses
+ nsDependentCString word (aWord, wordLength); // CHEAP, no allocation occurs here...
+
+ // XXX: i think the 40 byte check is just for perf reasons...if the email address is longer than that then forget about it.
+ const char *atSign = strchr(aWord, '@');
+ if (wordLength < 40 && strchr(aWord, '.') && atSign && !strchr(atSign + 1, '@'))
+ {
+ uint32_t numBytesToSep = atSign - aWord;
+ if (numBytesToSep < wordLength - 1) // if the @ sign is the last character, it must not be an email address
+ {
+ // split the john@foo.com into john and foo.com, treat them as separate tokens
+ nsCString emailNameToken;
+ emailNameToken.AssignLiteral("email name:");
+ emailNameToken.Append(Substring(word, 0, numBytesToSep++));
+ add(emailNameToken.get());
+ nsCString emailAddrToken;
+ emailAddrToken.AssignLiteral("email addr:");
+ emailAddrToken.Append(Substring(word, numBytesToSep, wordLength - numBytesToSep));
+ add(emailAddrToken.get());
+ return;
+ }
+ }
+
+ // there is value in generating a token indicating the number
+ // of characters we are skipping. We'll round to the nearest 10
+ nsCString skipToken;
+ skipToken.AssignLiteral("skip:");
+ skipToken.Append(word[0]);
+ skipToken.Append(' ');
+ skipToken.AppendInt((wordLength/10) * 10);
+ add(skipToken.get());
+ }
+}
+
+// one substract and one conditional jump should be faster than two conditional jump on most recent system.
+#define IN_RANGE(x, low, high) ((uint16_t)((x)-(low)) <= (high)-(low))
+
+#define IS_JA_HIRAGANA(x) IN_RANGE(x, 0x3040, 0x309F)
+// swapping the range using xor operation to reduce conditional jump.
+#define IS_JA_KATAKANA(x) (IN_RANGE(x^0x0004, 0x30A0, 0x30FE)||(IN_RANGE(x, 0xFF66, 0xFF9F)))
+#define IS_JA_KANJI(x) (IN_RANGE(x, 0x2E80, 0x2FDF)||IN_RANGE(x, 0x4E00, 0x9FAF))
+#define IS_JA_KUTEN(x) (((x)==0x3001)||((x)==0xFF64)||((x)==0xFF0E))
+#define IS_JA_TOUTEN(x) (((x)==0x3002)||((x)==0xFF61)||((x)==0xFF0C))
+#define IS_JA_SPACE(x) ((x)==0x3000)
+#define IS_JA_FWLATAIN(x) IN_RANGE(x, 0xFF01, 0xFF5E)
+#define IS_JA_FWNUMERAL(x) IN_RANGE(x, 0xFF10, 0xFF19)
+
+#define IS_JAPANESE_SPECIFIC(x) (IN_RANGE(x, 0x3040, 0x30FF)||IN_RANGE(x, 0xFF01, 0xFF9F))
+
+enum char_class{
+ others = 0,
+ space,
+ hiragana,
+ katakana,
+ kanji,
+ kuten,
+ touten,
+ kigou,
+ fwlatain,
+ ascii
+};
+
+static char_class getCharClass(char16_t c)
+{
+ char_class charClass = others;
+
+ if(IS_JA_HIRAGANA(c))
+ charClass = hiragana;
+ else if(IS_JA_KATAKANA(c))
+ charClass = katakana;
+ else if(IS_JA_KANJI(c))
+ charClass = kanji;
+ else if(IS_JA_KUTEN(c))
+ charClass = kuten;
+ else if(IS_JA_TOUTEN(c))
+ charClass = touten;
+ else if(IS_JA_FWLATAIN(c))
+ charClass = fwlatain;
+
+ return charClass;
+}
+
+static bool isJapanese(const char* word)
+{
+ nsString text = NS_ConvertUTF8toUTF16(word);
+ char16_t* p = (char16_t*)text.get();
+ char16_t c;
+
+ // it is japanese chunk if it contains any hiragana or katakana.
+ while((c = *p++))
+ if( IS_JAPANESE_SPECIFIC(c))
+ return true;
+
+ return false;
+}
+
+static bool isFWNumeral(const char16_t* p1, const char16_t* p2)
+{
+ for(;p1<p2;p1++)
+ if(!IS_JA_FWNUMERAL(*p1))
+ return false;
+
+ return true;
+}
+
+// The japanese tokenizer was added as part of Bug #277354
+void Tokenizer::tokenize_japanese_word(char* chunk)
+{
+ MOZ_LOG(BayesianFilterLogModule, LogLevel::Debug, ("entering tokenize_japanese_word(%s)", chunk));
+
+ nsString srcStr = NS_ConvertUTF8toUTF16(chunk);
+ const char16_t* p1 = srcStr.get();
+ const char16_t* p2 = p1;
+ if(!*p2) return;
+
+ char_class cc = getCharClass(*p2);
+ while(*(++p2))
+ {
+ if(cc == getCharClass(*p2))
+ continue;
+
+ nsCString token = NS_ConvertUTF16toUTF8(p1, p2-p1);
+ if( (!isDecimalNumber(token.get())) && (!isFWNumeral(p1, p2)))
+ {
+ nsCString tmpStr;
+ tmpStr.AppendLiteral("JA:");
+ tmpStr.Append(token);
+ add(tmpStr.get());
+ }
+
+ cc = getCharClass(*p2);
+ p1 = p2;
+ }
+}
+
+nsresult Tokenizer::stripHTML(const nsAString& inString, nsAString& outString)
+{
+ uint32_t flags = nsIDocumentEncoder::OutputLFLineBreak
+ | nsIDocumentEncoder::OutputNoScriptContent
+ | nsIDocumentEncoder::OutputNoFramesContent
+ | nsIDocumentEncoder::OutputBodyOnly;
+ nsCOMPtr<nsIParserUtils> utils =
+ do_GetService(NS_PARSERUTILS_CONTRACTID);
+ return utils->ConvertToPlainText(inString, flags, 80, outString);
+}
+
+void Tokenizer::tokenize(const char* aText)
+{
+ MOZ_LOG(BayesianFilterLogModule, LogLevel::Debug, ("tokenize: %s", aText));
+
+ // strip out HTML tags before we begin processing
+ // uggh but first we have to blow up our string into UCS2
+ // since that's what the document encoder wants. UTF8/UCS2, I wish we all
+ // spoke the same language here..
+ nsString text = NS_ConvertUTF8toUTF16(aText);
+ nsString strippedUCS2;
+
+ // RSS feeds store their summary information as an iframe. But due to
+ // bug 365953, we can't see those in the plaintext serializer. As a
+ // workaround, allow an option to replace iframe with div in the message
+ // text. We disable by default, since most people won't be applying bayes
+ // to RSS
+
+ if (mIframeToDiv)
+ {
+ MsgReplaceSubstring(text, NS_LITERAL_STRING("<iframe"),
+ NS_LITERAL_STRING("<div"));
+ MsgReplaceSubstring(text, NS_LITERAL_STRING("/iframe>"),
+ NS_LITERAL_STRING("/div>"));
+ }
+
+ stripHTML(text, strippedUCS2);
+
+ // convert 0x3000(full width space) into 0x0020
+ char16_t * substr_start = strippedUCS2.BeginWriting();
+ char16_t * substr_end = strippedUCS2.EndWriting();
+ while (substr_start != substr_end) {
+ if (*substr_start == 0x3000)
+ *substr_start = 0x0020;
+ ++substr_start;
+ }
+
+ nsCString strippedStr = NS_ConvertUTF16toUTF8(strippedUCS2);
+ char * strippedText = strippedStr.BeginWriting();
+ MOZ_LOG(BayesianFilterLogModule, LogLevel::Debug, ("tokenize stripped html: %s", strippedText));
+
+ char* word;
+ char* next = strippedText;
+ while ((word = NS_strtok(mBodyDelimiters.get(), &next)) != NULL) {
+ if (!*word) continue;
+ if (isDecimalNumber(word)) continue;
+ if (isASCII(word))
+ tokenize_ascii_word(word);
+ else if (isJapanese(word))
+ tokenize_japanese_word(word);
+ else {
+ nsresult rv;
+ // use I18N scanner to break this word into meaningful semantic units.
+ if (!mScanner) {
+ mScanner = do_CreateInstance(NS_SEMANTICUNITSCANNER_CONTRACTID, &rv);
+ NS_ASSERTION(NS_SUCCEEDED(rv), "couldn't create semantic unit scanner!");
+ if (NS_FAILED(rv)) {
+ return;
+ }
+ }
+ if (mScanner) {
+ mScanner->Start("UTF-8");
+ // convert this word from UTF-8 into UCS2.
+ NS_ConvertUTF8toUTF16 uword(word);
+ ToLowerCase(uword);
+ const char16_t* utext = uword.get();
+ int32_t len = uword.Length(), pos = 0, begin, end;
+ bool gotUnit;
+ while (pos < len) {
+ rv = mScanner->Next(utext, len, pos, true, &begin, &end, &gotUnit);
+ if (NS_SUCCEEDED(rv) && gotUnit) {
+ NS_ConvertUTF16toUTF8 utfUnit(utext + begin, end - begin);
+ add(utfUnit.get());
+ // advance to end of current unit.
+ pos = end;
+ } else {
+ break;
+ }
+ }
+ }
+ }
+ }
+}
+
+// helper function to escape \n, \t, etc from a CString
+void Tokenizer::UnescapeCString(nsCString& aCString)
+{
+ nsAutoCString result;
+
+ const char* readEnd = aCString.EndReading();
+ char* writeStart = result.BeginWriting();
+ char* writeIter = writeStart;
+
+ bool inEscape = false;
+ for (const char* readIter = aCString.BeginReading(); readIter != readEnd; readIter++)
+ {
+ if (!inEscape)
+ {
+ if (*readIter == '\\')
+ inEscape = true;
+ else
+ *(writeIter++) = *readIter;
+ }
+ else
+ {
+ inEscape = false;
+ switch (*readIter)
+ {
+ case '\\':
+ *(writeIter++) = '\\';
+ break;
+ case 't':
+ *(writeIter++) = '\t';
+ break;
+ case 'n':
+ *(writeIter++) = '\n';
+ break;
+ case 'v':
+ *(writeIter++) = '\v';
+ break;
+ case 'f':
+ *(writeIter++) = '\f';
+ break;
+ case 'r':
+ *(writeIter++) = '\r';
+ break;
+ default:
+ // all other escapes are ignored
+ break;
+ }
+ }
+ }
+ result.SetLength(writeIter - writeStart);
+ aCString.Assign(result);
+}
+
+Token* Tokenizer::copyTokens()
+{
+ uint32_t count = countTokens();
+ if (count > 0) {
+ Token* tokens = new Token[count];
+ if (tokens) {
+ Token* tp = tokens;
+ TokenEnumeration e(&mTokenTable);
+ while (e.hasMoreTokens())
+ *tp++ = *(static_cast<Token*>(e.nextToken()));
+ }
+ return tokens;
+ }
+ return NULL;
+}
+
+class TokenAnalyzer {
+public:
+ virtual ~TokenAnalyzer() {}
+
+ virtual void analyzeTokens(Tokenizer& tokenizer) = 0;
+ void setTokenListener(nsIStreamListener *aTokenListener)
+ {
+ mTokenListener = aTokenListener;
+ }
+
+ void setSource(const char *sourceURI) {mTokenSource = sourceURI;}
+
+ nsCOMPtr<nsIStreamListener> mTokenListener;
+ nsCString mTokenSource;
+
+};
+
+/**
+ * This class downloads the raw content of an email message, buffering until
+ * complete segments are seen, that is until a linefeed is seen, although
+ * any of the valid token separators would do. This could be a further
+ * refinement.
+ */
+class TokenStreamListener : public nsIStreamListener, nsIMsgHeaderSink {
+public:
+ NS_DECL_ISUPPORTS
+ NS_DECL_NSIREQUESTOBSERVER
+ NS_DECL_NSISTREAMLISTENER
+ NS_DECL_NSIMSGHEADERSINK
+
+ TokenStreamListener(TokenAnalyzer* analyzer);
+protected:
+ virtual ~TokenStreamListener();
+ TokenAnalyzer* mAnalyzer;
+ char* mBuffer;
+ uint32_t mBufferSize;
+ uint32_t mLeftOverCount;
+ Tokenizer mTokenizer;
+ bool mSetAttachmentFlag;
+};
+
+const uint32_t kBufferSize = 16384;
+
+TokenStreamListener::TokenStreamListener(TokenAnalyzer* analyzer)
+ : mAnalyzer(analyzer),
+ mBuffer(NULL), mBufferSize(kBufferSize), mLeftOverCount(0),
+ mSetAttachmentFlag(false)
+{
+}
+
+TokenStreamListener::~TokenStreamListener()
+{
+ delete[] mBuffer;
+ delete mAnalyzer;
+}
+
+NS_IMPL_ISUPPORTS(TokenStreamListener, nsIRequestObserver, nsIStreamListener, nsIMsgHeaderSink)
+
+NS_IMETHODIMP TokenStreamListener::ProcessHeaders(nsIUTF8StringEnumerator *aHeaderNames, nsIUTF8StringEnumerator *aHeaderValues, bool dontCollectAddress)
+{
+ mTokenizer.tokenizeHeaders(aHeaderNames, aHeaderValues);
+ return NS_OK;
+}
+
+NS_IMETHODIMP TokenStreamListener::HandleAttachment(const char *contentType, const char *url, const char16_t *displayName, const char *uri, bool aIsExternalAttachment)
+{
+ mTokenizer.tokenizeAttachment(contentType, NS_ConvertUTF16toUTF8(displayName).get());
+ return NS_OK;
+}
+
+NS_IMETHODIMP TokenStreamListener::AddAttachmentField(const char *field, const char *value)
+{
+ return NS_OK;
+}
+
+NS_IMETHODIMP TokenStreamListener::OnEndAllAttachments()
+{
+ return NS_OK;
+}
+
+NS_IMETHODIMP TokenStreamListener::OnEndMsgDownload(nsIMsgMailNewsUrl *url)
+{
+ return NS_OK;
+}
+
+
+NS_IMETHODIMP TokenStreamListener::OnMsgHasRemoteContent(nsIMsgDBHdr *aMsgHdr,
+ nsIURI *aContentURI,
+ bool aCanOverride)
+{
+ return NS_OK;
+}
+
+NS_IMETHODIMP TokenStreamListener::OnEndMsgHeaders(nsIMsgMailNewsUrl *url)
+{
+ return NS_OK;
+}
+
+
+NS_IMETHODIMP TokenStreamListener::GetSecurityInfo(nsISupports * *aSecurityInfo)
+{
+ return NS_OK;
+}
+NS_IMETHODIMP TokenStreamListener::SetSecurityInfo(nsISupports * aSecurityInfo)
+{
+ return NS_OK;
+}
+
+NS_IMETHODIMP TokenStreamListener::GetDummyMsgHeader(nsIMsgDBHdr **aMsgDBHdr)
+{
+ return NS_ERROR_NOT_IMPLEMENTED;
+}
+
+NS_IMETHODIMP TokenStreamListener::ResetProperties()
+{
+ return NS_OK;
+}
+
+NS_IMETHODIMP TokenStreamListener::GetProperties(nsIWritablePropertyBag2 * *aProperties)
+{
+ return NS_ERROR_NOT_IMPLEMENTED;
+}
+
+/* void onStartRequest (in nsIRequest aRequest, in nsISupports aContext); */
+NS_IMETHODIMP TokenStreamListener::OnStartRequest(nsIRequest *aRequest, nsISupports *aContext)
+{
+ mLeftOverCount = 0;
+ if (!mBuffer)
+ {
+ mBuffer = new char[mBufferSize];
+ NS_ENSURE_TRUE(mBuffer, NS_ERROR_OUT_OF_MEMORY);
+ }
+
+ // get the url for the channel and set our nsIMsgHeaderSink on it so we get notified
+ // about the headers and attachments
+
+ nsCOMPtr<nsIChannel> channel (do_QueryInterface(aRequest));
+ if (channel)
+ {
+ nsCOMPtr<nsIURI> uri;
+ channel->GetURI(getter_AddRefs(uri));
+ nsCOMPtr<nsIMsgMailNewsUrl> mailUrl = do_QueryInterface(uri);
+ if (mailUrl)
+ mailUrl->SetMsgHeaderSink(static_cast<nsIMsgHeaderSink*>(this));
+ }
+
+ return NS_OK;
+}
+
+/* void onDataAvailable (in nsIRequest aRequest, in nsISupports aContext, in nsIInputStream aInputStream, in unsigned long long aOffset, in unsigned long aCount); */
+NS_IMETHODIMP TokenStreamListener::OnDataAvailable(nsIRequest *aRequest, nsISupports *aContext, nsIInputStream *aInputStream, uint64_t aOffset, uint32_t aCount)
+{
+ nsresult rv = NS_OK;
+
+ while (aCount > 0) {
+ uint32_t readCount, totalCount = (aCount + mLeftOverCount);
+ if (totalCount >= mBufferSize) {
+ readCount = mBufferSize - mLeftOverCount - 1;
+ } else {
+ readCount = aCount;
+ }
+
+ // mBuffer is supposed to be allocated in onStartRequest. But something
+ // is causing that to not happen, so as a last-ditch attempt we'll
+ // do it here.
+ if (!mBuffer)
+ {
+ mBuffer = new char[mBufferSize];
+ NS_ENSURE_TRUE(mBuffer, NS_ERROR_OUT_OF_MEMORY);
+ }
+
+ char* buffer = mBuffer;
+ rv = aInputStream->Read(buffer + mLeftOverCount, readCount, &readCount);
+ if (NS_FAILED(rv))
+ break;
+
+ if (readCount == 0) {
+ rv = NS_ERROR_UNEXPECTED;
+ NS_WARNING("failed to tokenize");
+ break;
+ }
+
+ aCount -= readCount;
+
+ /* consume the tokens up to the last legal token delimiter in the buffer. */
+ totalCount = (readCount + mLeftOverCount);
+ buffer[totalCount] = '\0';
+ char* lastDelimiter = NULL;
+ char* scan = buffer + totalCount;
+ while (scan > buffer) {
+ if (strchr(mTokenizer.mBodyDelimiters.get(), *--scan)) {
+ lastDelimiter = scan;
+ break;
+ }
+ }
+
+ if (lastDelimiter) {
+ *lastDelimiter = '\0';
+ mTokenizer.tokenize(buffer);
+
+ uint32_t consumedCount = 1 + (lastDelimiter - buffer);
+ mLeftOverCount = totalCount - consumedCount;
+ if (mLeftOverCount)
+ memmove(buffer, buffer + consumedCount, mLeftOverCount);
+ } else {
+ /* didn't find a delimiter, keep the whole buffer around. */
+ mLeftOverCount = totalCount;
+ if (totalCount >= (mBufferSize / 2)) {
+ uint32_t newBufferSize = mBufferSize * 2;
+ char* newBuffer = new char[newBufferSize];
+ NS_ENSURE_TRUE(newBuffer, NS_ERROR_OUT_OF_MEMORY);
+ memcpy(newBuffer, mBuffer, mLeftOverCount);
+ delete[] mBuffer;
+ mBuffer = newBuffer;
+ mBufferSize = newBufferSize;
+ }
+ }
+ }
+
+ return rv;
+}
+
+/* void onStopRequest (in nsIRequest aRequest, in nsISupports aContext, in nsresult aStatusCode); */
+NS_IMETHODIMP TokenStreamListener::OnStopRequest(nsIRequest *aRequest, nsISupports *aContext, nsresult aStatusCode)
+{
+ if (mLeftOverCount) {
+ /* assume final buffer is complete. */
+ mBuffer[mLeftOverCount] = '\0';
+ mTokenizer.tokenize(mBuffer);
+ }
+
+ /* finally, analyze the tokenized message. */
+ MOZ_LOG(BayesianFilterLogModule, LogLevel::Debug, ("analyze the tokenized message"));
+ if (mAnalyzer)
+ mAnalyzer->analyzeTokens(mTokenizer);
+
+ return NS_OK;
+}
+
+/* Implementation file */
+
+NS_IMPL_ISUPPORTS(nsBayesianFilter, nsIMsgFilterPlugin,
+ nsIJunkMailPlugin, nsIMsgCorpus, nsISupportsWeakReference,
+ nsIObserver)
+
+nsBayesianFilter::nsBayesianFilter()
+ : mTrainingDataDirty(false)
+{
+ if (!BayesianFilterLogModule)
+ BayesianFilterLogModule = PR_NewLogModule("BayesianFilter");
+
+ int32_t junkThreshold = 0;
+ nsresult rv;
+ nsCOMPtr<nsIPrefBranch> pPrefBranch(do_GetService(NS_PREFSERVICE_CONTRACTID, &rv));
+ if (pPrefBranch)
+ pPrefBranch->GetIntPref("mail.adaptivefilters.junk_threshold", &junkThreshold);
+
+ mJunkProbabilityThreshold = (static_cast<double>(junkThreshold)) / 100.0;
+ if (mJunkProbabilityThreshold == 0 || mJunkProbabilityThreshold >= 1)
+ mJunkProbabilityThreshold = kDefaultJunkThreshold;
+
+ MOZ_LOG(BayesianFilterLogModule, LogLevel::Warning, ("junk probability threshold: %f", mJunkProbabilityThreshold));
+
+ mCorpus.readTrainingData();
+
+ // get parameters for training data flushing, from the prefs
+
+ nsCOMPtr<nsIPrefBranch> prefBranch;
+
+ nsCOMPtr<nsIPrefService> prefs = do_GetService(NS_PREFSERVICE_CONTRACTID, &rv);
+ NS_ASSERTION(NS_SUCCEEDED(rv),"failed accessing preferences service");
+ rv = prefs->GetBranch(nullptr, getter_AddRefs(prefBranch));
+ NS_ASSERTION(NS_SUCCEEDED(rv),"failed getting preferences branch");
+
+ rv = prefBranch->GetIntPref("mailnews.bayesian_spam_filter.flush.minimum_interval",&mMinFlushInterval);
+ // it is not a good idea to allow a minimum interval of under 1 second
+ if (NS_FAILED(rv) || (mMinFlushInterval <= 1000) )
+ mMinFlushInterval = DEFAULT_MIN_INTERVAL_BETWEEN_WRITES;
+
+ rv = prefBranch->GetIntPref("mailnews.bayesian_spam_filter.junk_maxtokens", &mMaximumTokenCount);
+ if (NS_FAILED(rv))
+ mMaximumTokenCount = 0; // which means do not limit token counts
+ MOZ_LOG(BayesianFilterLogModule, LogLevel::Warning, ("maximum junk tokens: %d", mMaximumTokenCount));
+
+ mTimer = do_CreateInstance(NS_TIMER_CONTRACTID, &rv);
+ NS_ASSERTION(NS_SUCCEEDED(rv), "unable to create a timer; training data will only be written on exit");
+
+ // the timer is not used on object construction, since for
+ // the time being there are no dirying messages
+
+ // give a default capacity to the memory structure used to store
+ // per-message/per-trait token data
+ mAnalysisStore.SetCapacity(kAnalysisStoreCapacity);
+
+ // dummy 0th element. Index 0 means "end of list" so we need to
+ // start from 1
+ AnalysisPerToken analysisPT(0, 0.0, 0.0);
+ mAnalysisStore.AppendElement(analysisPT);
+ mNextAnalysisIndex = 1;
+}
+
+nsresult nsBayesianFilter::Init()
+{
+ nsCOMPtr<nsIObserverService> observerService =
+ mozilla::services::GetObserverService();
+ if (observerService)
+ observerService->AddObserver(this, "profile-before-change", true);
+ return NS_OK;
+}
+
+void
+nsBayesianFilter::TimerCallback(nsITimer* aTimer, void* aClosure)
+{
+ // we will flush the training data to disk after enough time has passed
+ // since the first time a message has been classified after the last flush
+
+ nsBayesianFilter *filter = static_cast<nsBayesianFilter *>(aClosure);
+ filter->mCorpus.writeTrainingData(filter->mMaximumTokenCount);
+ filter->mTrainingDataDirty = false;
+}
+
+nsBayesianFilter::~nsBayesianFilter()
+{
+ if (mTimer)
+ {
+ mTimer->Cancel();
+ mTimer = nullptr;
+ }
+ // call shutdown when we are going away in case we need
+ // to flush the training set to disk
+ Shutdown();
+}
+
+// this object is used for one call to classifyMessage or classifyMessages().
+// So if we're classifying multiple messages, this object will be used for each message.
+// It's going to hold a reference to itself, basically, to stay in memory.
+class MessageClassifier : public TokenAnalyzer {
+public:
+ // full classifier with arbitrary traits
+ MessageClassifier(nsBayesianFilter* aFilter,
+ nsIJunkMailClassificationListener* aJunkListener,
+ nsIMsgTraitClassificationListener* aTraitListener,
+ nsIMsgTraitDetailListener* aDetailListener,
+ nsTArray<uint32_t>& aProTraits,
+ nsTArray<uint32_t>& aAntiTraits,
+ nsIMsgWindow *aMsgWindow,
+ uint32_t aNumMessagesToClassify,
+ const char **aMessageURIs)
+ : mFilter(aFilter),
+ mJunkMailPlugin(aFilter),
+ mJunkListener(aJunkListener),
+ mTraitListener(aTraitListener),
+ mDetailListener(aDetailListener),
+ mProTraits(aProTraits),
+ mAntiTraits(aAntiTraits),
+ mMsgWindow(aMsgWindow)
+ {
+ mCurMessageToClassify = 0;
+ mNumMessagesToClassify = aNumMessagesToClassify;
+ mMessageURIs = (char **) moz_xmalloc(sizeof(char *) * aNumMessagesToClassify);
+ for (uint32_t i = 0; i < aNumMessagesToClassify; i++)
+ mMessageURIs[i] = PL_strdup(aMessageURIs[i]);
+
+ }
+
+ // junk-only classifier
+ MessageClassifier(nsBayesianFilter* aFilter,
+ nsIJunkMailClassificationListener* aJunkListener,
+ nsIMsgWindow *aMsgWindow,
+ uint32_t aNumMessagesToClassify,
+ const char **aMessageURIs)
+ : mFilter(aFilter),
+ mJunkMailPlugin(aFilter),
+ mJunkListener(aJunkListener),
+ mTraitListener(nullptr),
+ mDetailListener(nullptr),
+ mMsgWindow(aMsgWindow)
+ {
+ mCurMessageToClassify = 0;
+ mNumMessagesToClassify = aNumMessagesToClassify;
+ mMessageURIs = (char **) moz_xmalloc(sizeof(char *) * aNumMessagesToClassify);
+ for (uint32_t i = 0; i < aNumMessagesToClassify; i++)
+ mMessageURIs[i] = PL_strdup(aMessageURIs[i]);
+ mProTraits.AppendElement(kJunkTrait);
+ mAntiTraits.AppendElement(kGoodTrait);
+
+ }
+
+ virtual ~MessageClassifier()
+ {
+ if (mMessageURIs)
+ {
+ NS_FREE_XPCOM_ALLOCATED_POINTER_ARRAY(mNumMessagesToClassify, mMessageURIs);
+ }
+ }
+ virtual void analyzeTokens(Tokenizer& tokenizer)
+ {
+ mFilter->classifyMessage(tokenizer,
+ mTokenSource.get(),
+ mProTraits,
+ mAntiTraits,
+ mJunkListener,
+ mTraitListener,
+ mDetailListener);
+ tokenizer.clearTokens();
+ classifyNextMessage();
+ }
+
+ virtual void classifyNextMessage()
+ {
+
+ if (++mCurMessageToClassify < mNumMessagesToClassify && mMessageURIs[mCurMessageToClassify]) {
+ MOZ_LOG(BayesianFilterLogModule, LogLevel::Warning, ("classifyNextMessage(%s)", mMessageURIs[mCurMessageToClassify]));
+ mFilter->tokenizeMessage(mMessageURIs[mCurMessageToClassify], mMsgWindow, this);
+ }
+ else
+ {
+ // call all listeners with null parameters to signify end of batch
+ if (mJunkListener)
+ mJunkListener->OnMessageClassified(nullptr, nsIJunkMailPlugin::UNCLASSIFIED, 0);
+ if (mTraitListener)
+ mTraitListener->OnMessageTraitsClassified(nullptr, 0, nullptr, nullptr);
+ mTokenListener = nullptr; // this breaks the circular ref that keeps this object alive
+ // so we will be destroyed as a result.
+ }
+ }
+
+private:
+ nsBayesianFilter* mFilter;
+ nsCOMPtr<nsIJunkMailPlugin> mJunkMailPlugin;
+ nsCOMPtr<nsIJunkMailClassificationListener> mJunkListener;
+ nsCOMPtr<nsIMsgTraitClassificationListener> mTraitListener;
+ nsCOMPtr<nsIMsgTraitDetailListener> mDetailListener;
+ nsTArray<uint32_t> mProTraits;
+ nsTArray<uint32_t> mAntiTraits;
+ nsCOMPtr<nsIMsgWindow> mMsgWindow;
+ int32_t mNumMessagesToClassify;
+ int32_t mCurMessageToClassify; // 0-based index
+ char **mMessageURIs;
+};
+
+nsresult nsBayesianFilter::tokenizeMessage(const char* aMessageURI, nsIMsgWindow *aMsgWindow, TokenAnalyzer* aAnalyzer)
+{
+ NS_ENSURE_ARG_POINTER(aMessageURI);
+
+ nsCOMPtr <nsIMsgMessageService> msgService;
+ nsresult rv = GetMessageServiceFromURI(nsDependentCString(aMessageURI), getter_AddRefs(msgService));
+ NS_ENSURE_SUCCESS(rv, rv);
+
+ aAnalyzer->setSource(aMessageURI);
+ nsCOMPtr<nsIURI> dummyNull;
+ return msgService->StreamMessage(aMessageURI, aAnalyzer->mTokenListener,
+ aMsgWindow, nullptr, true /* convert data */,
+ NS_LITERAL_CSTRING("filter"), false, getter_AddRefs(dummyNull));
+}
+
+// a TraitAnalysis is the per-token representation of the statistical
+// calculations, basically created to group information that is then
+// sorted by mDistance
+struct TraitAnalysis
+{
+ uint32_t mTokenIndex;
+ double mDistance;
+ double mProbability;
+};
+
+// comparator required to sort an nsTArray
+class compareTraitAnalysis
+{
+public:
+ bool Equals(const TraitAnalysis& a, const TraitAnalysis& b) const
+ {
+ return a.mDistance == b.mDistance;
+ }
+ bool LessThan(const TraitAnalysis& a, const TraitAnalysis& b) const
+ {
+ return a.mDistance < b.mDistance;
+ }
+};
+
+inline double dmax(double x, double y) { return (x > y ? x : y); }
+inline double dmin(double x, double y) { return (x < y ? x : y); }
+
+// Chi square functions are implemented by an incomplete gamma function.
+// Note that chi2P's callers multiply the arguments by 2 but chi2P
+// divides them by 2 again. Inlining chi2P gives the compiler a
+// chance to notice this.
+
+// Both chi2P and nsIncompleteGammaP set *error negative on domain
+// errors and nsIncompleteGammaP sets it posivive on internal errors.
+// This may be useful but the chi2P callers treat any error as fatal.
+
+// Note that converting unsigned ints to floating point can be slow on
+// some platforms (like Intel) so use signed quantities for the numeric
+// routines.
+static inline double chi2P (double chi2, double nu, int32_t *error)
+{
+ // domain checks; set error and return a dummy value
+ if (chi2 < 0.0 || nu <= 0.0)
+ {
+ *error = -1;
+ return 0.0;
+ }
+ // reversing the arguments is intentional
+ return nsIncompleteGammaP (nu/2.0, chi2/2.0, error);
+}
+
+void nsBayesianFilter::classifyMessage(
+ Tokenizer& tokenizer,
+ const char* messageURI,
+ nsTArray<uint32_t>& aProTraits,
+ nsTArray<uint32_t>& aAntiTraits,
+ nsIJunkMailClassificationListener* listener,
+ nsIMsgTraitClassificationListener* aTraitListener,
+ nsIMsgTraitDetailListener* aDetailListener)
+{
+ Token* tokens = tokenizer.copyTokens();
+ uint32_t tokenCount;
+ if (!tokens)
+ {
+ // This can happen with problems with UTF conversion
+ NS_ERROR("Trying to classify a null or invalid message");
+ tokenCount = 0;
+ // don't return so that we still call the listeners
+ }
+ else
+ {
+ tokenCount = tokenizer.countTokens();
+ }
+
+ if (aProTraits.Length() != aAntiTraits.Length())
+ {
+ NS_ERROR("Each Pro trait needs a matching Anti trait");
+ return;
+ }
+
+ /* this part is similar to the Graham algorithm with some adjustments. */
+ uint32_t traitCount = aProTraits.Length();
+
+ // pro message counts per trait index
+ AutoTArray<uint32_t, kTraitAutoCapacity> numProMessages;
+ // anti message counts per trait index
+ AutoTArray<uint32_t, kTraitAutoCapacity> numAntiMessages;
+ // array of pro aliases per trait index
+ AutoTArray<uint32_t*, kTraitAutoCapacity > proAliasArrays;
+ // number of pro aliases per trait index
+ AutoTArray<uint32_t, kTraitAutoCapacity > proAliasesLengths;
+ // array of anti aliases per trait index
+ AutoTArray<uint32_t*, kTraitAutoCapacity> antiAliasArrays;
+ // number of anti aliases per trait index
+ AutoTArray<uint32_t, kTraitAutoCapacity > antiAliasesLengths;
+ // construct the outgoing listener arrays
+ AutoTArray<uint32_t, kTraitAutoCapacity> traits;
+ AutoTArray<uint32_t, kTraitAutoCapacity> percents;
+ if (traitCount > kTraitAutoCapacity)
+ {
+ traits.SetCapacity(traitCount);
+ percents.SetCapacity(traitCount);
+ numProMessages.SetCapacity(traitCount);
+ numAntiMessages.SetCapacity(traitCount);
+ proAliasesLengths.SetCapacity(traitCount);
+ antiAliasesLengths.SetCapacity(traitCount);
+ proAliasArrays.SetCapacity(traitCount);
+ antiAliasArrays.SetCapacity(traitCount);
+ }
+
+ nsresult rv;
+ nsCOMPtr<nsIMsgTraitService> traitService(do_GetService("@mozilla.org/msg-trait-service;1", &rv));
+ if (NS_FAILED(rv))
+ {
+ NS_ERROR("Failed to get trait service");
+ MOZ_LOG(BayesianFilterLogModule, LogLevel::Error, ("Failed to get trait service"));
+ }
+
+ // get aliases and message counts for the pro and anti traits
+ for (uint32_t traitIndex = 0; traitIndex < traitCount; traitIndex++)
+ {
+ nsresult rv;
+
+ // pro trait
+ uint32_t proAliasesLength = 0;
+ uint32_t* proAliases = nullptr;
+ uint32_t proTrait = aProTraits[traitIndex];
+ if (traitService)
+ {
+ rv = traitService->GetAliases(proTrait, &proAliasesLength, &proAliases);
+ if (NS_FAILED(rv))
+ {
+ NS_ERROR("trait service failed to get aliases");
+ MOZ_LOG(BayesianFilterLogModule, LogLevel::Error, ("trait service failed to get aliases"));
+ }
+ }
+ proAliasesLengths.AppendElement(proAliasesLength);
+ proAliasArrays.AppendElement(proAliases);
+ uint32_t proMessageCount = mCorpus.getMessageCount(proTrait);
+ for (uint32_t aliasIndex = 0; aliasIndex < proAliasesLength; aliasIndex++)
+ proMessageCount += mCorpus.getMessageCount(proAliases[aliasIndex]);
+ numProMessages.AppendElement(proMessageCount);
+
+ // anti trait
+ uint32_t antiAliasesLength = 0;
+ uint32_t* antiAliases = nullptr;
+ uint32_t antiTrait = aAntiTraits[traitIndex];
+ if (traitService)
+ {
+ rv = traitService->GetAliases(antiTrait, &antiAliasesLength, &antiAliases);
+ if (NS_FAILED(rv))
+ {
+ NS_ERROR("trait service failed to get aliases");
+ MOZ_LOG(BayesianFilterLogModule, LogLevel::Error, ("trait service failed to get aliases"));
+ }
+ }
+ antiAliasesLengths.AppendElement(antiAliasesLength);
+ antiAliasArrays.AppendElement(antiAliases);
+ uint32_t antiMessageCount = mCorpus.getMessageCount(antiTrait);
+ for (uint32_t aliasIndex = 0; aliasIndex < antiAliasesLength; aliasIndex++)
+ antiMessageCount += mCorpus.getMessageCount(antiAliases[aliasIndex]);
+ numAntiMessages.AppendElement(antiMessageCount);
+ }
+
+ for (uint32_t i = 0; i < tokenCount; ++i)
+ {
+ Token& token = tokens[i];
+ CorpusToken* t = mCorpus.get(token.mWord);
+ if (!t)
+ continue;
+ for (uint32_t traitIndex = 0; traitIndex < traitCount; traitIndex++)
+ {
+ uint32_t iProCount = mCorpus.getTraitCount(t, aProTraits[traitIndex]);
+ // add in any counts for aliases to proTrait
+ for (uint32_t aliasIndex = 0; aliasIndex < proAliasesLengths[traitIndex]; aliasIndex++)
+ iProCount += mCorpus.getTraitCount(t, proAliasArrays[traitIndex][aliasIndex]);
+ double proCount = static_cast<double>(iProCount);
+
+ uint32_t iAntiCount = mCorpus.getTraitCount(t, aAntiTraits[traitIndex]);
+ // add in any counts for aliases to antiTrait
+ for (uint32_t aliasIndex = 0; aliasIndex < antiAliasesLengths[traitIndex]; aliasIndex++)
+ iAntiCount += mCorpus.getTraitCount(t, antiAliasArrays[traitIndex][aliasIndex]);
+ double antiCount = static_cast<double>(iAntiCount);
+
+ double prob, denom;
+ // Prevent a divide by zero error by setting defaults for prob
+
+ // If there are no matching tokens at all, ignore.
+ if (antiCount == 0.0 && proCount == 0.0)
+ continue;
+ // if only anti match, set probability to 0%
+ if (proCount == 0.0)
+ prob = 0.0;
+ // if only pro match, set probability to 100%
+ else if (antiCount == 0.0)
+ prob = 1.0;
+ // not really needed, but just to be sure check the denom as well
+ else if ((denom = proCount * numAntiMessages[traitIndex] +
+ antiCount * numProMessages[traitIndex]) == 0.0)
+ continue;
+ else
+ prob = (proCount * numAntiMessages[traitIndex]) / denom;
+
+ double n = proCount + antiCount;
+ prob = (0.225 + n * prob) / (.45 + n);
+ double distance = std::abs(prob - 0.5);
+ if (distance >= .1)
+ {
+ mozilla::DebugOnly<nsresult> rv = setAnalysis(token, traitIndex, distance, prob);
+ NS_ASSERTION(NS_SUCCEEDED(rv), "Problem in setAnalysis");
+ }
+ }
+ }
+
+ for (uint32_t traitIndex = 0; traitIndex < traitCount; traitIndex++)
+ {
+ AutoTArray<TraitAnalysis, 1024> traitAnalyses;
+ // copy valid tokens into an array to sort
+ for (uint32_t tokenIndex = 0; tokenIndex < tokenCount; tokenIndex++)
+ {
+ uint32_t storeIndex = getAnalysisIndex(tokens[tokenIndex], traitIndex);
+ if (storeIndex)
+ {
+ TraitAnalysis ta =
+ {tokenIndex,
+ mAnalysisStore[storeIndex].mDistance,
+ mAnalysisStore[storeIndex].mProbability};
+ traitAnalyses.AppendElement(ta);
+ }
+ }
+
+ // sort the array by the distances
+ traitAnalyses.Sort(compareTraitAnalysis());
+ uint32_t count = traitAnalyses.Length();
+ uint32_t first, last = count;
+ const uint32_t kMaxTokens = 150;
+ first = ( count > kMaxTokens) ? count - kMaxTokens : 0;
+
+ // Setup the arrays to save details if needed
+ nsTArray<double> sArray;
+ nsTArray<double> hArray;
+ uint32_t usedTokenCount = ( count > kMaxTokens) ? kMaxTokens : count;
+ if (aDetailListener)
+ {
+ sArray.SetCapacity(usedTokenCount);
+ hArray.SetCapacity(usedTokenCount);
+ }
+
+ double H = 1.0, S = 1.0;
+ int32_t Hexp = 0, Sexp = 0;
+ uint32_t goodclues=0;
+ int e;
+
+ // index from end to analyze most significant first
+ for (uint32_t ip1 = last; ip1 != first; --ip1)
+ {
+ TraitAnalysis& ta = traitAnalyses[ip1 - 1];
+ if (ta.mDistance > 0.0)
+ {
+ goodclues++;
+ double value = ta.mProbability;
+ S *= (1.0 - value);
+ H *= value;
+ if ( S < 1e-200 )
+ {
+ S = frexp(S, &e);
+ Sexp += e;
+ }
+ if ( H < 1e-200 )
+ {
+ H = frexp(H, &e);
+ Hexp += e;
+ }
+ MOZ_LOG(BayesianFilterLogModule, LogLevel::Warning,
+ ("token probability (%s) is %f",
+ tokens[ta.mTokenIndex].mWord, ta.mProbability));
+ }
+ if (aDetailListener)
+ {
+ sArray.AppendElement(log(S) + Sexp * M_LN2);
+ hArray.AppendElement(log(H) + Hexp * M_LN2);
+ }
+ }
+
+ S = log(S) + Sexp * M_LN2;
+ H = log(H) + Hexp * M_LN2;
+
+ double prob;
+ if (goodclues > 0)
+ {
+ int32_t chi_error;
+ S = chi2P(-2.0 * S, 2.0 * goodclues, &chi_error);
+ if (!chi_error)
+ H = chi2P(-2.0 * H, 2.0 * goodclues, &chi_error);
+ // if any error toss the entire calculation
+ if (!chi_error)
+ prob = (S-H +1.0) / 2.0;
+ else
+ prob = 0.5;
+ }
+ else
+ prob = 0.5;
+
+ if (aDetailListener)
+ {
+ // Prepare output arrays
+ nsTArray<uint32_t> tokenPercents(usedTokenCount);
+ nsTArray<uint32_t> runningPercents(usedTokenCount);
+ nsTArray<char16_t*> tokenStrings(usedTokenCount);
+
+ double clueCount = 1.0;
+ for (uint32_t tokenIndex = 0; tokenIndex < usedTokenCount; tokenIndex++)
+ {
+ TraitAnalysis& ta = traitAnalyses[last - 1 - tokenIndex];
+ int32_t chi_error;
+ S = chi2P(-2.0 * sArray[tokenIndex], 2.0 * clueCount, &chi_error);
+ if (!chi_error)
+ H = chi2P(-2.0 * hArray[tokenIndex], 2.0 * clueCount, &chi_error);
+ clueCount += 1.0;
+ double runningProb;
+ if (!chi_error)
+ runningProb = (S - H + 1.0) / 2.0;
+ else
+ runningProb = 0.5;
+ runningPercents.AppendElement(static_cast<uint32_t>(runningProb *
+ 100. + .5));
+ tokenPercents.AppendElement(static_cast<uint32_t>(ta.mProbability *
+ 100. + .5));
+ tokenStrings.AppendElement(ToNewUnicode(NS_ConvertUTF8toUTF16(
+ tokens[ta.mTokenIndex].mWord)));
+ }
+
+ aDetailListener->OnMessageTraitDetails(messageURI, aProTraits[traitIndex],
+ usedTokenCount, (const char16_t**)tokenStrings.Elements(),
+ tokenPercents.Elements(), runningPercents.Elements());
+ for (uint32_t tokenIndex = 0; tokenIndex < usedTokenCount; tokenIndex++)
+ NS_Free(tokenStrings[tokenIndex]);
+ }
+
+ uint32_t proPercent = static_cast<uint32_t>(prob*100. + .5);
+
+ // directly classify junk to maintain backwards compatibility
+ if (aProTraits[traitIndex] == kJunkTrait)
+ {
+ bool isJunk = (prob >= mJunkProbabilityThreshold);
+ MOZ_LOG(BayesianFilterLogModule, LogLevel::Info,
+ ("%s is junk probability = (%f) HAM SCORE:%f SPAM SCORE:%f",
+ messageURI, prob,H,S));
+
+ // the algorithm in "A Plan For Spam" assumes that you have a large good
+ // corpus and a large junk corpus.
+ // that won't be the case with users who first use the junk mail trait
+ // so, we do certain things to encourage them to train.
+ //
+ // if there are no good tokens, assume the message is junk
+ // this will "encourage" the user to train
+ // and if there are no bad tokens, assume the message is not junk
+ // this will also "encourage" the user to train
+ // see bug #194238
+
+ if (listener && !mCorpus.getMessageCount(kGoodTrait))
+ isJunk = true;
+ else if (listener && !mCorpus.getMessageCount(kJunkTrait))
+ isJunk = false;
+
+ if (listener)
+ listener->OnMessageClassified(messageURI, isJunk ?
+ nsMsgJunkStatus(nsIJunkMailPlugin::JUNK) :
+ nsMsgJunkStatus(nsIJunkMailPlugin::GOOD), proPercent);
+ }
+
+ if (aTraitListener)
+ {
+ traits.AppendElement(aProTraits[traitIndex]);
+ percents.AppendElement(proPercent);
+ }
+
+ // free aliases arrays returned from XPCOM
+ if (proAliasesLengths[traitIndex])
+ NS_Free(proAliasArrays[traitIndex]);
+ if (antiAliasesLengths[traitIndex])
+ NS_Free(antiAliasArrays[traitIndex]);
+ }
+
+ if (aTraitListener)
+ aTraitListener->OnMessageTraitsClassified(messageURI,
+ traits.Length(), traits.Elements(), percents.Elements());
+
+ delete[] tokens;
+ // reuse mAnalysisStore without clearing memory
+ mNextAnalysisIndex = 1;
+ // but shrink it back to the default size
+ if (mAnalysisStore.Length() > kAnalysisStoreCapacity)
+ mAnalysisStore.RemoveElementsAt(kAnalysisStoreCapacity,
+ mAnalysisStore.Length() - kAnalysisStoreCapacity);
+ mAnalysisStore.Compact();
+}
+
+void nsBayesianFilter::classifyMessage(
+ Tokenizer& tokens,
+ const char* messageURI,
+ nsIJunkMailClassificationListener* aJunkListener)
+{
+ AutoTArray<uint32_t, 1> proTraits;
+ AutoTArray<uint32_t, 1> antiTraits;
+ proTraits.AppendElement(kJunkTrait);
+ antiTraits.AppendElement(kGoodTrait);
+ classifyMessage(tokens, messageURI, proTraits, antiTraits,
+ aJunkListener, nullptr, nullptr);
+}
+
+NS_IMETHODIMP
+nsBayesianFilter::Observe(nsISupports *aSubject, const char *aTopic,
+ const char16_t *someData)
+{
+ if (!strcmp(aTopic, "profile-before-change"))
+ Shutdown();
+ return NS_OK;
+}
+
+/* void shutdown (); */
+NS_IMETHODIMP nsBayesianFilter::Shutdown()
+{
+ if (mTrainingDataDirty)
+ mCorpus.writeTrainingData(mMaximumTokenCount);
+ mTrainingDataDirty = false;
+
+ return NS_OK;
+}
+
+/* readonly attribute boolean shouldDownloadAllHeaders; */
+NS_IMETHODIMP nsBayesianFilter::GetShouldDownloadAllHeaders(bool *aShouldDownloadAllHeaders)
+{
+ // bayesian filters work on the whole msg body currently.
+ *aShouldDownloadAllHeaders = false;
+ return NS_OK;
+}
+
+/* void classifyMessage (in string aMsgURL, in nsIJunkMailClassificationListener aListener); */
+NS_IMETHODIMP nsBayesianFilter::ClassifyMessage(const char *aMessageURL, nsIMsgWindow *aMsgWindow, nsIJunkMailClassificationListener *aListener)
+{
+ MessageClassifier* analyzer = new MessageClassifier(this, aListener, aMsgWindow, 1, &aMessageURL);
+ NS_ENSURE_TRUE(analyzer, NS_ERROR_OUT_OF_MEMORY);
+ TokenStreamListener *tokenListener = new TokenStreamListener(analyzer);
+ NS_ENSURE_TRUE(tokenListener, NS_ERROR_OUT_OF_MEMORY);
+ analyzer->setTokenListener(tokenListener);
+ return tokenizeMessage(aMessageURL, aMsgWindow, analyzer);
+}
+
+/* void classifyMessages (in unsigned long aCount, [array, size_is (aCount)] in string aMsgURLs, in nsIJunkMailClassificationListener aListener); */
+NS_IMETHODIMP nsBayesianFilter::ClassifyMessages(uint32_t aCount, const char **aMsgURLs, nsIMsgWindow *aMsgWindow, nsIJunkMailClassificationListener *aListener)
+{
+ NS_ENSURE_ARG_POINTER(aMsgURLs);
+
+ TokenAnalyzer* analyzer = new MessageClassifier(this, aListener, aMsgWindow, aCount, aMsgURLs);
+ NS_ENSURE_TRUE(analyzer, NS_ERROR_OUT_OF_MEMORY);
+ TokenStreamListener *tokenListener = new TokenStreamListener(analyzer);
+ NS_ENSURE_TRUE(tokenListener, NS_ERROR_OUT_OF_MEMORY);
+ analyzer->setTokenListener(tokenListener);
+ return tokenizeMessage(aMsgURLs[0], aMsgWindow, analyzer);
+}
+
+nsresult nsBayesianFilter::setAnalysis(Token& token, uint32_t aTraitIndex,
+ double aDistance, double aProbability)
+{
+ uint32_t nextLink = token.mAnalysisLink;
+ uint32_t lastLink = 0;
+ uint32_t linkCount = 0, maxLinks = 100;
+
+ // try to find an existing element. Limit the search to maxLinks
+ // as a precaution
+ for (linkCount = 0; nextLink && linkCount < maxLinks; linkCount++)
+ {
+ AnalysisPerToken &rAnalysis = mAnalysisStore[nextLink];
+ if (rAnalysis.mTraitIndex == aTraitIndex)
+ {
+ rAnalysis.mDistance = aDistance;
+ rAnalysis.mProbability = aProbability;
+ return NS_OK;
+ }
+ lastLink = nextLink;
+ nextLink = rAnalysis.mNextLink;
+ }
+ if (linkCount >= maxLinks)
+ return NS_ERROR_FAILURE;
+
+ // trait does not exist, so add it
+
+ AnalysisPerToken analysis(aTraitIndex, aDistance, aProbability);
+ if (mAnalysisStore.Length() == mNextAnalysisIndex)
+ mAnalysisStore.InsertElementAt(mNextAnalysisIndex, analysis);
+ else if (mAnalysisStore.Length() > mNextAnalysisIndex)
+ mAnalysisStore.ReplaceElementsAt(mNextAnalysisIndex, 1, analysis);
+ else // we can only insert at the end of the array
+ return NS_ERROR_FAILURE;
+
+ if (lastLink)
+ // the token had at least one link, so update the last link to point to
+ // the new item
+ mAnalysisStore[lastLink].mNextLink = mNextAnalysisIndex;
+ else
+ // need to update the token's first link
+ token.mAnalysisLink = mNextAnalysisIndex;
+ mNextAnalysisIndex++;
+ return NS_OK;
+}
+
+uint32_t nsBayesianFilter::getAnalysisIndex(Token& token, uint32_t aTraitIndex)
+{
+ uint32_t nextLink;
+ uint32_t linkCount = 0, maxLinks = 100;
+ for (nextLink = token.mAnalysisLink; nextLink && linkCount < maxLinks; linkCount++)
+ {
+ AnalysisPerToken &rAnalysis = mAnalysisStore[nextLink];
+ if (rAnalysis.mTraitIndex == aTraitIndex)
+ return nextLink;
+ nextLink = rAnalysis.mNextLink;
+ }
+ NS_ASSERTION(linkCount < maxLinks, "corrupt analysis store");
+
+ // Trait not found, indicate by zero
+ return 0;
+}
+
+NS_IMETHODIMP nsBayesianFilter::ClassifyTraitsInMessage(
+ const char *aMsgURI,
+ uint32_t aTraitCount,
+ uint32_t *aProTraits,
+ uint32_t *aAntiTraits,
+ nsIMsgTraitClassificationListener *aTraitListener,
+ nsIMsgWindow *aMsgWindow,
+ nsIJunkMailClassificationListener *aJunkListener)
+{
+ return ClassifyTraitsInMessages(1, &aMsgURI, aTraitCount, aProTraits,
+ aAntiTraits, aTraitListener, aMsgWindow, aJunkListener);
+}
+
+NS_IMETHODIMP nsBayesianFilter::ClassifyTraitsInMessages(
+ uint32_t aCount,
+ const char **aMsgURIs,
+ uint32_t aTraitCount,
+ uint32_t *aProTraits,
+ uint32_t *aAntiTraits,
+ nsIMsgTraitClassificationListener *aTraitListener,
+ nsIMsgWindow *aMsgWindow,
+ nsIJunkMailClassificationListener *aJunkListener)
+{
+ AutoTArray<uint32_t, kTraitAutoCapacity> proTraits;
+ AutoTArray<uint32_t, kTraitAutoCapacity> antiTraits;
+ if (aTraitCount > kTraitAutoCapacity)
+ {
+ proTraits.SetCapacity(aTraitCount);
+ antiTraits.SetCapacity(aTraitCount);
+ }
+ proTraits.AppendElements(aProTraits, aTraitCount);
+ antiTraits.AppendElements(aAntiTraits, aTraitCount);
+
+ MessageClassifier* analyzer = new MessageClassifier(this, aJunkListener,
+ aTraitListener, nullptr, proTraits, antiTraits, aMsgWindow, aCount, aMsgURIs);
+ NS_ENSURE_TRUE(analyzer, NS_ERROR_OUT_OF_MEMORY);
+
+ TokenStreamListener *tokenListener = new TokenStreamListener(analyzer);
+ NS_ENSURE_TRUE(tokenListener, NS_ERROR_OUT_OF_MEMORY);
+
+ analyzer->setTokenListener(tokenListener);
+ return tokenizeMessage(aMsgURIs[0], aMsgWindow, analyzer);
+}
+
+class MessageObserver : public TokenAnalyzer {
+public:
+ MessageObserver(nsBayesianFilter* filter,
+ nsTArray<uint32_t>& aOldClassifications,
+ nsTArray<uint32_t>& aNewClassifications,
+ nsIJunkMailClassificationListener* aJunkListener,
+ nsIMsgTraitClassificationListener* aTraitListener)
+ : mFilter(filter), mJunkMailPlugin(filter), mJunkListener(aJunkListener),
+ mTraitListener(aTraitListener),
+ mOldClassifications(aOldClassifications),
+ mNewClassifications(aNewClassifications)
+ {
+ }
+
+ virtual void analyzeTokens(Tokenizer& tokenizer)
+ {
+ mFilter->observeMessage(tokenizer, mTokenSource.get(), mOldClassifications,
+ mNewClassifications, mJunkListener, mTraitListener);
+ // release reference to listener, which will allow us to go away as well.
+ mTokenListener = nullptr;
+ }
+
+private:
+ nsBayesianFilter* mFilter;
+ nsCOMPtr<nsIJunkMailPlugin> mJunkMailPlugin;
+ nsCOMPtr<nsIJunkMailClassificationListener> mJunkListener;
+ nsCOMPtr<nsIMsgTraitClassificationListener> mTraitListener;
+ nsTArray<uint32_t> mOldClassifications;
+ nsTArray<uint32_t> mNewClassifications;
+};
+
+NS_IMETHODIMP nsBayesianFilter::SetMsgTraitClassification(
+ const char *aMsgURI,
+ uint32_t aOldCount,
+ uint32_t *aOldTraits,
+ uint32_t aNewCount,
+ uint32_t *aNewTraits,
+ nsIMsgTraitClassificationListener *aTraitListener,
+ nsIMsgWindow *aMsgWindow,
+ nsIJunkMailClassificationListener *aJunkListener)
+{
+ AutoTArray<uint32_t, kTraitAutoCapacity> oldTraits;
+ AutoTArray<uint32_t, kTraitAutoCapacity> newTraits;
+ if (aOldCount > kTraitAutoCapacity)
+ oldTraits.SetCapacity(aOldCount);
+ if (aNewCount > kTraitAutoCapacity)
+ newTraits.SetCapacity(aNewCount);
+ oldTraits.AppendElements(aOldTraits, aOldCount);
+ newTraits.AppendElements(aNewTraits, aNewCount);
+
+ MessageObserver* analyzer = new MessageObserver(this, oldTraits,
+ newTraits, aJunkListener, aTraitListener);
+ NS_ENSURE_TRUE(analyzer, NS_ERROR_OUT_OF_MEMORY);
+
+ TokenStreamListener *tokenListener = new TokenStreamListener(analyzer);
+ NS_ENSURE_TRUE(tokenListener, NS_ERROR_OUT_OF_MEMORY);
+
+ analyzer->setTokenListener(tokenListener);
+ return tokenizeMessage(aMsgURI, aMsgWindow, analyzer);
+}
+
+// set new message classifications for a message
+void nsBayesianFilter::observeMessage(
+ Tokenizer& tokenizer,
+ const char* messageURL,
+ nsTArray<uint32_t>& oldClassifications,
+ nsTArray<uint32_t>& newClassifications,
+ nsIJunkMailClassificationListener* aJunkListener,
+ nsIMsgTraitClassificationListener* aTraitListener)
+{
+
+ bool trainingDataWasDirty = mTrainingDataDirty;
+
+ // Uhoh...if the user is re-training then the message may already be classified and we are classifying it again with the same classification.
+ // the old code would have removed the tokens for this message then added them back. But this really hurts the message occurrence
+ // count for tokens if you just removed training.dat and are re-training. See Bug #237095 for more details.
+ // What can we do here? Well we can skip the token removal step if the classifications are the same and assume the user is
+ // just re-training. But this then allows users to re-classify the same message on the same training set over and over again
+ // leading to data skew. But that's all I can think to do right now to address this.....
+ uint32_t oldLength = oldClassifications.Length();
+ for (uint32_t index = 0; index < oldLength; index++)
+ {
+ uint32_t trait = oldClassifications.ElementAt(index);
+ // skip removing if trait is also in the new set
+ if (newClassifications.Contains(trait))
+ continue;
+ // remove the tokens from the token set it is currently in
+ uint32_t messageCount;
+ messageCount = mCorpus.getMessageCount(trait);
+ if (messageCount > 0)
+ {
+ mCorpus.setMessageCount(trait, messageCount - 1);
+ mCorpus.forgetTokens(tokenizer, trait, 1);
+ mTrainingDataDirty = true;
+ }
+ }
+
+ nsMsgJunkStatus newClassification = nsIJunkMailPlugin::UNCLASSIFIED;
+ uint32_t junkPercent = 0; // 0 here is no possibility of meeting the classification
+ uint32_t newLength = newClassifications.Length();
+ for (uint32_t index = 0; index < newLength; index++)
+ {
+ uint32_t trait = newClassifications.ElementAt(index);
+ mCorpus.setMessageCount(trait, mCorpus.getMessageCount(trait) + 1);
+ mCorpus.rememberTokens(tokenizer, trait, 1);
+ mTrainingDataDirty = true;
+
+ if (aJunkListener)
+ {
+ if (trait == kJunkTrait)
+ {
+ junkPercent = nsIJunkMailPlugin::IS_SPAM_SCORE;
+ newClassification = nsIJunkMailPlugin::JUNK;
+ }
+ else if (trait == kGoodTrait)
+ {
+ junkPercent = nsIJunkMailPlugin::IS_HAM_SCORE;
+ newClassification = nsIJunkMailPlugin::GOOD;
+ }
+ }
+ }
+
+ if (aJunkListener)
+ aJunkListener->OnMessageClassified(messageURL, newClassification, junkPercent);
+
+ if (aTraitListener)
+ {
+ // construct the outgoing listener arrays
+ AutoTArray<uint32_t, kTraitAutoCapacity> traits;
+ AutoTArray<uint32_t, kTraitAutoCapacity> percents;
+ uint32_t newLength = newClassifications.Length();
+ if (newLength > kTraitAutoCapacity)
+ {
+ traits.SetCapacity(newLength);
+ percents.SetCapacity(newLength);
+ }
+ traits.AppendElements(newClassifications);
+ for (uint32_t index = 0; index < newLength; index++)
+ percents.AppendElement(100); // This is 100 percent, or certainty
+ aTraitListener->OnMessageTraitsClassified(messageURL,
+ traits.Length(), traits.Elements(), percents.Elements());
+ }
+
+ if (mTrainingDataDirty && !trainingDataWasDirty && ( mTimer != nullptr ))
+ {
+ // if training data became dirty just now, schedule flush
+ // mMinFlushInterval msec from now
+ MOZ_LOG(
+ BayesianFilterLogModule, LogLevel::Debug,
+ ("starting training data flush timer %i msec", mMinFlushInterval));
+ mTimer->InitWithFuncCallback(nsBayesianFilter::TimerCallback, this, mMinFlushInterval, nsITimer::TYPE_ONE_SHOT);
+ }
+}
+
+NS_IMETHODIMP nsBayesianFilter::GetUserHasClassified(bool *aResult)
+{
+ *aResult = ( (mCorpus.getMessageCount(kGoodTrait) +
+ mCorpus.getMessageCount(kJunkTrait)) &&
+ mCorpus.countTokens());
+ return NS_OK;
+}
+
+// Set message classification (only allows junk and good)
+NS_IMETHODIMP nsBayesianFilter::SetMessageClassification(
+ const char *aMsgURL,
+ nsMsgJunkStatus aOldClassification,
+ nsMsgJunkStatus aNewClassification,
+ nsIMsgWindow *aMsgWindow,
+ nsIJunkMailClassificationListener *aListener)
+{
+ AutoTArray<uint32_t, 1> oldClassifications;
+ AutoTArray<uint32_t, 1> newClassifications;
+
+ // convert between classifications and trait
+ if (aOldClassification == nsIJunkMailPlugin::JUNK)
+ oldClassifications.AppendElement(kJunkTrait);
+ else if (aOldClassification == nsIJunkMailPlugin::GOOD)
+ oldClassifications.AppendElement(kGoodTrait);
+ if (aNewClassification == nsIJunkMailPlugin::JUNK)
+ newClassifications.AppendElement(kJunkTrait);
+ else if (aNewClassification == nsIJunkMailPlugin::GOOD)
+ newClassifications.AppendElement(kGoodTrait);
+
+ MessageObserver* analyzer = new MessageObserver(this, oldClassifications,
+ newClassifications, aListener, nullptr);
+ NS_ENSURE_TRUE(analyzer, NS_ERROR_OUT_OF_MEMORY);
+
+ TokenStreamListener *tokenListener = new TokenStreamListener(analyzer);
+ NS_ENSURE_TRUE(tokenListener, NS_ERROR_OUT_OF_MEMORY);
+
+ analyzer->setTokenListener(tokenListener);
+ return tokenizeMessage(aMsgURL, aMsgWindow, analyzer);
+}
+
+NS_IMETHODIMP nsBayesianFilter::ResetTrainingData()
+{
+ return mCorpus.resetTrainingData();
+}
+
+NS_IMETHODIMP nsBayesianFilter::DetailMessage(const char *aMsgURI,
+ uint32_t aProTrait, uint32_t aAntiTrait,
+ nsIMsgTraitDetailListener *aDetailListener, nsIMsgWindow *aMsgWindow)
+{
+ AutoTArray<uint32_t, 1> proTraits;
+ AutoTArray<uint32_t, 1> antiTraits;
+ proTraits.AppendElement(aProTrait);
+ antiTraits.AppendElement(aAntiTrait);
+
+ MessageClassifier* analyzer = new MessageClassifier(this, nullptr,
+ nullptr, aDetailListener, proTraits, antiTraits, aMsgWindow, 1, &aMsgURI);
+ NS_ENSURE_TRUE(analyzer, NS_ERROR_OUT_OF_MEMORY);
+
+ TokenStreamListener *tokenListener = new TokenStreamListener(analyzer);
+ NS_ENSURE_TRUE(tokenListener, NS_ERROR_OUT_OF_MEMORY);
+
+ analyzer->setTokenListener(tokenListener);
+ return tokenizeMessage(aMsgURI, aMsgWindow, analyzer);
+}
+
+// nsIMsgCorpus implementation
+
+NS_IMETHODIMP nsBayesianFilter::CorpusCounts(uint32_t aTrait,
+ uint32_t *aMessageCount,
+ uint32_t *aTokenCount)
+{
+ NS_ENSURE_ARG_POINTER(aTokenCount);
+ *aTokenCount = mCorpus.countTokens();
+ if (aTrait && aMessageCount)
+ *aMessageCount = mCorpus.getMessageCount(aTrait);
+ return NS_OK;
+}
+
+NS_IMETHODIMP nsBayesianFilter::ClearTrait(uint32_t aTrait)
+{
+ return mCorpus.ClearTrait(aTrait);
+}
+
+NS_IMETHODIMP
+nsBayesianFilter::UpdateData(nsIFile *aFile,
+ bool aIsAdd,
+ uint32_t aRemapCount,
+ uint32_t *aFromTraits,
+ uint32_t *aToTraits)
+{
+ return mCorpus.UpdateData(aFile, aIsAdd, aRemapCount, aFromTraits, aToTraits);
+}
+
+NS_IMETHODIMP
+nsBayesianFilter::GetTokenCount(const nsACString &aWord,
+ uint32_t aTrait,
+ uint32_t *aCount)
+{
+ NS_ENSURE_ARG_POINTER(aCount);
+ CorpusToken* t = mCorpus.get(PromiseFlatCString(aWord).get());
+ uint32_t count = mCorpus.getTraitCount(t, aTrait);
+ *aCount = count;
+ return NS_OK;
+}
+
+/* Corpus Store */
+
+/*
+ Format of the training file for version 1:
+ [0xFEEDFACE]
+ [number good messages][number bad messages]
+ [number good tokens]
+ [count][length of word]word
+ ...
+ [number bad tokens]
+ [count][length of word]word
+ ...
+
+ Format of the trait file for version 1:
+ [0xFCA93601] (the 01 is the version)
+ for each trait to write
+ [id of trait to write] (0 means end of list)
+ [number of messages per trait]
+ for each token with non-zero count
+ [count]
+ [length of word]word
+*/
+
+CorpusStore::CorpusStore() :
+ TokenHash(sizeof(CorpusToken)),
+ mNextTraitIndex(1) // skip 0 since index=0 will mean end of linked list
+{
+ getTrainingFile(getter_AddRefs(mTrainingFile));
+ mTraitStore.SetCapacity(kTraitStoreCapacity);
+ TraitPerToken traitPT(0, 0);
+ mTraitStore.AppendElement(traitPT); // dummy 0th element
+}
+
+CorpusStore::~CorpusStore()
+{
+}
+
+inline int writeUInt32(FILE* stream, uint32_t value)
+{
+ value = PR_htonl(value);
+ return fwrite(&value, sizeof(uint32_t), 1, stream);
+}
+
+inline int readUInt32(FILE* stream, uint32_t* value)
+{
+ int n = fread(value, sizeof(uint32_t), 1, stream);
+ if (n == 1) {
+ *value = PR_ntohl(*value);
+ }
+ return n;
+}
+
+void CorpusStore::forgetTokens(Tokenizer& aTokenizer,
+ uint32_t aTraitId, uint32_t aCount)
+{
+ // if we are forgetting the tokens for a message, should only
+ // subtract 1 from the occurrence count for that token in the training set
+ // because we assume we only bumped the training set count once per messages
+ // containing the token.
+ TokenEnumeration tokens = aTokenizer.getTokens();
+ while (tokens.hasMoreTokens())
+ {
+ CorpusToken* token = static_cast<CorpusToken*>(tokens.nextToken());
+ remove(token->mWord, aTraitId, aCount);
+ }
+}
+
+void CorpusStore::rememberTokens(Tokenizer& aTokenizer,
+ uint32_t aTraitId, uint32_t aCount)
+{
+ TokenEnumeration tokens = aTokenizer.getTokens();
+ while (tokens.hasMoreTokens())
+ {
+ CorpusToken* token = static_cast<CorpusToken*>(tokens.nextToken());
+ if (!token)
+ {
+ NS_ERROR("null token");
+ continue;
+ }
+ add(token->mWord, aTraitId, aCount);
+ }
+}
+
+bool CorpusStore::writeTokens(FILE* stream, bool shrink, uint32_t aTraitId)
+{
+ uint32_t tokenCount = countTokens();
+ uint32_t newTokenCount = 0;
+
+ // calculate the tokens for this trait to write
+
+ TokenEnumeration tokens = getTokens();
+ for (uint32_t i = 0; i < tokenCount; ++i)
+ {
+ CorpusToken* token = static_cast<CorpusToken*>(tokens.nextToken());
+ uint32_t count = getTraitCount(token, aTraitId);
+ // Shrinking the token database is accomplished by dividing all token counts by 2.
+ // If shrinking, we'll ignore counts < 2, otherwise only ignore counts of < 1
+ if ((shrink && count > 1) || (!shrink && count))
+ newTokenCount++;
+ }
+
+ if (writeUInt32(stream, newTokenCount) != 1)
+ return false;
+
+ if (newTokenCount > 0)
+ {
+ TokenEnumeration tokens = getTokens();
+ for (uint32_t i = 0; i < tokenCount; ++i)
+ {
+ CorpusToken* token = static_cast<CorpusToken*>(tokens.nextToken());
+ uint32_t wordCount = getTraitCount(token, aTraitId);
+ if (shrink)
+ wordCount /= 2;
+ if (!wordCount)
+ continue; // Don't output zero count words
+ if (writeUInt32(stream, wordCount) != 1)
+ return false;
+ uint32_t tokenLength = strlen(token->mWord);
+ if (writeUInt32(stream, tokenLength) != 1)
+ return false;
+ if (fwrite(token->mWord, tokenLength, 1, stream) != 1)
+ return false;
+ }
+ }
+ return true;
+}
+
+bool CorpusStore::readTokens(FILE* stream, int64_t fileSize,
+ uint32_t aTraitId, bool aIsAdd)
+{
+ uint32_t tokenCount;
+ if (readUInt32(stream, &tokenCount) != 1)
+ return false;
+
+ int64_t fpos = ftell(stream);
+ if (fpos < 0)
+ return false;
+
+ uint32_t bufferSize = 4096;
+ char* buffer = new char[bufferSize];
+ if (!buffer) return false;
+
+ for (uint32_t i = 0; i < tokenCount; ++i) {
+ uint32_t count;
+ if (readUInt32(stream, &count) != 1)
+ break;
+ uint32_t size;
+ if (readUInt32(stream, &size) != 1)
+ break;
+ fpos += 8;
+ if (fpos + size > fileSize) {
+ delete[] buffer;
+ return false;
+ }
+ if (size >= bufferSize) {
+ delete[] buffer;
+ while (size >= bufferSize) {
+ bufferSize *= 2;
+ if (bufferSize == 0)
+ return false;
+ }
+ buffer = new char[bufferSize];
+ if (!buffer) return false;
+ }
+ if (fread(buffer, size, 1, stream) != 1)
+ break;
+ fpos += size;
+ buffer[size] = '\0';
+ if (aIsAdd)
+ add(buffer, aTraitId, count);
+ else
+ remove(buffer, aTraitId, count);
+ }
+
+ delete[] buffer;
+
+ return true;
+}
+
+nsresult CorpusStore::getTrainingFile(nsIFile ** aTrainingFile)
+{
+ // should we cache the profile manager's directory?
+ nsCOMPtr<nsIFile> profileDir;
+
+ nsresult rv = NS_GetSpecialDirectory(NS_APP_USER_PROFILE_50_DIR, getter_AddRefs(profileDir));
+ NS_ENSURE_SUCCESS(rv, rv);
+ rv = profileDir->Append(NS_LITERAL_STRING("training.dat"));
+ NS_ENSURE_SUCCESS(rv, rv);
+
+ return profileDir->QueryInterface(NS_GET_IID(nsIFile), (void **) aTrainingFile);
+}
+
+nsresult CorpusStore::getTraitFile(nsIFile ** aTraitFile)
+{
+ // should we cache the profile manager's directory?
+ nsCOMPtr<nsIFile> profileDir;
+
+ nsresult rv = NS_GetSpecialDirectory(NS_APP_USER_PROFILE_50_DIR, getter_AddRefs(profileDir));
+ NS_ENSURE_SUCCESS(rv, rv);
+
+ rv = profileDir->Append(NS_LITERAL_STRING("traits.dat"));
+ NS_ENSURE_SUCCESS(rv, rv);
+
+ return profileDir->QueryInterface(NS_GET_IID(nsIFile), (void **) aTraitFile);
+}
+
+static const char kMagicCookie[] = { '\xFE', '\xED', '\xFA', '\xCE' };
+
+// random string used to identify trait file and version (last byte is version)
+static const char kTraitCookie[] = { '\xFC', '\xA9', '\x36', '\x01' };
+
+void CorpusStore::writeTrainingData(uint32_t aMaximumTokenCount)
+{
+ MOZ_LOG(BayesianFilterLogModule, LogLevel::Debug, ("writeTrainingData() entered"));
+ if (!mTrainingFile)
+ return;
+
+ /*
+ * For backwards compatibility, write the good and junk tokens to
+ * training.dat; additional traits are added to a different file
+ */
+
+ // open the file, and write out training data
+ FILE* stream;
+ nsresult rv = mTrainingFile->OpenANSIFileDesc("wb", &stream);
+ if (NS_FAILED(rv))
+ return;
+
+ // If the number of tokens exceeds our limit, set the shrink flag
+ bool shrink = false;
+ if ((aMaximumTokenCount > 0) && // if 0, do not limit tokens
+ (countTokens() > aMaximumTokenCount))
+ {
+ shrink = true;
+ MOZ_LOG(BayesianFilterLogModule, LogLevel::Warning, ("shrinking token data file"));
+ }
+
+ // We implement shrink by dividing counts by two
+ uint32_t shrinkFactor = shrink ? 2 : 1;
+
+ if (!((fwrite(kMagicCookie, sizeof(kMagicCookie), 1, stream) == 1) &&
+ (writeUInt32(stream, getMessageCount(kGoodTrait) / shrinkFactor)) &&
+ (writeUInt32(stream, getMessageCount(kJunkTrait) / shrinkFactor)) &&
+ writeTokens(stream, shrink, kGoodTrait) &&
+ writeTokens(stream, shrink, kJunkTrait)))
+ {
+ NS_WARNING("failed to write training data.");
+ fclose(stream);
+ // delete the training data file, since it is potentially corrupt.
+ mTrainingFile->Remove(false);
+ }
+ else
+ {
+ fclose(stream);
+ }
+
+ /*
+ * Write the remaining data to a second file traits.dat
+ */
+
+ if (!mTraitFile)
+ {
+ getTraitFile(getter_AddRefs(mTraitFile));
+ if (!mTraitFile)
+ return;
+ }
+
+ // open the file, and write out training data
+ rv = mTraitFile->OpenANSIFileDesc("wb", &stream);
+ if (NS_FAILED(rv))
+ return;
+
+ uint32_t numberOfTraits = mMessageCounts.Length();
+ bool error;
+ while (1) // break on error or done
+ {
+ if ((error = (fwrite(kTraitCookie, sizeof(kTraitCookie), 1, stream) != 1)))
+ break;
+
+ for (uint32_t index = 0; index < numberOfTraits; index++)
+ {
+ uint32_t trait = mMessageCountsId[index];
+ if (trait == 1 || trait == 2)
+ continue; // junk traits are stored in training.dat
+ if ((error = (writeUInt32(stream, trait) != 1)))
+ break;
+ if ((error = (writeUInt32(stream, mMessageCounts[index] / shrinkFactor) != 1)))
+ break;
+ if ((error = !writeTokens(stream, shrink, trait)))
+ break;
+ }
+ break;
+ }
+ // we add a 0 at the end to represent end of trait list
+ error = writeUInt32(stream, 0) != 1;
+
+ fclose(stream);
+ if (error)
+ {
+ NS_WARNING("failed to write trait data.");
+ // delete the trait data file, since it is probably corrupt.
+ mTraitFile->Remove(false);
+ }
+
+ if (shrink)
+ {
+ // We'll clear the tokens, and read them back in from the file.
+ // Yes this is slower than in place, but this is a rare event.
+
+ if (countTokens())
+ {
+ clearTokens();
+ for (uint32_t index = 0; index < numberOfTraits; index++)
+ mMessageCounts[index] = 0;
+ }
+
+ readTrainingData();
+ }
+}
+
+void CorpusStore::readTrainingData()
+{
+
+ /*
+ * To maintain backwards compatibility, good and junk traits
+ * are stored in a file "training.dat"
+ */
+ if (!mTrainingFile)
+ return;
+
+ bool exists;
+ nsresult rv = mTrainingFile->Exists(&exists);
+ if (NS_FAILED(rv) || !exists)
+ return;
+
+ FILE* stream;
+ rv = mTrainingFile->OpenANSIFileDesc("rb", &stream);
+ if (NS_FAILED(rv))
+ return;
+
+ int64_t fileSize;
+ rv = mTrainingFile->GetFileSize(&fileSize);
+ if (NS_FAILED(rv))
+ return;
+
+ // FIXME: should make sure that the tokenizers are empty.
+ char cookie[4];
+ uint32_t goodMessageCount = 0, junkMessageCount = 0;
+ if (!((fread(cookie, sizeof(cookie), 1, stream) == 1) &&
+ (memcmp(cookie, kMagicCookie, sizeof(cookie)) == 0) &&
+ (readUInt32(stream, &goodMessageCount) == 1) &&
+ (readUInt32(stream, &junkMessageCount) == 1) &&
+ readTokens(stream, fileSize, kGoodTrait, true) &&
+ readTokens(stream, fileSize, kJunkTrait, true))) {
+ NS_WARNING("failed to read training data.");
+ MOZ_LOG(BayesianFilterLogModule, LogLevel::Error, ("failed to read training data."));
+ }
+ setMessageCount(kGoodTrait, goodMessageCount);
+ setMessageCount(kJunkTrait, junkMessageCount);
+
+ fclose(stream);
+
+ /*
+ * Additional traits are stored in traits.dat
+ */
+
+ if (!mTraitFile)
+ {
+ getTraitFile(getter_AddRefs(mTraitFile));
+ if (!mTraitFile)
+ return;
+ }
+
+ rv = mTraitFile->Exists(&exists);
+ if (NS_FAILED(rv) || !exists)
+ return;
+
+ rv = UpdateData(mTraitFile, true, 0, nullptr, nullptr);
+
+ if (NS_FAILED(rv))
+ {
+ NS_WARNING("failed to read training data.");
+ MOZ_LOG(BayesianFilterLogModule, LogLevel::Error, ("failed to read training data."));
+ }
+ return;
+}
+
+nsresult CorpusStore::resetTrainingData()
+{
+ // clear out our in memory training tokens...
+ if (countTokens())
+ clearTokens();
+
+ uint32_t length = mMessageCounts.Length();
+ for (uint32_t index = 0 ; index < length; index++)
+ mMessageCounts[index] = 0;
+
+ if (mTrainingFile)
+ mTrainingFile->Remove(false);
+ if (mTraitFile)
+ mTraitFile->Remove(false);
+ return NS_OK;
+}
+
+inline CorpusToken* CorpusStore::get(const char* word)
+{
+ return static_cast<CorpusToken*>(TokenHash::get(word));
+}
+
+nsresult CorpusStore::updateTrait(CorpusToken* token, uint32_t aTraitId,
+ int32_t aCountChange)
+{
+ NS_ENSURE_ARG_POINTER(token);
+ uint32_t nextLink = token->mTraitLink;
+ uint32_t lastLink = 0;
+
+ uint32_t linkCount, maxLinks = 100; //sanity check
+ for (linkCount = 0; nextLink && linkCount < maxLinks; linkCount++)
+ {
+ TraitPerToken& traitPT = mTraitStore[nextLink];
+ if (traitPT.mId == aTraitId)
+ {
+ // be careful with signed versus unsigned issues here
+ if (static_cast<int32_t>(traitPT.mCount) + aCountChange > 0)
+ traitPT.mCount += aCountChange;
+ else
+ traitPT.mCount = 0;
+ // we could delete zero count traits here, but let's not. It's rare anyway.
+ return NS_OK;
+ }
+ lastLink = nextLink;
+ nextLink = traitPT.mNextLink;
+ }
+ if (linkCount >= maxLinks)
+ return NS_ERROR_FAILURE;
+
+ // trait does not exist, so add it
+
+ if (aCountChange > 0) // don't set a negative count
+ {
+ TraitPerToken traitPT(aTraitId, aCountChange);
+ if (mTraitStore.Length() == mNextTraitIndex)
+ mTraitStore.InsertElementAt(mNextTraitIndex, traitPT);
+ else if (mTraitStore.Length() > mNextTraitIndex)
+ mTraitStore.ReplaceElementsAt(mNextTraitIndex, 1, traitPT);
+ else
+ return NS_ERROR_FAILURE;
+ if (lastLink)
+ // the token had a parent, so update it
+ mTraitStore[lastLink].mNextLink = mNextTraitIndex;
+ else
+ // need to update the token's root link
+ token->mTraitLink = mNextTraitIndex;
+ mNextTraitIndex++;
+ }
+ return NS_OK;
+}
+
+uint32_t CorpusStore::getTraitCount(CorpusToken* token, uint32_t aTraitId)
+{
+ uint32_t nextLink;
+ if (!token || !(nextLink = token->mTraitLink))
+ return 0;
+
+ uint32_t linkCount, maxLinks = 100; //sanity check
+ for (linkCount = 0; nextLink && linkCount < maxLinks; linkCount++)
+ {
+ TraitPerToken& traitPT = mTraitStore[nextLink];
+ if (traitPT.mId == aTraitId)
+ return traitPT.mCount;
+ nextLink = traitPT.mNextLink;
+ }
+ NS_ASSERTION(linkCount < maxLinks, "Corrupt trait count store");
+
+ // trait not found (or error), so count is zero
+ return 0;
+}
+
+CorpusToken* CorpusStore::add(const char* word, uint32_t aTraitId, uint32_t aCount)
+{
+ CorpusToken* token = static_cast<CorpusToken*>(TokenHash::add(word));
+ if (token) {
+ MOZ_LOG(BayesianFilterLogModule, LogLevel::Debug,
+ ("adding word to corpus store: %s (Trait=%d) (deltaCount=%d)",
+ word, aTraitId, aCount));
+ updateTrait(token, aTraitId, aCount);
+ }
+ return token;
+ }
+
+void CorpusStore::remove(const char* word, uint32_t aTraitId, uint32_t aCount)
+{
+ MOZ_LOG(BayesianFilterLogModule, LogLevel::Debug,
+ ("remove word: %s (TraitId=%d) (Count=%d)",
+ word, aTraitId, aCount));
+ CorpusToken* token = get(word);
+ if (token)
+ updateTrait(token, aTraitId, -static_cast<int32_t>(aCount));
+}
+
+uint32_t CorpusStore::getMessageCount(uint32_t aTraitId)
+{
+ size_t index = mMessageCountsId.IndexOf(aTraitId);
+ if (index == mMessageCountsId.NoIndex)
+ return 0;
+ return mMessageCounts.ElementAt(index);
+}
+
+void CorpusStore::setMessageCount(uint32_t aTraitId, uint32_t aCount)
+{
+ size_t index = mMessageCountsId.IndexOf(aTraitId);
+ if (index == mMessageCountsId.NoIndex)
+ {
+ mMessageCounts.AppendElement(aCount);
+ mMessageCountsId.AppendElement(aTraitId);
+ }
+ else
+ {
+ mMessageCounts[index] = aCount;
+ }
+}
+
+nsresult
+CorpusStore::UpdateData(nsIFile *aFile,
+ bool aIsAdd,
+ uint32_t aRemapCount,
+ uint32_t *aFromTraits,
+ uint32_t *aToTraits)
+{
+ NS_ENSURE_ARG_POINTER(aFile);
+ if (aRemapCount)
+ {
+ NS_ENSURE_ARG_POINTER(aFromTraits);
+ NS_ENSURE_ARG_POINTER(aToTraits);
+ }
+
+ int64_t fileSize;
+ nsresult rv = aFile->GetFileSize(&fileSize);
+ NS_ENSURE_SUCCESS(rv, rv);
+
+ FILE* stream;
+ rv = aFile->OpenANSIFileDesc("rb", &stream);
+ NS_ENSURE_SUCCESS(rv, rv);
+
+ bool error;
+ do // break on error or done
+ {
+ char cookie[4];
+ if ((error = (fread(cookie, sizeof(cookie), 1, stream) != 1)))
+ break;
+
+ if ((error = memcmp(cookie, kTraitCookie, sizeof(cookie))))
+ break;
+
+ uint32_t fileTrait;
+ while ( !(error = (readUInt32(stream, &fileTrait) != 1)) && fileTrait)
+ {
+ uint32_t count;
+ if ((error = (readUInt32(stream, &count) != 1)))
+ break;
+
+ uint32_t localTrait = fileTrait;
+ // remap the trait
+ for (uint32_t i = 0; i < aRemapCount; i++)
+ {
+ if (aFromTraits[i] == fileTrait)
+ localTrait = aToTraits[i];
+ }
+
+ uint32_t messageCount = getMessageCount(localTrait);
+ if (aIsAdd)
+ messageCount += count;
+ else if (count > messageCount)
+ messageCount = 0;
+ else
+ messageCount -= count;
+ setMessageCount(localTrait, messageCount);
+
+ if ((error = !readTokens(stream, fileSize, localTrait, aIsAdd)))
+ break;
+ }
+ break;
+ } while (0);
+
+ fclose(stream);
+
+ if (error)
+ return NS_ERROR_FAILURE;
+ return NS_OK;
+}
+
+nsresult CorpusStore::ClearTrait(uint32_t aTrait)
+{
+ // clear message counts
+ setMessageCount(aTrait, 0);
+
+ TokenEnumeration tokens = getTokens();
+ while (tokens.hasMoreTokens())
+ {
+ CorpusToken* token = static_cast<CorpusToken*>(tokens.nextToken());
+ int32_t wordCount = static_cast<int32_t>(getTraitCount(token, aTrait));
+ updateTrait(token, aTrait, -wordCount);
+ }
+ return NS_OK;
+}
diff --git a/mailnews/extensions/bayesian-spam-filter/src/nsBayesianFilter.h b/mailnews/extensions/bayesian-spam-filter/src/nsBayesianFilter.h
new file mode 100644
index 000000000..32a9d26d0
--- /dev/null
+++ b/mailnews/extensions/bayesian-spam-filter/src/nsBayesianFilter.h
@@ -0,0 +1,404 @@
+/* -*- Mode: C++; tab-width: 4; indent-tabs-mode: nil; c-basic-offset: 4 -*- */
+/* This Source Code Form is subject to the terms of the Mozilla Public
+ * License, v. 2.0. If a copy of the MPL was not distributed with this
+ * file, You can obtain one at http://mozilla.org/MPL/2.0/. */
+
+#ifndef nsBayesianFilter_h__
+#define nsBayesianFilter_h__
+
+#include <stdio.h>
+#include "nsCOMPtr.h"
+#include "nsIMsgFilterPlugin.h"
+#include "nsISemanticUnitScanner.h"
+#include "PLDHashTable.h"
+#include "nsITimer.h"
+#include "nsTArray.h"
+#include "nsStringGlue.h"
+#include "nsWeakReference.h"
+#include "nsIObserver.h"
+
+// XXX can't simply byte align arenas, must at least 2-byte align.
+#define PL_ARENA_CONST_ALIGN_MASK 1
+#include "plarena.h"
+
+#define DEFAULT_MIN_INTERVAL_BETWEEN_WRITES 15*60*1000
+
+struct Token;
+class TokenEnumeration;
+class TokenAnalyzer;
+class nsIMsgWindow;
+class nsIMimeHeaders;
+class nsIUTF8StringEnumerator;
+struct BaseToken;
+struct CorpusToken;
+
+/**
+ * Helper class to enumerate Token objects in a PLDHashTable
+ * safely and without copying (see bugzilla #174859). The
+ * enumeration is safe to use until an Add()
+ * or Remove() is performed on the table.
+ */
+class TokenEnumeration {
+public:
+ TokenEnumeration(PLDHashTable* table);
+ bool hasMoreTokens();
+ BaseToken* nextToken();
+
+private:
+ PLDHashTable::Iterator mIterator;
+};
+
+// A trait is some aspect of a message, like being junk or tagged as
+// Personal, that the statistical classifier should track. The Trait
+// structure is a per-token representation of information pertaining to
+// a message trait.
+//
+// Traits per token are maintained as a linked list.
+//
+struct TraitPerToken
+{
+ uint32_t mId; // identifying number for a trait
+ uint32_t mCount; // count of messages with this token and trait
+ uint32_t mNextLink; // index in mTraitStore for the next trait, or 0
+ // for none
+ TraitPerToken(uint32_t aId, uint32_t aCount); // inititializer
+};
+
+// An Analysis is the statistical results for a particular message, a
+// particular token, and for a particular pair of trait/antitrait, that
+// is then used in subsequent analysis to score the message.
+//
+// Analyses per token are maintained as a linked list.
+//
+struct AnalysisPerToken
+{
+ uint32_t mTraitIndex; // index representing a protrait/antitrait pair.
+ // So if we are analyzing 3 different traits, then
+ // the first trait is 0, the second 1, etc.
+ double mDistance; // absolute value of mProbability - 0.5
+ double mProbability; // relative indicator of match of trait to token
+ uint32_t mNextLink; // index in mAnalysisStore for the Analysis object
+ // for the next trait index, or 0 for none.
+ // initializer
+ AnalysisPerToken(uint32_t aTraitIndex, double aDistance, double aProbability);
+};
+
+class TokenHash {
+public:
+
+ virtual ~TokenHash();
+ /**
+ * Clears out the previous message tokens.
+ */
+ nsresult clearTokens();
+ uint32_t countTokens();
+ TokenEnumeration getTokens();
+ BaseToken* add(const char* word);
+
+protected:
+ TokenHash(uint32_t entrySize);
+ PLArenaPool mWordPool;
+ uint32_t mEntrySize;
+ PLDHashTable mTokenTable;
+ char* copyWord(const char* word, uint32_t len);
+ BaseToken* get(const char* word);
+};
+
+class Tokenizer: public TokenHash {
+public:
+ Tokenizer();
+ ~Tokenizer();
+
+ Token* get(const char* word);
+
+ // The training set keeps an occurrence count on each word. This count
+ // is supposed to count the # of messsages it occurs in.
+ // When add/remove is called while tokenizing a message and NOT the training set,
+ //
+ Token* add(const char* word, uint32_t count = 1);
+
+ Token* copyTokens();
+
+ void tokenize(const char* text);
+
+ /**
+ * Creates specific tokens based on the mime headers for the message being tokenized
+ */
+ void tokenizeHeaders(nsIUTF8StringEnumerator * aHeaderNames, nsIUTF8StringEnumerator * aHeaderValues);
+
+ void tokenizeAttachment(const char * aContentType, const char * aFileName);
+
+ nsCString mBodyDelimiters; // delimiters for body tokenization
+ nsCString mHeaderDelimiters; // delimiters for header tokenization
+
+ // arrays of extra headers to tokenize / to not tokenize
+ nsTArray<nsCString> mEnabledHeaders;
+ nsTArray<nsCString> mDisabledHeaders;
+ // Delimiters used in tokenizing a particular header.
+ // Parallel array to mEnabledHeaders
+ nsTArray<nsCString> mEnabledHeadersDelimiters;
+ bool mCustomHeaderTokenization; // Are there any preference-set tokenization customizations?
+ uint32_t mMaxLengthForToken; // maximum length of a token
+ // should we convert iframe to div during tokenization?
+ bool mIframeToDiv;
+
+private:
+
+ void tokenize_ascii_word(char * word);
+ void tokenize_japanese_word(char* chunk);
+ inline void addTokenForHeader(const char * aTokenPrefix, nsACString& aValue,
+ bool aTokenizeValue = false, const char* aDelimiters = nullptr);
+ nsresult stripHTML(const nsAString& inString, nsAString& outString);
+ // helper function to escape \n, \t, etc from a CString
+ void UnescapeCString(nsCString& aCString);
+
+private:
+ nsCOMPtr<nsISemanticUnitScanner> mScanner;
+};
+
+/**
+ * Implements storage of a collection of message tokens and counts for
+ * a corpus of classified messages
+ */
+
+class CorpusStore: public TokenHash {
+public:
+ CorpusStore();
+ ~CorpusStore();
+
+ /**
+ * retrieve the token structure for a particular string
+ *
+ * @param word the character representation of the token
+ *
+ * @return token structure containing counts, null if not found
+ */
+ CorpusToken* get(const char* word);
+
+ /**
+ * add tokens to the storage, or increment counts if already exists.
+ *
+ * @param aTokenizer tokenizer for the list of tokens to remember
+ * @param aTraitId id for the trait whose counts will be remembered
+ * @param aCount number of new messages represented by the token list
+ */
+ void rememberTokens(Tokenizer& aTokenizer, uint32_t aTraitId, uint32_t aCount);
+
+ /**
+ * decrement counts for tokens in the storage, removing if all counts
+ * are zero
+ *
+ * @param aTokenizer tokenizer for the list of tokens to forget
+ * @param aTraitId id for the trait whose counts will be removed
+ * @param aCount number of messages represented by the token list
+ */
+ void forgetTokens(Tokenizer& aTokenizer, uint32_t aTraitId, uint32_t aCount);
+
+ /**
+ * write the corpus information to file storage
+ *
+ * @param aMaximumTokenCount prune tokens if number of tokens exceeds
+ * this value. == 0 for no pruning
+ */
+ void writeTrainingData(uint32_t aMaximumTokenCount);
+
+ /**
+ * read the corpus information from file storage
+ */
+ void readTrainingData();
+
+ /**
+ * delete the local corpus storage file and data
+ */
+ nsresult resetTrainingData();
+
+ /**
+ * get the count of messages whose tokens are stored that are associated
+ * with a trait
+ *
+ * @param aTraitId identifier for the trait
+ * @return number of messages for that trait
+ */
+ uint32_t getMessageCount(uint32_t aTraitId);
+
+ /**
+ * set the count of messages whose tokens are stored that are associated
+ * with a trait
+ *
+ * @param aTraitId identifier for the trait
+ * @param aCount number of messages for that trait
+ */
+ void setMessageCount(uint32_t aTraitId, uint32_t aCount);
+
+ /**
+ * get the count of messages associated with a particular token and trait
+ *
+ * @param token the token string and associated counts
+ * @param aTraitId identifier for the trait
+ */
+ uint32_t getTraitCount(CorpusToken *token, uint32_t aTraitId);
+
+ /**
+ * Add (or remove) data from a particular file to the corpus data.
+ *
+ * @param aFile the file with the data, in the format:
+ *
+ * Format of the trait file for version 1:
+ * [0xFCA93601] (the 01 is the version)
+ * for each trait to write:
+ * [id of trait to write] (0 means end of list)
+ * [number of messages per trait]
+ * for each token with non-zero count
+ * [count]
+ * [length of word]word
+ *
+ * @param aIsAdd should the data be added, or removed? true if adding,
+ * else removing.
+ *
+ * @param aRemapCount number of items in the parallel arrays aFromTraits,
+ * aToTraits. These arrays allow conversion of the
+ * trait id stored in the file (which may be originated
+ * externally) to the trait id used in the local corpus
+ * (which is defined locally using nsIMsgTraitService).
+ *
+ * @param aFromTraits array of trait ids used in aFile. If aFile contains
+ * trait ids that are not in this array, they are not
+ * remapped, but assummed to be local trait ids.
+ *
+ * @param aToTraits array of trait ids, corresponding to elements of
+ * aFromTraits, that represent the local trait ids to be
+ * used in storing data from aFile into the local corpus.
+ *
+ */
+ nsresult UpdateData(nsIFile *aFile, bool aIsAdd,
+ uint32_t aRemapCount, uint32_t *aFromTraits,
+ uint32_t *aToTraits);
+
+ /**
+ * remove all counts (message and tokens) for a trait id
+ *
+ * @param aTrait trait id for the trait to remove
+ */
+ nsresult ClearTrait(uint32_t aTrait);
+
+protected:
+
+ /**
+ * return the local corpus storage file for junk traits
+ */
+ nsresult getTrainingFile(nsIFile ** aFile);
+
+ /**
+ * return the local corpus storage file for non-junk traits
+ */
+ nsresult getTraitFile(nsIFile ** aFile);
+
+ /**
+ * read token strings from the data file
+ *
+ * @param stream file stream with token data
+ * @param fileSize file size
+ * @param aTraitId id for the trait whose counts will be read
+ * @param aIsAdd true to add the counts, false to remove them
+ *
+ * @return true if successful, false if error
+ */
+ bool readTokens(FILE* stream, int64_t fileSize, uint32_t aTraitId,
+ bool aIsAdd);
+
+ /**
+ * write token strings to the data file
+ */
+ bool writeTokens(FILE* stream, bool shrink, uint32_t aTraitId);
+
+ /**
+ * remove counts for a token string
+ */
+ void remove(const char* word, uint32_t aTraitId, uint32_t aCount);
+
+ /**
+ * add counts for a token string, adding the token string if new
+ */
+ CorpusToken* add(const char* word, uint32_t aTraitId, uint32_t aCount);
+
+ /**
+ * change counts in a trait in the traits array, adding the trait if needed
+ */
+ nsresult updateTrait(CorpusToken* token, uint32_t aTraitId,
+ int32_t aCountChange);
+ nsCOMPtr<nsIFile> mTrainingFile; // file used to store junk training data
+ nsCOMPtr<nsIFile> mTraitFile; // file used to store non-junk
+ // training data
+ nsTArray<TraitPerToken> mTraitStore; // memory for linked-list of counts
+ uint32_t mNextTraitIndex; // index in mTraitStore to first empty
+ // TraitPerToken
+ nsTArray<uint32_t> mMessageCounts; // count of messages per trait
+ // represented in the store
+ nsTArray<uint32_t> mMessageCountsId; // Parallel array to mMessageCounts, with
+ // the corresponding trait ID
+};
+
+class nsBayesianFilter : public nsIJunkMailPlugin, nsIMsgCorpus,
+ nsIObserver, nsSupportsWeakReference {
+public:
+ NS_DECL_ISUPPORTS
+ NS_DECL_NSIMSGFILTERPLUGIN
+ NS_DECL_NSIJUNKMAILPLUGIN
+ NS_DECL_NSIMSGCORPUS
+ NS_DECL_NSIOBSERVER
+
+ nsBayesianFilter();
+
+ nsresult Init();
+
+ nsresult tokenizeMessage(const char* messageURI, nsIMsgWindow *aMsgWindow, TokenAnalyzer* analyzer);
+ void classifyMessage(Tokenizer& tokens, const char* messageURI,
+ nsIJunkMailClassificationListener* listener);
+
+ void classifyMessage(
+ Tokenizer& tokenizer,
+ const char* messageURI,
+ nsTArray<uint32_t>& aProTraits,
+ nsTArray<uint32_t>& aAntiTraits,
+ nsIJunkMailClassificationListener* listener,
+ nsIMsgTraitClassificationListener* aTraitListener,
+ nsIMsgTraitDetailListener* aDetailListener);
+
+ void observeMessage(Tokenizer& tokens, const char* messageURI,
+ nsTArray<uint32_t>& oldClassifications,
+ nsTArray<uint32_t>& newClassifications,
+ nsIJunkMailClassificationListener* listener,
+ nsIMsgTraitClassificationListener* aTraitListener);
+
+
+protected:
+ virtual ~nsBayesianFilter();
+
+ static void TimerCallback(nsITimer* aTimer, void* aClosure);
+
+ CorpusStore mCorpus;
+ double mJunkProbabilityThreshold;
+ int32_t mMaximumTokenCount;
+ bool mTrainingDataDirty;
+ int32_t mMinFlushInterval; // in milliseconds, must be positive
+ //and not too close to 0
+ nsCOMPtr<nsITimer> mTimer;
+
+ // index in mAnalysisStore for first empty AnalysisPerToken
+ uint32_t mNextAnalysisIndex;
+ // memory for linked list of AnalysisPerToken objects
+ nsTArray<AnalysisPerToken> mAnalysisStore;
+ /**
+ * Determine the location in mAnalysisStore where the AnalysisPerToken
+ * object for a particular token and trait is stored
+ */
+ uint32_t getAnalysisIndex(Token& token, uint32_t aTraitIndex);
+ /**
+ * Set the value of the AnalysisPerToken object for a particular
+ * token and trait
+ */
+ nsresult setAnalysis(Token& token, uint32_t aTraitIndex,
+ double aDistance, double aProbability);
+};
+
+#endif // _nsBayesianFilter_h__
diff --git a/mailnews/extensions/bayesian-spam-filter/src/nsBayesianFilterCID.h b/mailnews/extensions/bayesian-spam-filter/src/nsBayesianFilterCID.h
new file mode 100644
index 000000000..b1a54a1fc
--- /dev/null
+++ b/mailnews/extensions/bayesian-spam-filter/src/nsBayesianFilterCID.h
@@ -0,0 +1,22 @@
+/* -*- Mode: C++; tab-width: 2; indent-tabs-mode: nil; c-basic-offset: 2 -*- */
+/* This Source Code Form is subject to the terms of the Mozilla Public
+ * License, v. 2.0. If a copy of the MPL was not distributed with this
+ * file, You can obtain one at http://mozilla.org/MPL/2.0/. */
+
+#ifndef nsBayesianFilterCID_h__
+#define nsBayesianFilterCID_h__
+
+#include "nsISupports.h"
+#include "nsIFactory.h"
+#include "nsIComponentManager.h"
+
+#include "nsIMsgMdnGenerator.h"
+
+#define NS_BAYESIANFILTER_CONTRACTID \
+ "@mozilla.org/messenger/filter-plugin;1?name=bayesianfilter"
+#define NS_BAYESIANFILTER_CID \
+{ /* F1070BFA-D539-11D6-90CA-00039310A47A */ \
+ 0xF1070BFA, 0xD539, 0x11D6, \
+ { 0x90, 0xCA, 0x00, 0x03, 0x93, 0x10, 0xA4, 0x7A }}
+
+#endif /* nsBayesianFilterCID_h__ */
diff --git a/mailnews/extensions/bayesian-spam-filter/src/nsIncompleteGamma.h b/mailnews/extensions/bayesian-spam-filter/src/nsIncompleteGamma.h
new file mode 100644
index 000000000..1f7388b16
--- /dev/null
+++ b/mailnews/extensions/bayesian-spam-filter/src/nsIncompleteGamma.h
@@ -0,0 +1,259 @@
+/* -*- Mode: C++; tab-width: 2; indent-tabs-mode: nil; c-basic-offset: 4 -*- */
+/* This Source Code Form is subject to the terms of the Mozilla Public
+ * License, v. 2.0. If a copy of the MPL was not distributed with this
+ * file, You can obtain one at http://mozilla.org/MPL/2.0/. */
+
+#ifndef nsIncompleteGamma_h__
+#define nsIncompleteGamma_h__
+
+/* An implementation of the incomplete gamma functions for real
+ arguments. P is defined as
+
+ x
+ /
+ 1 [ a - 1 - t
+ P(a, x) = -------- I t e dt
+ Gamma(a) ]
+ /
+ 0
+
+ and
+
+ infinity
+ /
+ 1 [ a - 1 - t
+ Q(a, x) = -------- I t e dt
+ Gamma(a) ]
+ /
+ x
+
+ so that P(a,x) + Q(a,x) = 1.
+
+ Both a series expansion and a continued fraction exist. This
+ implementation uses the more efficient method based on the arguments.
+
+ Either case involves calculating a multiplicative term:
+ e^(-x)*x^a/Gamma(a).
+ Here we calculate the log of this term. Most math libraries have a
+ "lgamma" function but it is not re-entrant. Some libraries have a
+ "lgamma_r" which is re-entrant. Use it if possible. I have included a
+ simple replacement but it is certainly not as accurate.
+
+ Relative errors are almost always < 1e-10 and usually < 1e-14. Very
+ small and very large arguments cause trouble.
+
+ The region where a < 0.5 and x < 0.5 has poor error properties and is
+ not too stable. Get a better routine if you need results in this
+ region.
+
+ The error argument will be set negative if there is a domain error or
+ positive for an internal calculation error, currently lack of
+ convergence. A value is always returned, though.
+
+ */
+
+#include <math.h>
+#include <float.h>
+
+// the main routine
+static double nsIncompleteGammaP (double a, double x, int *error);
+
+// nsLnGamma(z): either a wrapper around lgamma_r or the internal function.
+// C_m = B[2*m]/(2*m*(2*m-1)) where B is a Bernoulli number
+static const double C_1 = 1.0 / 12.0;
+static const double C_2 = -1.0 / 360.0;
+static const double C_3 = 1.0 / 1260.0;
+static const double C_4 = -1.0 / 1680.0;
+static const double C_5 = 1.0 / 1188.0;
+static const double C_6 = -691.0 / 360360.0;
+static const double C_7 = 1.0 / 156.0;
+static const double C_8 = -3617.0 / 122400.0;
+static const double C_9 = 43867.0 / 244188.0;
+static const double C_10 = -174611.0 / 125400.0;
+static const double C_11 = 77683.0 / 5796.0;
+
+// truncated asymptotic series in 1/z
+static inline double lngamma_asymp (double z)
+{
+ double w, w2, sum;
+ w = 1.0 / z;
+ w2 = w * w;
+ sum = w * (w2 * (w2 * (w2 * (w2 * (w2 * (w2 * (w2 * (w2 * (w2
+ * (C_11 * w2 + C_10) + C_9) + C_8) + C_7) + C_6)
+ + C_5) + C_4) + C_3) + C_2) + C_1);
+
+ return sum;
+}
+
+struct fact_table_s
+{
+ double fact;
+ double lnfact;
+};
+
+// for speed and accuracy
+static const struct fact_table_s FactTable[] = {
+ {1.000000000000000, 0.0000000000000000000000e+00},
+ {1.000000000000000, 0.0000000000000000000000e+00},
+ {2.000000000000000, 6.9314718055994530942869e-01},
+ {6.000000000000000, 1.7917594692280550007892e+00},
+ {24.00000000000000, 3.1780538303479456197550e+00},
+ {120.0000000000000, 4.7874917427820459941458e+00},
+ {720.0000000000000, 6.5792512120101009952602e+00},
+ {5040.000000000000, 8.5251613610654142999881e+00},
+ {40320.00000000000, 1.0604602902745250228925e+01},
+ {362880.0000000000, 1.2801827480081469610995e+01},
+ {3628800.000000000, 1.5104412573075515295248e+01},
+ {39916800.00000000, 1.7502307845873885839769e+01},
+ {479001600.0000000, 1.9987214495661886149228e+01},
+ {6227020800.000000, 2.2552163853123422886104e+01},
+ {87178291200.00000, 2.5191221182738681499610e+01},
+ {1307674368000.000, 2.7899271383840891566988e+01},
+ {20922789888000.00, 3.0671860106080672803835e+01},
+ {355687428096000.0, 3.3505073450136888885825e+01},
+ {6402373705728000., 3.6395445208033053576674e+01}
+};
+#define FactTableLength (int)(sizeof(FactTable)/sizeof(FactTable[0]))
+
+// for speed
+static const double ln_2pi_2 = 0.918938533204672741803; // log(2*PI)/2
+
+/* A simple lgamma function, not very robust.
+
+ Valid for z_in > 0 ONLY.
+
+ For z_in > 8 precision is quite good, relative errors < 1e-14 and
+ usually better. For z_in < 8 relative errors increase but are usually
+ < 1e-10. In two small regions, 1 +/- .001 and 2 +/- .001 errors
+ increase quickly.
+*/
+static double nsLnGamma (double z_in, int *gsign)
+{
+ double scale, z, sum, result;
+ *gsign = 1;
+
+ int zi = (int) z_in;
+ if (z_in == (double) zi)
+ {
+ if (0 < zi && zi <= FactTableLength)
+ return FactTable[zi - 1].lnfact; // gamma(z) = (z-1)!
+ }
+
+ for (scale = 1.0, z = z_in; z < 8.0; ++z)
+ scale *= z;
+
+ sum = lngamma_asymp (z);
+ result = (z - 0.5) * log (z) - z + ln_2pi_2 - log (scale);
+ result += sum;
+ return result;
+}
+
+// log( e^(-x)*x^a/Gamma(a) )
+static inline double lnPQfactor (double a, double x)
+{
+ int gsign; // ignored because a > 0
+ return a * log (x) - x - nsLnGamma (a, &gsign);
+}
+
+static double Pseries (double a, double x, int *error)
+{
+ double sum, term;
+ const double eps = 2.0 * DBL_EPSILON;
+ const int imax = 5000;
+ int i;
+
+ sum = term = 1.0 / a;
+ for (i = 1; i < imax; ++i)
+ {
+ term *= x / (a + i);
+ sum += term;
+ if (fabs (term) < eps * fabs (sum))
+ break;
+ }
+
+ if (i >= imax)
+ *error = 1;
+
+ return sum;
+}
+
+static double Qcontfrac (double a, double x, int *error)
+{
+ double result, D, C, e, f, term;
+ const double eps = 2.0 * DBL_EPSILON;
+ const double small =
+ DBL_EPSILON * DBL_EPSILON * DBL_EPSILON * DBL_EPSILON;
+ const int imax = 5000;
+ int i;
+
+ // modified Lentz method
+ f = x - a + 1.0;
+ if (fabs (f) < small)
+ f = small;
+ C = f + 1.0 / small;
+ D = 1.0 / f;
+ result = D;
+ for (i = 1; i < imax; ++i)
+ {
+ e = i * (a - i);
+ f += 2.0;
+ D = f + e * D;
+ if (fabs (D) < small)
+ D = small;
+ D = 1.0 / D;
+ C = f + e / C;
+ if (fabs (C) < small)
+ C = small;
+ term = C * D;
+ result *= term;
+ if (fabs (term - 1.0) < eps)
+ break;
+ }
+
+ if (i >= imax)
+ *error = 1;
+ return result;
+}
+
+static double nsIncompleteGammaP (double a, double x, int *error)
+{
+ double result, dom, ldom;
+ // domain errors. the return values are meaningless but have
+ // to return something.
+ *error = -1;
+ if (a <= 0.0)
+ return 1.0;
+ if (x < 0.0)
+ return 0.0;
+ *error = 0;
+ if (x == 0.0)
+ return 0.0;
+
+ ldom = lnPQfactor (a, x);
+ dom = exp (ldom);
+ // might need to adjust the crossover point
+ if (a <= 0.5)
+ {
+ if (x < a + 1.0)
+ result = dom * Pseries (a, x, error);
+ else
+ result = 1.0 - dom * Qcontfrac (a, x, error);
+ }
+ else
+ {
+ if (x < a)
+ result = dom * Pseries (a, x, error);
+ else
+ result = 1.0 - dom * Qcontfrac (a, x, error);
+ }
+
+ // not clear if this can ever happen
+ if (result > 1.0)
+ result = 1.0;
+ if (result < 0.0)
+ result = 0.0;
+ return result;
+}
+
+#endif
+