summaryrefslogtreecommitdiffstats
path: root/mailnews/db/gloda/modules/msg_search.js
diff options
context:
space:
mode:
authorMatt A. Tobin <email@mattatobin.com>2019-11-03 00:17:46 -0400
committerMatt A. Tobin <email@mattatobin.com>2019-11-03 00:17:46 -0400
commit302bf1b523012e11b60425d6eee1221ebc2724eb (patch)
treeb191a895f8716efcbe42f454f37597a545a6f421 /mailnews/db/gloda/modules/msg_search.js
parent21b3f6247403c06f85e1f45d219f87549862198f (diff)
downloadUXP-302bf1b523012e11b60425d6eee1221ebc2724eb.tar
UXP-302bf1b523012e11b60425d6eee1221ebc2724eb.tar.gz
UXP-302bf1b523012e11b60425d6eee1221ebc2724eb.tar.lz
UXP-302bf1b523012e11b60425d6eee1221ebc2724eb.tar.xz
UXP-302bf1b523012e11b60425d6eee1221ebc2724eb.zip
Issue #1258 - Part 1: Import mailnews, ldap, and mork from comm-esr52.9.1
Diffstat (limited to 'mailnews/db/gloda/modules/msg_search.js')
-rw-r--r--mailnews/db/gloda/modules/msg_search.js346
1 files changed, 346 insertions, 0 deletions
diff --git a/mailnews/db/gloda/modules/msg_search.js b/mailnews/db/gloda/modules/msg_search.js
new file mode 100644
index 000000000..8ba854406
--- /dev/null
+++ b/mailnews/db/gloda/modules/msg_search.js
@@ -0,0 +1,346 @@
+/* This Source Code Form is subject to the terms of the Mozilla Public
+ * License, v. 2.0. If a copy of the MPL was not distributed with this
+ * file, You can obtain one at http://mozilla.org/MPL/2.0/. */
+
+this.EXPORTED_SYMBOLS = ["GlodaMsgSearcher"];
+
+var Cc = Components.classes;
+var Ci = Components.interfaces;
+var Cr = Components.results;
+var Cu = Components.utils;
+
+Cu.import("resource://gre/modules/Services.jsm");
+Cu.import("resource:///modules/gloda/public.js");
+
+/**
+ * How much time boost should a 'score point' amount to? The authoritative,
+ * incontrivertible answer, across all time and space, is a week.
+ * Note that gloda stores timestamps as PRTimes for no exceedingly good
+ * reason.
+ */
+var FUZZSCORE_TIMESTAMP_FACTOR = 1000 * 1000 * 60 * 60 * 24 * 7;
+
+var RANK_USAGE =
+ "glodaRank(matchinfo(messagesText), 1.0, 2.0, 2.0, 1.5, 1.5)";
+
+var DASCORE =
+ "(((" + RANK_USAGE + " + messages.notability) * " +
+ FUZZSCORE_TIMESTAMP_FACTOR +
+ ") + messages.date)";
+
+/**
+ * A new optimization decision we are making is that we do not want to carry
+ * around any data in our ephemeral tables that is not used for whittling the
+ * result set. The idea is that the btree page cache or OS cache is going to
+ * save us from the disk seeks and carrying around the extra data is just going
+ * to be CPU/memory churn that slows us down.
+ *
+ * Additionally, we try and avoid row lookups that would have their results
+ * discarded by the LIMIT. Because of limitations in FTS3 (which might
+ * be addressed in FTS4 by a feature request), we can't avoid the 'messages'
+ * lookup since that has the message's date and static notability but we can
+ * defer the 'messagesText' lookup.
+ *
+ * This is the access pattern we are after here:
+ * 1) Order the matches with minimized lookup and result storage costs.
+ * - The innermost MATCH does the doclist magic and provides us with
+ * matchinfo() support which does not require content row retrieval
+ * from messagesText. Unfortunately, this is not enough to whittle anything
+ * because we still need static interestingness, so...
+ * - Based on the match we retrieve the date and notability for that row from
+ * 'messages' using this in conjunction with matchinfo() to provide a score
+ * that we can then use to LIMIT our results.
+ * 2) We reissue the MATCH query so that we will be able to use offsets(), but
+ * we intersect the results of this MATCH against our LIMITed results from
+ * step 1.
+ * - We use 'docid IN (phase 1 query)' to accomplish this because it results in
+ * efficient lookup. If we just use a join, we get O(mn) performance because
+ * a cartesian join ends up being performed where either we end up performing
+ * the fulltext query M times and table scan intersect with the results from
+ * phase 1 or we do the fulltext once but traverse the entire result set from
+ * phase 1 N times.
+ * - We believe that the re-execution of the MATCH query should have no disk
+ * costs because it should still be cached by SQLite or the OS. In the case
+ * where memory is so constrained this is not true our behavior is still
+ * probably preferable than the old way because that would have caused lots
+ * of swapping.
+ * - This part of the query otherwise resembles the basic gloda query but with
+ * the inclusion of the offsets() invocation. The messages table lookup
+ * should not involve any disk traffic because the pages should still be
+ * cached (SQLite or OS) from phase 1. The messagesText lookup is new, and
+ * this is the major disk-seek reduction optimization we are making. (Since
+ * we avoid this lookup for all of the documents that were excluded by the
+ * LIMIT.) Since offsets() also needs to retrieve the row from messagesText
+ * there is a nice synergy there.
+ */
+var NUEVO_FULLTEXT_SQL =
+ "SELECT messages.*, messagesText.*, offsets(messagesText) AS osets " +
+ "FROM messagesText, messages " +
+ "WHERE" +
+ " messagesText MATCH ?1 " +
+ " AND messagesText.docid IN (" +
+ "SELECT docid " +
+ "FROM messagesText JOIN messages ON messagesText.docid = messages.id " +
+ "WHERE messagesText MATCH ?1 " +
+ "ORDER BY " + DASCORE + " DESC " +
+ "LIMIT ?2" +
+ " )" +
+ " AND messages.id = messagesText.docid " +
+ " AND +messages.deleted = 0" +
+ " AND +messages.folderID IS NOT NULL" +
+ " AND +messages.messageKey IS NOT NULL";
+
+function identityFunc(x) {
+ return x;
+}
+
+function oneLessMaxZero(x) {
+ if (x <= 1)
+ return 0;
+ else
+ return x - 1;
+}
+
+function reduceSum(accum, curValue) {
+ return accum + curValue;
+}
+
+/*
+ * Columns are: body, subject, attachment names, author, recipients
+ */
+
+/**
+ * Scores if all search terms match in a column. We bias against author
+ * slightly and recipient a bit more in this case because a search that
+ * entirely matches just on a person should give a mention of that person
+ * in the subject or attachment a fighting chance.
+ * Keep in mind that because of our indexing in the face of address book
+ * contacts (namely, we index the name used in the e-mail as well as the
+ * display name on the address book card associated with the e-mail adress)
+ * a contact is going to bias towards matching multiple times.
+ */
+var COLUMN_ALL_MATCH_SCORES = [4, 20, 20, 16, 12];
+/**
+ * Score for each distinct term that matches in the column. This is capped
+ * by COLUMN_ALL_SCORES.
+ */
+var COLUMN_PARTIAL_PER_MATCH_SCORES = [1, 4, 4, 4, 3];
+/**
+ * If a term matches multiple times, what is the marginal score for each
+ * additional match. We count the total number of matches beyond the
+ * first match for each term. In other words, if we have 3 terms which
+ * matched 5, 3, and 0 times, then the total from our perspective is
+ * (5 - 1) + (3 - 1) + 0 = 4 + 2 + 0 = 6. We take the minimum of that value
+ * and the value in COLUMN_MULTIPLE_MATCH_LIMIT and multiply by the value in
+ * COLUMN_MULTIPLE_MATCH_SCORES.
+ */
+var COLUMN_MULTIPLE_MATCH_SCORES = [1, 0, 0, 0, 0];
+var COLUMN_MULTIPLE_MATCH_LIMIT = [10, 0, 0, 0, 0];
+
+/**
+ * Score the message on its offsets (from stashedColumns).
+ */
+function scoreOffsets(aMessage, aContext) {
+ let score = 0;
+
+ let termTemplate = aContext.terms.map(_ => 0);
+ // for each column, a list of the incidence of each term
+ let columnTermIncidence = [termTemplate.concat(),
+ termTemplate.concat(),
+ termTemplate.concat(),
+ termTemplate.concat(),
+ termTemplate.concat()];
+
+ // we need a friendlyParseInt because otherwise the radix stuff happens
+ // because of the extra arguments map parses. curse you, map!
+ let offsetNums =
+ aContext.stashedColumns[aMessage.id][0].split(" ").map(x => parseInt(x));
+ for (let i=0; i < offsetNums.length; i += 4) {
+ let columnIndex = offsetNums[i];
+ let termIndex = offsetNums[i+1];
+ columnTermIncidence[columnIndex][termIndex]++;
+ }
+
+ for (let iColumn = 0; iColumn < COLUMN_ALL_MATCH_SCORES.length; iColumn++) {
+ let termIncidence = columnTermIncidence[iColumn];
+ // bestow all match credit
+ if (termIncidence.every(identityFunc))
+ score += COLUMN_ALL_MATCH_SCORES[iColumn];
+ // bestow partial match credit
+ else if (termIncidence.some(identityFunc))
+ score += Math.min(COLUMN_ALL_MATCH_SCORES[iColumn],
+ COLUMN_PARTIAL_PER_MATCH_SCORES[iColumn] *
+ termIncidence.filter(identityFunc).length);
+ // bestow multiple match credit
+ score += Math.min(termIncidence.map(oneLessMaxZero).reduce(reduceSum, 0),
+ COLUMN_MULTIPLE_MATCH_LIMIT[iColumn]) *
+ COLUMN_MULTIPLE_MATCH_SCORES[iColumn];
+ }
+
+ return score;
+}
+
+/**
+ * The searcher basically looks like a query, but is specialized for fulltext
+ * search against messages. Most of the explicit specialization involves
+ * crafting a SQL query that attempts to order the matches by likelihood that
+ * the user was looking for it. This is based on full-text matches combined
+ * with an explicit (generic) interest score value placed on the message at
+ * indexing time. This is followed by using the more generic gloda scoring
+ * mechanism to explicitly score the messages given the search context in
+ * addition to the more generic score adjusting rules.
+ */
+function GlodaMsgSearcher(aListener, aSearchString, aAndTerms) {
+ this.listener = aListener;
+
+ this.searchString = aSearchString;
+ this.fulltextTerms = this.parseSearchString(aSearchString);
+ this.andTerms = (aAndTerms != null) ? aAndTerms : true;
+
+ this.query = null;
+ this.collection = null;
+
+ this.scores = null;
+}
+GlodaMsgSearcher.prototype = {
+ /**
+ * Number of messages to retrieve initially.
+ */
+ get retrievalLimit() {
+ return Services.prefs.getIntPref(
+ "mailnews.database.global.search.msg.limit"
+ );
+ },
+
+ /**
+ * Parse the string into terms/phrases by finding matching double-quotes.
+ */
+ parseSearchString: function GlodaMsgSearcher_parseSearchString(aSearchString) {
+ aSearchString = aSearchString.trim();
+ let terms = [];
+
+ /*
+ * Add the term as long as the trim on the way in didn't obliterate it.
+ *
+ * In the future this might have other helper logic; it did once before.
+ */
+ function addTerm(aTerm) {
+ if (aTerm)
+ terms.push(aTerm);
+ }
+
+ while (aSearchString) {
+ if (aSearchString.startsWith('"')) {
+ let endIndex = aSearchString.indexOf(aSearchString[0], 1);
+ // eat the quote if it has no friend
+ if (endIndex == -1) {
+ aSearchString = aSearchString.substring(1);
+ continue;
+ }
+
+ addTerm(aSearchString.substring(1, endIndex).trim());
+ aSearchString = aSearchString.substring(endIndex + 1);
+ continue;
+ }
+
+ let spaceIndex = aSearchString.indexOf(" ");
+ if (spaceIndex == -1) {
+ addTerm(aSearchString);
+ break;
+ }
+
+ addTerm(aSearchString.substring(0, spaceIndex));
+ aSearchString = aSearchString.substring(spaceIndex+1);
+ }
+
+ return terms;
+ },
+
+ buildFulltextQuery: function GlodaMsgSearcher_buildFulltextQuery() {
+ let query = Gloda.newQuery(Gloda.NOUN_MESSAGE, {
+ noMagic: true,
+ explicitSQL: NUEVO_FULLTEXT_SQL,
+ limitClauseAlreadyIncluded: true,
+ // osets is 0-based column number 14 (volatile to column changes)
+ // save the offset column for extra analysis
+ stashColumns: [14]
+ });
+
+ let fulltextQueryString = "";
+
+ for (let [iTerm, term] of this.fulltextTerms.entries()) {
+ if (iTerm)
+ fulltextQueryString += this.andTerms ? " " : " OR ";
+
+ // Put our term in quotes. This is needed for the tokenizer to be able
+ // to do useful things. The exception is people clever enough to use
+ // NEAR.
+ if (/^NEAR(\/\d+)?$/.test(term))
+ fulltextQueryString += term;
+ // Check if this is a single-character CJK search query. If so, we want
+ // to add a wildcard.
+ // Our tokenizer treats anything at/above 0x2000 as CJK for now.
+ else if (term.length == 1 && term.charCodeAt(0) >= 0x2000)
+ fulltextQueryString += term + "*";
+ else if (
+ term.length == 2 &&
+ term.charCodeAt(0) >= 0x2000 &&
+ term.charCodeAt(1) >= 0x2000
+ || term.length >= 3
+ )
+ fulltextQueryString += '"' + term + '"';
+
+ }
+
+ query.fulltextMatches(fulltextQueryString);
+ query.limit(this.retrievalLimit);
+
+ return query;
+ },
+
+ getCollection: function GlodaMsgSearcher_getCollection(
+ aListenerOverride, aData) {
+ if (aListenerOverride)
+ this.listener = aListenerOverride;
+
+ this.query = this.buildFulltextQuery();
+ this.collection = this.query.getCollection(this, aData);
+ this.completed = false;
+
+ return this.collection;
+ },
+
+ sortBy: '-dascore',
+
+ onItemsAdded: function GlodaMsgSearcher_onItemsAdded(aItems, aCollection) {
+ let newScores = Gloda.scoreNounItems(
+ aItems,
+ {
+ terms: this.fulltextTerms,
+ stashedColumns: aCollection.stashedColumns
+ },
+ [scoreOffsets]);
+ if (this.scores)
+ this.scores = this.scores.concat(newScores);
+ else
+ this.scores = newScores;
+
+ if (this.listener)
+ this.listener.onItemsAdded(aItems, aCollection);
+ },
+ onItemsModified: function GlodaMsgSearcher_onItemsModified(aItems,
+ aCollection) {
+ if (this.listener)
+ this.listener.onItemsModified(aItems, aCollection);
+ },
+ onItemsRemoved: function GlodaMsgSearcher_onItemsRemoved(aItems,
+ aCollection) {
+ if (this.listener)
+ this.listener.onItemsRemoved(aItems, aCollection);
+ },
+ onQueryCompleted: function GlodaMsgSearcher_onQueryCompleted(aCollection) {
+ this.completed = true;
+ if (this.listener)
+ this.listener.onQueryCompleted(aCollection);
+ },
+};