summaryrefslogtreecommitdiffstats
path: root/mailnews/db/gloda/modules/suffixtree.js
diff options
context:
space:
mode:
Diffstat (limited to 'mailnews/db/gloda/modules/suffixtree.js')
-rw-r--r--mailnews/db/gloda/modules/suffixtree.js340
1 files changed, 340 insertions, 0 deletions
diff --git a/mailnews/db/gloda/modules/suffixtree.js b/mailnews/db/gloda/modules/suffixtree.js
new file mode 100644
index 000000000..009ad5f9d
--- /dev/null
+++ b/mailnews/db/gloda/modules/suffixtree.js
@@ -0,0 +1,340 @@
+/* This Source Code Form is subject to the terms of the Mozilla Public
+ * License, v. 2.0. If a copy of the MPL was not distributed with this
+ * file, You can obtain one at http://mozilla.org/MPL/2.0/. */
+
+this.EXPORTED_SYMBOLS = ["SuffixTree", "MultiSuffixTree"];
+
+/**
+ * Given a list of strings and a corresponding map of items that those strings
+ * correspond to, build a suffix tree.
+ */
+function MultiSuffixTree(aStrings, aItems) {
+ if (aStrings.length != aItems.length)
+ throw new Error("Array lengths need to be the same.");
+
+ let s = '';
+ let offsetsToItems = [];
+ let lastLength = 0;
+ for (let i = 0; i < aStrings.length; i++) {
+ s += aStrings[i];
+ offsetsToItems.push(lastLength, s.length, aItems[i]);
+ lastLength = s.length;
+ }
+
+ this._construct(s);
+ this._offsetsToItems = offsetsToItems;
+ this._numItems = aItems.length;
+}
+
+/**
+ * @constructor
+ */
+function State(aStartIndex, aEndIndex, aSuffix) {
+ this.start = aStartIndex;
+ this.end = aEndIndex;
+ this.suffix = aSuffix;
+}
+
+var dump;
+if (dump === undefined) {
+ dump = function(a) {
+ print(a.slice(0, -1));
+ };
+}
+
+/**
+ * Since objects are basically hash-tables anyways, we simply create an
+ * attribute whose name is the first letter of the edge string. (So, the
+ * edge string can conceptually be a multi-letter string, but since we would
+ * split it were there any ambiguity, it's okay to just use the single letter.)
+ * This avoids having to update the attribute name or worry about tripping our
+ * implementation up.
+ */
+State.prototype = {
+ get isExplicit() {
+ // our end is not inclusive...
+ return (this.end <= this.start);
+ },
+ get isImplicit() {
+ // our end is not inclusive...
+ return (this.end > this.start);
+ },
+
+ get length() {
+ return this.end - this.start;
+ },
+
+ toString: function State_toString() {
+ return "[Start: " + this.start + " End: " + this.end +
+ (this.suffix ? " non-null suffix]" : " null suffix]");
+ }
+};
+
+/**
+ * Suffix tree implemented using Ukkonen's algorithm.
+ * @constructor
+ */
+function SuffixTree(aStr) {
+ this._construct(aStr);
+}
+
+/**
+ * States are
+ */
+SuffixTree.prototype = {
+ /**
+ * Find all items matching the provided substring.
+ */
+ findMatches: function findMatches(aSubstring) {
+ let results = [];
+ let state = this._root;
+ let index=0;
+ let end = aSubstring.length;
+ while(index < end) {
+ state = state[aSubstring[index]];
+ // bail if there was no edge
+ if (state === undefined)
+ return results;
+ // bail if the portion of the edge we traversed is not equal to that
+ // portion of our pattern
+ let actualTraverseLength = Math.min(state.length,
+ end - index);
+ if (this._str.substring(state.start,
+ state.start + actualTraverseLength) !=
+ aSubstring.substring(index, index + actualTraverseLength))
+ return results;
+ index += state.length;
+ }
+
+ // state should now be the node which itself and all its children match...
+ // The delta is to adjust us to the offset of the last letter of our match;
+ // the edge we traversed to get here may have found us traversing more
+ // than we wanted.
+ // index - end captures the over-shoot of the edge traversal,
+ // index - end + 1 captures the fact that we want to find the last letter
+ // that matched, not just the first letter beyond it
+ // However, if this state is a leaf node (end == 'infinity'), then 'end'
+ // isn't describing an edge at all and we want to avoid accounting for it.
+ let delta;
+ /*
+ if (state.end != this._infinity)
+ //delta = index - end + 1;
+ delta = end - (index - state.length);
+ else */
+ delta = index - state.length - end + 1;
+
+ this._resultGather(state, results, {}, end, delta, true);
+ return results;
+ },
+
+ _resultGather: function resultGather(aState, aResults, aPresence,
+ aPatLength, aDelta, alreadyAdjusted) {
+ // find the item that this state originated from based on the state's
+ // start character. offsetToItem holds [string start index, string end
+ // index (exclusive), item reference]. So we want to binary search to
+ // find the string whose start/end index contains the state's start index.
+ let low = 0;
+ let high = this._numItems-1;
+ let mid, stringStart, stringEnd;
+
+ let patternLast = aState.start - aDelta;
+ while (low <= high) {
+ mid = low + Math.floor((high - low) / 2); // excessive, especially with js nums
+ stringStart = this._offsetsToItems[mid*3];
+ let startDelta = stringStart - patternLast;
+ stringEnd = this._offsetsToItems[mid*3+1];
+ let endDelta = stringEnd - patternLast;
+ if (startDelta > 0)
+ high = mid - 1;
+ else if (endDelta <= 0)
+ low = mid + 1;
+ else {
+ break;
+ }
+ }
+
+ // - The match occurred completely inside a source string. Success.
+ // - The match spans more than one source strings, and is therefore not
+ // a match.
+
+ // at this point, we have located the origin string that corresponds to the
+ // start index of this state.
+ // - The match terminated with the end of the preceding string, and does
+ // not match us at all. We, and potentially our children, are merely
+ // serving as a unique terminal.
+ // - The
+
+ let patternFirst = patternLast - (aPatLength - 1);
+
+ if (patternFirst >= stringStart) {
+ if (!(stringStart in aPresence)) {
+ aPresence[stringStart] = true;
+ aResults.push(this._offsetsToItems[mid*3+2]);
+ }
+ }
+
+ // bail if we had it coming OR
+ // if the result terminates at/part-way through this state, meaning any
+ // of its children are not going to be actual results, just hangers
+ // on.
+/*
+ if (bail || (end <= aState.end)) {
+dump(" bailing! (bail was: " + bail + ")\n");
+ return;
+ }
+*/
+ // process our children...
+ for (let key in aState) {
+ // edges have attributes of length 1...
+ if (key.length == 1) {
+ let statePrime = aState[key];
+ this._resultGather(statePrime, aResults, aPresence, aPatLength,
+ aDelta + aState.length, //(alreadyAdjusted ? 0 : aState.length),
+ false);
+ }
+ }
+ },
+
+ /**
+ * Given a reference 'pair' of a state and a string (may be 'empty'=explicit,
+ * which means no work to do and we return immediately) follow that state
+ * (and then the successive states)'s transitions until we run out of
+ * transitions. This happens either when we find an explicit state, or
+ * find ourselves partially along an edge (conceptually speaking). In
+ * the partial case, we return the state prior to the edge traversal.
+ * (The information about the 'edge' is contained on its target State;
+ * we can do this because a state is only referenced by one other state.)
+ */
+ _canonize: function canonize(aState, aStart, aEnd) {
+ if (aEnd <= aStart) {
+ return [aState, aStart];
+ }
+
+ let statePrime;
+ // we treat an aState of null as 'bottom', which has transitions for every
+ // letter in the alphabet to 'root'. rather than create all those
+ // transitions, we special-case here.
+ if (aState === null)
+ statePrime = this._root;
+ else
+ statePrime = aState[this._str[aStart]];
+ while (statePrime.length <= aEnd - aStart) { // (no 1 adjustment required)
+ aStart += statePrime.length;
+ aState = statePrime;
+ if (aStart < aEnd) {
+ statePrime = aState[this._str[aStart]];
+ }
+ }
+ return [aState, aStart];
+ },
+
+ /**
+ * Given a reference 'pair' whose state may or may not be explicit (and for
+ * which we will perform the required splitting to make it explicit), test
+ * whether it already possesses a transition corresponding to the provided
+ * character.
+ * @return A list of: whether we had to make it explicit, the (potentially)
+ * new explicit state.
+ */
+ _testAndSplit: function testAndSplit(aState, aStart, aEnd, aChar) {
+ if (aStart < aEnd) { // it's not explicit
+ let statePrime = aState[this._str[aStart]];
+ let length = aEnd - aStart;
+ if (aChar == this._str[statePrime.start + length]) {
+ return [true, aState];
+ }
+ else {
+ // do splitting... aState -> rState -> statePrime
+ let rState = new State(statePrime.start, statePrime.start + length);
+ aState[this._str[statePrime.start]] = rState;
+ statePrime.start += length;
+ rState[this._str[statePrime.start]] = statePrime;
+ return [false, rState];
+ }
+ }
+ else { // it's already explicit
+ if (aState === null) { // bottom case... shouldn't happen, but hey.
+ return [true, aState];
+ }
+ return [(aChar in aState), aState];
+ }
+
+ },
+
+ _update: function update(aState, aStart, aIndex) {
+ let oldR = this._root;
+ let textAtIndex = this._str[aIndex]; // T sub i (0-based corrected...)
+ // because of the way we store the 'end' value as a one-past form, we do
+ // not need to subtract 1 off of aIndex.
+ let [endPoint, rState] = this._testAndSplit(aState, aStart, aIndex, //no -1
+ textAtIndex);
+ while (!endPoint) {
+ let rPrime = new State(aIndex, this._infinity);
+ rState[textAtIndex] = rPrime;
+ if (oldR !== this._root)
+ oldR.suffix = rState;
+ oldR = rState;
+ [aState, aStart] = this._canonize(aState.suffix, aStart, aIndex); // no -1
+ [endPoint, rState] = this._testAndSplit(aState, aStart, aIndex, // no -1
+ textAtIndex);
+ }
+ if (oldR !== this._root)
+ oldR.suffix = aState;
+
+ return [aState, aStart];
+ },
+
+ _construct: function construct(aStr) {
+ this._str = aStr;
+ // just needs to be longer than the string.
+ this._infinity = aStr.length + 1;
+
+ //this._bottom = new State(0, -1, null);
+ this._root = new State(-1, 0, null); // null === bottom
+ let state = this._root;
+ let start = 0;
+
+ for (let i = 0; i < aStr.length; i++) {
+ [state, start] = this._update(state, start, i); // treat as flowing -1...
+ [state, start] = this._canonize(state, start, i+1); // 1-length string
+ }
+ },
+
+ dump: function SuffixTree_show(aState, aIndent, aKey) {
+ if (aState === undefined)
+ aState = this._root;
+ if (aIndent === undefined) {
+ aIndent = "";
+ aKey = ".";
+ }
+
+ if (aState.isImplicit) {
+ let snip;
+ if (aState.length > 10)
+ snip = this._str.slice(aState.start,
+ Math.min(aState.start+10, this._str.length)) + "...";
+ else
+ snip = this._str.slice(aState.start,
+ Math.min(aState.end, this._str.length));
+ dump(aIndent + aKey + ":" + snip + "(" +
+ aState.start + ":" + aState.end + ")\n");
+ }
+ else
+ dump(aIndent + aKey + ": (explicit:" + aState.start + ":" + aState.end +")\n");
+ let nextIndent = aIndent + " ";
+ let keys = Object.keys(aState).filter(c => c.length == 1);
+ for (let key of keys) {
+ this.dump(aState[key], nextIndent, key);
+ }
+ }
+};
+MultiSuffixTree.prototype = SuffixTree.prototype;
+
+function examplar() {
+ let names = ["AndrewSmith", "AndrewJones", "MarkSmith", "BryanClark",
+ "MarthaJones", "DavidAscher", "DanMosedale", "DavidBienvenu",
+ "JanetDavis", "JosephBryant"];
+ let b = new MultiSuffixTree(names, names);
+ b.dump();
+ dump(b.findMatches("rya") + "\n");
+}