1 files changed, 340 insertions, 0 deletions
diff --git a/mailnews/db/gloda/modules/suffixtree.js b/mailnews/db/gloda/modules/suffixtree.js
new file mode 100644
index 000000000..009ad5f9d
--- /dev/null
+++ b/mailnews/db/gloda/modules/suffixtree.js
@@ -0,0 +1,340 @@
+/* This Source Code Form is subject to the terms of the Mozilla Public
+ * License, v. 2.0. If a copy of the MPL was not distributed with this
+ * file, You can obtain one at http://mozilla.org/MPL/2.0/. */
+
+this.EXPORTED_SYMBOLS = ["SuffixTree", "MultiSuffixTree"];
+
+/**
+ * Given a list of strings and a corresponding map of items that those strings
+ *  correspond to, build a suffix tree.
+ */
+function MultiSuffixTree(aStrings, aItems) {
+  if (aStrings.length != aItems.length)
+    throw new Error("Array lengths need to be the same.");
+
+  let s = '';
+  let offsetsToItems = [];
+  let lastLength = 0;
+  for (let i = 0; i < aStrings.length; i++) {
+    s += aStrings[i];
+    offsetsToItems.push(lastLength, s.length, aItems[i]);
+    lastLength = s.length;
+  }
+  
+  this._construct(s);
+  this._offsetsToItems = offsetsToItems;
+  this._numItems = aItems.length;
+}
+
+/**
+ * @constructor
+ */
+function State(aStartIndex, aEndIndex, aSuffix) {
+  this.start = aStartIndex;
+  this.end = aEndIndex;
+  this.suffix = aSuffix;
+}
+
+var dump;
+if (dump === undefined) {
+  dump = function(a) {
+    print(a.slice(0, -1));
+  };
+}
+
+/**
+ * Since objects are basically hash-tables anyways, we simply create an
+ *  attribute whose name is the first letter of the edge string.  (So, the
+ *  edge string can conceptually be a multi-letter string, but since we would
+ *  split it were there any ambiguity, it's okay to just use the single letter.)
+ *  This avoids having to update the attribute name or worry about tripping our
+ *  implementation up.
+ */
+State.prototype = {
+  get isExplicit() {
+    // our end is not inclusive...
+    return (this.end <= this.start);
+  },
+  get isImplicit() {
+    // our end is not inclusive...
+    return (this.end > this.start);
+  },
+  
+  get length() {
+    return this.end - this.start;
+  },
+  
+  toString: function State_toString() {
+    return "[Start: " + this.start + " End: " + this.end +
+           (this.suffix ? " non-null suffix]" : " null suffix]"); 
+  }
+};
+
+/**
+ * Suffix tree implemented using Ukkonen's algorithm.
+ * @constructor
+ */
+function SuffixTree(aStr) {
+  this._construct(aStr);
+}
+
+/**
+ * States are 
+ */
+SuffixTree.prototype = {
+  /**
+   * Find all items matching the provided substring.
+   */
+  findMatches: function findMatches(aSubstring) {
+    let results = [];
+    let state = this._root;
+    let index=0;
+    let end = aSubstring.length;
+    while(index < end) {
+      state = state[aSubstring[index]];
+      // bail if there was no edge
+      if (state === undefined)
+        return results;
+      // bail if the portion of the edge we traversed is not equal to that
+      //  portion of our pattern
+      let actualTraverseLength = Math.min(state.length,
+                                          end - index);
+      if (this._str.substring(state.start,
+                              state.start + actualTraverseLength) !=
+          aSubstring.substring(index, index + actualTraverseLength))
+        return results;
+      index += state.length;
+    }
+    
+    // state should now be the node which itself and all its children match...
+    // The delta is to adjust us to the offset of the last letter of our match;
+    //  the edge we traversed to get here may have found us traversing more
+    //  than we wanted.
+    // index - end captures the over-shoot of the edge traversal,
+    // index - end + 1 captures the fact that we want to find the last letter
+    //  that matched, not just the first letter beyond it
+    // However, if this state is a leaf node (end == 'infinity'), then 'end'
+    //  isn't describing an edge at all and we want to avoid accounting for it.
+    let delta;
+    /*
+    if (state.end != this._infinity)
+      //delta = index - end + 1;
+      delta = end - (index - state.length); 
+    else */
+    delta = index - state.length - end + 1;
+ 
+    this._resultGather(state, results, {}, end, delta, true);
+    return results;
+  },
+  
+  _resultGather: function resultGather(aState, aResults, aPresence,
+                                       aPatLength, aDelta, alreadyAdjusted) {
+    // find the item that this state originated from based on the state's
+    //  start character.  offsetToItem holds [string start index, string end
+    //  index (exclusive), item reference].  So we want to binary search to
+    //  find the string whose start/end index contains the state's start index.
+    let low = 0;
+    let high = this._numItems-1;
+    let mid, stringStart, stringEnd;
+    
+    let patternLast = aState.start - aDelta;
+    while (low <= high) {
+      mid = low + Math.floor((high - low) / 2); // excessive, especially with js nums
+      stringStart = this._offsetsToItems[mid*3];
+      let startDelta = stringStart - patternLast;
+      stringEnd = this._offsetsToItems[mid*3+1];
+      let endDelta = stringEnd - patternLast;
+      if (startDelta > 0)
+        high = mid - 1;
+      else if (endDelta <= 0)
+        low = mid + 1;
+      else {
+        break;
+      }
+    }
+    
+    // - The match occurred completely inside a source string.  Success.
+    // - The match spans more than one source strings, and is therefore not
+    //   a match.
+    
+    // at this point, we have located the origin string that corresponds to the
+    //  start index of this state.
+    // - The match terminated with the end of the preceding string, and does
+    //   not match us at all.  We, and potentially our children, are merely
+    //   serving as a unique terminal.
+    // - The 
+
+  let patternFirst = patternLast - (aPatLength - 1);
+
+  if (patternFirst >= stringStart) {
+    if (!(stringStart in aPresence)) {
+      aPresence[stringStart] = true;
+      aResults.push(this._offsetsToItems[mid*3+2]);
+    }
+  }
+    
+    // bail if we had it coming OR
+    // if the result terminates at/part-way through this state, meaning any
+    //  of its children are not going to be actual results, just hangers
+    //  on.
+/*
+    if (bail || (end <= aState.end)) {
+dump("  bailing! (bail was: " + bail + ")\n");
+      return;
+    }
+*/    
+    // process our children...
+    for (let key in aState) {
+      // edges have attributes of length 1...
+      if (key.length == 1) {
+        let statePrime = aState[key];
+        this._resultGather(statePrime, aResults, aPresence, aPatLength,
+                           aDelta + aState.length, //(alreadyAdjusted ? 0 : aState.length),
+                           false);
+      }
+    }
+  },
+
+  /**
+   * Given a reference 'pair' of a state and a string (may be 'empty'=explicit,
+   *  which means no work to do and we return immediately) follow that state
+   *  (and then the successive states)'s transitions until we run out of
+   *  transitions.  This happens either when we find an explicit state, or 
+   *  find ourselves partially along an edge (conceptually speaking).  In
+   *  the partial case, we return the state prior to the edge traversal.
+   * (The information about the 'edge' is contained on its target State;
+   *  we can do this because a state is only referenced by one other state.) 
+   */
+  _canonize: function canonize(aState, aStart, aEnd) {
+    if (aEnd <= aStart) {
+      return [aState, aStart];
+    }
+  
+    let statePrime;
+    // we treat an aState of null as 'bottom', which has transitions for every
+    //  letter in the alphabet to 'root'.  rather than create all those
+    //  transitions, we special-case here.
+    if (aState === null)
+      statePrime = this._root;
+    else
+      statePrime = aState[this._str[aStart]];
+    while (statePrime.length <= aEnd - aStart) { // (no 1 adjustment required)
+      aStart += statePrime.length;
+      aState = statePrime;
+      if (aStart < aEnd) {
+        statePrime = aState[this._str[aStart]];
+      }
+    }
+    return [aState, aStart]; 
+  },
+
+  /**
+   * Given a reference 'pair' whose state may or may not be explicit (and for
+   *  which we will perform the required splitting to make it explicit), test
+   *  whether it already possesses a transition corresponding to the provided
+   *  character.
+   * @return A list of: whether we had to make it explicit, the (potentially)
+   *    new explicit state.
+   */
+  _testAndSplit: function testAndSplit(aState, aStart, aEnd, aChar) {
+    if (aStart < aEnd) { // it's not explicit
+      let statePrime = aState[this._str[aStart]];
+      let length = aEnd - aStart;
+      if (aChar == this._str[statePrime.start + length]) {
+        return [true, aState];
+      }
+      else {
+        // do splitting... aState -> rState -> statePrime
+        let rState = new State(statePrime.start, statePrime.start + length);
+        aState[this._str[statePrime.start]] = rState;
+        statePrime.start += length;
+        rState[this._str[statePrime.start]] = statePrime;
+        return [false, rState];
+      }
+    }
+    else { // it's already explicit
+      if (aState === null) { // bottom case... shouldn't happen, but hey. 
+        return [true, aState];
+      }
+      return [(aChar in aState), aState];
+    }
+      
+  },
+
+  _update: function update(aState, aStart, aIndex) {
+    let oldR = this._root;
+    let textAtIndex = this._str[aIndex]; // T sub i (0-based corrected...)
+    // because of the way we store the 'end' value as a one-past form, we do
+    //  not need to subtract 1 off of aIndex.
+    let [endPoint, rState] = this._testAndSplit(aState, aStart, aIndex, //no -1
+                                                textAtIndex);
+    while (!endPoint) {
+      let rPrime = new State(aIndex, this._infinity);
+      rState[textAtIndex] = rPrime;
+      if (oldR !== this._root)
+        oldR.suffix = rState;
+      oldR = rState;
+      [aState, aStart] = this._canonize(aState.suffix, aStart, aIndex); // no -1
+      [endPoint, rState] = this._testAndSplit(aState, aStart, aIndex, // no -1
+                                              textAtIndex);
+    }
+    if (oldR !== this._root)
+      oldR.suffix = aState;
+    
+    return [aState, aStart];
+  },
+  
+  _construct: function construct(aStr) {
+    this._str = aStr;
+    // just needs to be longer than the string.
+    this._infinity = aStr.length + 1;
+    
+    //this._bottom = new State(0, -1, null);
+    this._root = new State(-1, 0, null); // null === bottom
+    let state = this._root;
+    let start = 0;
+  
+    for (let i = 0; i < aStr.length; i++) {
+      [state, start] = this._update(state, start, i); // treat as flowing -1...
+      [state, start] = this._canonize(state, start, i+1); // 1-length string
+    }
+  },
+  
+  dump: function SuffixTree_show(aState, aIndent, aKey) {
+    if (aState === undefined)
+      aState = this._root;
+    if (aIndent === undefined) {
+      aIndent = "";
+      aKey = ".";
+    }
+    
+    if (aState.isImplicit) {
+      let snip;
+      if (aState.length > 10)
+        snip = this._str.slice(aState.start,
+                           Math.min(aState.start+10, this._str.length)) + "...";
+      else
+        snip =  this._str.slice(aState.start,
+                                Math.min(aState.end, this._str.length)); 
+      dump(aIndent + aKey + ":" + snip + "(" +
+           aState.start + ":" + aState.end + ")\n");
+    }
+    else
+      dump(aIndent + aKey + ": (explicit:" + aState.start + ":" + aState.end +")\n");
+    let nextIndent = aIndent + "  ";
+    let keys = Object.keys(aState).filter(c => c.length == 1);
+    for (let key of keys) {
+      this.dump(aState[key], nextIndent, key);
+    }
+  }
+};
+MultiSuffixTree.prototype = SuffixTree.prototype;
+
+function examplar() {
+  let names = ["AndrewSmith", "AndrewJones", "MarkSmith", "BryanClark",
+               "MarthaJones", "DavidAscher", "DanMosedale", "DavidBienvenu",
+               "JanetDavis", "JosephBryant"];
+  let b = new MultiSuffixTree(names, names);
+  b.dump();
+  dump(b.findMatches("rya") + "\n");
+}