diff options
Diffstat (limited to 'mailnews/db/gloda/modules/suffixtree.js')
-rw-r--r-- | mailnews/db/gloda/modules/suffixtree.js | 340 |
1 files changed, 340 insertions, 0 deletions
diff --git a/mailnews/db/gloda/modules/suffixtree.js b/mailnews/db/gloda/modules/suffixtree.js new file mode 100644 index 000000000..009ad5f9d --- /dev/null +++ b/mailnews/db/gloda/modules/suffixtree.js @@ -0,0 +1,340 @@ +/* This Source Code Form is subject to the terms of the Mozilla Public + * License, v. 2.0. If a copy of the MPL was not distributed with this + * file, You can obtain one at http://mozilla.org/MPL/2.0/. */ + +this.EXPORTED_SYMBOLS = ["SuffixTree", "MultiSuffixTree"]; + +/** + * Given a list of strings and a corresponding map of items that those strings + * correspond to, build a suffix tree. + */ +function MultiSuffixTree(aStrings, aItems) { + if (aStrings.length != aItems.length) + throw new Error("Array lengths need to be the same."); + + let s = ''; + let offsetsToItems = []; + let lastLength = 0; + for (let i = 0; i < aStrings.length; i++) { + s += aStrings[i]; + offsetsToItems.push(lastLength, s.length, aItems[i]); + lastLength = s.length; + } + + this._construct(s); + this._offsetsToItems = offsetsToItems; + this._numItems = aItems.length; +} + +/** + * @constructor + */ +function State(aStartIndex, aEndIndex, aSuffix) { + this.start = aStartIndex; + this.end = aEndIndex; + this.suffix = aSuffix; +} + +var dump; +if (dump === undefined) { + dump = function(a) { + print(a.slice(0, -1)); + }; +} + +/** + * Since objects are basically hash-tables anyways, we simply create an + * attribute whose name is the first letter of the edge string. (So, the + * edge string can conceptually be a multi-letter string, but since we would + * split it were there any ambiguity, it's okay to just use the single letter.) + * This avoids having to update the attribute name or worry about tripping our + * implementation up. + */ +State.prototype = { + get isExplicit() { + // our end is not inclusive... + return (this.end <= this.start); + }, + get isImplicit() { + // our end is not inclusive... + return (this.end > this.start); + }, + + get length() { + return this.end - this.start; + }, + + toString: function State_toString() { + return "[Start: " + this.start + " End: " + this.end + + (this.suffix ? " non-null suffix]" : " null suffix]"); + } +}; + +/** + * Suffix tree implemented using Ukkonen's algorithm. + * @constructor + */ +function SuffixTree(aStr) { + this._construct(aStr); +} + +/** + * States are + */ +SuffixTree.prototype = { + /** + * Find all items matching the provided substring. + */ + findMatches: function findMatches(aSubstring) { + let results = []; + let state = this._root; + let index=0; + let end = aSubstring.length; + while(index < end) { + state = state[aSubstring[index]]; + // bail if there was no edge + if (state === undefined) + return results; + // bail if the portion of the edge we traversed is not equal to that + // portion of our pattern + let actualTraverseLength = Math.min(state.length, + end - index); + if (this._str.substring(state.start, + state.start + actualTraverseLength) != + aSubstring.substring(index, index + actualTraverseLength)) + return results; + index += state.length; + } + + // state should now be the node which itself and all its children match... + // The delta is to adjust us to the offset of the last letter of our match; + // the edge we traversed to get here may have found us traversing more + // than we wanted. + // index - end captures the over-shoot of the edge traversal, + // index - end + 1 captures the fact that we want to find the last letter + // that matched, not just the first letter beyond it + // However, if this state is a leaf node (end == 'infinity'), then 'end' + // isn't describing an edge at all and we want to avoid accounting for it. + let delta; + /* + if (state.end != this._infinity) + //delta = index - end + 1; + delta = end - (index - state.length); + else */ + delta = index - state.length - end + 1; + + this._resultGather(state, results, {}, end, delta, true); + return results; + }, + + _resultGather: function resultGather(aState, aResults, aPresence, + aPatLength, aDelta, alreadyAdjusted) { + // find the item that this state originated from based on the state's + // start character. offsetToItem holds [string start index, string end + // index (exclusive), item reference]. So we want to binary search to + // find the string whose start/end index contains the state's start index. + let low = 0; + let high = this._numItems-1; + let mid, stringStart, stringEnd; + + let patternLast = aState.start - aDelta; + while (low <= high) { + mid = low + Math.floor((high - low) / 2); // excessive, especially with js nums + stringStart = this._offsetsToItems[mid*3]; + let startDelta = stringStart - patternLast; + stringEnd = this._offsetsToItems[mid*3+1]; + let endDelta = stringEnd - patternLast; + if (startDelta > 0) + high = mid - 1; + else if (endDelta <= 0) + low = mid + 1; + else { + break; + } + } + + // - The match occurred completely inside a source string. Success. + // - The match spans more than one source strings, and is therefore not + // a match. + + // at this point, we have located the origin string that corresponds to the + // start index of this state. + // - The match terminated with the end of the preceding string, and does + // not match us at all. We, and potentially our children, are merely + // serving as a unique terminal. + // - The + + let patternFirst = patternLast - (aPatLength - 1); + + if (patternFirst >= stringStart) { + if (!(stringStart in aPresence)) { + aPresence[stringStart] = true; + aResults.push(this._offsetsToItems[mid*3+2]); + } + } + + // bail if we had it coming OR + // if the result terminates at/part-way through this state, meaning any + // of its children are not going to be actual results, just hangers + // on. +/* + if (bail || (end <= aState.end)) { +dump(" bailing! (bail was: " + bail + ")\n"); + return; + } +*/ + // process our children... + for (let key in aState) { + // edges have attributes of length 1... + if (key.length == 1) { + let statePrime = aState[key]; + this._resultGather(statePrime, aResults, aPresence, aPatLength, + aDelta + aState.length, //(alreadyAdjusted ? 0 : aState.length), + false); + } + } + }, + + /** + * Given a reference 'pair' of a state and a string (may be 'empty'=explicit, + * which means no work to do and we return immediately) follow that state + * (and then the successive states)'s transitions until we run out of + * transitions. This happens either when we find an explicit state, or + * find ourselves partially along an edge (conceptually speaking). In + * the partial case, we return the state prior to the edge traversal. + * (The information about the 'edge' is contained on its target State; + * we can do this because a state is only referenced by one other state.) + */ + _canonize: function canonize(aState, aStart, aEnd) { + if (aEnd <= aStart) { + return [aState, aStart]; + } + + let statePrime; + // we treat an aState of null as 'bottom', which has transitions for every + // letter in the alphabet to 'root'. rather than create all those + // transitions, we special-case here. + if (aState === null) + statePrime = this._root; + else + statePrime = aState[this._str[aStart]]; + while (statePrime.length <= aEnd - aStart) { // (no 1 adjustment required) + aStart += statePrime.length; + aState = statePrime; + if (aStart < aEnd) { + statePrime = aState[this._str[aStart]]; + } + } + return [aState, aStart]; + }, + + /** + * Given a reference 'pair' whose state may or may not be explicit (and for + * which we will perform the required splitting to make it explicit), test + * whether it already possesses a transition corresponding to the provided + * character. + * @return A list of: whether we had to make it explicit, the (potentially) + * new explicit state. + */ + _testAndSplit: function testAndSplit(aState, aStart, aEnd, aChar) { + if (aStart < aEnd) { // it's not explicit + let statePrime = aState[this._str[aStart]]; + let length = aEnd - aStart; + if (aChar == this._str[statePrime.start + length]) { + return [true, aState]; + } + else { + // do splitting... aState -> rState -> statePrime + let rState = new State(statePrime.start, statePrime.start + length); + aState[this._str[statePrime.start]] = rState; + statePrime.start += length; + rState[this._str[statePrime.start]] = statePrime; + return [false, rState]; + } + } + else { // it's already explicit + if (aState === null) { // bottom case... shouldn't happen, but hey. + return [true, aState]; + } + return [(aChar in aState), aState]; + } + + }, + + _update: function update(aState, aStart, aIndex) { + let oldR = this._root; + let textAtIndex = this._str[aIndex]; // T sub i (0-based corrected...) + // because of the way we store the 'end' value as a one-past form, we do + // not need to subtract 1 off of aIndex. + let [endPoint, rState] = this._testAndSplit(aState, aStart, aIndex, //no -1 + textAtIndex); + while (!endPoint) { + let rPrime = new State(aIndex, this._infinity); + rState[textAtIndex] = rPrime; + if (oldR !== this._root) + oldR.suffix = rState; + oldR = rState; + [aState, aStart] = this._canonize(aState.suffix, aStart, aIndex); // no -1 + [endPoint, rState] = this._testAndSplit(aState, aStart, aIndex, // no -1 + textAtIndex); + } + if (oldR !== this._root) + oldR.suffix = aState; + + return [aState, aStart]; + }, + + _construct: function construct(aStr) { + this._str = aStr; + // just needs to be longer than the string. + this._infinity = aStr.length + 1; + + //this._bottom = new State(0, -1, null); + this._root = new State(-1, 0, null); // null === bottom + let state = this._root; + let start = 0; + + for (let i = 0; i < aStr.length; i++) { + [state, start] = this._update(state, start, i); // treat as flowing -1... + [state, start] = this._canonize(state, start, i+1); // 1-length string + } + }, + + dump: function SuffixTree_show(aState, aIndent, aKey) { + if (aState === undefined) + aState = this._root; + if (aIndent === undefined) { + aIndent = ""; + aKey = "."; + } + + if (aState.isImplicit) { + let snip; + if (aState.length > 10) + snip = this._str.slice(aState.start, + Math.min(aState.start+10, this._str.length)) + "..."; + else + snip = this._str.slice(aState.start, + Math.min(aState.end, this._str.length)); + dump(aIndent + aKey + ":" + snip + "(" + + aState.start + ":" + aState.end + ")\n"); + } + else + dump(aIndent + aKey + ": (explicit:" + aState.start + ":" + aState.end +")\n"); + let nextIndent = aIndent + " "; + let keys = Object.keys(aState).filter(c => c.length == 1); + for (let key of keys) { + this.dump(aState[key], nextIndent, key); + } + } +}; +MultiSuffixTree.prototype = SuffixTree.prototype; + +function examplar() { + let names = ["AndrewSmith", "AndrewJones", "MarkSmith", "BryanClark", + "MarthaJones", "DavidAscher", "DanMosedale", "DavidBienvenu", + "JanetDavis", "JosephBryant"]; + let b = new MultiSuffixTree(names, names); + b.dump(); + dump(b.findMatches("rya") + "\n"); +} |