/* This Source Code Form is subject to the terms of the Mozilla Public * License, v. 2.0. If a copy of the MPL was not distributed with this * file, You can obtain one at http://mozilla.org/MPL/2.0/. */ this.EXPORTED_SYMBOLS = ["SuffixTree", "MultiSuffixTree"]; /** * Given a list of strings and a corresponding map of items that those strings * correspond to, build a suffix tree. */ function MultiSuffixTree(aStrings, aItems) { if (aStrings.length != aItems.length) throw new Error("Array lengths need to be the same."); let s = ''; let offsetsToItems = []; let lastLength = 0; for (let i = 0; i < aStrings.length; i++) { s += aStrings[i]; offsetsToItems.push(lastLength, s.length, aItems[i]); lastLength = s.length; } this._construct(s); this._offsetsToItems = offsetsToItems; this._numItems = aItems.length; } /** * @constructor */ function State(aStartIndex, aEndIndex, aSuffix) { this.start = aStartIndex; this.end = aEndIndex; this.suffix = aSuffix; } var dump; if (dump === undefined) { dump = function(a) { print(a.slice(0, -1)); }; } /** * Since objects are basically hash-tables anyways, we simply create an * attribute whose name is the first letter of the edge string. (So, the * edge string can conceptually be a multi-letter string, but since we would * split it were there any ambiguity, it's okay to just use the single letter.) * This avoids having to update the attribute name or worry about tripping our * implementation up. */ State.prototype = { get isExplicit() { // our end is not inclusive... return (this.end <= this.start); }, get isImplicit() { // our end is not inclusive... return (this.end > this.start); }, get length() { return this.end - this.start; }, toString: function State_toString() { return "[Start: " + this.start + " End: " + this.end + (this.suffix ? " non-null suffix]" : " null suffix]"); } }; /** * Suffix tree implemented using Ukkonen's algorithm. * @constructor */ function SuffixTree(aStr) { this._construct(aStr); } /** * States are */ SuffixTree.prototype = { /** * Find all items matching the provided substring. */ findMatches: function findMatches(aSubstring) { let results = []; let state = this._root; let index=0; let end = aSubstring.length; while(index < end) { state = state[aSubstring[index]]; // bail if there was no edge if (state === undefined) return results; // bail if the portion of the edge we traversed is not equal to that // portion of our pattern let actualTraverseLength = Math.min(state.length, end - index); if (this._str.substring(state.start, state.start + actualTraverseLength) != aSubstring.substring(index, index + actualTraverseLength)) return results; index += state.length; } // state should now be the node which itself and all its children match... // The delta is to adjust us to the offset of the last letter of our match; // the edge we traversed to get here may have found us traversing more // than we wanted. // index - end captures the over-shoot of the edge traversal, // index - end + 1 captures the fact that we want to find the last letter // that matched, not just the first letter beyond it // However, if this state is a leaf node (end == 'infinity'), then 'end' // isn't describing an edge at all and we want to avoid accounting for it. let delta; /* if (state.end != this._infinity) //delta = index - end + 1; delta = end - (index - state.length); else */ delta = index - state.length - end + 1; this._resultGather(state, results, {}, end, delta, true); return results; }, _resultGather: function resultGather(aState, aResults, aPresence, aPatLength, aDelta, alreadyAdjusted) { // find the item that this state originated from based on the state's // start character. offsetToItem holds [string start index, string end // index (exclusive), item reference]. So we want to binary search to // find the string whose start/end index contains the state's start index. let low = 0; let high = this._numItems-1; let mid, stringStart, stringEnd; let patternLast = aState.start - aDelta; while (low <= high) { mid = low + Math.floor((high - low) / 2); // excessive, especially with js nums stringStart = this._offsetsToItems[mid*3]; let startDelta = stringStart - patternLast; stringEnd = this._offsetsToItems[mid*3+1]; let endDelta = stringEnd - patternLast; if (startDelta > 0) high = mid - 1; else if (endDelta <= 0) low = mid + 1; else { break; } } // - The match occurred completely inside a source string. Success. // - The match spans more than one source strings, and is therefore not // a match. // at this point, we have located the origin string that corresponds to the // start index of this state. // - The match terminated with the end of the preceding string, and does // not match us at all. We, and potentially our children, are merely // serving as a unique terminal. // - The let patternFirst = patternLast - (aPatLength - 1); if (patternFirst >= stringStart) { if (!(stringStart in aPresence)) { aPresence[stringStart] = true; aResults.push(this._offsetsToItems[mid*3+2]); } } // bail if we had it coming OR // if the result terminates at/part-way through this state, meaning any // of its children are not going to be actual results, just hangers // on. /* if (bail || (end <= aState.end)) { dump(" bailing! (bail was: " + bail + ")\n"); return; } */ // process our children... for (let key in aState) { // edges have attributes of length 1... if (key.length == 1) { let statePrime = aState[key]; this._resultGather(statePrime, aResults, aPresence, aPatLength, aDelta + aState.length, //(alreadyAdjusted ? 0 : aState.length), false); } } }, /** * Given a reference 'pair' of a state and a string (may be 'empty'=explicit, * which means no work to do and we return immediately) follow that state * (and then the successive states)'s transitions until we run out of * transitions. This happens either when we find an explicit state, or * find ourselves partially along an edge (conceptually speaking). In * the partial case, we return the state prior to the edge traversal. * (The information about the 'edge' is contained on its target State; * we can do this because a state is only referenced by one other state.) */ _canonize: function canonize(aState, aStart, aEnd) { if (aEnd <= aStart) { return [aState, aStart]; } let statePrime; // we treat an aState of null as 'bottom', which has transitions for every // letter in the alphabet to 'root'. rather than create all those // transitions, we special-case here. if (aState === null) statePrime = this._root; else statePrime = aState[this._str[aStart]]; while (statePrime.length <= aEnd - aStart) { // (no 1 adjustment required) aStart += statePrime.length; aState = statePrime; if (aStart < aEnd) { statePrime = aState[this._str[aStart]]; } } return [aState, aStart]; }, /** * Given a reference 'pair' whose state may or may not be explicit (and for * which we will perform the required splitting to make it explicit), test * whether it already possesses a transition corresponding to the provided * character. * @return A list of: whether we had to make it explicit, the (potentially) * new explicit state. */ _testAndSplit: function testAndSplit(aState, aStart, aEnd, aChar) { if (aStart < aEnd) { // it's not explicit let statePrime = aState[this._str[aStart]]; let length = aEnd - aStart; if (aChar == this._str[statePrime.start + length]) { return [true, aState]; } else { // do splitting... aState -> rState -> statePrime let rState = new State(statePrime.start, statePrime.start + length); aState[this._str[statePrime.start]] = rState; statePrime.start += length; rState[this._str[statePrime.start]] = statePrime; return [false, rState]; } } else { // it's already explicit if (aState === null) { // bottom case... shouldn't happen, but hey. return [true, aState]; } return [(aChar in aState), aState]; } }, _update: function update(aState, aStart, aIndex) { let oldR = this._root; let textAtIndex = this._str[aIndex]; // T sub i (0-based corrected...) // because of the way we store the 'end' value as a one-past form, we do // not need to subtract 1 off of aIndex. let [endPoint, rState] = this._testAndSplit(aState, aStart, aIndex, //no -1 textAtIndex); while (!endPoint) { let rPrime = new State(aIndex, this._infinity); rState[textAtIndex] = rPrime; if (oldR !== this._root) oldR.suffix = rState; oldR = rState; [aState, aStart] = this._canonize(aState.suffix, aStart, aIndex); // no -1 [endPoint, rState] = this._testAndSplit(aState, aStart, aIndex, // no -1 textAtIndex); } if (oldR !== this._root) oldR.suffix = aState; return [aState, aStart]; }, _construct: function construct(aStr) { this._str = aStr; // just needs to be longer than the string. this._infinity = aStr.length + 1; //this._bottom = new State(0, -1, null); this._root = new State(-1, 0, null); // null === bottom let state = this._root; let start = 0; for (let i = 0; i < aStr.length; i++) { [state, start] = this._update(state, start, i); // treat as flowing -1... [state, start] = this._canonize(state, start, i+1); // 1-length string } }, dump: function SuffixTree_show(aState, aIndent, aKey) { if (aState === undefined) aState = this._root; if (aIndent === undefined) { aIndent = ""; aKey = "."; } if (aState.isImplicit) { let snip; if (aState.length > 10) snip = this._str.slice(aState.start, Math.min(aState.start+10, this._str.length)) + "..."; else snip = this._str.slice(aState.start, Math.min(aState.end, this._str.length)); dump(aIndent + aKey + ":" + snip + "(" + aState.start + ":" + aState.end + ")\n"); } else dump(aIndent + aKey + ": (explicit:" + aState.start + ":" + aState.end +")\n"); let nextIndent = aIndent + " "; let keys = Object.keys(aState).filter(c => c.length == 1); for (let key of keys) { this.dump(aState[key], nextIndent, key); } } }; MultiSuffixTree.prototype = SuffixTree.prototype; function examplar() { let names = ["AndrewSmith", "AndrewJones", "MarkSmith", "BryanClark", "MarthaJones", "DavidAscher", "DanMosedale", "DavidBienvenu", "JanetDavis", "JosephBryant"]; let b = new MultiSuffixTree(names, names); b.dump(); dump(b.findMatches("rya") + "\n"); }