summaryrefslogtreecommitdiffstats
path: root/mailnews/db/gloda/modules/suffixtree.js
blob: 009ad5f9d413f2cad4bd39f51b70fa26e32abe58 (plain)
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
/* This Source Code Form is subject to the terms of the Mozilla Public
 * License, v. 2.0. If a copy of the MPL was not distributed with this
 * file, You can obtain one at http://mozilla.org/MPL/2.0/. */

this.EXPORTED_SYMBOLS = ["SuffixTree", "MultiSuffixTree"];

/**
 * Given a list of strings and a corresponding map of items that those strings
 *  correspond to, build a suffix tree.
 */
function MultiSuffixTree(aStrings, aItems) {
  if (aStrings.length != aItems.length)
    throw new Error("Array lengths need to be the same.");

  let s = '';
  let offsetsToItems = [];
  let lastLength = 0;
  for (let i = 0; i < aStrings.length; i++) {
    s += aStrings[i];
    offsetsToItems.push(lastLength, s.length, aItems[i]);
    lastLength = s.length;
  }
  
  this._construct(s);
  this._offsetsToItems = offsetsToItems;
  this._numItems = aItems.length;
}

/**
 * @constructor
 */
function State(aStartIndex, aEndIndex, aSuffix) {
  this.start = aStartIndex;
  this.end = aEndIndex;
  this.suffix = aSuffix;
}

var dump;
if (dump === undefined) {
  dump = function(a) {
    print(a.slice(0, -1));
  };
}

/**
 * Since objects are basically hash-tables anyways, we simply create an
 *  attribute whose name is the first letter of the edge string.  (So, the
 *  edge string can conceptually be a multi-letter string, but since we would
 *  split it were there any ambiguity, it's okay to just use the single letter.)
 *  This avoids having to update the attribute name or worry about tripping our
 *  implementation up.
 */
State.prototype = {
  get isExplicit() {
    // our end is not inclusive...
    return (this.end <= this.start);
  },
  get isImplicit() {
    // our end is not inclusive...
    return (this.end > this.start);
  },
  
  get length() {
    return this.end - this.start;
  },
  
  toString: function State_toString() {
    return "[Start: " + this.start + " End: " + this.end +
           (this.suffix ? " non-null suffix]" : " null suffix]"); 
  }
};

/**
 * Suffix tree implemented using Ukkonen's algorithm.
 * @constructor
 */
function SuffixTree(aStr) {
  this._construct(aStr);
}

/**
 * States are 
 */
SuffixTree.prototype = {
  /**
   * Find all items matching the provided substring.
   */
  findMatches: function findMatches(aSubstring) {
    let results = [];
    let state = this._root;
    let index=0;
    let end = aSubstring.length;
    while(index < end) {
      state = state[aSubstring[index]];
      // bail if there was no edge
      if (state === undefined)
        return results;
      // bail if the portion of the edge we traversed is not equal to that
      //  portion of our pattern
      let actualTraverseLength = Math.min(state.length,
                                          end - index);
      if (this._str.substring(state.start,
                              state.start + actualTraverseLength) !=
          aSubstring.substring(index, index + actualTraverseLength))
        return results;
      index += state.length;
    }
    
    // state should now be the node which itself and all its children match...
    // The delta is to adjust us to the offset of the last letter of our match;
    //  the edge we traversed to get here may have found us traversing more
    //  than we wanted.
    // index - end captures the over-shoot of the edge traversal,
    // index - end + 1 captures the fact that we want to find the last letter
    //  that matched, not just the first letter beyond it
    // However, if this state is a leaf node (end == 'infinity'), then 'end'
    //  isn't describing an edge at all and we want to avoid accounting for it.
    let delta;
    /*
    if (state.end != this._infinity)
      //delta = index - end + 1;
      delta = end - (index - state.length); 
    else */
    delta = index - state.length - end + 1;
 
    this._resultGather(state, results, {}, end, delta, true);
    return results;
  },
  
  _resultGather: function resultGather(aState, aResults, aPresence,
                                       aPatLength, aDelta, alreadyAdjusted) {
    // find the item that this state originated from based on the state's
    //  start character.  offsetToItem holds [string start index, string end
    //  index (exclusive), item reference].  So we want to binary search to
    //  find the string whose start/end index contains the state's start index.
    let low = 0;
    let high = this._numItems-1;
    let mid, stringStart, stringEnd;
    
    let patternLast = aState.start - aDelta;
    while (low <= high) {
      mid = low + Math.floor((high - low) / 2); // excessive, especially with js nums
      stringStart = this._offsetsToItems[mid*3];
      let startDelta = stringStart - patternLast;
      stringEnd = this._offsetsToItems[mid*3+1];
      let endDelta = stringEnd - patternLast;
      if (startDelta > 0)
        high = mid - 1;
      else if (endDelta <= 0)
        low = mid + 1;
      else {
        break;
      }
    }
    
    // - The match occurred completely inside a source string.  Success.
    // - The match spans more than one source strings, and is therefore not
    //   a match.
    
    // at this point, we have located the origin string that corresponds to the
    //  start index of this state.
    // - The match terminated with the end of the preceding string, and does
    //   not match us at all.  We, and potentially our children, are merely
    //   serving as a unique terminal.
    // - The 

  let patternFirst = patternLast - (aPatLength - 1);

  if (patternFirst >= stringStart) {
    if (!(stringStart in aPresence)) {
      aPresence[stringStart] = true;
      aResults.push(this._offsetsToItems[mid*3+2]);
    }
  }
    
    // bail if we had it coming OR
    // if the result terminates at/part-way through this state, meaning any
    //  of its children are not going to be actual results, just hangers
    //  on.
/*
    if (bail || (end <= aState.end)) {
dump("  bailing! (bail was: " + bail + ")\n");
      return;
    }
*/    
    // process our children...
    for (let key in aState) {
      // edges have attributes of length 1...
      if (key.length == 1) {
        let statePrime = aState[key];
        this._resultGather(statePrime, aResults, aPresence, aPatLength,
                           aDelta + aState.length, //(alreadyAdjusted ? 0 : aState.length),
                           false);
      }
    }
  },

  /**
   * Given a reference 'pair' of a state and a string (may be 'empty'=explicit,
   *  which means no work to do and we return immediately) follow that state
   *  (and then the successive states)'s transitions until we run out of
   *  transitions.  This happens either when we find an explicit state, or 
   *  find ourselves partially along an edge (conceptually speaking).  In
   *  the partial case, we return the state prior to the edge traversal.
   * (The information about the 'edge' is contained on its target State;
   *  we can do this because a state is only referenced by one other state.) 
   */
  _canonize: function canonize(aState, aStart, aEnd) {
    if (aEnd <= aStart) {
      return [aState, aStart];
    }
  
    let statePrime;
    // we treat an aState of null as 'bottom', which has transitions for every
    //  letter in the alphabet to 'root'.  rather than create all those
    //  transitions, we special-case here.
    if (aState === null)
      statePrime = this._root;
    else
      statePrime = aState[this._str[aStart]];
    while (statePrime.length <= aEnd - aStart) { // (no 1 adjustment required)
      aStart += statePrime.length;
      aState = statePrime;
      if (aStart < aEnd) {
        statePrime = aState[this._str[aStart]];
      }
    }
    return [aState, aStart]; 
  },

  /**
   * Given a reference 'pair' whose state may or may not be explicit (and for
   *  which we will perform the required splitting to make it explicit), test
   *  whether it already possesses a transition corresponding to the provided
   *  character.
   * @return A list of: whether we had to make it explicit, the (potentially)
   *    new explicit state.
   */
  _testAndSplit: function testAndSplit(aState, aStart, aEnd, aChar) {
    if (aStart < aEnd) { // it's not explicit
      let statePrime = aState[this._str[aStart]];
      let length = aEnd - aStart;
      if (aChar == this._str[statePrime.start + length]) {
        return [true, aState];
      }
      else {
        // do splitting... aState -> rState -> statePrime
        let rState = new State(statePrime.start, statePrime.start + length);
        aState[this._str[statePrime.start]] = rState;
        statePrime.start += length;
        rState[this._str[statePrime.start]] = statePrime;
        return [false, rState];
      }
    }
    else { // it's already explicit
      if (aState === null) { // bottom case... shouldn't happen, but hey. 
        return [true, aState];
      }
      return [(aChar in aState), aState];
    }
      
  },

  _update: function update(aState, aStart, aIndex) {
    let oldR = this._root;
    let textAtIndex = this._str[aIndex]; // T sub i (0-based corrected...)
    // because of the way we store the 'end' value as a one-past form, we do
    //  not need to subtract 1 off of aIndex.
    let [endPoint, rState] = this._testAndSplit(aState, aStart, aIndex, //no -1
                                                textAtIndex);
    while (!endPoint) {
      let rPrime = new State(aIndex, this._infinity);
      rState[textAtIndex] = rPrime;
      if (oldR !== this._root)
        oldR.suffix = rState;
      oldR = rState;
      [aState, aStart] = this._canonize(aState.suffix, aStart, aIndex); // no -1
      [endPoint, rState] = this._testAndSplit(aState, aStart, aIndex, // no -1
                                              textAtIndex);
    }
    if (oldR !== this._root)
      oldR.suffix = aState;
    
    return [aState, aStart];
  },
  
  _construct: function construct(aStr) {
    this._str = aStr;
    // just needs to be longer than the string.
    this._infinity = aStr.length + 1;
    
    //this._bottom = new State(0, -1, null);
    this._root = new State(-1, 0, null); // null === bottom
    let state = this._root;
    let start = 0;
  
    for (let i = 0; i < aStr.length; i++) {
      [state, start] = this._update(state, start, i); // treat as flowing -1...
      [state, start] = this._canonize(state, start, i+1); // 1-length string
    }
  },
  
  dump: function SuffixTree_show(aState, aIndent, aKey) {
    if (aState === undefined)
      aState = this._root;
    if (aIndent === undefined) {
      aIndent = "";
      aKey = ".";
    }
    
    if (aState.isImplicit) {
      let snip;
      if (aState.length > 10)
        snip = this._str.slice(aState.start,
                           Math.min(aState.start+10, this._str.length)) + "...";
      else
        snip =  this._str.slice(aState.start,
                                Math.min(aState.end, this._str.length)); 
      dump(aIndent + aKey + ":" + snip + "(" +
           aState.start + ":" + aState.end + ")\n");
    }
    else
      dump(aIndent + aKey + ": (explicit:" + aState.start + ":" + aState.end +")\n");
    let nextIndent = aIndent + "  ";
    let keys = Object.keys(aState).filter(c => c.length == 1);
    for (let key of keys) {
      this.dump(aState[key], nextIndent, key);
    }
  }
};
MultiSuffixTree.prototype = SuffixTree.prototype;

function examplar() {
  let names = ["AndrewSmith", "AndrewJones", "MarkSmith", "BryanClark",
               "MarthaJones", "DavidAscher", "DanMosedale", "DavidBienvenu",
               "JanetDavis", "JosephBryant"];
  let b = new MultiSuffixTree(names, names);
  b.dump();
  dump(b.findMatches("rya") + "\n");
}