application/basilisk/components/translation/LanguageDetector.jsm


1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143

/* This Source Code Form is subject to the terms of the Mozilla Public
 * License, v. 2.0. If a copy of the MPL was not distributed with this file,
 * You can obtain one at http://mozilla.org/MPL/2.0/. */

"use strict";

this.EXPORTED_SYMBOLS = ["LanguageDetector"];

Components.utils.import("resource://gre/modules/Timer.jsm");
Components.utils.import("resource://gre/modules/XPCOMUtils.jsm");

// Since Emscripten can handle heap growth, but not heap shrinkage, we
// need to refresh the worker after we've processed a particularly large
// string in order to prevent unnecessary resident memory growth.
//
// These values define the cut-off string length and the idle timeout
// (in milliseconds) before destroying a worker. Once a string of the
// maximum size has been processed, the worker is marked for
// destruction, and is terminated as soon as it has been idle for the
// given timeout.
//
// 1.5MB. This is the approximate string length that forces heap growth
// for a 2MB heap.
var LARGE_STRING = 1.5 * 1024 * 1024;
var IDLE_TIMEOUT = 10 * 1000;

const WORKER_URL = "resource:///modules/translation/cld-worker.js";

var workerManager = {
  detectionQueue: [],

  detectLanguage(aParams) {
    return this.workerReady.then(worker => {
      return new Promise(resolve => {
        this.detectionQueue.push({resolve});
        worker.postMessage(aParams);
      });
    }).then(result => {
      // We have our asynchronous result from the worker.
      //
      // Determine if our input was large enough to trigger heap growth,
      // or if we're already waiting to destroy the worker when it's
      // idle. If so, schedule termination after the idle timeout.
      if (aParams.text.length >= LARGE_STRING || this._idleTimeout != null)
        this.flushWorker();

      return result;
    })
  },

  _worker: null,
  _workerReadyPromise: null,

  get workerReady() {
    if (!this._workerReadyPromise)
      this._workerReadyPromise = new Promise(resolve => {
        let worker = new Worker(WORKER_URL);
        worker.onmessage = (aMsg) => {
          if (aMsg.data == "ready")
            resolve(worker);
          else
            this.detectionQueue.shift().resolve(aMsg.data);
        };
        this._worker = worker;
      });

    return this._workerReadyPromise;
  },

  // Holds the ID of the current pending idle cleanup setTimeout.
  _idleTimeout: null,

  // Schedule the current worker to be terminated after the idle timeout.
  flushWorker() {
    if (this._idleTimeout != null)
      clearTimeout(this._idleTimeout);

    this._idleTimeout = setTimeout(this._flushWorker.bind(this), IDLE_TIMEOUT);
  },

  // Immediately terminate the worker, as long as there no pending
  // results. Otherwise, reschedule termination until after the next
  // idle timeout.
  _flushWorker() {
    if (this.detectionQueue.length)
      this.flushWorker();
    else {
      if (this._worker)
        this._worker.terminate();

      this._worker = null;
      this._workerReadyPromise = null;
      this._idleTimeout = null;
    }
  },
};

this.LanguageDetector = {
  /**
   * Detect the language of a given string.
   *
   * The argument may be either a string containing the text to analyze,
   * or an object with the following properties:
   *
   *  - 'text' The text to analyze.
   *
   *  - 'isHTML' (optional) A boolean, indicating whether the text
   *      should be analyzed as HTML rather than plain text.
   *
   *  - 'language' (optional) A string indicating the expected language.
   *      For text extracted from HTTP documents, this is expected to
   *      come from the Content-Language header.
   *
   *  - 'tld' (optional) A string indicating the top-level domain of the
   *      document the text was extracted from.
   *
   *  - 'encoding' (optional) A string describing the encoding of the
   *      document the string was extracted from. Note that, regardless
   *      of the value of this property, the 'text' property must be a
   *      UTF-16 JavaScript string.
   *
   * @returns {Promise<Object>}
   * @resolves When detection is finished, with a object containing
   * these fields:
   *  - 'language' (string with a language code)
   *  - 'confident' (boolean) Whether the detector is confident of the
   *      result.
   *  - 'languages' (array) An array of up to three elements, containing
   *      the most prevalent languages detected. It contains a
   *      'languageCode' property, containing the ISO language code of
   *      the language, and a 'percent' property, describing the
   *      approximate percentage of the input which is in that language.
   *      For text of an unknown language, the result may contain an
   *      entry with the languge code 'un', indicating the percent of
   *      the text which is unknown.
   */
  detectLanguage(aParams) {
    if (typeof aParams == "string")
      aParams = { text: aParams };

    return workerManager.detectLanguage(aParams);
  },
};