diff options
Diffstat (limited to 'application/basilisk/components/translation/cld2/post.js')
-rw-r--r-- | application/basilisk/components/translation/cld2/post.js | 171 |
1 files changed, 171 insertions, 0 deletions
diff --git a/application/basilisk/components/translation/cld2/post.js b/application/basilisk/components/translation/cld2/post.js new file mode 100644 index 000000000..8a905f988 --- /dev/null +++ b/application/basilisk/components/translation/cld2/post.js @@ -0,0 +1,171 @@ +/* This Source Code Form is subject to the terms of the Mozilla Public + * License, v. 2.0. If a copy of the MPL was not distributed with this file, + * You can obtain one at http://mozilla.org/MPL/2.0/. */ + +// The WebIDL binder places static methods on the prototype, rather than +// on the constructor, which is a bit clumsy, and is definitely not +// idiomatic. +LanguageInfo.detectLanguage = LanguageInfo.prototype.detectLanguage; + +// Closure is overzealous in its function call optimization, and tries +// to turn these singleton methods into unbound function calls. +ensureCache.alloc = ensureCache.alloc.bind(ensureCache); +ensureCache.prepare = ensureCache.prepare.bind(ensureCache); + +// From public/encodings.h. Unfortunately, the WebIDL binder doesn't +// allow us to define or automatically derive these in the IDL. +var Encodings = { + "ISO_8859_1" : 0, + "ISO_8859_2" : 1, + "ISO_8859_3" : 2, + "ISO_8859_4" : 3, + "ISO_8859_5" : 4, + "ISO_8859_6" : 5, + "ISO_8859_7" : 6, + "ISO_8859_8" : 7, + "ISO_8859_9" : 8, + "ISO_8859_10" : 9, + "JAPANESE_EUC_JP" : 10, + "EUC_JP" : 10, + "JAPANESE_SHIFT_JIS" : 11, + "SHIFT_JIS" : 11, + "JAPANESE_JIS" : 12, + "JIS" : 12, + "CHINESE_BIG5" : 13, + "BIG5" : 13, + "CHINESE_GB" : 14, + "CHINESE_EUC_CN" : 15, + "EUC_CN" : 15, + "KOREAN_EUC_KR" : 16, + "EUC_KR" : 16, + "UNICODE_UNUSED" : 17, + "CHINESE_EUC_DEC" : 18, + "EUC_DEC" : 18, + "CHINESE_CNS" : 19, + "CNS" : 19, + "CHINESE_BIG5_CP950" : 20, + "BIG5_CP950" : 20, + "JAPANESE_CP932" : 21, + "CP932" : 21, + "UTF8" : 22, + "UNKNOWN_ENCODING" : 23, + "ASCII_7BIT" : 24, + "RUSSIAN_KOI8_R" : 25, + "KOI8_R" : 25, + "RUSSIAN_CP1251" : 26, + "CP1251" : 26, + "MSFT_CP1252" : 27, + "CP1252" : 27, + "RUSSIAN_KOI8_RU" : 28, + "KOI8_RU" : 28, + "MSFT_CP1250" : 29, + "CP1250" : 29, + "ISO_8859_15" : 30, + "MSFT_CP1254" : 31, + "CP1254" : 31, + "MSFT_CP1257" : 32, + "CP1257" : 32, + "ISO_8859_11" : 33, + "MSFT_CP874" : 34, + "CP874" : 34, + "MSFT_CP1256" : 35, + "CP1256" : 35, + "MSFT_CP1255" : 36, + "CP1255" : 36, + "ISO_8859_8_I" : 37, + "HEBREW_VISUAL" : 38, + "CZECH_CP852" : 39, + "CP852" : 39, + "CZECH_CSN_369103" : 40, + "CSN_369103" : 40, + "MSFT_CP1253" : 41, + "CP1253" : 41, + "RUSSIAN_CP866" : 42, + "CP866" : 42, + "ISO_8859_13" : 43, + "ISO_2022_KR" : 44, + "GBK" : 45, + "GB18030" : 46, + "BIG5_HKSCS" : 47, + "ISO_2022_CN" : 48, + "TSCII" : 49, + "TAMIL_MONO" : 50, + "TAMIL_BI" : 51, + "JAGRAN" : 52, + "MACINTOSH_ROMAN" : 53, + "UTF7" : 54, + "BHASKAR" : 55, + "HTCHANAKYA" : 56, + "UTF16BE" : 57, + "UTF16LE" : 58, + "UTF32BE" : 59, + "UTF32LE" : 60, + "BINARYENC" : 61, + "HZ_GB_2312" : 62, + "UTF8UTF8" : 63, + "TAM_ELANGO" : 64, + "TAM_LTTMBARANI" : 65, + "TAM_SHREE" : 66, + "TAM_TBOOMIS" : 67, + "TAM_TMNEWS" : 68, + "TAM_WEBTAMIL" : 69, + "KDDI_SHIFT_JIS" : 70, + "DOCOMO_SHIFT_JIS" : 71, + "SOFTBANK_SHIFT_JIS" : 72, + "KDDI_ISO_2022_JP" : 73, + "ISO_2022_JP" : 73, + "SOFTBANK_ISO_2022_JP" : 74, +}; + +// Accept forms both with and without underscores/hypens. +for (let code of Object.keys(Encodings)) { + if (code["includes"]("_")) + Encodings[code.replace(/_/g, "")] = Encodings[code]; +} + +addOnPreMain(function() { + + onmessage = function(aMsg) { + let data = aMsg["data"]; + + let langInfo; + if (data["tld"] == undefined && data["encoding"] == undefined && data["language"] == undefined) { + langInfo = LanguageInfo.detectLanguage(data["text"], !data["isHTML"]); + } else { + // Do our best to find the given encoding in the encodings table. + // Otherwise, just fall back to unknown. + let enc = String(data["encoding"]).toUpperCase().replace(/[_-]/g, ""); + + let encoding; + if (Encodings.hasOwnProperty(enc)) + encoding = Encodings[enc]; + else + encoding = Encodings["UNKNOWN_ENCODING"]; + + langInfo = LanguageInfo.detectLanguage(data["text"], !data["isHTML"], + data["tld"] || null, + encoding, + data["language"] || null); + } + + postMessage({ + "language": langInfo.getLanguageCode(), + "confident": langInfo.getIsReliable(), + + "languages": new Array(3).fill(0).map((_, index) => { + let lang = langInfo.get_languages(index); + return { + "languageCode": lang.getLanguageCode(), + "percent": lang.getPercent(), + }; + }).filter(lang => { + // Ignore empty results. + return lang["languageCode"] != "un" || lang["percent"] > 0; + }), + }); + + Module.destroy(langInfo); + }; + + postMessage("ready"); +}); |