diff options
Diffstat (limited to 'application/basilisk/components/translation/cld2/post.js')
-rw-r--r-- | application/basilisk/components/translation/cld2/post.js | 171 |
1 files changed, 171 insertions, 0 deletions
diff --git a/application/basilisk/components/translation/cld2/post.js b/application/basilisk/components/translation/cld2/post.js new file mode 100644 index 000000000..a3e8b8522 --- /dev/null +++ b/application/basilisk/components/translation/cld2/post.js @@ -0,0 +1,171 @@ +/* This Source Code Form is subject to the terms of the Mozilla Public + * License, v. 2.0. If a copy of the MPL was not distributed with this file, + * You can obtain one at http://mozilla.org/MPL/2.0/. */ + +// The WebIDL binder places static methods on the prototype, rather than +// on the constructor, which is a bit clumsy, and is definitely not +// idiomatic. +LanguageInfo.detectLanguage = LanguageInfo.prototype.detectLanguage; + +// Closure is overzealous in its function call optimization, and tries +// to turn these singleton methods into unbound function calls. +ensureCache.alloc = ensureCache.alloc.bind(ensureCache); +ensureCache.prepare = ensureCache.prepare.bind(ensureCache); + +// From public/encodings.h. Unfortunately, the WebIDL binder doesn't +// allow us to define or automatically derive these in the IDL. +var Encodings = { + 'ISO_8859_1' : 0, + 'ISO_8859_2' : 1, + 'ISO_8859_3' : 2, + 'ISO_8859_4' : 3, + 'ISO_8859_5' : 4, + 'ISO_8859_6' : 5, + 'ISO_8859_7' : 6, + 'ISO_8859_8' : 7, + 'ISO_8859_9' : 8, + 'ISO_8859_10' : 9, + 'JAPANESE_EUC_JP' : 10, + 'EUC_JP' : 10, + 'JAPANESE_SHIFT_JIS' : 11, + 'SHIFT_JIS' : 11, + 'JAPANESE_JIS' : 12, + 'JIS' : 12, + 'CHINESE_BIG5' : 13, + 'BIG5' : 13, + 'CHINESE_GB' : 14, + 'CHINESE_EUC_CN' : 15, + 'EUC_CN' : 15, + 'KOREAN_EUC_KR' : 16, + 'EUC_KR' : 16, + 'UNICODE_UNUSED' : 17, + 'CHINESE_EUC_DEC' : 18, + 'EUC_DEC' : 18, + 'CHINESE_CNS' : 19, + 'CNS' : 19, + 'CHINESE_BIG5_CP950' : 20, + 'BIG5_CP950' : 20, + 'JAPANESE_CP932' : 21, + 'CP932' : 21, + 'UTF8' : 22, + 'UNKNOWN_ENCODING' : 23, + 'ASCII_7BIT' : 24, + 'RUSSIAN_KOI8_R' : 25, + 'KOI8_R' : 25, + 'RUSSIAN_CP1251' : 26, + 'CP1251' : 26, + 'MSFT_CP1252' : 27, + 'CP1252' : 27, + 'RUSSIAN_KOI8_RU' : 28, + 'KOI8_RU' : 28, + 'MSFT_CP1250' : 29, + 'CP1250' : 29, + 'ISO_8859_15' : 30, + 'MSFT_CP1254' : 31, + 'CP1254' : 31, + 'MSFT_CP1257' : 32, + 'CP1257' : 32, + 'ISO_8859_11' : 33, + 'MSFT_CP874' : 34, + 'CP874' : 34, + 'MSFT_CP1256' : 35, + 'CP1256' : 35, + 'MSFT_CP1255' : 36, + 'CP1255' : 36, + 'ISO_8859_8_I' : 37, + 'HEBREW_VISUAL' : 38, + 'CZECH_CP852' : 39, + 'CP852' : 39, + 'CZECH_CSN_369103' : 40, + 'CSN_369103' : 40, + 'MSFT_CP1253' : 41, + 'CP1253' : 41, + 'RUSSIAN_CP866' : 42, + 'CP866' : 42, + 'ISO_8859_13' : 43, + 'ISO_2022_KR' : 44, + 'GBK' : 45, + 'GB18030' : 46, + 'BIG5_HKSCS' : 47, + 'ISO_2022_CN' : 48, + 'TSCII' : 49, + 'TAMIL_MONO' : 50, + 'TAMIL_BI' : 51, + 'JAGRAN' : 52, + 'MACINTOSH_ROMAN' : 53, + 'UTF7' : 54, + 'BHASKAR' : 55, + 'HTCHANAKYA' : 56, + 'UTF16BE' : 57, + 'UTF16LE' : 58, + 'UTF32BE' : 59, + 'UTF32LE' : 60, + 'BINARYENC' : 61, + 'HZ_GB_2312' : 62, + 'UTF8UTF8' : 63, + 'TAM_ELANGO' : 64, + 'TAM_LTTMBARANI' : 65, + 'TAM_SHREE' : 66, + 'TAM_TBOOMIS' : 67, + 'TAM_TMNEWS' : 68, + 'TAM_WEBTAMIL' : 69, + 'KDDI_SHIFT_JIS' : 70, + 'DOCOMO_SHIFT_JIS' : 71, + 'SOFTBANK_SHIFT_JIS' : 72, + 'KDDI_ISO_2022_JP' : 73, + 'ISO_2022_JP' : 73, + 'SOFTBANK_ISO_2022_JP' : 74, +}; + +// Accept forms both with and without underscores/hypens. +for (let code of Object.keys(Encodings)) { + if (code['includes']("_")) + Encodings[code.replace(/_/g, "")] = Encodings[code]; +} + +addOnPreMain(function() { + + onmessage = function(aMsg) { + let data = aMsg['data']; + + let langInfo; + if (data['tld'] == undefined && data['encoding'] == undefined && data['language'] == undefined) { + langInfo = LanguageInfo.detectLanguage(data['text'], !data['isHTML']); + } else { + // Do our best to find the given encoding in the encodings table. + // Otherwise, just fall back to unknown. + let enc = String(data['encoding']).toUpperCase().replace(/[_-]/g, ""); + + let encoding; + if (Encodings.hasOwnProperty(enc)) + encoding = Encodings[enc]; + else + encoding = Encodings['UNKNOWN_ENCODING']; + + langInfo = LanguageInfo.detectLanguage(data['text'], !data['isHTML'], + data['tld'] || null, + encoding, + data['language'] || null); + } + + postMessage({ + 'language': langInfo.getLanguageCode(), + 'confident': langInfo.getIsReliable(), + + 'languages': new Array(3).fill(0).map((_, index) => { + let lang = langInfo.get_languages(index); + return { + 'languageCode': lang.getLanguageCode(), + 'percent': lang.getPercent(), + }; + }).filter(lang => { + // Ignore empty results. + return lang['languageCode'] != "un" || lang['percent'] > 0; + }), + }); + + Module.destroy(langInfo); + }; + + postMessage("ready"); +}); |