diff options
Diffstat (limited to 'application/basilisk/components/translation/cld2/public')
-rw-r--r-- | application/basilisk/components/translation/cld2/public/compact_lang_det.h | 320 | ||||
-rw-r--r-- | application/basilisk/components/translation/cld2/public/encodings.h | 169 |
2 files changed, 489 insertions, 0 deletions
diff --git a/application/basilisk/components/translation/cld2/public/compact_lang_det.h b/application/basilisk/components/translation/cld2/public/compact_lang_det.h new file mode 100644 index 000000000..da59abd63 --- /dev/null +++ b/application/basilisk/components/translation/cld2/public/compact_lang_det.h @@ -0,0 +1,320 @@ +// Copyright 2013 Google Inc. All Rights Reserved. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +// +// Author: dsites@google.com (Dick Sites) +// + +// NOTE: +// Baybayin (ancient script of the Philippines) is detected as TAGALOG. +// Chu Nom (Vietnamese ancient Han characters) is detected as VIETNAMESE. +// HAITIAN_CREOLE is detected as such. +// NORWEGIAN and NORWEGIAN_N are detected separately (but not robustly) +// PORTUGUESE, PORTUGUESE_P, and PORTUGUESE_B are all detected as PORTUGUESE. +// ROMANIAN-Latin is detected as ROMANIAN; ROMANIAN-Cyrillic as ROMANIAN. +// BOSNIAN is not detected as such, but likely scores as Croatian or Serbian. +// MONTENEGRIN is not detected as such, but likely scores as Serbian. +// CROATIAN is detected in the Latin script +// SERBIAN is detected in the Cyrililc and Latin scripts +// Zhuang is detected in the Latin script only. +// +// The languages X_PIG_LATIN and X_KLINGON are detected in the +// extended calls ExtDetectLanguageSummary(). +// +// UNKNOWN_LANGUAGE is returned if no language's internal reliablity measure +// is high enough. This happens with non-text input such as the bytes of a +// JPEG, and also with text in languages outside training set. +// +// The following languages are to be detected in multiple scripts: +// AZERBAIJANI (Latin, Cyrillic*, Arabic*) +// BURMESE (Latin, Myanmar) +// HAUSA (Latin, Arabic) +// KASHMIRI (Arabic, Devanagari) +// KAZAKH (Latin, Cyrillic, Arabic) +// KURDISH (Latin*, Arabic) +// KYRGYZ (Cyrillic, Arabic) +// LIMBU (Devanagari, Limbu) +// MONGOLIAN (Cyrillic, Mongolian) +// SANSKRIT (Latin, Devanagari) +// SINDHI (Arabic, Devanagari) +// TAGALOG (Latin, Tagalog) +// TAJIK (Cyrillic, Arabic*) +// TATAR (Latin, Cyrillic, Arabic) +// TURKMEN (Latin, Cyrillic, Arabic) +// UIGHUR (Latin, Cyrillic, Arabic) +// UZBEK (Latin, Cyrillic, Arabic) +// +// * Due to a shortage of training text, AZERBAIJANI is not currently detected +// in Arabic or Cyrillic scripts, nor KURDISH in Latin script, nor TAJIK in +// Arabic script. +// + +#ifndef I18N_ENCODINGS_CLD2_PUBLIC_COMPACT_LANG_DET_H_ +#define I18N_ENCODINGS_CLD2_PUBLIC_COMPACT_LANG_DET_H_ + +#include <vector> +#include "../internal/lang_script.h" // For Language + +namespace CLD2 { + + // Scan interchange-valid UTF-8 bytes and detect most likely language, + // or set of languages. + // + // Design goals: + // Skip over big stretches of HTML tags + // Able to return ranges of different languages + // Relatively small tables and relatively fast processing + // Thread safe + // + // For HTML documents, tags are skipped, along with <script> ... </script> + // and <style> ... </style> sequences, and entities are expanded. + // + // We distinguish between bytes of the raw input buffer and bytes of non-tag + // text letters. Since tags can be over 50% of the bytes of an HTML Page, + // and are nearly all seven-bit ASCII English, we prefer to distinguish + // language mixture fractions based on just the non-tag text. + // + // Inputs: text and text_length + // Code skips HTML tags and expands HTML entities, unless + // is_plain_text is true + // Outputs: + // language3 is an array of the top 3 languages or UNKNOWN_LANGUAGE + // percent3 is an array of the text percentages 0..100 of the top 3 languages + // text_bytes is the amount of non-tag/letters-only text found + // is_reliable set true if the returned Language is some amount more + // probable then the second-best Language. Calculation is a complex function + // of the length of the text and the different-script runs of text. + // Return value: the most likely Language for the majority of the input text + // Length 0 input returns UNKNOWN_LANGUAGE. Very short indeterminate text + // defaults to ENGLISH. + // + // The first two versions return ENGLISH instead of UNKNOWN_LANGUAGE, for + // backwards compatibility with a different detector. + // + // The third version may return UNKNOWN_LANGUAGE, and also returns extended + // language codes from lang_script.h + // + + + // Instead of individual arguments, pass in hints as an initialized struct + // Init to {NULL, NULL, UNKNOWN_ENCODING, UNKNOWN_LANGUAGE} if not known. + // + // Pass in hints whenever possible; doing so improves detection accuracy. The + // set of passed-in hints are all information that is external to the text + // itself. + // + // The content_language_hint is intended to come from an HTTP header + // Content-Language: field, the tld_hint from the hostname of a URL, the + // encoding-hint from an encoding detector applied to the input + // document, and the language hint from any other context you might have. + // The lang= tags inside an HTML document will be picked up as hints + // by code within the compact language detector. + + typedef struct { + const char* content_language_hint; // "mi,en" boosts Maori and English + const char* tld_hint; // "id" boosts Indonesian + int encoding_hint; // SJS boosts Japanese + Language language_hint; // ITALIAN boosts it + } CLDHints; + + static const int kMaxResultChunkBytes = 65535; + + // For returning a vector of per-language pieces of the input buffer + // Unreliable and too-short are mapped to UNKNOWN_LANGUAGE + typedef struct { + int offset; // Starting byte offset in original buffer + uint16 bytes; // Number of bytes in chunk + uint16 lang1; // Top lang, as full Language. Apply + // static_cast<Language>() to this short value. + } ResultChunk; + typedef std::vector<ResultChunk> ResultChunkVector; + + + // Scan interchange-valid UTF-8 bytes and detect most likely language + Language DetectLanguage( + const char* buffer, + int buffer_length, + bool is_plain_text, + bool* is_reliable); + + // Scan interchange-valid UTF-8 bytes and detect list of top 3 languages. + // language3[0] is usually also the return value + Language DetectLanguageSummary( + const char* buffer, + int buffer_length, + bool is_plain_text, + Language* language3, + int* percent3, + int* text_bytes, + bool* is_reliable); + + // Same as above, with hints supplied + // Scan interchange-valid UTF-8 bytes and detect list of top 3 languages. + // language3[0] is usually also the return value + Language DetectLanguageSummary( + const char* buffer, + int buffer_length, + bool is_plain_text, + const char* tld_hint, // "id" boosts Indonesian + int encoding_hint, // SJS boosts Japanese + Language language_hint, // ITALIAN boosts it + Language* language3, + int* percent3, + int* text_bytes, + bool* is_reliable); + + // Scan interchange-valid UTF-8 bytes and detect list of top 3 extended + // languages. + // + // Extended languages are additional interface languages and Unicode + // single-language scripts, from lang_script.h + // + // language3[0] is usually also the return value + Language ExtDetectLanguageSummary( + const char* buffer, + int buffer_length, + bool is_plain_text, + Language* language3, + int* percent3, + int* text_bytes, + bool* is_reliable); + + // Same as above, with hints supplied + // Scan interchange-valid UTF-8 bytes and detect list of top 3 extended + // languages. + // + // Extended languages are additional Google interface languages and Unicode + // single-language scripts, from lang_script.h + // + // language3[0] is usually also the return value + Language ExtDetectLanguageSummary( + const char* buffer, + int buffer_length, + bool is_plain_text, + const char* tld_hint, // "id" boosts Indonesian + int encoding_hint, // SJS boosts Japanese + Language language_hint, // ITALIAN boosts it + Language* language3, + int* percent3, + int* text_bytes, + bool* is_reliable); + + // Same as above, and also returns 3 internal language scores as a ratio to + // normal score for real text in that language. Scores close to 1.0 indicate + // normal text, while scores far away from 1.0 indicate badly-skewed text or + // gibberish + // + Language ExtDetectLanguageSummary( + const char* buffer, + int buffer_length, + bool is_plain_text, + const char* tld_hint, // "id" boosts Indonesian + int encoding_hint, // SJS boosts Japanese + Language language_hint, // ITALIAN boosts it + Language* language3, + int* percent3, + double* normalized_score3, + int* text_bytes, + bool* is_reliable); + + + // Use this one. + // Hints are collected into a struct. + // Flags are passed in (normally zero). + // + // Also returns 3 internal language scores as a ratio to + // normal score for real text in that language. Scores close to 1.0 indicate + // normal text, while scores far away from 1.0 indicate badly-skewed text or + // gibberish + // + // Returns a vector of chunks in different languages, so that caller may + // spell-check, translate, or otherwaise process different parts of the input + // buffer in language-dependant ways. + // + Language ExtDetectLanguageSummary( + const char* buffer, + int buffer_length, + bool is_plain_text, + const CLDHints* cld_hints, + int flags, + Language* language3, + int* percent3, + double* normalized_score3, + ResultChunkVector* resultchunkvector, + int* text_bytes, + bool* is_reliable); + + // Return version text string + // String is "code_version - data_build_date" + const char* DetectLanguageVersion(); + + + // Public use flags, debug output controls + static const int kCLDFlagScoreAsQuads = 0x0100; // Force Greek, etc. => quads + static const int kCLDFlagHtml = 0x0200; // Debug HTML => stderr + static const int kCLDFlagCr = 0x0400; // <cr> per chunk if HTML + static const int kCLDFlagVerbose = 0x0800; // More debug HTML => stderr + static const int kCLDFlagQuiet = 0x1000; // Less debug HTML => stderr + static const int kCLDFlagEcho = 0x2000; // Echo input => stderr + + +/*** + +Flag meanings: + kCLDFlagScoreAsQuads + Normally, several languages are detected solely by their Unicode script. + Combined with appropritate lookup tables, this flag forces them instead + to be detected via quadgrams. This can be a useful refinement when looking + for meaningful text in these languages, instead of just character sets. + The default tables do not support this use. + kCLDFlagHtml + For each detection call, write an HTML file to stderr, showing the text + chunks and their detected languages. + kCLDFlagCr + In that HTML file, force a new line for each chunk. + kCLDFlagVerbose + In that HTML file, show every lookup entry. + kCLDFlagQuiet + In that HTML file, suppress most of the output detail. + kCLDFlagEcho + Echo every input buffer to stderr. +***/ + +// Debug output: Print the resultchunkvector to file f +void DumpResultChunkVector(FILE* f, const char* src, + ResultChunkVector* resultchunkvector); + +#ifdef CLD2_DYNAMIC_MODE + +// If compiled with dynamic mode, load data from the specified file location. +// If other data has already been loaded, it is discarded and the data is read +// in from the specified file location again (even if the file has not changed). +// WARNING: Before calling this method, language detection will always fail +// and will always return the unknown language. +void loadData(const char* fileName); + +// If compiled with dynamic mode, unload the previously-loaded data. +// WARNING: After calling this method, language detection will no longer work +// and will always return the unknown language. +void unloadData(); + +// Returns true if and only if data has been loaded via a call to loadData(...) +// and has not been subsequently unladed via a call to unloadDate(). +bool isDataLoaded(); + +#endif // #ifdef CLD2_DYNAMIC_MODE + +}; // End namespace CLD2 + +#endif // I18N_ENCODINGS_CLD2_PUBLIC_COMPACT_LANG_DET_H_ diff --git a/application/basilisk/components/translation/cld2/public/encodings.h b/application/basilisk/components/translation/cld2/public/encodings.h new file mode 100644 index 000000000..1eb8f0a15 --- /dev/null +++ b/application/basilisk/components/translation/cld2/public/encodings.h @@ -0,0 +1,169 @@ +// Copyright 2013 Google Inc. All Rights Reserved. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +// +// Author: dsites@google.com (Dick Sites) +// + +#ifndef I18N_ENCODINGS_CLD2_PUBLIC_ENCODINGS_H__ +#define I18N_ENCODINGS_CLD2_PUBLIC_ENCODINGS_H__ + +namespace CLD2 { + +enum Encoding { + ISO_8859_1 = 0, // ASCII + ISO_8859_2 = 1, // Latin2 + ISO_8859_3 = 2, // + ISO_8859_4 = 3, // Latin4 + ISO_8859_5 = 4, // ISO-8859-5 + ISO_8859_6 = 5, // Arabic + ISO_8859_7 = 6, // Greek + ISO_8859_8 = 7, // Hebrew + ISO_8859_9 = 8, // + ISO_8859_10 = 9, // + JAPANESE_EUC_JP = 10, // EUC_JP + JAPANESE_SHIFT_JIS = 11, // SJS + JAPANESE_JIS = 12, // JIS + CHINESE_BIG5 = 13, // BIG5 + CHINESE_GB = 14, // GB + CHINESE_EUC_CN = 15, // Misnamed. Should be EUC_TW. Was Basis Tech + // CNS11643EUC, before that EUC-CN(!) + KOREAN_EUC_KR = 16, // KSC + UNICODE_UNUSED = 17, // Unicode + CHINESE_EUC_DEC = 18, // Misnamed. Should be EUC_TW. Was + // CNS11643EUC, before that EUC. + CHINESE_CNS = 19, // Misnamed. Should be EUC_TW. Was + // CNS11643EUC, before that CNS. + CHINESE_BIG5_CP950 = 20, // BIG5_CP950 + JAPANESE_CP932 = 21, // CP932 + UTF8 = 22, + UNKNOWN_ENCODING = 23, + ASCII_7BIT = 24, // ISO_8859_1 with all characters <= 127. + RUSSIAN_KOI8_R = 25, // KOI8R + RUSSIAN_CP1251 = 26, // CP1251 + + //---------------------------------------------------------- + MSFT_CP1252 = 27, // 27: CP1252 aka MSFT euro ascii + RUSSIAN_KOI8_RU = 28, // CP21866 aka KOI8-U, used for Ukrainian. + // Misnamed, this is _not_ KOI8-RU but KOI8-U. + // KOI8-U is used much more often than KOI8-RU. + MSFT_CP1250 = 29, // CP1250 aka MSFT eastern european + ISO_8859_15 = 30, // aka ISO_8859_0 aka ISO_8859_1 euroized + //---------------------------------------------------------- + + //---------------------------------------------------------- + MSFT_CP1254 = 31, // used for Turkish + MSFT_CP1257 = 32, // used in Baltic countries + //---------------------------------------------------------- + + //---------------------------------------------------------- + //---------------------------------------------------------- + ISO_8859_11 = 33, // aka TIS-620, used for Thai + MSFT_CP874 = 34, // used for Thai + MSFT_CP1256 = 35, // used for Arabic + + //---------------------------------------------------------- + MSFT_CP1255 = 36, // Logical Hebrew Microsoft + ISO_8859_8_I = 37, // Iso Hebrew Logical + HEBREW_VISUAL = 38, // Iso Hebrew Visual + //---------------------------------------------------------- + + //---------------------------------------------------------- + CZECH_CP852 = 39, + CZECH_CSN_369103 = 40, // aka ISO_IR_139 aka KOI8_CS + MSFT_CP1253 = 41, // used for Greek + RUSSIAN_CP866 = 42, + //---------------------------------------------------------- + + //---------------------------------------------------------- + // Handled by iconv in glibc + ISO_8859_13 = 43, + ISO_2022_KR = 44, + GBK = 45, + GB18030 = 46, + BIG5_HKSCS = 47, + ISO_2022_CN = 48, + + //----------------------------------------------------------- + // Following 4 encodings are deprecated (font encodings) + TSCII = 49, + TAMIL_MONO = 50, + TAMIL_BI = 51, + JAGRAN = 52, + + + MACINTOSH_ROMAN = 53, + UTF7 = 54, + + //----------------------------------------------------------- + // Following 2 encodings are deprecated (font encodings) + BHASKAR = 55, // Indic encoding - Devanagari + HTCHANAKYA = 56, // 56 Indic encoding - Devanagari + + //----------------------------------------------------------- + UTF16BE = 57, // big-endian UTF-16 + UTF16LE = 58, // little-endian UTF-16 + UTF32BE = 59, // big-endian UTF-32 + UTF32LE = 60, // little-endian UTF-32 + //----------------------------------------------------------- + + //----------------------------------------------------------- + // An encoding that means "This is not text, but it may have some + // simple ASCII text embedded". Intended input conversion + // is to keep strings of >=4 seven-bit ASCII characters + BINARYENC = 61, + //----------------------------------------------------------- + + //----------------------------------------------------------- + // Some Web pages allow a mixture of HZ-GB and GB-2312 by using + // ~{ ... ~} for 2-byte pairs, and the browsers support this. + HZ_GB_2312 = 62, + //----------------------------------------------------------- + + //----------------------------------------------------------- + // Some external vendors make the common input error of + // converting MSFT_CP1252 to UTF8 *twice*. + UTF8UTF8 = 63, + //----------------------------------------------------------- + + //----------------------------------------------------------- + // Following 6 encodings are deprecated (font encodings) + TAM_ELANGO = 64, // Elango - Tamil + TAM_LTTMBARANI = 65, // Barani - Tamil + TAM_SHREE = 66, // Shree - Tamil + TAM_TBOOMIS = 67, // TBoomis - Tamil + TAM_TMNEWS = 68, // TMNews - Tamil + TAM_WEBTAMIL = 69, // Webtamil - Tamil + //----------------------------------------------------------- + + //----------------------------------------------------------- + // Shift_JIS variants used by Japanese cell phone carriers. + KDDI_SHIFT_JIS = 70, + DOCOMO_SHIFT_JIS = 71, + SOFTBANK_SHIFT_JIS = 72, + // ISO-2022-JP variants used by KDDI and SoftBank. + KDDI_ISO_2022_JP = 73, + SOFTBANK_ISO_2022_JP = 74, + //----------------------------------------------------------- + + NUM_ENCODINGS = 75, // Always keep this at the end. It is not a + // valid Encoding enum, it is only used to + // indicate the total number of Encodings. +}; + +} // End namespace CLD2 + +#endif // I18N_ENCODINGS_CLD2_PUBLIC_ENCODINGS_H__ + + |