From 6d614170cbfa958564eb5f824234ad5a9e484344 Mon Sep 17 00:00:00 2001 From: "Matt A. Tobin" Date: Fri, 2 Feb 2018 05:06:10 -0500 Subject: Revert "Add Basilisk" This reverts commit e72ef92b5bdc43cd2584198e2e54e951b70299e8. --- .../translation/cld2/public/compact_lang_det.h | 320 --------------------- .../components/translation/cld2/public/encodings.h | 169 ----------- 2 files changed, 489 deletions(-) delete mode 100644 application/basilisk/components/translation/cld2/public/compact_lang_det.h delete mode 100644 application/basilisk/components/translation/cld2/public/encodings.h (limited to 'application/basilisk/components/translation/cld2/public') diff --git a/application/basilisk/components/translation/cld2/public/compact_lang_det.h b/application/basilisk/components/translation/cld2/public/compact_lang_det.h deleted file mode 100644 index da59abd63..000000000 --- a/application/basilisk/components/translation/cld2/public/compact_lang_det.h +++ /dev/null @@ -1,320 +0,0 @@ -// Copyright 2013 Google Inc. All Rights Reserved. -// -// Licensed under the Apache License, Version 2.0 (the "License"); -// you may not use this file except in compliance with the License. -// You may obtain a copy of the License at -// -// http://www.apache.org/licenses/LICENSE-2.0 -// -// Unless required by applicable law or agreed to in writing, software -// distributed under the License is distributed on an "AS IS" BASIS, -// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -// See the License for the specific language governing permissions and -// limitations under the License. - -// -// Author: dsites@google.com (Dick Sites) -// - -// NOTE: -// Baybayin (ancient script of the Philippines) is detected as TAGALOG. -// Chu Nom (Vietnamese ancient Han characters) is detected as VIETNAMESE. -// HAITIAN_CREOLE is detected as such. -// NORWEGIAN and NORWEGIAN_N are detected separately (but not robustly) -// PORTUGUESE, PORTUGUESE_P, and PORTUGUESE_B are all detected as PORTUGUESE. -// ROMANIAN-Latin is detected as ROMANIAN; ROMANIAN-Cyrillic as ROMANIAN. -// BOSNIAN is not detected as such, but likely scores as Croatian or Serbian. -// MONTENEGRIN is not detected as such, but likely scores as Serbian. -// CROATIAN is detected in the Latin script -// SERBIAN is detected in the Cyrililc and Latin scripts -// Zhuang is detected in the Latin script only. -// -// The languages X_PIG_LATIN and X_KLINGON are detected in the -// extended calls ExtDetectLanguageSummary(). -// -// UNKNOWN_LANGUAGE is returned if no language's internal reliablity measure -// is high enough. This happens with non-text input such as the bytes of a -// JPEG, and also with text in languages outside training set. -// -// The following languages are to be detected in multiple scripts: -// AZERBAIJANI (Latin, Cyrillic*, Arabic*) -// BURMESE (Latin, Myanmar) -// HAUSA (Latin, Arabic) -// KASHMIRI (Arabic, Devanagari) -// KAZAKH (Latin, Cyrillic, Arabic) -// KURDISH (Latin*, Arabic) -// KYRGYZ (Cyrillic, Arabic) -// LIMBU (Devanagari, Limbu) -// MONGOLIAN (Cyrillic, Mongolian) -// SANSKRIT (Latin, Devanagari) -// SINDHI (Arabic, Devanagari) -// TAGALOG (Latin, Tagalog) -// TAJIK (Cyrillic, Arabic*) -// TATAR (Latin, Cyrillic, Arabic) -// TURKMEN (Latin, Cyrillic, Arabic) -// UIGHUR (Latin, Cyrillic, Arabic) -// UZBEK (Latin, Cyrillic, Arabic) -// -// * Due to a shortage of training text, AZERBAIJANI is not currently detected -// in Arabic or Cyrillic scripts, nor KURDISH in Latin script, nor TAJIK in -// Arabic script. -// - -#ifndef I18N_ENCODINGS_CLD2_PUBLIC_COMPACT_LANG_DET_H_ -#define I18N_ENCODINGS_CLD2_PUBLIC_COMPACT_LANG_DET_H_ - -#include -#include "../internal/lang_script.h" // For Language - -namespace CLD2 { - - // Scan interchange-valid UTF-8 bytes and detect most likely language, - // or set of languages. - // - // Design goals: - // Skip over big stretches of HTML tags - // Able to return ranges of different languages - // Relatively small tables and relatively fast processing - // Thread safe - // - // For HTML documents, tags are skipped, along with - // and sequences, and entities are expanded. - // - // We distinguish between bytes of the raw input buffer and bytes of non-tag - // text letters. Since tags can be over 50% of the bytes of an HTML Page, - // and are nearly all seven-bit ASCII English, we prefer to distinguish - // language mixture fractions based on just the non-tag text. - // - // Inputs: text and text_length - // Code skips HTML tags and expands HTML entities, unless - // is_plain_text is true - // Outputs: - // language3 is an array of the top 3 languages or UNKNOWN_LANGUAGE - // percent3 is an array of the text percentages 0..100 of the top 3 languages - // text_bytes is the amount of non-tag/letters-only text found - // is_reliable set true if the returned Language is some amount more - // probable then the second-best Language. Calculation is a complex function - // of the length of the text and the different-script runs of text. - // Return value: the most likely Language for the majority of the input text - // Length 0 input returns UNKNOWN_LANGUAGE. Very short indeterminate text - // defaults to ENGLISH. - // - // The first two versions return ENGLISH instead of UNKNOWN_LANGUAGE, for - // backwards compatibility with a different detector. - // - // The third version may return UNKNOWN_LANGUAGE, and also returns extended - // language codes from lang_script.h - // - - - // Instead of individual arguments, pass in hints as an initialized struct - // Init to {NULL, NULL, UNKNOWN_ENCODING, UNKNOWN_LANGUAGE} if not known. - // - // Pass in hints whenever possible; doing so improves detection accuracy. The - // set of passed-in hints are all information that is external to the text - // itself. - // - // The content_language_hint is intended to come from an HTTP header - // Content-Language: field, the tld_hint from the hostname of a URL, the - // encoding-hint from an encoding detector applied to the input - // document, and the language hint from any other context you might have. - // The lang= tags inside an HTML document will be picked up as hints - // by code within the compact language detector. - - typedef struct { - const char* content_language_hint; // "mi,en" boosts Maori and English - const char* tld_hint; // "id" boosts Indonesian - int encoding_hint; // SJS boosts Japanese - Language language_hint; // ITALIAN boosts it - } CLDHints; - - static const int kMaxResultChunkBytes = 65535; - - // For returning a vector of per-language pieces of the input buffer - // Unreliable and too-short are mapped to UNKNOWN_LANGUAGE - typedef struct { - int offset; // Starting byte offset in original buffer - uint16 bytes; // Number of bytes in chunk - uint16 lang1; // Top lang, as full Language. Apply - // static_cast() to this short value. - } ResultChunk; - typedef std::vector ResultChunkVector; - - - // Scan interchange-valid UTF-8 bytes and detect most likely language - Language DetectLanguage( - const char* buffer, - int buffer_length, - bool is_plain_text, - bool* is_reliable); - - // Scan interchange-valid UTF-8 bytes and detect list of top 3 languages. - // language3[0] is usually also the return value - Language DetectLanguageSummary( - const char* buffer, - int buffer_length, - bool is_plain_text, - Language* language3, - int* percent3, - int* text_bytes, - bool* is_reliable); - - // Same as above, with hints supplied - // Scan interchange-valid UTF-8 bytes and detect list of top 3 languages. - // language3[0] is usually also the return value - Language DetectLanguageSummary( - const char* buffer, - int buffer_length, - bool is_plain_text, - const char* tld_hint, // "id" boosts Indonesian - int encoding_hint, // SJS boosts Japanese - Language language_hint, // ITALIAN boosts it - Language* language3, - int* percent3, - int* text_bytes, - bool* is_reliable); - - // Scan interchange-valid UTF-8 bytes and detect list of top 3 extended - // languages. - // - // Extended languages are additional interface languages and Unicode - // single-language scripts, from lang_script.h - // - // language3[0] is usually also the return value - Language ExtDetectLanguageSummary( - const char* buffer, - int buffer_length, - bool is_plain_text, - Language* language3, - int* percent3, - int* text_bytes, - bool* is_reliable); - - // Same as above, with hints supplied - // Scan interchange-valid UTF-8 bytes and detect list of top 3 extended - // languages. - // - // Extended languages are additional Google interface languages and Unicode - // single-language scripts, from lang_script.h - // - // language3[0] is usually also the return value - Language ExtDetectLanguageSummary( - const char* buffer, - int buffer_length, - bool is_plain_text, - const char* tld_hint, // "id" boosts Indonesian - int encoding_hint, // SJS boosts Japanese - Language language_hint, // ITALIAN boosts it - Language* language3, - int* percent3, - int* text_bytes, - bool* is_reliable); - - // Same as above, and also returns 3 internal language scores as a ratio to - // normal score for real text in that language. Scores close to 1.0 indicate - // normal text, while scores far away from 1.0 indicate badly-skewed text or - // gibberish - // - Language ExtDetectLanguageSummary( - const char* buffer, - int buffer_length, - bool is_plain_text, - const char* tld_hint, // "id" boosts Indonesian - int encoding_hint, // SJS boosts Japanese - Language language_hint, // ITALIAN boosts it - Language* language3, - int* percent3, - double* normalized_score3, - int* text_bytes, - bool* is_reliable); - - - // Use this one. - // Hints are collected into a struct. - // Flags are passed in (normally zero). - // - // Also returns 3 internal language scores as a ratio to - // normal score for real text in that language. Scores close to 1.0 indicate - // normal text, while scores far away from 1.0 indicate badly-skewed text or - // gibberish - // - // Returns a vector of chunks in different languages, so that caller may - // spell-check, translate, or otherwaise process different parts of the input - // buffer in language-dependant ways. - // - Language ExtDetectLanguageSummary( - const char* buffer, - int buffer_length, - bool is_plain_text, - const CLDHints* cld_hints, - int flags, - Language* language3, - int* percent3, - double* normalized_score3, - ResultChunkVector* resultchunkvector, - int* text_bytes, - bool* is_reliable); - - // Return version text string - // String is "code_version - data_build_date" - const char* DetectLanguageVersion(); - - - // Public use flags, debug output controls - static const int kCLDFlagScoreAsQuads = 0x0100; // Force Greek, etc. => quads - static const int kCLDFlagHtml = 0x0200; // Debug HTML => stderr - static const int kCLDFlagCr = 0x0400; // per chunk if HTML - static const int kCLDFlagVerbose = 0x0800; // More debug HTML => stderr - static const int kCLDFlagQuiet = 0x1000; // Less debug HTML => stderr - static const int kCLDFlagEcho = 0x2000; // Echo input => stderr - - -/*** - -Flag meanings: - kCLDFlagScoreAsQuads - Normally, several languages are detected solely by their Unicode script. - Combined with appropritate lookup tables, this flag forces them instead - to be detected via quadgrams. This can be a useful refinement when looking - for meaningful text in these languages, instead of just character sets. - The default tables do not support this use. - kCLDFlagHtml - For each detection call, write an HTML file to stderr, showing the text - chunks and their detected languages. - kCLDFlagCr - In that HTML file, force a new line for each chunk. - kCLDFlagVerbose - In that HTML file, show every lookup entry. - kCLDFlagQuiet - In that HTML file, suppress most of the output detail. - kCLDFlagEcho - Echo every input buffer to stderr. -***/ - -// Debug output: Print the resultchunkvector to file f -void DumpResultChunkVector(FILE* f, const char* src, - ResultChunkVector* resultchunkvector); - -#ifdef CLD2_DYNAMIC_MODE - -// If compiled with dynamic mode, load data from the specified file location. -// If other data has already been loaded, it is discarded and the data is read -// in from the specified file location again (even if the file has not changed). -// WARNING: Before calling this method, language detection will always fail -// and will always return the unknown language. -void loadData(const char* fileName); - -// If compiled with dynamic mode, unload the previously-loaded data. -// WARNING: After calling this method, language detection will no longer work -// and will always return the unknown language. -void unloadData(); - -// Returns true if and only if data has been loaded via a call to loadData(...) -// and has not been subsequently unladed via a call to unloadDate(). -bool isDataLoaded(); - -#endif // #ifdef CLD2_DYNAMIC_MODE - -}; // End namespace CLD2 - -#endif // I18N_ENCODINGS_CLD2_PUBLIC_COMPACT_LANG_DET_H_ diff --git a/application/basilisk/components/translation/cld2/public/encodings.h b/application/basilisk/components/translation/cld2/public/encodings.h deleted file mode 100644 index 1eb8f0a15..000000000 --- a/application/basilisk/components/translation/cld2/public/encodings.h +++ /dev/null @@ -1,169 +0,0 @@ -// Copyright 2013 Google Inc. All Rights Reserved. -// -// Licensed under the Apache License, Version 2.0 (the "License"); -// you may not use this file except in compliance with the License. -// You may obtain a copy of the License at -// -// http://www.apache.org/licenses/LICENSE-2.0 -// -// Unless required by applicable law or agreed to in writing, software -// distributed under the License is distributed on an "AS IS" BASIS, -// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -// See the License for the specific language governing permissions and -// limitations under the License. - -// -// Author: dsites@google.com (Dick Sites) -// - -#ifndef I18N_ENCODINGS_CLD2_PUBLIC_ENCODINGS_H__ -#define I18N_ENCODINGS_CLD2_PUBLIC_ENCODINGS_H__ - -namespace CLD2 { - -enum Encoding { - ISO_8859_1 = 0, // ASCII - ISO_8859_2 = 1, // Latin2 - ISO_8859_3 = 2, // - ISO_8859_4 = 3, // Latin4 - ISO_8859_5 = 4, // ISO-8859-5 - ISO_8859_6 = 5, // Arabic - ISO_8859_7 = 6, // Greek - ISO_8859_8 = 7, // Hebrew - ISO_8859_9 = 8, // - ISO_8859_10 = 9, // - JAPANESE_EUC_JP = 10, // EUC_JP - JAPANESE_SHIFT_JIS = 11, // SJS - JAPANESE_JIS = 12, // JIS - CHINESE_BIG5 = 13, // BIG5 - CHINESE_GB = 14, // GB - CHINESE_EUC_CN = 15, // Misnamed. Should be EUC_TW. Was Basis Tech - // CNS11643EUC, before that EUC-CN(!) - KOREAN_EUC_KR = 16, // KSC - UNICODE_UNUSED = 17, // Unicode - CHINESE_EUC_DEC = 18, // Misnamed. Should be EUC_TW. Was - // CNS11643EUC, before that EUC. - CHINESE_CNS = 19, // Misnamed. Should be EUC_TW. Was - // CNS11643EUC, before that CNS. - CHINESE_BIG5_CP950 = 20, // BIG5_CP950 - JAPANESE_CP932 = 21, // CP932 - UTF8 = 22, - UNKNOWN_ENCODING = 23, - ASCII_7BIT = 24, // ISO_8859_1 with all characters <= 127. - RUSSIAN_KOI8_R = 25, // KOI8R - RUSSIAN_CP1251 = 26, // CP1251 - - //---------------------------------------------------------- - MSFT_CP1252 = 27, // 27: CP1252 aka MSFT euro ascii - RUSSIAN_KOI8_RU = 28, // CP21866 aka KOI8-U, used for Ukrainian. - // Misnamed, this is _not_ KOI8-RU but KOI8-U. - // KOI8-U is used much more often than KOI8-RU. - MSFT_CP1250 = 29, // CP1250 aka MSFT eastern european - ISO_8859_15 = 30, // aka ISO_8859_0 aka ISO_8859_1 euroized - //---------------------------------------------------------- - - //---------------------------------------------------------- - MSFT_CP1254 = 31, // used for Turkish - MSFT_CP1257 = 32, // used in Baltic countries - //---------------------------------------------------------- - - //---------------------------------------------------------- - //---------------------------------------------------------- - ISO_8859_11 = 33, // aka TIS-620, used for Thai - MSFT_CP874 = 34, // used for Thai - MSFT_CP1256 = 35, // used for Arabic - - //---------------------------------------------------------- - MSFT_CP1255 = 36, // Logical Hebrew Microsoft - ISO_8859_8_I = 37, // Iso Hebrew Logical - HEBREW_VISUAL = 38, // Iso Hebrew Visual - //---------------------------------------------------------- - - //---------------------------------------------------------- - CZECH_CP852 = 39, - CZECH_CSN_369103 = 40, // aka ISO_IR_139 aka KOI8_CS - MSFT_CP1253 = 41, // used for Greek - RUSSIAN_CP866 = 42, - //---------------------------------------------------------- - - //---------------------------------------------------------- - // Handled by iconv in glibc - ISO_8859_13 = 43, - ISO_2022_KR = 44, - GBK = 45, - GB18030 = 46, - BIG5_HKSCS = 47, - ISO_2022_CN = 48, - - //----------------------------------------------------------- - // Following 4 encodings are deprecated (font encodings) - TSCII = 49, - TAMIL_MONO = 50, - TAMIL_BI = 51, - JAGRAN = 52, - - - MACINTOSH_ROMAN = 53, - UTF7 = 54, - - //----------------------------------------------------------- - // Following 2 encodings are deprecated (font encodings) - BHASKAR = 55, // Indic encoding - Devanagari - HTCHANAKYA = 56, // 56 Indic encoding - Devanagari - - //----------------------------------------------------------- - UTF16BE = 57, // big-endian UTF-16 - UTF16LE = 58, // little-endian UTF-16 - UTF32BE = 59, // big-endian UTF-32 - UTF32LE = 60, // little-endian UTF-32 - //----------------------------------------------------------- - - //----------------------------------------------------------- - // An encoding that means "This is not text, but it may have some - // simple ASCII text embedded". Intended input conversion - // is to keep strings of >=4 seven-bit ASCII characters - BINARYENC = 61, - //----------------------------------------------------------- - - //----------------------------------------------------------- - // Some Web pages allow a mixture of HZ-GB and GB-2312 by using - // ~{ ... ~} for 2-byte pairs, and the browsers support this. - HZ_GB_2312 = 62, - //----------------------------------------------------------- - - //----------------------------------------------------------- - // Some external vendors make the common input error of - // converting MSFT_CP1252 to UTF8 *twice*. - UTF8UTF8 = 63, - //----------------------------------------------------------- - - //----------------------------------------------------------- - // Following 6 encodings are deprecated (font encodings) - TAM_ELANGO = 64, // Elango - Tamil - TAM_LTTMBARANI = 65, // Barani - Tamil - TAM_SHREE = 66, // Shree - Tamil - TAM_TBOOMIS = 67, // TBoomis - Tamil - TAM_TMNEWS = 68, // TMNews - Tamil - TAM_WEBTAMIL = 69, // Webtamil - Tamil - //----------------------------------------------------------- - - //----------------------------------------------------------- - // Shift_JIS variants used by Japanese cell phone carriers. - KDDI_SHIFT_JIS = 70, - DOCOMO_SHIFT_JIS = 71, - SOFTBANK_SHIFT_JIS = 72, - // ISO-2022-JP variants used by KDDI and SoftBank. - KDDI_ISO_2022_JP = 73, - SOFTBANK_ISO_2022_JP = 74, - //----------------------------------------------------------- - - NUM_ENCODINGS = 75, // Always keep this at the end. It is not a - // valid Encoding enum, it is only used to - // indicate the total number of Encodings. -}; - -} // End namespace CLD2 - -#endif // I18N_ENCODINGS_CLD2_PUBLIC_ENCODINGS_H__ - - -- cgit v1.2.3