summaryrefslogtreecommitdiffstats
path: root/application/basilisk/components/translation/cld2/public
diff options
context:
space:
mode:
authorMatt A. Tobin <email@mattatobin.com>2018-02-02 05:06:10 -0500
committerMatt A. Tobin <email@mattatobin.com>2018-02-02 05:06:10 -0500
commit6d614170cbfa958564eb5f824234ad5a9e484344 (patch)
tree3e1eb384382f30987cb2e64bd654afa8b74fd06b /application/basilisk/components/translation/cld2/public
parent2a6b605d64b19411a300efdbbd7f78c349f90206 (diff)
downloadUXP-6d614170cbfa958564eb5f824234ad5a9e484344.tar
UXP-6d614170cbfa958564eb5f824234ad5a9e484344.tar.gz
UXP-6d614170cbfa958564eb5f824234ad5a9e484344.tar.lz
UXP-6d614170cbfa958564eb5f824234ad5a9e484344.tar.xz
UXP-6d614170cbfa958564eb5f824234ad5a9e484344.zip
Revert "Add Basilisk"
This reverts commit e72ef92b5bdc43cd2584198e2e54e951b70299e8.
Diffstat (limited to 'application/basilisk/components/translation/cld2/public')
-rw-r--r--application/basilisk/components/translation/cld2/public/compact_lang_det.h320
-rw-r--r--application/basilisk/components/translation/cld2/public/encodings.h169
2 files changed, 0 insertions, 489 deletions
diff --git a/application/basilisk/components/translation/cld2/public/compact_lang_det.h b/application/basilisk/components/translation/cld2/public/compact_lang_det.h
deleted file mode 100644
index da59abd63..000000000
--- a/application/basilisk/components/translation/cld2/public/compact_lang_det.h
+++ /dev/null
@@ -1,320 +0,0 @@
-// Copyright 2013 Google Inc. All Rights Reserved.
-//
-// Licensed under the Apache License, Version 2.0 (the "License");
-// you may not use this file except in compliance with the License.
-// You may obtain a copy of the License at
-//
-// http://www.apache.org/licenses/LICENSE-2.0
-//
-// Unless required by applicable law or agreed to in writing, software
-// distributed under the License is distributed on an "AS IS" BASIS,
-// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-// See the License for the specific language governing permissions and
-// limitations under the License.
-
-//
-// Author: dsites@google.com (Dick Sites)
-//
-
-// NOTE:
-// Baybayin (ancient script of the Philippines) is detected as TAGALOG.
-// Chu Nom (Vietnamese ancient Han characters) is detected as VIETNAMESE.
-// HAITIAN_CREOLE is detected as such.
-// NORWEGIAN and NORWEGIAN_N are detected separately (but not robustly)
-// PORTUGUESE, PORTUGUESE_P, and PORTUGUESE_B are all detected as PORTUGUESE.
-// ROMANIAN-Latin is detected as ROMANIAN; ROMANIAN-Cyrillic as ROMANIAN.
-// BOSNIAN is not detected as such, but likely scores as Croatian or Serbian.
-// MONTENEGRIN is not detected as such, but likely scores as Serbian.
-// CROATIAN is detected in the Latin script
-// SERBIAN is detected in the Cyrililc and Latin scripts
-// Zhuang is detected in the Latin script only.
-//
-// The languages X_PIG_LATIN and X_KLINGON are detected in the
-// extended calls ExtDetectLanguageSummary().
-//
-// UNKNOWN_LANGUAGE is returned if no language's internal reliablity measure
-// is high enough. This happens with non-text input such as the bytes of a
-// JPEG, and also with text in languages outside training set.
-//
-// The following languages are to be detected in multiple scripts:
-// AZERBAIJANI (Latin, Cyrillic*, Arabic*)
-// BURMESE (Latin, Myanmar)
-// HAUSA (Latin, Arabic)
-// KASHMIRI (Arabic, Devanagari)
-// KAZAKH (Latin, Cyrillic, Arabic)
-// KURDISH (Latin*, Arabic)
-// KYRGYZ (Cyrillic, Arabic)
-// LIMBU (Devanagari, Limbu)
-// MONGOLIAN (Cyrillic, Mongolian)
-// SANSKRIT (Latin, Devanagari)
-// SINDHI (Arabic, Devanagari)
-// TAGALOG (Latin, Tagalog)
-// TAJIK (Cyrillic, Arabic*)
-// TATAR (Latin, Cyrillic, Arabic)
-// TURKMEN (Latin, Cyrillic, Arabic)
-// UIGHUR (Latin, Cyrillic, Arabic)
-// UZBEK (Latin, Cyrillic, Arabic)
-//
-// * Due to a shortage of training text, AZERBAIJANI is not currently detected
-// in Arabic or Cyrillic scripts, nor KURDISH in Latin script, nor TAJIK in
-// Arabic script.
-//
-
-#ifndef I18N_ENCODINGS_CLD2_PUBLIC_COMPACT_LANG_DET_H_
-#define I18N_ENCODINGS_CLD2_PUBLIC_COMPACT_LANG_DET_H_
-
-#include <vector>
-#include "../internal/lang_script.h" // For Language
-
-namespace CLD2 {
-
- // Scan interchange-valid UTF-8 bytes and detect most likely language,
- // or set of languages.
- //
- // Design goals:
- // Skip over big stretches of HTML tags
- // Able to return ranges of different languages
- // Relatively small tables and relatively fast processing
- // Thread safe
- //
- // For HTML documents, tags are skipped, along with <script> ... </script>
- // and <style> ... </style> sequences, and entities are expanded.
- //
- // We distinguish between bytes of the raw input buffer and bytes of non-tag
- // text letters. Since tags can be over 50% of the bytes of an HTML Page,
- // and are nearly all seven-bit ASCII English, we prefer to distinguish
- // language mixture fractions based on just the non-tag text.
- //
- // Inputs: text and text_length
- // Code skips HTML tags and expands HTML entities, unless
- // is_plain_text is true
- // Outputs:
- // language3 is an array of the top 3 languages or UNKNOWN_LANGUAGE
- // percent3 is an array of the text percentages 0..100 of the top 3 languages
- // text_bytes is the amount of non-tag/letters-only text found
- // is_reliable set true if the returned Language is some amount more
- // probable then the second-best Language. Calculation is a complex function
- // of the length of the text and the different-script runs of text.
- // Return value: the most likely Language for the majority of the input text
- // Length 0 input returns UNKNOWN_LANGUAGE. Very short indeterminate text
- // defaults to ENGLISH.
- //
- // The first two versions return ENGLISH instead of UNKNOWN_LANGUAGE, for
- // backwards compatibility with a different detector.
- //
- // The third version may return UNKNOWN_LANGUAGE, and also returns extended
- // language codes from lang_script.h
- //
-
-
- // Instead of individual arguments, pass in hints as an initialized struct
- // Init to {NULL, NULL, UNKNOWN_ENCODING, UNKNOWN_LANGUAGE} if not known.
- //
- // Pass in hints whenever possible; doing so improves detection accuracy. The
- // set of passed-in hints are all information that is external to the text
- // itself.
- //
- // The content_language_hint is intended to come from an HTTP header
- // Content-Language: field, the tld_hint from the hostname of a URL, the
- // encoding-hint from an encoding detector applied to the input
- // document, and the language hint from any other context you might have.
- // The lang= tags inside an HTML document will be picked up as hints
- // by code within the compact language detector.
-
- typedef struct {
- const char* content_language_hint; // "mi,en" boosts Maori and English
- const char* tld_hint; // "id" boosts Indonesian
- int encoding_hint; // SJS boosts Japanese
- Language language_hint; // ITALIAN boosts it
- } CLDHints;
-
- static const int kMaxResultChunkBytes = 65535;
-
- // For returning a vector of per-language pieces of the input buffer
- // Unreliable and too-short are mapped to UNKNOWN_LANGUAGE
- typedef struct {
- int offset; // Starting byte offset in original buffer
- uint16 bytes; // Number of bytes in chunk
- uint16 lang1; // Top lang, as full Language. Apply
- // static_cast<Language>() to this short value.
- } ResultChunk;
- typedef std::vector<ResultChunk> ResultChunkVector;
-
-
- // Scan interchange-valid UTF-8 bytes and detect most likely language
- Language DetectLanguage(
- const char* buffer,
- int buffer_length,
- bool is_plain_text,
- bool* is_reliable);
-
- // Scan interchange-valid UTF-8 bytes and detect list of top 3 languages.
- // language3[0] is usually also the return value
- Language DetectLanguageSummary(
- const char* buffer,
- int buffer_length,
- bool is_plain_text,
- Language* language3,
- int* percent3,
- int* text_bytes,
- bool* is_reliable);
-
- // Same as above, with hints supplied
- // Scan interchange-valid UTF-8 bytes and detect list of top 3 languages.
- // language3[0] is usually also the return value
- Language DetectLanguageSummary(
- const char* buffer,
- int buffer_length,
- bool is_plain_text,
- const char* tld_hint, // "id" boosts Indonesian
- int encoding_hint, // SJS boosts Japanese
- Language language_hint, // ITALIAN boosts it
- Language* language3,
- int* percent3,
- int* text_bytes,
- bool* is_reliable);
-
- // Scan interchange-valid UTF-8 bytes and detect list of top 3 extended
- // languages.
- //
- // Extended languages are additional interface languages and Unicode
- // single-language scripts, from lang_script.h
- //
- // language3[0] is usually also the return value
- Language ExtDetectLanguageSummary(
- const char* buffer,
- int buffer_length,
- bool is_plain_text,
- Language* language3,
- int* percent3,
- int* text_bytes,
- bool* is_reliable);
-
- // Same as above, with hints supplied
- // Scan interchange-valid UTF-8 bytes and detect list of top 3 extended
- // languages.
- //
- // Extended languages are additional Google interface languages and Unicode
- // single-language scripts, from lang_script.h
- //
- // language3[0] is usually also the return value
- Language ExtDetectLanguageSummary(
- const char* buffer,
- int buffer_length,
- bool is_plain_text,
- const char* tld_hint, // "id" boosts Indonesian
- int encoding_hint, // SJS boosts Japanese
- Language language_hint, // ITALIAN boosts it
- Language* language3,
- int* percent3,
- int* text_bytes,
- bool* is_reliable);
-
- // Same as above, and also returns 3 internal language scores as a ratio to
- // normal score for real text in that language. Scores close to 1.0 indicate
- // normal text, while scores far away from 1.0 indicate badly-skewed text or
- // gibberish
- //
- Language ExtDetectLanguageSummary(
- const char* buffer,
- int buffer_length,
- bool is_plain_text,
- const char* tld_hint, // "id" boosts Indonesian
- int encoding_hint, // SJS boosts Japanese
- Language language_hint, // ITALIAN boosts it
- Language* language3,
- int* percent3,
- double* normalized_score3,
- int* text_bytes,
- bool* is_reliable);
-
-
- // Use this one.
- // Hints are collected into a struct.
- // Flags are passed in (normally zero).
- //
- // Also returns 3 internal language scores as a ratio to
- // normal score for real text in that language. Scores close to 1.0 indicate
- // normal text, while scores far away from 1.0 indicate badly-skewed text or
- // gibberish
- //
- // Returns a vector of chunks in different languages, so that caller may
- // spell-check, translate, or otherwaise process different parts of the input
- // buffer in language-dependant ways.
- //
- Language ExtDetectLanguageSummary(
- const char* buffer,
- int buffer_length,
- bool is_plain_text,
- const CLDHints* cld_hints,
- int flags,
- Language* language3,
- int* percent3,
- double* normalized_score3,
- ResultChunkVector* resultchunkvector,
- int* text_bytes,
- bool* is_reliable);
-
- // Return version text string
- // String is "code_version - data_build_date"
- const char* DetectLanguageVersion();
-
-
- // Public use flags, debug output controls
- static const int kCLDFlagScoreAsQuads = 0x0100; // Force Greek, etc. => quads
- static const int kCLDFlagHtml = 0x0200; // Debug HTML => stderr
- static const int kCLDFlagCr = 0x0400; // <cr> per chunk if HTML
- static const int kCLDFlagVerbose = 0x0800; // More debug HTML => stderr
- static const int kCLDFlagQuiet = 0x1000; // Less debug HTML => stderr
- static const int kCLDFlagEcho = 0x2000; // Echo input => stderr
-
-
-/***
-
-Flag meanings:
- kCLDFlagScoreAsQuads
- Normally, several languages are detected solely by their Unicode script.
- Combined with appropritate lookup tables, this flag forces them instead
- to be detected via quadgrams. This can be a useful refinement when looking
- for meaningful text in these languages, instead of just character sets.
- The default tables do not support this use.
- kCLDFlagHtml
- For each detection call, write an HTML file to stderr, showing the text
- chunks and their detected languages.
- kCLDFlagCr
- In that HTML file, force a new line for each chunk.
- kCLDFlagVerbose
- In that HTML file, show every lookup entry.
- kCLDFlagQuiet
- In that HTML file, suppress most of the output detail.
- kCLDFlagEcho
- Echo every input buffer to stderr.
-***/
-
-// Debug output: Print the resultchunkvector to file f
-void DumpResultChunkVector(FILE* f, const char* src,
- ResultChunkVector* resultchunkvector);
-
-#ifdef CLD2_DYNAMIC_MODE
-
-// If compiled with dynamic mode, load data from the specified file location.
-// If other data has already been loaded, it is discarded and the data is read
-// in from the specified file location again (even if the file has not changed).
-// WARNING: Before calling this method, language detection will always fail
-// and will always return the unknown language.
-void loadData(const char* fileName);
-
-// If compiled with dynamic mode, unload the previously-loaded data.
-// WARNING: After calling this method, language detection will no longer work
-// and will always return the unknown language.
-void unloadData();
-
-// Returns true if and only if data has been loaded via a call to loadData(...)
-// and has not been subsequently unladed via a call to unloadDate().
-bool isDataLoaded();
-
-#endif // #ifdef CLD2_DYNAMIC_MODE
-
-}; // End namespace CLD2
-
-#endif // I18N_ENCODINGS_CLD2_PUBLIC_COMPACT_LANG_DET_H_
diff --git a/application/basilisk/components/translation/cld2/public/encodings.h b/application/basilisk/components/translation/cld2/public/encodings.h
deleted file mode 100644
index 1eb8f0a15..000000000
--- a/application/basilisk/components/translation/cld2/public/encodings.h
+++ /dev/null
@@ -1,169 +0,0 @@
-// Copyright 2013 Google Inc. All Rights Reserved.
-//
-// Licensed under the Apache License, Version 2.0 (the "License");
-// you may not use this file except in compliance with the License.
-// You may obtain a copy of the License at
-//
-// http://www.apache.org/licenses/LICENSE-2.0
-//
-// Unless required by applicable law or agreed to in writing, software
-// distributed under the License is distributed on an "AS IS" BASIS,
-// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-// See the License for the specific language governing permissions and
-// limitations under the License.
-
-//
-// Author: dsites@google.com (Dick Sites)
-//
-
-#ifndef I18N_ENCODINGS_CLD2_PUBLIC_ENCODINGS_H__
-#define I18N_ENCODINGS_CLD2_PUBLIC_ENCODINGS_H__
-
-namespace CLD2 {
-
-enum Encoding {
- ISO_8859_1 = 0, // ASCII
- ISO_8859_2 = 1, // Latin2
- ISO_8859_3 = 2, //
- ISO_8859_4 = 3, // Latin4
- ISO_8859_5 = 4, // ISO-8859-5
- ISO_8859_6 = 5, // Arabic
- ISO_8859_7 = 6, // Greek
- ISO_8859_8 = 7, // Hebrew
- ISO_8859_9 = 8, //
- ISO_8859_10 = 9, //
- JAPANESE_EUC_JP = 10, // EUC_JP
- JAPANESE_SHIFT_JIS = 11, // SJS
- JAPANESE_JIS = 12, // JIS
- CHINESE_BIG5 = 13, // BIG5
- CHINESE_GB = 14, // GB
- CHINESE_EUC_CN = 15, // Misnamed. Should be EUC_TW. Was Basis Tech
- // CNS11643EUC, before that EUC-CN(!)
- KOREAN_EUC_KR = 16, // KSC
- UNICODE_UNUSED = 17, // Unicode
- CHINESE_EUC_DEC = 18, // Misnamed. Should be EUC_TW. Was
- // CNS11643EUC, before that EUC.
- CHINESE_CNS = 19, // Misnamed. Should be EUC_TW. Was
- // CNS11643EUC, before that CNS.
- CHINESE_BIG5_CP950 = 20, // BIG5_CP950
- JAPANESE_CP932 = 21, // CP932
- UTF8 = 22,
- UNKNOWN_ENCODING = 23,
- ASCII_7BIT = 24, // ISO_8859_1 with all characters <= 127.
- RUSSIAN_KOI8_R = 25, // KOI8R
- RUSSIAN_CP1251 = 26, // CP1251
-
- //----------------------------------------------------------
- MSFT_CP1252 = 27, // 27: CP1252 aka MSFT euro ascii
- RUSSIAN_KOI8_RU = 28, // CP21866 aka KOI8-U, used for Ukrainian.
- // Misnamed, this is _not_ KOI8-RU but KOI8-U.
- // KOI8-U is used much more often than KOI8-RU.
- MSFT_CP1250 = 29, // CP1250 aka MSFT eastern european
- ISO_8859_15 = 30, // aka ISO_8859_0 aka ISO_8859_1 euroized
- //----------------------------------------------------------
-
- //----------------------------------------------------------
- MSFT_CP1254 = 31, // used for Turkish
- MSFT_CP1257 = 32, // used in Baltic countries
- //----------------------------------------------------------
-
- //----------------------------------------------------------
- //----------------------------------------------------------
- ISO_8859_11 = 33, // aka TIS-620, used for Thai
- MSFT_CP874 = 34, // used for Thai
- MSFT_CP1256 = 35, // used for Arabic
-
- //----------------------------------------------------------
- MSFT_CP1255 = 36, // Logical Hebrew Microsoft
- ISO_8859_8_I = 37, // Iso Hebrew Logical
- HEBREW_VISUAL = 38, // Iso Hebrew Visual
- //----------------------------------------------------------
-
- //----------------------------------------------------------
- CZECH_CP852 = 39,
- CZECH_CSN_369103 = 40, // aka ISO_IR_139 aka KOI8_CS
- MSFT_CP1253 = 41, // used for Greek
- RUSSIAN_CP866 = 42,
- //----------------------------------------------------------
-
- //----------------------------------------------------------
- // Handled by iconv in glibc
- ISO_8859_13 = 43,
- ISO_2022_KR = 44,
- GBK = 45,
- GB18030 = 46,
- BIG5_HKSCS = 47,
- ISO_2022_CN = 48,
-
- //-----------------------------------------------------------
- // Following 4 encodings are deprecated (font encodings)
- TSCII = 49,
- TAMIL_MONO = 50,
- TAMIL_BI = 51,
- JAGRAN = 52,
-
-
- MACINTOSH_ROMAN = 53,
- UTF7 = 54,
-
- //-----------------------------------------------------------
- // Following 2 encodings are deprecated (font encodings)
- BHASKAR = 55, // Indic encoding - Devanagari
- HTCHANAKYA = 56, // 56 Indic encoding - Devanagari
-
- //-----------------------------------------------------------
- UTF16BE = 57, // big-endian UTF-16
- UTF16LE = 58, // little-endian UTF-16
- UTF32BE = 59, // big-endian UTF-32
- UTF32LE = 60, // little-endian UTF-32
- //-----------------------------------------------------------
-
- //-----------------------------------------------------------
- // An encoding that means "This is not text, but it may have some
- // simple ASCII text embedded". Intended input conversion
- // is to keep strings of >=4 seven-bit ASCII characters
- BINARYENC = 61,
- //-----------------------------------------------------------
-
- //-----------------------------------------------------------
- // Some Web pages allow a mixture of HZ-GB and GB-2312 by using
- // ~{ ... ~} for 2-byte pairs, and the browsers support this.
- HZ_GB_2312 = 62,
- //-----------------------------------------------------------
-
- //-----------------------------------------------------------
- // Some external vendors make the common input error of
- // converting MSFT_CP1252 to UTF8 *twice*.
- UTF8UTF8 = 63,
- //-----------------------------------------------------------
-
- //-----------------------------------------------------------
- // Following 6 encodings are deprecated (font encodings)
- TAM_ELANGO = 64, // Elango - Tamil
- TAM_LTTMBARANI = 65, // Barani - Tamil
- TAM_SHREE = 66, // Shree - Tamil
- TAM_TBOOMIS = 67, // TBoomis - Tamil
- TAM_TMNEWS = 68, // TMNews - Tamil
- TAM_WEBTAMIL = 69, // Webtamil - Tamil
- //-----------------------------------------------------------
-
- //-----------------------------------------------------------
- // Shift_JIS variants used by Japanese cell phone carriers.
- KDDI_SHIFT_JIS = 70,
- DOCOMO_SHIFT_JIS = 71,
- SOFTBANK_SHIFT_JIS = 72,
- // ISO-2022-JP variants used by KDDI and SoftBank.
- KDDI_ISO_2022_JP = 73,
- SOFTBANK_ISO_2022_JP = 74,
- //-----------------------------------------------------------
-
- NUM_ENCODINGS = 75, // Always keep this at the end. It is not a
- // valid Encoding enum, it is only used to
- // indicate the total number of Encodings.
-};
-
-} // End namespace CLD2
-
-#endif // I18N_ENCODINGS_CLD2_PUBLIC_ENCODINGS_H__
-
-