Issue #303 Part 1: Move basilisk files from /browser to /application/basilisk

author: wolfbeast <mcwerewolf@gmail.com> 2018-06-04 13:17:38 +0200
committer: wolfbeast <mcwerewolf@gmail.com> 2018-06-04 13:17:38 +0200
commit: a1be17c1cea81ebb1e8b131a662c698d78f3f7f2 (patch)
tree: a92f7de513be600cc07bac458183e9af40e00c06 /browser/components/translation/cld2/public
parent: bf11fdd304898ac675e39b01b280d39550e419d0 (diff)
download: UXP-a1be17c1cea81ebb1e8b131a662c698d78f3f7f2.tar
UXP-a1be17c1cea81ebb1e8b131a662c698d78f3f7f2.tar.gz
UXP-a1be17c1cea81ebb1e8b131a662c698d78f3f7f2.tar.lz
UXP-a1be17c1cea81ebb1e8b131a662c698d78f3f7f2.tar.xz
UXP-a1be17c1cea81ebb1e8b131a662c698d78f3f7f2.zip
2 files changed, 0 insertions, 489 deletions
diff --git a/browser/components/translation/cld2/public/compact_lang_det.h b/browser/components/translation/cld2/public/compact_lang_det.h
deleted file mode 100644
index da59abd63..000000000
--- a/browser/components/translation/cld2/public/compact_lang_det.h
+++ /dev/null
@@ -1,320 +0,0 @@
-// Copyright 2013 Google Inc. All Rights Reserved.
-//
-// Licensed under the Apache License, Version 2.0 (the "License");
-// you may not use this file except in compliance with the License.
-// You may obtain a copy of the License at
-//
-//     http://www.apache.org/licenses/LICENSE-2.0
-//
-// Unless required by applicable law or agreed to in writing, software
-// distributed under the License is distributed on an "AS IS" BASIS,
-// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-// See the License for the specific language governing permissions and
-// limitations under the License.
-
-//
-// Author: dsites@google.com (Dick Sites)
-//
-
-// NOTE:
-// Baybayin (ancient script of the Philippines) is detected as TAGALOG.
-// Chu Nom (Vietnamese ancient Han characters) is detected as VIETNAMESE.
-// HAITIAN_CREOLE is detected as such.
-// NORWEGIAN and NORWEGIAN_N are detected separately (but not robustly)
-// PORTUGUESE, PORTUGUESE_P, and PORTUGUESE_B are all detected as PORTUGUESE.
-// ROMANIAN-Latin is detected as ROMANIAN; ROMANIAN-Cyrillic as ROMANIAN.
-// BOSNIAN is not detected as such, but likely scores as Croatian or Serbian.
-// MONTENEGRIN is not detected as such, but likely scores as Serbian.
-// CROATIAN is detected in the Latin script
-// SERBIAN is detected in the Cyrililc and Latin scripts
-// Zhuang is detected in the Latin script only.
-//
-// The languages X_PIG_LATIN and X_KLINGON are detected in the
-//  extended calls ExtDetectLanguageSummary().
-//
-// UNKNOWN_LANGUAGE is returned if no language's internal reliablity measure
-//  is high enough. This happens with non-text input such as the bytes of a
-//  JPEG, and also with text in languages outside training set.
-//
-// The following languages are to be detected in multiple scripts:
-//  AZERBAIJANI (Latin, Cyrillic*, Arabic*)
-//  BURMESE (Latin, Myanmar)
-//  HAUSA (Latin, Arabic)
-//  KASHMIRI (Arabic, Devanagari)
-//  KAZAKH (Latin, Cyrillic, Arabic)
-//  KURDISH (Latin*, Arabic)
-//  KYRGYZ (Cyrillic, Arabic)
-//  LIMBU (Devanagari, Limbu)
-//  MONGOLIAN (Cyrillic, Mongolian)
-//  SANSKRIT (Latin, Devanagari)
-//  SINDHI (Arabic, Devanagari)
-//  TAGALOG (Latin, Tagalog)
-//  TAJIK (Cyrillic, Arabic*)
-//  TATAR (Latin, Cyrillic, Arabic)
-//  TURKMEN (Latin, Cyrillic, Arabic)
-//  UIGHUR (Latin, Cyrillic, Arabic)
-//  UZBEK (Latin, Cyrillic, Arabic)
-//
-// * Due to a shortage of training text, AZERBAIJANI is not currently detected
-//   in Arabic or Cyrillic scripts, nor KURDISH in Latin script, nor TAJIK in
-//   Arabic script.
-//
-
-#ifndef I18N_ENCODINGS_CLD2_PUBLIC_COMPACT_LANG_DET_H_
-#define I18N_ENCODINGS_CLD2_PUBLIC_COMPACT_LANG_DET_H_
-
-#include <vector>
-#include "../internal/lang_script.h"  // For Language
-
-namespace CLD2 {
-
-  // Scan interchange-valid UTF-8 bytes and detect most likely language,
-  // or set of languages.
-  //
-  // Design goals:
-  //   Skip over big stretches of HTML tags
-  //   Able to return ranges of different languages
-  //   Relatively small tables and relatively fast processing
-  //   Thread safe
-  //
-  // For HTML documents, tags are skipped, along with <script> ... </script>
-  // and <style> ... </style> sequences, and entities are expanded.
-  //
-  // We distinguish between bytes of the raw input buffer and bytes of non-tag
-  // text letters. Since tags can be over 50% of the bytes of an HTML Page,
-  // and are nearly all seven-bit ASCII English, we prefer to distinguish
-  // language mixture fractions based on just the non-tag text.
-  //
-  // Inputs: text and text_length
-  //  Code skips HTML tags and expands HTML entities, unless
-  //  is_plain_text is true
-  // Outputs:
-  //  language3 is an array of the top 3 languages or UNKNOWN_LANGUAGE
-  //  percent3 is an array of the text percentages 0..100 of the top 3 languages
-  //  text_bytes is the amount of non-tag/letters-only text found
-  //  is_reliable set true if the returned Language is some amount more
-  //   probable then the second-best Language. Calculation is a complex function
-  //   of the length of the text and the different-script runs of text.
-  // Return value: the most likely Language for the majority of the input text
-  //  Length 0 input returns UNKNOWN_LANGUAGE. Very short indeterminate text
-  //  defaults to ENGLISH.
-  //
-  // The first two versions return ENGLISH instead of UNKNOWN_LANGUAGE, for
-  // backwards compatibility with a different detector.
-  //
-  // The third version may return UNKNOWN_LANGUAGE, and also returns extended
-  // language codes from lang_script.h
-  //
-
-
-  // Instead of individual arguments, pass in hints as an initialized struct
-  // Init to {NULL, NULL, UNKNOWN_ENCODING, UNKNOWN_LANGUAGE} if not known.
-  //
-  // Pass in hints whenever possible; doing so improves detection accuracy. The
-  // set of passed-in hints are all information that is external to the text
-  // itself.
-  //
-  // The content_language_hint is intended to come from an HTTP header
-  // Content-Language: field, the tld_hint from the hostname of a URL, the
-  // encoding-hint from an encoding detector applied to the input
-  // document, and the language hint from any other context you might have.
-  // The lang= tags inside an HTML document will be picked up as hints
-  // by code within the compact language detector.
-
-  typedef struct {
-    const char* content_language_hint;      // "mi,en" boosts Maori and English
-    const char* tld_hint;                   // "id" boosts Indonesian
-    int encoding_hint;                      // SJS boosts Japanese
-    Language language_hint;                 // ITALIAN boosts it
-  } CLDHints;
-
-  static const int kMaxResultChunkBytes = 65535;
-
-  // For returning a vector of per-language pieces of the input buffer
-  // Unreliable and too-short are mapped to UNKNOWN_LANGUAGE
-  typedef struct {
-    int offset;                 // Starting byte offset in original buffer
-    uint16 bytes;               // Number of bytes in chunk
-    uint16 lang1;               // Top lang, as full Language. Apply
-                                // static_cast<Language>() to this short value.
-  } ResultChunk;
-  typedef std::vector<ResultChunk> ResultChunkVector;
-
-
-  // Scan interchange-valid UTF-8 bytes and detect most likely language
-  Language DetectLanguage(
-                          const char* buffer,
-                          int buffer_length,
-                          bool is_plain_text,
-                          bool* is_reliable);
-
-  // Scan interchange-valid UTF-8 bytes and detect list of top 3 languages.
-  // language3[0] is usually also the return value
-  Language DetectLanguageSummary(
-                          const char* buffer,
-                          int buffer_length,
-                          bool is_plain_text,
-                          Language* language3,
-                          int* percent3,
-                          int* text_bytes,
-                          bool* is_reliable);
-
-  // Same as above, with hints supplied
-  // Scan interchange-valid UTF-8 bytes and detect list of top 3 languages.
-  // language3[0] is usually also the return value
-  Language DetectLanguageSummary(
-                          const char* buffer,
-                          int buffer_length,
-                          bool is_plain_text,
-                          const char* tld_hint,       // "id" boosts Indonesian
-                          int encoding_hint,          // SJS boosts Japanese
-                          Language language_hint,     // ITALIAN boosts it
-                          Language* language3,
-                          int* percent3,
-                          int* text_bytes,
-                          bool* is_reliable);
-
-  // Scan interchange-valid UTF-8 bytes and detect list of top 3 extended
-  // languages.
-  //
-  // Extended languages are additional interface languages and Unicode
-  // single-language scripts, from lang_script.h
-  //
-  // language3[0] is usually also the return value
-  Language ExtDetectLanguageSummary(
-                          const char* buffer,
-                          int buffer_length,
-                          bool is_plain_text,
-                          Language* language3,
-                          int* percent3,
-                          int* text_bytes,
-                          bool* is_reliable);
-
-  // Same as above, with hints supplied
-  // Scan interchange-valid UTF-8 bytes and detect list of top 3 extended
-  // languages.
-  //
-  // Extended languages are additional Google interface languages and Unicode
-  // single-language scripts, from lang_script.h
-  //
-  // language3[0] is usually also the return value
-  Language ExtDetectLanguageSummary(
-                          const char* buffer,
-                          int buffer_length,
-                          bool is_plain_text,
-                          const char* tld_hint,       // "id" boosts Indonesian
-                          int encoding_hint,          // SJS boosts Japanese
-                          Language language_hint,     // ITALIAN boosts it
-                          Language* language3,
-                          int* percent3,
-                          int* text_bytes,
-                          bool* is_reliable);
-
-  // Same as above, and also returns 3 internal language scores as a ratio to
-  // normal score for real text in that language. Scores close to 1.0 indicate
-  // normal text, while scores far away from 1.0 indicate badly-skewed text or
-  // gibberish
-  //
-  Language ExtDetectLanguageSummary(
-                          const char* buffer,
-                          int buffer_length,
-                          bool is_plain_text,
-                          const char* tld_hint,       // "id" boosts Indonesian
-                          int encoding_hint,          // SJS boosts Japanese
-                          Language language_hint,     // ITALIAN boosts it
-                          Language* language3,
-                          int* percent3,
-                          double* normalized_score3,
-                          int* text_bytes,
-                          bool* is_reliable);
-
-
-  // Use this one.
-  // Hints are collected into a struct.
-  // Flags are passed in (normally zero).
-  //
-  // Also returns 3 internal language scores as a ratio to
-  // normal score for real text in that language. Scores close to 1.0 indicate
-  // normal text, while scores far away from 1.0 indicate badly-skewed text or
-  // gibberish
-  //
-  // Returns a vector of chunks in different languages, so that caller may
-  // spell-check, translate, or otherwaise process different parts of the input
-  // buffer in language-dependant ways.
-  //
-  Language ExtDetectLanguageSummary(
-                          const char* buffer,
-                          int buffer_length,
-                          bool is_plain_text,
-                          const CLDHints* cld_hints,
-                          int flags,
-                          Language* language3,
-                          int* percent3,
-                          double* normalized_score3,
-                          ResultChunkVector* resultchunkvector,
-                          int* text_bytes,
-                          bool* is_reliable);
-
-  // Return version text string
-  // String is "code_version - data_build_date"
-  const char* DetectLanguageVersion();
-
-
-  // Public use flags, debug output controls
-  static const int kCLDFlagScoreAsQuads = 0x0100;  // Force Greek, etc. => quads
-  static const int kCLDFlagHtml =         0x0200;  // Debug HTML => stderr
-  static const int kCLDFlagCr =           0x0400;  // <cr> per chunk if HTML
-  static const int kCLDFlagVerbose =      0x0800;  // More debug HTML => stderr
-  static const int kCLDFlagQuiet =        0x1000;  // Less debug HTML => stderr
-  static const int kCLDFlagEcho =         0x2000;  // Echo input => stderr
-
-
-/***
-
-Flag meanings:
- kCLDFlagScoreAsQuads
-   Normally, several languages are detected solely by their Unicode script.
-   Combined with appropritate lookup tables, this flag forces them instead
-   to be detected via quadgrams. This can be a useful refinement when looking
-   for meaningful text in these languages, instead of just character sets.
-   The default tables do not support this use.
- kCLDFlagHtml
-   For each detection call, write an HTML file to stderr, showing the text
-   chunks and their detected languages.
- kCLDFlagCr
-   In that HTML file, force a new line for each chunk.
- kCLDFlagVerbose
-   In that HTML file, show every lookup entry.
- kCLDFlagQuiet
-   In that HTML file, suppress most of the output detail.
- kCLDFlagEcho
-  Echo every input buffer to stderr.
-***/
-
-// Debug output: Print the resultchunkvector to file f
-void DumpResultChunkVector(FILE* f, const char* src,
-                           ResultChunkVector* resultchunkvector);
-
-#ifdef CLD2_DYNAMIC_MODE
-
-// If compiled with dynamic mode, load data from the specified file location.
-// If other data has already been loaded, it is discarded and the data is read
-// in from the specified file location again (even if the file has not changed).
-// WARNING: Before calling this method, language detection will always fail
-// and will always return the unknown language.
-void loadData(const char* fileName);
-
-// If compiled with dynamic mode, unload the previously-loaded data.
-// WARNING: After calling this method, language detection will no longer work
-// and will always return the unknown language.
-void unloadData();
-
-// Returns true if and only if data has been loaded via a call to loadData(...)
-// and has not been subsequently unladed via a call to unloadDate().
-bool isDataLoaded();
-
-#endif // #ifdef CLD2_DYNAMIC_MODE
-
-};      // End namespace CLD2
-
-#endif  // I18N_ENCODINGS_CLD2_PUBLIC_COMPACT_LANG_DET_H_
diff --git a/browser/components/translation/cld2/public/encodings.h b/browser/components/translation/cld2/public/encodings.h
deleted file mode 100644
index 1eb8f0a15..000000000
--- a/browser/components/translation/cld2/public/encodings.h
+++ /dev/null
@@ -1,169 +0,0 @@
-// Copyright 2013 Google Inc. All Rights Reserved.
-//
-// Licensed under the Apache License, Version 2.0 (the "License");
-// you may not use this file except in compliance with the License.
-// You may obtain a copy of the License at
-//
-//     http://www.apache.org/licenses/LICENSE-2.0
-//
-// Unless required by applicable law or agreed to in writing, software
-// distributed under the License is distributed on an "AS IS" BASIS,
-// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-// See the License for the specific language governing permissions and
-// limitations under the License.
-
-//
-// Author: dsites@google.com (Dick Sites)
-//
-
-#ifndef I18N_ENCODINGS_CLD2_PUBLIC_ENCODINGS_H__
-#define I18N_ENCODINGS_CLD2_PUBLIC_ENCODINGS_H__
-
-namespace CLD2 {
-
-enum Encoding {
-  ISO_8859_1           =  0,  //   ASCII
-  ISO_8859_2           =  1,  //   Latin2
-  ISO_8859_3           =  2,  //
-  ISO_8859_4           =  3,  //   Latin4
-  ISO_8859_5           =  4,  //   ISO-8859-5
-  ISO_8859_6           =  5,  //   Arabic
-  ISO_8859_7           =  6,  //   Greek
-  ISO_8859_8           =  7,  //   Hebrew
-  ISO_8859_9           =  8,  //
-  ISO_8859_10          =  9,  //
-  JAPANESE_EUC_JP      = 10,  //   EUC_JP
-  JAPANESE_SHIFT_JIS   = 11,  //   SJS
-  JAPANESE_JIS         = 12,  //   JIS
-  CHINESE_BIG5         = 13,  //   BIG5
-  CHINESE_GB           = 14,  //   GB
-  CHINESE_EUC_CN       = 15,  // Misnamed. Should be EUC_TW. Was Basis Tech
-                              // CNS11643EUC, before that   EUC-CN(!)
-  KOREAN_EUC_KR        = 16,  //   KSC
-  UNICODE_UNUSED       = 17,  //   Unicode
-  CHINESE_EUC_DEC      = 18,  // Misnamed. Should be EUC_TW. Was
-                              // CNS11643EUC, before that   EUC.
-  CHINESE_CNS          = 19,  // Misnamed. Should be EUC_TW. Was
-                              // CNS11643EUC, before that   CNS.
-  CHINESE_BIG5_CP950   = 20,  //   BIG5_CP950
-  JAPANESE_CP932       = 21,  //   CP932
-  UTF8                 = 22,
-  UNKNOWN_ENCODING     = 23,
-  ASCII_7BIT           = 24,  // ISO_8859_1 with all characters <= 127.
-  RUSSIAN_KOI8_R       = 25,  //   KOI8R
-  RUSSIAN_CP1251       = 26,  //   CP1251
-
-  //----------------------------------------------------------
-  MSFT_CP1252          = 27,  // 27: CP1252 aka MSFT euro ascii
-  RUSSIAN_KOI8_RU      = 28,  // CP21866 aka KOI8-U, used for Ukrainian.
-                              // Misnamed, this is _not_ KOI8-RU but KOI8-U.
-                              // KOI8-U is used much more often than KOI8-RU.
-  MSFT_CP1250          = 29,  // CP1250 aka MSFT eastern european
-  ISO_8859_15          = 30,  // aka ISO_8859_0 aka ISO_8859_1 euroized
-  //----------------------------------------------------------
-
-  //----------------------------------------------------------
-  MSFT_CP1254          = 31,  // used for Turkish
-  MSFT_CP1257          = 32,  // used in Baltic countries
-  //----------------------------------------------------------
-
-  //----------------------------------------------------------
-  //----------------------------------------------------------
-  ISO_8859_11          = 33,  // aka TIS-620, used for Thai
-  MSFT_CP874           = 34,  // used for Thai
-  MSFT_CP1256          = 35,  // used for Arabic
-
-  //----------------------------------------------------------
-  MSFT_CP1255          = 36,  // Logical Hebrew Microsoft
-  ISO_8859_8_I         = 37,  // Iso Hebrew Logical
-  HEBREW_VISUAL        = 38,  // Iso Hebrew Visual
-  //----------------------------------------------------------
-
-  //----------------------------------------------------------
-  CZECH_CP852          = 39,
-  CZECH_CSN_369103     = 40,  // aka ISO_IR_139 aka KOI8_CS
-  MSFT_CP1253          = 41,  // used for Greek
-  RUSSIAN_CP866        = 42,
-  //----------------------------------------------------------
-
-  //----------------------------------------------------------
-  // Handled by iconv in glibc
-  ISO_8859_13          = 43,
-  ISO_2022_KR          = 44,
-  GBK                  = 45,
-  GB18030              = 46,
-  BIG5_HKSCS           = 47,
-  ISO_2022_CN          = 48,
-
-  //-----------------------------------------------------------
-  // Following 4 encodings are deprecated (font encodings)
-  TSCII                = 49,
-  TAMIL_MONO           = 50,
-  TAMIL_BI             = 51,
-  JAGRAN               = 52,
-
-
-  MACINTOSH_ROMAN      = 53,
-  UTF7                 = 54,
-
-  //-----------------------------------------------------------
-  // Following 2 encodings are deprecated (font encodings)
-  BHASKAR              = 55,  // Indic encoding - Devanagari
-  HTCHANAKYA           = 56,  // 56 Indic encoding - Devanagari
-
-  //-----------------------------------------------------------
-  UTF16BE              = 57,  // big-endian UTF-16
-  UTF16LE              = 58,  // little-endian UTF-16
-  UTF32BE              = 59,  // big-endian UTF-32
-  UTF32LE              = 60,  // little-endian UTF-32
-  //-----------------------------------------------------------
-
-  //-----------------------------------------------------------
-  // An encoding that means "This is not text, but it may have some
-  // simple ASCII text embedded". Intended input conversion
-  // is to keep strings of >=4 seven-bit ASCII characters
-  BINARYENC            = 61,
-  //-----------------------------------------------------------
-
-  //-----------------------------------------------------------
-  // Some Web pages allow a mixture of HZ-GB and GB-2312 by using
-  // ~{ ... ~} for 2-byte pairs, and the browsers support this.
-  HZ_GB_2312           = 62,
-  //-----------------------------------------------------------
-
-  //-----------------------------------------------------------
-  // Some external vendors make the common input error of
-  // converting MSFT_CP1252 to UTF8 *twice*.
-  UTF8UTF8             = 63,
-  //-----------------------------------------------------------
-
-  //-----------------------------------------------------------
-  // Following 6 encodings are deprecated (font encodings)
-  TAM_ELANGO           = 64,  // Elango - Tamil
-  TAM_LTTMBARANI       = 65,  // Barani - Tamil
-  TAM_SHREE            = 66,  // Shree - Tamil
-  TAM_TBOOMIS          = 67,  // TBoomis - Tamil
-  TAM_TMNEWS           = 68,  // TMNews - Tamil
-  TAM_WEBTAMIL         = 69,  // Webtamil - Tamil
-  //-----------------------------------------------------------
-
-  //-----------------------------------------------------------
-  // Shift_JIS variants used by Japanese cell phone carriers.
-  KDDI_SHIFT_JIS       = 70,
-  DOCOMO_SHIFT_JIS     = 71,
-  SOFTBANK_SHIFT_JIS   = 72,
-  // ISO-2022-JP variants used by KDDI and SoftBank.
-  KDDI_ISO_2022_JP     = 73,
-  SOFTBANK_ISO_2022_JP = 74,
-  //-----------------------------------------------------------
-
-  NUM_ENCODINGS        = 75,  // Always keep this at the end. It is not a
-                              // valid Encoding enum, it is only used to
-                              // indicate the total number of Encodings.
-};
-
-}       // End namespace CLD2
-
-#endif  // I18N_ENCODINGS_CLD2_PUBLIC_ENCODINGS_H__
-
-
author	wolfbeast <mcwerewolf@gmail.com>	2018-06-04 13:17:38 +0200
committer	wolfbeast <mcwerewolf@gmail.com>	2018-06-04 13:17:38 +0200
commit	a1be17c1cea81ebb1e8b131a662c698d78f3f7f2 (patch)
tree	a92f7de513be600cc07bac458183e9af40e00c06 /browser/components/translation/cld2/public
parent	bf11fdd304898ac675e39b01b280d39550e419d0 (diff)
download	UXP-a1be17c1cea81ebb1e8b131a662c698d78f3f7f2.tar UXP-a1be17c1cea81ebb1e8b131a662c698d78f3f7f2.tar.gz UXP-a1be17c1cea81ebb1e8b131a662c698d78f3f7f2.tar.lz UXP-a1be17c1cea81ebb1e8b131a662c698d78f3f7f2.tar.xz UXP-a1be17c1cea81ebb1e8b131a662c698d78f3f7f2.zip