summaryrefslogtreecommitdiffstats
path: root/browser/components/translation/cld2/internal/lang_script.h
diff options
context:
space:
mode:
authorwolfbeast <mcwerewolf@gmail.com>2018-06-04 13:17:38 +0200
committerwolfbeast <mcwerewolf@gmail.com>2018-06-04 13:17:38 +0200
commita1be17c1cea81ebb1e8b131a662c698d78f3f7f2 (patch)
treea92f7de513be600cc07bac458183e9af40e00c06 /browser/components/translation/cld2/internal/lang_script.h
parentbf11fdd304898ac675e39b01b280d39550e419d0 (diff)
downloadUXP-a1be17c1cea81ebb1e8b131a662c698d78f3f7f2.tar
UXP-a1be17c1cea81ebb1e8b131a662c698d78f3f7f2.tar.gz
UXP-a1be17c1cea81ebb1e8b131a662c698d78f3f7f2.tar.lz
UXP-a1be17c1cea81ebb1e8b131a662c698d78f3f7f2.tar.xz
UXP-a1be17c1cea81ebb1e8b131a662c698d78f3f7f2.zip
Issue #303 Part 1: Move basilisk files from /browser to /application/basilisk
Diffstat (limited to 'browser/components/translation/cld2/internal/lang_script.h')
-rw-r--r--browser/components/translation/cld2/internal/lang_script.h187
1 files changed, 0 insertions, 187 deletions
diff --git a/browser/components/translation/cld2/internal/lang_script.h b/browser/components/translation/cld2/internal/lang_script.h
deleted file mode 100644
index 9311707e4..000000000
--- a/browser/components/translation/cld2/internal/lang_script.h
+++ /dev/null
@@ -1,187 +0,0 @@
-// Copyright 2013 Google Inc. All Rights Reserved.
-//
-// Licensed under the Apache License, Version 2.0 (the "License");
-// you may not use this file except in compliance with the License.
-// You may obtain a copy of the License at
-//
-// http://www.apache.org/licenses/LICENSE-2.0
-//
-// Unless required by applicable law or agreed to in writing, software
-// distributed under the License is distributed on an "AS IS" BASIS,
-// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-// See the License for the specific language governing permissions and
-// limitations under the License.
-
-//
-// File: lang_script.h
-// ================
-//
-// Author: dsites@google.com (Dick Sites)
-//
-// This file declares language and script numbers and names for CLD2,
-// plus routines that access side tables based on these
-//
-
-#ifndef I18N_ENCODINGS_CLD2_LANG_SCRIPT_H__
-#define I18N_ENCODINGS_CLD2_LANG_SCRIPT_H__
-
-#include "generated_language.h"
-#include "generated_ulscript.h"
-#include "integral_types.h"
-
-
-// NOTE: The script numbers and language numbers here are not guaranteed to be
-// stable. If you want to record a result for posterity, save the
-// ULScriptCode(ULScript ulscript) result as character strings.
-//
-// The Unicode scripts recognized by CLD2 are numbered almost arbitrarily,
-// specified in an enum. Each script has human-readable script name and a
-// 4-letter ISO 15924 script code. Each has a C name (largely for use by
-// programs that generate declarations in cld2_generated_scripts.h). Each
-// also has a recognition type
-// r_type: 0 script-only, 1 nilgrams, 2 quadgrams, 3 CJK
-//
-// The declarations for a particular version of Unicode are machine-generated in
-// generated_scripts.h
-//
-// This file includes that one and declares the access routines. The type
-// involved is called "ULScript" to signify Unicode Letters-Marks Scripts,
-// which are not quite Unicode Scripts. In particular, the CJK scripts are
-// merged into a single number because CLD2 recognizes the CJK languages from
-// four scripts intermixed: Hani (both Hans and Hant), Hangul, Hiragana, and
-// Katakana.
-
-// Each script has one of these four recognition types.
-// RTypeNone: There is no language associated with this script. In extended
-// language recognition calls, return a fake language number that maps to
-// xx-Cham, with literally "xx" for the language code,and with the script
-// code instead of "Cham". In non-extended calls, return UNKNOWN_LANGUAGE.
-// RTypeOne: The script maps 1:1 to a single language. No letters are examined
-// during recognition and no lookups done.
-// RTypeMany: The usual quadgram + delta-octagram + distinctive-words scoring
-// is done to determine the languages involved.
-// RTypeCJK: The CJK unigram + delta-bigram scoring is done to determine the
-// languages involved.
-//
-// Note that the choice of recognition type is a function of script, not
-// language. In particular, some languges are recognized in multiple scripts
-// and those have different recognition types (Mongolian mn-Latn vs. mn-Mong
-// for example).
-
-namespace CLD2 {
-
-//----------------------------------------------------------------------------//
-// Functions of ULScript //
-//----------------------------------------------------------------------------//
-
-// If the input is out of range or otherwise unrecognized, it is treated
-// as ULScript_Common (which never participates in language recognition)
-const char* ULScriptName(ULScript ulscript);
-const char* ULScriptCode(ULScript ulscript);
-const char* ULScriptDeclaredName(ULScript ulscript);
-ULScriptRType ULScriptRecognitionType(ULScript ulscript);
-
-// Name can be either full name or ISO code, or can be ISO code embedded in
-// a language-script combination such as "en-Latn-GB"
-ULScript GetULScriptFromName(const char* src);
-
-// Map script into Latin, Cyrillic, Arabic, Other
-int LScript4(ULScript ulscript);
-
-//----------------------------------------------------------------------------//
-// Functions of Language //
-//----------------------------------------------------------------------------//
-
-// The languages recognized by CLD2 are numbered almost arbitrarily,
-// specified in an enum. Each language has human-readable language name and a
-// 2- or 3-letter ISO 639 language code. Each has a C name (largely for use by
-// programs that generate declarations in cld2_generated_languagess.h).
-// Each has a list of up to four scripts in which it is currently recognized.
-//
-// The declarations for a particular set of recognized languages are
-// machine-generated in
-// generated_languages.h
-//
-// The Language enum is intended to match the internal Google Language enum
-// in i18n/languages/proto/languages.proto up to NUM_LANGUAGES, with additional
-// languages assigned above that. Over time, some languages may be renumbered
-// if they are moved into the Language enum.
-//
-// The Language enum includes the fake language numbers for RTypeNone above.
-//
-
-
-// If the input is out of range or otherwise unrecognized, it is treated
-// as UNKNOWN_LANGUAGE
-//
-// LanguageCode
-// ------------
-// Given the Language, return the language code, e.g. "ko"
-// This is determined by
-// the following (in order of preference):
-// - ISO-639-1 two-letter language code
-// (all except those mentioned below)
-// - ISO-639-2 three-letter bibliographic language code
-// (Tibetan, Dhivehi, Cherokee, Syriac)
-// - Google-specific language code
-// (ChineseT ("zh-TW"), Teragram Unknown, Unknown,
-// Portuguese-Portugal, Portuguese-Brazil, Limbu)
-// - Fake RTypeNone names.
-
-const char* LanguageName(Language lang);
-const char* LanguageCode(Language lang);
-const char* LanguageShortCode(Language lang);
-const char* LanguageDeclaredName(Language lang);
-
-// n is in 0..3. Trailing entries are filled with
-// ULScript_Common (which never participates in language recognition)
-ULScript LanguageRecognizedScript(Language lang, int n);
-
-// Name can be either full name or ISO code, or can be ISO code embedded in
-// a language-script combination such as "en-Latn-GB"
-Language GetLanguageFromName(const char* src);
-
-// Returns which set of statistically-close languages lang is in. 0 means none.
-int LanguageCloseSet(Language lang);
-
-//----------------------------------------------------------------------------//
-// Functions of ULScript and Language //
-//----------------------------------------------------------------------------//
-
-// Most common language in each script
-Language DefaultLanguage(ULScript ulscript);
-
-// For RTypeMany recognition,
-// the CLD2 lookup tables are kept small by encoding a language into one byte.
-// To avoid limiting CLD2 to at most 256 languages, a larger range of external
-// Language numbers is mapped to a smaller range of per-script numbers. At
-// the moment (January 2013) the Latin script has about 90 languages to be
-// recognized, while all the other scripts total about 50 more languages. In
-// addition, the RTypeNone scripts map to about 100 fake languages.
-// So we map all Latin-script languages to one range of 1..255 per-script
-// numbers and map all the other RTypeMany languages to an overlapping range
-// 1..255 of per-script numbers.
-
-uint8 PerScriptNumber(ULScript ulscript, Language lang);
-Language FromPerScriptNumber(ULScript ulscript, uint8 perscript_number);
-
-// While the speed-sensitive processing deals with per-script language numbers,
-// there is a need for low-performance dealing with original language numbers
-// and unknown scripts, mostly for processing language hints.
-// These routines let one derive a script class from a bare language.
-// For languages written in multiple scripts, both of these can return true.
-
-bool IsLatnLanguage(Language lang);
-bool IsOthrLanguage(Language lang);
-
-
-//----------------------------------------------------------------------------//
-// Other //
-//----------------------------------------------------------------------------//
-
-// Utility routine to search alphabetical tables
-int BinarySearch(const char* key, int lo, int hi, const CharIntPair* cipair);
-
-} // namespace CLD2
-
-#endif // I18N_ENCODINGS_CLD2_LANG_SCRIPT_H__