diff options
Diffstat (limited to 'browser/components/translation/cld2/internal/lang_script.h')
-rw-r--r-- | browser/components/translation/cld2/internal/lang_script.h | 187 |
1 files changed, 0 insertions, 187 deletions
diff --git a/browser/components/translation/cld2/internal/lang_script.h b/browser/components/translation/cld2/internal/lang_script.h deleted file mode 100644 index 9311707e4..000000000 --- a/browser/components/translation/cld2/internal/lang_script.h +++ /dev/null @@ -1,187 +0,0 @@ -// Copyright 2013 Google Inc. All Rights Reserved. -// -// Licensed under the Apache License, Version 2.0 (the "License"); -// you may not use this file except in compliance with the License. -// You may obtain a copy of the License at -// -// http://www.apache.org/licenses/LICENSE-2.0 -// -// Unless required by applicable law or agreed to in writing, software -// distributed under the License is distributed on an "AS IS" BASIS, -// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -// See the License for the specific language governing permissions and -// limitations under the License. - -// -// File: lang_script.h -// ================ -// -// Author: dsites@google.com (Dick Sites) -// -// This file declares language and script numbers and names for CLD2, -// plus routines that access side tables based on these -// - -#ifndef I18N_ENCODINGS_CLD2_LANG_SCRIPT_H__ -#define I18N_ENCODINGS_CLD2_LANG_SCRIPT_H__ - -#include "generated_language.h" -#include "generated_ulscript.h" -#include "integral_types.h" - - -// NOTE: The script numbers and language numbers here are not guaranteed to be -// stable. If you want to record a result for posterity, save the -// ULScriptCode(ULScript ulscript) result as character strings. -// -// The Unicode scripts recognized by CLD2 are numbered almost arbitrarily, -// specified in an enum. Each script has human-readable script name and a -// 4-letter ISO 15924 script code. Each has a C name (largely for use by -// programs that generate declarations in cld2_generated_scripts.h). Each -// also has a recognition type -// r_type: 0 script-only, 1 nilgrams, 2 quadgrams, 3 CJK -// -// The declarations for a particular version of Unicode are machine-generated in -// generated_scripts.h -// -// This file includes that one and declares the access routines. The type -// involved is called "ULScript" to signify Unicode Letters-Marks Scripts, -// which are not quite Unicode Scripts. In particular, the CJK scripts are -// merged into a single number because CLD2 recognizes the CJK languages from -// four scripts intermixed: Hani (both Hans and Hant), Hangul, Hiragana, and -// Katakana. - -// Each script has one of these four recognition types. -// RTypeNone: There is no language associated with this script. In extended -// language recognition calls, return a fake language number that maps to -// xx-Cham, with literally "xx" for the language code,and with the script -// code instead of "Cham". In non-extended calls, return UNKNOWN_LANGUAGE. -// RTypeOne: The script maps 1:1 to a single language. No letters are examined -// during recognition and no lookups done. -// RTypeMany: The usual quadgram + delta-octagram + distinctive-words scoring -// is done to determine the languages involved. -// RTypeCJK: The CJK unigram + delta-bigram scoring is done to determine the -// languages involved. -// -// Note that the choice of recognition type is a function of script, not -// language. In particular, some languges are recognized in multiple scripts -// and those have different recognition types (Mongolian mn-Latn vs. mn-Mong -// for example). - -namespace CLD2 { - -//----------------------------------------------------------------------------// -// Functions of ULScript // -//----------------------------------------------------------------------------// - -// If the input is out of range or otherwise unrecognized, it is treated -// as ULScript_Common (which never participates in language recognition) -const char* ULScriptName(ULScript ulscript); -const char* ULScriptCode(ULScript ulscript); -const char* ULScriptDeclaredName(ULScript ulscript); -ULScriptRType ULScriptRecognitionType(ULScript ulscript); - -// Name can be either full name or ISO code, or can be ISO code embedded in -// a language-script combination such as "en-Latn-GB" -ULScript GetULScriptFromName(const char* src); - -// Map script into Latin, Cyrillic, Arabic, Other -int LScript4(ULScript ulscript); - -//----------------------------------------------------------------------------// -// Functions of Language // -//----------------------------------------------------------------------------// - -// The languages recognized by CLD2 are numbered almost arbitrarily, -// specified in an enum. Each language has human-readable language name and a -// 2- or 3-letter ISO 639 language code. Each has a C name (largely for use by -// programs that generate declarations in cld2_generated_languagess.h). -// Each has a list of up to four scripts in which it is currently recognized. -// -// The declarations for a particular set of recognized languages are -// machine-generated in -// generated_languages.h -// -// The Language enum is intended to match the internal Google Language enum -// in i18n/languages/proto/languages.proto up to NUM_LANGUAGES, with additional -// languages assigned above that. Over time, some languages may be renumbered -// if they are moved into the Language enum. -// -// The Language enum includes the fake language numbers for RTypeNone above. -// - - -// If the input is out of range or otherwise unrecognized, it is treated -// as UNKNOWN_LANGUAGE -// -// LanguageCode -// ------------ -// Given the Language, return the language code, e.g. "ko" -// This is determined by -// the following (in order of preference): -// - ISO-639-1 two-letter language code -// (all except those mentioned below) -// - ISO-639-2 three-letter bibliographic language code -// (Tibetan, Dhivehi, Cherokee, Syriac) -// - Google-specific language code -// (ChineseT ("zh-TW"), Teragram Unknown, Unknown, -// Portuguese-Portugal, Portuguese-Brazil, Limbu) -// - Fake RTypeNone names. - -const char* LanguageName(Language lang); -const char* LanguageCode(Language lang); -const char* LanguageShortCode(Language lang); -const char* LanguageDeclaredName(Language lang); - -// n is in 0..3. Trailing entries are filled with -// ULScript_Common (which never participates in language recognition) -ULScript LanguageRecognizedScript(Language lang, int n); - -// Name can be either full name or ISO code, or can be ISO code embedded in -// a language-script combination such as "en-Latn-GB" -Language GetLanguageFromName(const char* src); - -// Returns which set of statistically-close languages lang is in. 0 means none. -int LanguageCloseSet(Language lang); - -//----------------------------------------------------------------------------// -// Functions of ULScript and Language // -//----------------------------------------------------------------------------// - -// Most common language in each script -Language DefaultLanguage(ULScript ulscript); - -// For RTypeMany recognition, -// the CLD2 lookup tables are kept small by encoding a language into one byte. -// To avoid limiting CLD2 to at most 256 languages, a larger range of external -// Language numbers is mapped to a smaller range of per-script numbers. At -// the moment (January 2013) the Latin script has about 90 languages to be -// recognized, while all the other scripts total about 50 more languages. In -// addition, the RTypeNone scripts map to about 100 fake languages. -// So we map all Latin-script languages to one range of 1..255 per-script -// numbers and map all the other RTypeMany languages to an overlapping range -// 1..255 of per-script numbers. - -uint8 PerScriptNumber(ULScript ulscript, Language lang); -Language FromPerScriptNumber(ULScript ulscript, uint8 perscript_number); - -// While the speed-sensitive processing deals with per-script language numbers, -// there is a need for low-performance dealing with original language numbers -// and unknown scripts, mostly for processing language hints. -// These routines let one derive a script class from a bare language. -// For languages written in multiple scripts, both of these can return true. - -bool IsLatnLanguage(Language lang); -bool IsOthrLanguage(Language lang); - - -//----------------------------------------------------------------------------// -// Other // -//----------------------------------------------------------------------------// - -// Utility routine to search alphabetical tables -int BinarySearch(const char* key, int lo, int hi, const CharIntPair* cipair); - -} // namespace CLD2 - -#endif // I18N_ENCODINGS_CLD2_LANG_SCRIPT_H__ |