Add Basilisk

author: Matt A. Tobin <email@mattatobin.com> 2018-02-02 03:32:58 -0500
committer: Matt A. Tobin <email@mattatobin.com> 2018-02-02 03:32:58 -0500
commit: e72ef92b5bdc43cd2584198e2e54e951b70299e8 (patch)
tree: 01ceb4a897c33eca9e7ccf2bc3aefbe530169fe5 /application/basilisk/components/translation/cld2/internal/lang_script.cc
parent: 0d19b77d3eaa5b8d837bf52c19759e68e42a1c4c (diff)
download: UXP-e72ef92b5bdc43cd2584198e2e54e951b70299e8.tar
UXP-e72ef92b5bdc43cd2584198e2e54e951b70299e8.tar.gz
UXP-e72ef92b5bdc43cd2584198e2e54e951b70299e8.tar.lz
UXP-e72ef92b5bdc43cd2584198e2e54e951b70299e8.tar.xz
UXP-e72ef92b5bdc43cd2584198e2e54e951b70299e8.zip
1 files changed, 560 insertions, 0 deletions
diff --git a/application/basilisk/components/translation/cld2/internal/lang_script.cc b/application/basilisk/components/translation/cld2/internal/lang_script.cc
new file mode 100644
index 000000000..10fc4557e
--- /dev/null
+++ b/application/basilisk/components/translation/cld2/internal/lang_script.cc
@@ -0,0 +1,560 @@
+// Copyright 2013 Google Inc. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+//
+// File: lang_script.cc
+// ================
+//
+// Author: dsites@google.com (Dick Sites)
+//
+// This file declares language and script numbers and names for CLD2
+//
+
+#include "lang_script.h"
+
+#include <stdlib.h>
+#include <string.h>
+
+#include "generated_language.h"
+#include "generated_ulscript.h"
+
+namespace CLD2 {
+
+// Language tables
+// Subscripted by enum Language
+extern const int kLanguageToNameSize;
+extern const char* const kLanguageToName[];
+extern const int kLanguageToCodeSize;
+extern const char* const kLanguageToCode[];
+extern const int kLanguageToCNameSize;
+extern const char* const kLanguageToCName[];
+extern const int kLanguageToScriptsSize;
+extern const FourScripts kLanguageToScripts[];
+
+// Subscripted by Language
+extern const int kLanguageToPLangSize;
+extern const uint8 kLanguageToPLang[];
+// Subscripted by per-script language
+extern const uint16 kPLangToLanguageLatn[];
+extern const uint16 kPLangToLanguageOthr[];
+
+// Alphabetical order for binary search
+extern const int kNameToLanguageSize;
+extern const CharIntPair kNameToLanguage[];
+extern const int kCodeToLanguageSize;
+extern const CharIntPair kCodeToLanguage[];
+
+// ULScript tables
+// Subscripted by enum ULScript
+extern const int kULScriptToNameSize;
+extern const char* const kULScriptToName[];
+extern const int kULScriptToCodeSize;
+extern const char* const kULScriptToCode[];
+extern const int kULScriptToCNameSize;
+extern const char* const kULScriptToCName[];
+extern const int kULScriptToRtypeSize;
+extern const ULScriptRType kULScriptToRtype[];
+extern const int kULScriptToDefaultLangSize;
+extern const Language kULScriptToDefaultLang[];
+
+// Alphabetical order for binary search
+extern const int kNameToULScriptSize;
+extern const CharIntPair kNameToULScript[];
+extern const int kCodeToULScriptSize;
+extern const CharIntPair kCodeToULScript[];
+
+
+//
+// File: lang_script.h
+// ================
+//
+// Author: dsites@google.com (Dick Sites)
+//
+// This file declares language and script numbers and names for CLD2
+//
+
+
+// NOTE: The script numbers and language numbers here are not guaranteed to be
+// stable. If you want to record a result for posterity, save the ISO codes
+// as character strings.
+//
+//
+// The Unicode scripts recognized by CLD2 are numbered almost arbitrarily,
+// specified in an enum. Each script has human-readable script name and a
+// 4-letter ISO 15924 script code. Each has a C name (largely for use by
+// programs that generate declarations in cld2_generated_scripts.h). Each
+// also has a recognition type
+//  r_type: 0 script-only, 1 nilgrams, 2 quadgrams, 3 CJK
+//
+// The declarations for a particular version of Unicode are machine-generated in
+//   cld2_generated_scripts.h
+//
+// This file includes that one and declares the access routines. The type
+// involved is called "ULScript" to signify Unicode Letters-Marks Scripts,
+// which are not quite Unicode Scripts. In particular, the CJK scripts are
+// merged into a single number because CLD2 recognizes the CJK languages from
+// four scripts intermixed: Hani (both Hans  and Hant), Hangul, Hiragana, and
+// Katakana.
+
+// Each script has one of these four recognition types.
+// RTypeNone: There is no language associated with this script. In extended
+//  language recognition calls, return a fake language number that maps to
+//  xx-Cham, with literally "xx" for the language code,and with the script
+//  code instead of "Cham". In non-extended calls, return UNKNOWN_LANGUAGE.
+// RTypeOne: The script maps 1:1 to a single language. No letters are examined
+//  during recognition and no lookups done.
+// RTypeMany: The usual quadgram + delta-octagram + distinctive-words scoring
+//  is done to determine the languages involved.
+// RTypeCJK: The CJK unigram + delta-bigram scoring is done to determine the
+//  languages involved.
+//
+// Note that the choice of recognition type is a function of script, not
+// language. In particular, some languges are recognized in multiple scripts
+// and those have different recognition types (Mongolian mn-Latn vs. mn-Mong
+// for example).
+
+//----------------------------------------------------------------------------//
+// Functions of ULScript                                                      //
+//----------------------------------------------------------------------------//
+
+// If the input is out of range or otherwise unrecognized, it is treated
+// as UNKNOWN_ULSCRIPT (which never participates in language recognition)
+const char* ULScriptName(ULScript ulscript) {
+  int i_ulscript = ulscript;
+  if (i_ulscript < 0) {i_ulscript = UNKNOWN_ULSCRIPT;}
+  if (i_ulscript >= NUM_ULSCRIPTS) {i_ulscript = UNKNOWN_ULSCRIPT;}
+  return kULScriptToName[i_ulscript];
+}
+
+const char* ULScriptCode(ULScript ulscript) {
+  int i_ulscript = ulscript;
+  if (i_ulscript < 0) {i_ulscript = UNKNOWN_ULSCRIPT;}
+  if (i_ulscript >= NUM_ULSCRIPTS) {i_ulscript = UNKNOWN_ULSCRIPT;}
+  return kULScriptToCode[i_ulscript];
+}
+
+const char* ULScriptDeclaredName(ULScript ulscript) {
+  int i_ulscript = ulscript;
+  if (i_ulscript < 0) {i_ulscript = UNKNOWN_ULSCRIPT;}
+  if (i_ulscript >= NUM_ULSCRIPTS) {i_ulscript = UNKNOWN_ULSCRIPT;}
+  return kULScriptToCName[i_ulscript];
+}
+
+ULScriptRType ULScriptRecognitionType(ULScript ulscript) {
+  int i_ulscript = ulscript;
+  if (i_ulscript < 0) {i_ulscript = UNKNOWN_ULSCRIPT;}
+  if (i_ulscript >= NUM_ULSCRIPTS) {i_ulscript = UNKNOWN_ULSCRIPT;}
+  return kULScriptToRtype[i_ulscript];
+}
+
+
+
+// The languages recognized by CLD2 are numbered almost arbitrarily,
+// specified in an enum. Each language has human-readable language name and a
+// 2- or 3-letter ISO 639 language code. Each has a C name (largely for use by
+// programs that generate declarations in cld2_generated_languagess.h).
+// Each has a list of up to four scripts in which it is currently recognized.
+//
+// The declarations for a particular set of recognized languages are
+// machine-generated in
+//   cld2_generated_languages.h
+//
+// The Language enum is intended to match the internal Google Language enum
+// in i18n/languages/proto/languages.proto up to NUM_LANGUAGES, with additional
+// languages assigned above that. Over time, some languages may be renumbered
+// if they are moved into the Language enum.
+//
+// The Language enum includes the fake language numbers for RTypeNone above.
+//
+// In an open-source environment, the Google-specific Language enum is not
+// available. Language decouples the two environments while maintaining
+// internal compatibility.
+
+
+// If the input is out of range or otherwise unrecognized, it is treated
+// as UNKNOWN_LANGUAGE
+//
+// LanguageCode
+// ------------
+// Given the Language, return the language code, e.g. "ko"
+// This is determined by
+// the following (in order of preference):
+// - ISO-639-1 two-letter language code
+//   (all except those mentioned below)
+// - ISO-639-2 three-letter bibliographic language code
+//   (Tibetan, Dhivehi, Cherokee, Syriac)
+// - Google-specific language code
+//   (ChineseT ("zh-TW"), Teragram Unknown, Unknown,
+//   Portuguese-Portugal, Portuguese-Brazil, Limbu)
+// - Fake RTypeNone names.
+
+//----------------------------------------------------------------------------//
+// Functions of Language                                                      //
+//----------------------------------------------------------------------------//
+
+const char* LanguageName(Language lang) {
+  int i_lang = lang;
+  if (i_lang < 0) {i_lang = UNKNOWN_LANGUAGE;}
+  if (i_lang >= NUM_LANGUAGES) {i_lang = UNKNOWN_LANGUAGE;}
+  return kLanguageToName[i_lang];
+}
+const char* LanguageCode(Language lang) {
+  int i_lang = lang;
+  if (i_lang < 0) {i_lang = UNKNOWN_LANGUAGE;}
+  if (i_lang >= NUM_LANGUAGES) {i_lang = UNKNOWN_LANGUAGE;}
+  return kLanguageToCode[i_lang];
+}
+
+const char* LanguageDeclaredName(Language lang) {
+  int i_lang = lang;
+  if (i_lang < 0) {i_lang = UNKNOWN_LANGUAGE;}
+  if (i_lang >= NUM_LANGUAGES) {i_lang = UNKNOWN_LANGUAGE;}
+  return kLanguageToCName[i_lang];
+}
+
+// n is in 0..3. Trailing entries are filled with
+// UNKNOWN_LANGUAGE (which never participates in language recognition)
+ULScript LanguageRecognizedScript(Language lang, int n) {
+  int i_lang = lang;
+  if (i_lang < 0) {i_lang = UNKNOWN_LANGUAGE;}
+  if (i_lang >= NUM_LANGUAGES) {i_lang = UNKNOWN_LANGUAGE;}
+  return static_cast<ULScript>(kLanguageToScripts[i_lang][n]);
+}
+
+// Given the Language, returns its string name used as the output by
+// the lang/enc identifier, e.g. "Korean"
+// "invalid_language" if the input is invalid.
+// TG_UNKNOWN_LANGUAGE is used as a placeholder for the "ignore me" language,
+// used to subtract out HTML, link farms, DNA strings, and alittle English porn
+const char* ExtLanguageName(const Language lang) {
+  return LanguageName(lang);
+}
+
+// Given the Language, return the language code, e.g. "ko"
+const char* ExtLanguageCode(const Language lang) {
+  return LanguageCode(lang);
+}
+
+
+// Given the Language, returns its Language enum spelling, for use by
+// programs that create C declarations, e.g. "KOREAN"
+// "UNKNOWN_LANGUAGE" if the input is invalid.
+const char* ExtLanguageDeclaredName(const Language lang) {
+  return LanguageDeclaredName(lang);
+}
+
+
+extern const int kCloseSetSize = 10;
+
+// Returns which set of statistically-close languages lang is in. 0 means none.
+int LanguageCloseSet(Language lang) {
+  // Scaffolding
+  // id ms         # INDONESIAN MALAY coef=0.4698    Problematic w/o extra words
+  // bo dz         # TIBETAN DZONGKHA coef=0.4571
+  // cs sk         # CZECH SLOVAK coef=0.4273
+  // zu xh         # ZULU XHOSA coef=0.3716
+  //
+  // bs hr sr srm  # BOSNIAN CROATIAN SERBIAN MONTENEGRIN
+  // hi mr bh ne   # HINDI MARATHI BIHARI NEPALI
+  // no nn da      # NORWEGIAN NORWEGIAN_N DANISH
+  // gl es pt      # GALICIAN SPANISH PORTUGUESE
+  // rw rn         # KINYARWANDA RUNDI
+
+  if (lang == INDONESIAN) {return 1;}
+  if (lang == MALAY) {return 1;}
+
+  if (lang == TIBETAN) {return 2;}
+  if (lang == DZONGKHA) {return 2;}
+
+  if (lang == CZECH) {return 3;}
+  if (lang == SLOVAK) {return 3;}
+
+  if (lang == ZULU) {return 4;}
+  if (lang == XHOSA) {return 4;}
+
+  if (lang == BOSNIAN) {return 5;}
+  if (lang == CROATIAN) {return 5;}
+  if (lang == SERBIAN) {return 5;}
+  if (lang == MONTENEGRIN) {return 5;}
+
+  if (lang == HINDI) {return 6;}
+  if (lang == MARATHI) {return 6;}
+  if (lang == BIHARI) {return 6;}
+  if (lang == NEPALI) {return 6;}
+
+  if (lang == NORWEGIAN) {return 7;}
+  if (lang == NORWEGIAN_N) {return 7;}
+  if (lang == DANISH) {return 7;}
+
+  if (lang == GALICIAN) {return 8;}
+  if (lang == SPANISH) {return 8;}
+  if (lang == PORTUGUESE) {return 8;}
+
+  if (lang == KINYARWANDA) {return 9;}
+  if (lang == RUNDI) {return 9;}
+
+  return 0;
+}
+
+//----------------------------------------------------------------------------//
+// Functions of ULScript and Language                                         //
+//----------------------------------------------------------------------------//
+
+Language DefaultLanguage(ULScript ulscript) {
+  if (ulscript < 0) {return UNKNOWN_LANGUAGE;}
+  if (ulscript >= NUM_ULSCRIPTS) {return UNKNOWN_LANGUAGE;}
+  return kULScriptToDefaultLang[ulscript];
+}
+
+uint8 PerScriptNumber(ULScript ulscript, Language lang) {
+  if (ulscript < 0) {return 0;}
+  if (ulscript >= NUM_ULSCRIPTS) {return 0;}
+  if (kULScriptToRtype[ulscript] == RTypeNone) {return 1;}
+  if (lang >= kLanguageToPLangSize) {return 0;}
+  return kLanguageToPLang[lang];
+}
+
+Language FromPerScriptNumber(ULScript ulscript, uint8 perscript_number) {
+  if (ulscript < 0) {return UNKNOWN_LANGUAGE;}
+  if (ulscript >= NUM_ULSCRIPTS) {return UNKNOWN_LANGUAGE;}
+  if ((kULScriptToRtype[ulscript] == RTypeNone) ||
+      (kULScriptToRtype[ulscript] == RTypeOne)) {
+    return kULScriptToDefaultLang[ulscript];
+  }
+
+  if (ulscript == ULScript_Latin) {
+     return static_cast<Language>(kPLangToLanguageLatn[perscript_number]);
+  } else {
+     return static_cast<Language>(kPLangToLanguageOthr[perscript_number]);
+  }
+}
+
+// Return true if language can be in the Latin script
+bool IsLatnLanguage(Language lang) {
+  if (lang >= kLanguageToPLangSize) {return false;}
+  return (lang == kPLangToLanguageLatn[kLanguageToPLang[lang]]);
+}
+
+// Return true if language can be in a non-Latin script
+bool IsOthrLanguage(Language lang) {
+  if (lang >= kLanguageToPLangSize) {return false;}
+  return (lang == kPLangToLanguageOthr[kLanguageToPLang[lang]]);
+}
+
+
+//----------------------------------------------------------------------------//
+// Other                                                                      //
+//----------------------------------------------------------------------------//
+
+// Returns mid if key found in lo <= mid < hi, else -1
+int BinarySearch(const char* key, int lo, int hi, const CharIntPair* cipair) {
+  // binary search
+  while (lo < hi) {
+    int mid = (lo + hi) >> 1;
+    if (strcmp(key, cipair[mid].s) < 0) {
+      hi = mid;
+    } else if (strcmp(key, cipair[mid].s) > 0) {
+      lo = mid + 1;
+    } else {
+      return mid;
+    }
+  }
+  return -1;
+}
+
+Language MakeLang(int i) {return static_cast<Language>(i);}
+
+// Name can be either full name or ISO code, or can be ISO code embedded in
+// a language-script combination such as "ABKHAZIAN", "en", "en-Latn-GB"
+Language GetLanguageFromName(const char* src) {
+  const char* hyphen1 = strchr(src, '-');
+  const char* hyphen2 = NULL;
+  if (hyphen1 != NULL) {hyphen2 = strchr(hyphen1 + 1, '-');}
+
+  int match = -1;
+  if (hyphen1 == NULL) {
+    // Bare name. Look at full name, then code
+    match = BinarySearch(src, 0, kNameToLanguageSize, kNameToLanguage);
+    if (match >= 0) {return MakeLang(kNameToLanguage[match].i);}    // aa
+    match = BinarySearch(src, 0, kCodeToLanguageSize, kCodeToLanguage);
+    if (match >= 0) {return MakeLang(kCodeToLanguage[match].i);}    // aa
+    return UNKNOWN_LANGUAGE;
+  }
+
+  if (hyphen2 == NULL) {
+    // aa-bb. Not a full name; must be code-something. Try zh-TW then bare zh
+    match = BinarySearch(src, 0, kCodeToLanguageSize, kCodeToLanguage);
+    if (match >= 0) {return MakeLang(kCodeToLanguage[match].i);}    // aa-bb
+
+    int len = strlen(src);
+    if (len >= 16) {return UNKNOWN_LANGUAGE;}   // Real codes are shorter
+
+    char temp[16];
+    int hyphen1_offset = hyphen1 - src;
+    // Take off part after hyphen1
+    memcpy(temp, src, len);
+    temp[hyphen1_offset] = '\0';
+    match = BinarySearch(temp, 0, kCodeToLanguageSize, kCodeToLanguage);
+    if (match >= 0) {return MakeLang(kCodeToLanguage[match].i);}    // aa
+
+    return UNKNOWN_LANGUAGE;
+  }
+
+  // aa-bb-cc. Must be code-something. Try en-Latn-US, en-Latn, en-US, en
+  match = BinarySearch(src, 0, kCodeToLanguageSize, kCodeToLanguage);
+  if (match >= 0) {return MakeLang(kCodeToLanguage[match].i);}    // aa-bb-cc
+
+
+  int len = strlen(src);
+  if (len >= 16) {return UNKNOWN_LANGUAGE;}   // Real codes are shorter
+
+  char temp[16];
+  int hyphen1_offset = hyphen1 - src;
+  int hyphen2_offset = hyphen2 - src;
+  // Take off part after hyphen2
+  memcpy(temp, src, len);
+  temp[hyphen2_offset] = '\0';
+  match = BinarySearch(temp, 0, kCodeToLanguageSize, kCodeToLanguage);
+  if (match >= 0) {return MakeLang(kCodeToLanguage[match].i);}    // aa-bb
+
+
+  // Take off part between hyphen1 and hyphen2
+  int len2 = len - hyphen2_offset;
+  memcpy(temp, src, len);
+  memcpy(&temp[hyphen1_offset], hyphen2, len2);
+  temp[hyphen1_offset + len2] = '\0';
+  match = BinarySearch(temp, 0, kCodeToLanguageSize, kCodeToLanguage);
+  if (match >= 0) {return MakeLang(kCodeToLanguage[match].i);}    // aa-cc
+
+
+  // Take off everything after hyphen1
+  memcpy(temp, src, len);
+  temp[hyphen1_offset] = '\0';
+  match = BinarySearch(temp, 0, kCodeToLanguageSize, kCodeToLanguage);
+  if (match >= 0) {return MakeLang(kCodeToLanguage[match].i);}    // aa
+
+
+  return UNKNOWN_LANGUAGE;
+}
+
+
+// Name can be either full name or ISO code, or can be ISO code embedded in
+// a language-script combination such as "en-Latn-GB"
+// MORE WORK to do here. also kLanguageToScripts [4] is bogus
+// if bare language name, no script, want  zh, ja, ko to Hani, pt to Latn, etc.
+// Something like map code to Language, then Language to kLanguageToScripts[x][0]
+// ADD BIAS: kLanguageToScripts lists default script first
+// If total mismatch, reutrn Latn
+//   if (strcmp(src, "nd") == 0) {return NDEBELE;}         // [nd was wrong]
+//   if (strcmp(src, "sit-NP-Limb") == 0) {return ULScript_Limbu;}
+
+ULScript MakeULScr(int i) {return static_cast<ULScript>(i);}
+
+ULScript GetULScriptFromName(const char* src) {
+  const char* hyphen1 = strchr(src, '-');
+  const char* hyphen2 = NULL;
+  if (hyphen1 != NULL) {hyphen2 = strchr(hyphen1 + 1, '-');}
+
+  int match = -1;
+  if (hyphen1 == NULL) {
+    // Bare name. Look at full name, then code, then try backmapping as Language
+    match = BinarySearch(src, 0, kNameToULScriptSize, kNameToULScript);
+    if (match >= 0) {return MakeULScr(kNameToULScript[match].i);}    // aa
+    match = BinarySearch(src, 0, kCodeToULScriptSize, kCodeToULScript);
+    if (match >= 0) {return MakeULScr(kCodeToULScript[match].i);}    // aa
+
+    Language backmap_me = GetLanguageFromName(src);
+    if (backmap_me != UNKNOWN_LANGUAGE) {
+      return static_cast<ULScript>(kLanguageToScripts[backmap_me][0]);
+    }
+    return ULScript_Latin;
+  }
+
+  if (hyphen2 == NULL) {
+    // aa-bb. Not a full name; must be code-something. Try en-Latn, bare Latn
+    if (strcmp(src, "zh-TW") == 0) {return ULScript_Hani;}
+    if (strcmp(src, "zh-CN") == 0) {return ULScript_Hani;}
+    if (strcmp(src, "sit-NP") == 0) {return ULScript_Limbu;}
+    if (strcmp(src, "sit-Limb") == 0) {return ULScript_Limbu;}
+    if (strcmp(src, "sr-ME") == 0) {return ULScript_Latin;}
+    match = BinarySearch(src, 0, kCodeToULScriptSize, kCodeToULScript);
+    if (match >= 0) {return MakeULScr(kCodeToULScript[match].i);}    // aa-bb
+
+    int len = strlen(src);
+    if (len >= 16) {return ULScript_Latin;}   // Real codes are shorter
+
+    char temp[16];
+    int hyphen1_offset = hyphen1 - src;
+    int len1 = len - hyphen1_offset - 1;    // Exclude the hyphen
+    // Take off part before hyphen1
+    memcpy(temp, hyphen1 + 1, len1);
+    temp[len1] = '\0';
+    match = BinarySearch(temp, 0, kCodeToULScriptSize, kCodeToULScript);
+    if (match >= 0) {return MakeULScr(kCodeToULScript[match].i);}    // bb
+
+    // Take off part after hyphen1
+    memcpy(temp, src, len);
+    temp[hyphen1_offset] = '\0';
+    match = BinarySearch(temp, 0, kCodeToULScriptSize, kCodeToULScript);
+    if (match >= 0) {return MakeULScr(kCodeToULScript[match].i);}    // aa
+
+    return ULScript_Latin;
+  }
+
+  // aa-bb-cc. Must be code-something. Try en-Latn-US, en-Latn, en-US, en
+  if (strcmp(src, "sit-NP-Limb") == 0) {return ULScript_Limbu;}
+  if (strcmp(src, "sr-ME-Latn") == 0) {return ULScript_Latin;}
+  if (strcmp(src, "sr-ME-Cyrl") == 0) {return ULScript_Cyrillic;}
+  match = BinarySearch(src, 0, kCodeToULScriptSize, kCodeToULScript);
+  if (match >= 0) {return MakeULScr(kCodeToULScript[match].i);}    // aa-bb-cc
+
+  int len = strlen(src);
+  if (len >= 16) {return ULScript_Latin;}   // Real codes are shorter
+
+  char temp[16];
+  int hyphen1_offset = hyphen1 - src;
+  int hyphen2_offset = hyphen2 - src;
+  int len2 = len - hyphen2_offset - 1;                // Exclude the hyphen
+  int lenmid = hyphen2_offset - hyphen1_offset - 1;   // Exclude the hyphen
+  // Keep part between hyphen1 and hyphen2
+  memcpy(temp, hyphen1 + 1, lenmid);
+  temp[lenmid] = '\0';
+  match = BinarySearch(temp, 0, kCodeToULScriptSize, kCodeToULScript);
+  if (match >= 0) {return MakeULScr(kCodeToULScript[match].i);}    // bb
+
+  // Keep part after hyphen2
+  memcpy(temp, hyphen2 + 1, len2);
+  temp[len2] = '\0';
+  match = BinarySearch(temp, 0, kCodeToULScriptSize, kCodeToULScript);
+  if (match >= 0) {return MakeULScr(kCodeToULScript[match].i);}    // cc
+
+  // Keep part before hyphen1
+  memcpy(temp, src, len);
+  temp[hyphen1_offset] = '\0';
+  match = BinarySearch(temp, 0, kCodeToULScriptSize, kCodeToULScript);
+  if (match >= 0) {return MakeULScr(kCodeToULScript[match].i);}    // aa
+
+  return ULScript_Latin;
+}
+
+// Map script into Latin, Cyrillic, Arabic, Other
+int LScript4(ULScript ulscript) {
+  if (ulscript == ULScript_Latin) {return 0;}
+  if (ulscript == ULScript_Cyrillic) {return 1;}
+  if (ulscript == ULScript_Arabic) {return 2;}
+  return 3;
+}
+
+}  // namespace CLD2
+
author	Matt A. Tobin <email@mattatobin.com>	2018-02-02 03:32:58 -0500
committer	Matt A. Tobin <email@mattatobin.com>	2018-02-02 03:32:58 -0500
commit	e72ef92b5bdc43cd2584198e2e54e951b70299e8 (patch)
tree	01ceb4a897c33eca9e7ccf2bc3aefbe530169fe5 /application/basilisk/components/translation/cld2/internal/lang_script.cc
parent	0d19b77d3eaa5b8d837bf52c19759e68e42a1c4c (diff)
download	UXP-e72ef92b5bdc43cd2584198e2e54e951b70299e8.tar UXP-e72ef92b5bdc43cd2584198e2e54e951b70299e8.tar.gz UXP-e72ef92b5bdc43cd2584198e2e54e951b70299e8.tar.lz UXP-e72ef92b5bdc43cd2584198e2e54e951b70299e8.tar.xz UXP-e72ef92b5bdc43cd2584198e2e54e951b70299e8.zip