summaryrefslogtreecommitdiffstats
path: root/application/basilisk/components/translation/cld2/internal/compact_lang_det_hint_code.cc
diff options
context:
space:
mode:
authorMatt A. Tobin <email@mattatobin.com>2019-12-16 13:57:01 -0500
committerMatt A. Tobin <email@mattatobin.com>2019-12-16 13:57:01 -0500
commit06494f307850c576868831bd28a61464eab1f359 (patch)
treef281f5c46c3e0b73c7eabe22f02622dc013b0c35 /application/basilisk/components/translation/cld2/internal/compact_lang_det_hint_code.cc
parente7d4713e0765c79feddf2384d343d10595fa5cb3 (diff)
downloadUXP-06494f307850c576868831bd28a61464eab1f359.tar
UXP-06494f307850c576868831bd28a61464eab1f359.tar.gz
UXP-06494f307850c576868831bd28a61464eab1f359.tar.lz
UXP-06494f307850c576868831bd28a61464eab1f359.tar.xz
UXP-06494f307850c576868831bd28a61464eab1f359.zip
Remove Basilisk from the Unified XUL Platform repository
Development will proceed at https://github.com/MoonchildProductions/Basilisk
Diffstat (limited to 'application/basilisk/components/translation/cld2/internal/compact_lang_det_hint_code.cc')
-rw-r--r--application/basilisk/components/translation/cld2/internal/compact_lang_det_hint_code.cc1649
1 files changed, 0 insertions, 1649 deletions
diff --git a/application/basilisk/components/translation/cld2/internal/compact_lang_det_hint_code.cc b/application/basilisk/components/translation/cld2/internal/compact_lang_det_hint_code.cc
deleted file mode 100644
index 9bde8a86a..000000000
--- a/application/basilisk/components/translation/cld2/internal/compact_lang_det_hint_code.cc
+++ /dev/null
@@ -1,1649 +0,0 @@
-// Copyright 2013 Google Inc. All Rights Reserved.
-//
-// Licensed under the Apache License, Version 2.0 (the "License");
-// you may not use this file except in compliance with the License.
-// You may obtain a copy of the License at
-//
-// http://www.apache.org/licenses/LICENSE-2.0
-//
-// Unless required by applicable law or agreed to in writing, software
-// distributed under the License is distributed on an "AS IS" BASIS,
-// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-// See the License for the specific language governing permissions and
-// limitations under the License.
-
-//
-// Author: dsites@google.com (Dick Sites)
-//
-
-#include "compact_lang_det_hint_code.h"
-
-#include <stdlib.h> // for abs()
-#include <stdio.h> // for sprintf()
-#include <string.h> //
-#include "lang_script.h"
-#include "port.h"
-
-using namespace std;
-
-namespace CLD2 {
-
-static const int kCLDPriorEncodingWeight = 4; // 100x more likely
-static const int kCLDPriorLanguageWeight = 8; // 10000x more likely
-
-
-// Tables to map lang="..." language code lists to actual languages.
-// based on scraping and hand-edits, dsites June 2011
-
-// n = f(string, &a) gives list of n<=4 language pairs: primary, secondary
-
-// For close pairs like ms/id, more weight on TLD and lang=
-// Alternately, weaker boost but mark others of set as negative;
-// makes "neither" an easier result.
-// lang=en low weight 4
-// tld=lu boost lu maaybe 4. but lang= alwyas overcomes tld and encoding
-// (except maybe en)
-
-// TLD to separate, e.g., burundi from rwanda
-
-// Encoding lookup: OneLangProb array
-// TLD lookup: tld OneLangProb pairs
-
-
-typedef struct {
- const char* const langtag; // Lowercased, hyphen only lookup key
- const char* const langcode; // Canonical language codes; two if ambiguous
- OneCLDLangPrior onelangprior1;
- OneCLDLangPrior onelangprior2;
-} LangTagLookup;
-
-typedef struct {
- const char* const tld; // Lowercased, hyphen only lookup key
- OneCLDLangPrior onelangprior1;
- OneCLDLangPrior onelangprior2;
-} TLDLookup;
-
-
-#define W2 (2 << 10) // 3**2 = 10x more likely
-#define W4 (4 << 10) // 3**4 = 100x more likely
-#define W6 (6 << 10) // 3**6 = 1000x more likely
-#define W8 (8 << 10) // 3**8 = 10K x more likely
-#define W10 (10 << 10) // 3**10 = 100K x more likely
-#define W12 (12 << 10) // 3**12 = 1M x more likely
-
-// TODO: more about ba hr sr sr-ME and sl
-// Temporary state of affairs:
-// BOSNIAN CROATIAN MONTENEGRIN SERBIAN detecting just CROATIAN SERBIAN
-// Eventually, we want to do all four, but it requires a CLD change to handle
-// up to six languages per quadgram.
-
-
-// Close pairs boost one of pair, demote other.
-// Statistically close pairs:
-// INDONESIAN/MALAY difficult to distinguish -- extra word-based lookups used
-//
-// INDONESIAN MALAY coef=0.4698 Problematic w/o extra words
-// TIBETAN DZONGKHA coef=0.4571
-// CZECH SLOVAK coef=0.4273
-// NORWEGIAN NORWEGIAN_N coef=0.4182
-//
-// HINDI MARATHI coef=0.3795
-// ZULU XHOSA coef=0.3716
-//
-// DANISH NORWEGIAN coef=0.3672 Usually OK
-// BIHARI HINDI coef=0.3668 Usually OK
-// ICELANDIC FAROESE coef=0.3519 Usually OK
-
-//
-// Table to look up lang= tags longer than three characters
-// Overrides table below, which is truncated at first hyphen
-// In alphabetical order for binary search
-static const int kCLDTable1Size = 213;
-static const LangTagLookup kCLDLangTagsHintTable1[kCLDTable1Size] = {
- {"abkhazian", "ab", ABKHAZIAN + W10, 0},
- {"afar", "aa", AFAR + W10, 0},
- {"afrikaans", "af", AFRIKAANS + W10, 0},
- {"akan", "ak", AKAN + W10, 0},
- {"albanian", "sq", ALBANIAN + W10, 0},
- {"am-am", "hy", ARMENIAN + W10, 0}, // 1:2 Armenian, not ambiguous
- {"amharic", "am", AMHARIC + W10, 0},
- {"arabic", "ar", ARABIC + W10, 0},
- {"argentina", "es", SPANISH + W10, 0},
- {"armenian", "hy", ARMENIAN + W10, 0},
- {"assamese", "as", ASSAMESE + W10, 0},
- {"aymara", "ay", AYMARA + W10, 0},
- {"azerbaijani", "az", AZERBAIJANI + W10, 0},
-
- {"bangla", "bn", BENGALI + W10, 0},
- {"bashkir", "ba", BASHKIR + W10, 0},
- {"basque", "eu", BASQUE + W10, 0},
- {"belarusian", "be", BELARUSIAN + W10, 0},
- {"bengali", "bn", BENGALI + W10, 0},
- {"bihari", "bh", BIHARI + W10, HINDI - W4},
- {"bislama", "bi", BISLAMA + W10, 0},
- {"bosnian", "bs", BOSNIAN + W10, 0}, // Bosnian => Bosnian
- {"br-br", "pt", PORTUGUESE + W10, 0}, // 1:2 Portuguese, not ambiguous
- {"br-fr", "br", BRETON + W10, 0}, // 1:2 Breton, not ambiguous
- {"breton", "br", BRETON + W10, 0},
- {"bulgarian", "bg", BULGARIAN + W10, 0},
- {"burmese", "my", BURMESE + W10, 0}, // Myanmar
-
- {"catalan", "ca", CATALAN + W10, 0},
- {"cherokee", "chr", CHEROKEE + W10, 0},
- {"chichewa", "ny", NYANJA + W10, 0},
-
- {"chinese", "zh", CHINESE + W10, 0},
- {"chinese-t", "zhT", CHINESE_T + W10, 0},
- {"chineset", "zhT", CHINESE_T + W10, 0},
- {"corsican", "co", CORSICAN + W10, 0},
- {"cpf-hat", "ht", HAITIAN_CREOLE + W10, 0}, // Creole, French-based
- {"croatian", "hr", CROATIAN + W10, 0},
- {"czech", "cs", CZECH + W10, SLOVAK - W4},
-
- {"danish", "da", DANISH + W10, NORWEGIAN - W4},
- {"deutsch", "de", GERMAN + W10, 0},
- {"dhivehi", "dv", DHIVEHI + W10, 0},
- {"dutch", "nl", DUTCH + W10, 0},
- {"dzongkha", "dz", DZONGKHA + W10, TIBETAN - W4},
-
- {"ell-gr", "el", GREEK + W10, 0},
- {"english", "en", ENGLISH + W4, 0},
- {"esperanto", "eo", ESPERANTO + W10, 0},
- {"estonian", "et", ESTONIAN + W10, 0},
- {"euc-jp", "ja", JAPANESE + W10, 0}, // Japanese encoding
- {"euc-kr", "ko", KOREAN + W10, 0}, // Korean encoding
-
- {"faroese", "fo", FAROESE + W10, ICELANDIC - W4},
- {"fijian", "fj", FIJIAN + W10, 0},
- {"finnish", "fi", FINNISH + W10, 0},
- {"fran", "fr", FRENCH + W10, 0}, // Truncated at non-ASCII
- {"francais", "fr", FRENCH + W10, 0},
- {"french", "fr", FRENCH + W10, 0},
- {"frisian", "fy", FRISIAN + W10, 0},
-
- {"ga-es", "gl", GALICIAN + W10, 0}, // 1:2 Galician, not ambiguous
- {"galician", "gl", GALICIAN + W10, 0},
- {"ganda", "lg", GANDA + W10, 0},
- {"georgian", "ka", GEORGIAN + W10, 0},
- {"german", "de", GERMAN + W10, 0},
- {"greek", "el", GREEK + W10, 0},
- {"greenlandic", "kl", GREENLANDIC + W10, 0},
- {"guarani", "gn", GUARANI + W10, 0},
- {"gujarati", "gu", GUJARATI + W10, 0},
-
- {"haitian_creole", "ht", HAITIAN_CREOLE + W10, 0},
- {"hausa", "ha", HAUSA + W10, 0},
- {"hawaiian", "haw", HAWAIIAN + W10, 0},
- {"hebrew", "he", HEBREW + W10, 0},
- {"hindi", "hi", HINDI + W10, MARATHI - W4},
- {"hn-in", "hi", HINDI + W10, MARATHI - W4},
- {"hungarian", "hu", HUNGARIAN + W10, 0},
-
- {"icelandic", "is", ICELANDIC + W10, FAROESE - W4},
- {"igbo", "ig", IGBO + W10, 0},
- {"indonesian", "id", INDONESIAN + W10, MALAY - W4},
- {"interlingua", "ia", INTERLINGUA + W10, 0},
- {"interlingue", "ie", INTERLINGUE + W10, 0},
- // 1:2 iu-Cans ik-Latn
- {"inuktitut", "iu,ik", INUKTITUT + W10, INUPIAK + W10}, // 1:2
- {"inupiak", "ik,iu", INUPIAK + W10, INUKTITUT + W10}, // 1:2
- {"ir-ie", "ga", IRISH + W10, 0}, // Irish
- {"irish", "ga", IRISH + W10, 0},
- {"italian", "it", ITALIAN + W10, 0},
-
- {"ja-euc", "ja", JAPANESE + W10, 0}, // Japanese encoding
- {"jan-jp", "ja", JAPANESE + W10, 0}, // Japanese encoding
- {"japanese", "ja", JAPANESE + W10, 0},
- {"javanese", "jw", JAVANESE + W10, 0},
-
- {"kannada", "kn", KANNADA + W10, 0},
- {"kashmiri", "ks", KASHMIRI + W10, 0},
- {"kazakh", "kk", KAZAKH + W10, 0},
- {"khasi", "kha", KHASI + W10, 0},
- {"khmer", "km", KHMER + W10, 0},
- {"kinyarwanda", "rw", KINYARWANDA + W10, 0},
- {"klingon", "tlh", X_KLINGON + W10, 0},
- {"korean", "ko", KOREAN + W10, 0},
- {"kurdish", "ku", KURDISH + W10, 0},
- {"kyrgyz", "ky", KYRGYZ + W10, 0},
-
- {"laothian", "lo", LAOTHIAN + W10, 0},
- {"latin", "la", LATIN + W10, 0},
- {"latvian", "lv", LATVIAN + W10, 0},
- {"limbu", "sit", LIMBU + W10, 0},
- {"lingala", "ln", LINGALA + W10, 0},
- {"lithuanian", "lt", LITHUANIAN + W10, 0},
- {"luxembourgish", "lb", LUXEMBOURGISH + W10, 0},
-
- {"macedonian", "mk", MACEDONIAN + W10, 0},
- {"malagasy", "mg", MALAGASY + W10, 0},
- {"malay", "ms", MALAY + W10, INDONESIAN - W4},
- {"malayalam", "ml", MALAYALAM + W10, 0},
- {"maltese", "mt", MALTESE + W10, 0},
- {"manx", "gv", MANX + W10, 0},
- {"maori", "mi", MAORI + W10, 0},
- {"marathi", "mr", MARATHI + W10, HINDI - W4},
- {"mauritian_creole", "mfe", MAURITIAN_CREOLE + W10, 0},
- {"moldavian", "mo", ROMANIAN + W10, 0},
- {"mongolian", "mn", MONGOLIAN + W10, 0},
- {"montenegrin", "sr-me", MONTENEGRIN + W10, 0},
- {"myanmar", "my", BURMESE + W10, 0}, // Myanmar
- {"nauru", "na", NAURU + W10, 0},
- {"ndebele", "nr", NDEBELE + W10, 0},
- {"nepali", "ne", NEPALI + W10, 0},
- {"no-bok", "no", NORWEGIAN + W10, NORWEGIAN_N - W4}, // Bokmaal
- {"no-bokmaal", "no", NORWEGIAN + W10, NORWEGIAN_N - W4},
- {"no-nb", "no", NORWEGIAN + W10, NORWEGIAN_N - W4}, // Bokmaal
- {"no-no", "no", NORWEGIAN + W10, NORWEGIAN_N - W4},
- {"no-nyn", "nn", NORWEGIAN_N + W10, NORWEGIAN - W4}, // Nynorsk
- {"no-nynorsk", "nn", NORWEGIAN_N + W10, NORWEGIAN - W4},
- {"norwegian", "no", NORWEGIAN + W10, NORWEGIAN_N - W4},
- {"norwegian_n", "nn", NORWEGIAN_N + W10, NORWEGIAN - W4},
- {"nyanja", "ny", NYANJA + W10, 0},
-
- {"occitan", "oc", OCCITAN + W10, 0},
- {"oriya", "or", ORIYA + W10, 0},
- {"oromo", "om", OROMO + W10, 0},
- {"parsi", "fa", PERSIAN + W10, 0},
-
- {"pashto", "ps", PASHTO + W10, 0},
- {"pedi", "nso", PEDI + W10, 0},
- {"persian", "fa", PERSIAN + W10, 0},
- {"polish", "pl", POLISH + W10, 0},
- {"polska", "pl", POLISH + W10, 0},
- {"polski", "pl", POLISH + W10, 0},
- {"portugu", "pt", PORTUGUESE + W10, 0}, // Truncated at non-ASCII
- {"portuguese", "pt", PORTUGUESE + W10, 0},
- {"punjabi", "pa", PUNJABI + W10, 0},
-
- {"quechua", "qu", QUECHUA + W10, 0},
-
- {"rhaeto_romance", "rm", RHAETO_ROMANCE + W10, 0},
- {"romanian", "ro", ROMANIAN + W10, 0},
- {"rundi", "rn", RUNDI + W10, 0},
- {"russian", "ru", RUSSIAN + W10, 0},
-
- {"samoan", "sm", SAMOAN + W10, 0},
- {"sango", "sg", SANGO + W10, 0},
- {"sanskrit", "sa", SANSKRIT + W10, 0},
- {"scots", "sco", SCOTS + W10, ENGLISH - W4},
- {"scots_gaelic", "gd", SCOTS_GAELIC + W10, 0},
- {"serbian", "sr", SERBIAN + W10, 0},
- {"seselwa", "crs", SESELWA + W10, 0},
- {"sesotho", "st", SESOTHO + W10, 0},
- {"shift-jis", "ja", JAPANESE + W10, 0}, // Japanese encoding
- {"shift-js", "ja", JAPANESE + W10, 0}, // Japanese encoding
- {"shona", "sn", SHONA + W10, 0},
- {"si-lk", "si", SINHALESE + W10, 0}, // 1:2 Sri Lanka, not ambiguous
- {"si-si", "sl", SLOVENIAN + W10, 0}, // 1:2 Slovenia, not ambiguous
- {"si-sl", "sl", SLOVENIAN + W10, 0}, // 1:2 Slovenia, not ambiguous
- {"sindhi", "sd", SINDHI + W10, 0},
- {"sinhalese", "si", SINHALESE + W10, 0},
- {"siswant", "ss", SISWANT + W10, 0},
- {"sit-np", "sit", LIMBU + W10, 0},
- {"slovak", "sk", SLOVAK + W10, CZECH - W4},
- {"slovenian", "sl", SLOVENIAN + W10, 0},
- {"somali", "so", SOMALI + W10, 0},
- {"spanish", "es", SPANISH + W10, 0},
- {"sr-me", "sr-me", MONTENEGRIN + W10, 0}, // Montenegrin => Montenegrin
- {"sundanese", "su", SUNDANESE + W10, 0},
- {"suomi", "fi", FINNISH + W10, 0}, // Finnish
- {"swahili", "sw", SWAHILI + W10, 0},
- {"swedish", "sv", SWEDISH + W10, 0},
- {"syriac", "syr", SYRIAC + W10, 0},
-
- {"tagalog", "tl", TAGALOG + W10, 0},
- {"tajik", "tg", TAJIK + W10, 0},
- {"tamil", "ta", TAMIL + W10, 0},
- {"tatar", "tt", TATAR + W10, 0},
- {"tb-tb", "bo", TIBETAN + W10, DZONGKHA - W4}, // Tibet
- {"tchinese", "zhT", CHINESE_T + W10, 0},
- {"telugu", "te", TELUGU + W10, 0},
- {"thai", "th", THAI + W10, 0},
- {"tibetan", "bo", TIBETAN + W10, DZONGKHA - W4},
- {"tigrinya", "ti", TIGRINYA + W10, 0},
- {"tonga", "to", TONGA + W10, 0},
- {"tsonga", "ts", TSONGA + W10, 0},
- {"tswana", "tn", TSWANA + W10, 0},
- {"tt-ru", "tt", TATAR + W10, 0},
- {"tur-tr", "tr", TURKISH + W10, 0},
- {"turkish", "tr", TURKISH + W10, 0},
- {"turkmen", "tk", TURKMEN + W10, 0},
- {"uighur", "ug", UIGHUR + W10, 0},
- {"ukrainian", "uk", UKRAINIAN + W10, 0},
- {"urdu", "ur", URDU + W10, 0},
- {"uzbek", "uz", UZBEK + W10, 0},
-
- {"venda", "ve", VENDA + W10, 0},
- {"vietnam", "vi", VIETNAMESE + W10, 0},
- {"vietnamese", "vi", VIETNAMESE + W10, 0},
- {"volapuk", "vo", VOLAPUK + W10, 0},
-
- {"welsh", "cy", WELSH + W10, 0},
- {"wolof", "wo", WOLOF + W10, 0},
-
- {"xhosa", "xh", XHOSA + W10, ZULU - W4},
-
- {"yiddish", "yi", YIDDISH + W10, 0},
- {"yoruba", "yo", YORUBA + W10, 0},
-
- {"zh-classical", "zhT", CHINESE_T + W10, 0},
- {"zh-cn", "zh", CHINESE + W10, 0},
- {"zh-hans", "zh", CHINESE + W10, 0},
- {"zh-hant", "zhT", CHINESE_T + W10, 0},
- {"zh-hk", "zhT", CHINESE_T + W10, 0},
- {"zh-min-nan", "zhT", CHINESE_T + W10, 0}, // Min Nan => ChineseT
- {"zh-sg", "zhT", CHINESE_T + W10, 0},
- {"zh-tw", "zhT", CHINESE_T + W10, 0},
- {"zh-yue", "zh", CHINESE + W10, 0}, // Yue (Cantonese) => Chinese
- {"zhuang", "za", ZHUANG + W10, 0},
- {"zulu", "zu", ZULU + W10, XHOSA - W4},
-};
-
-
-
-// Table to look up lang= tags of two/three characters after truncate at hyphen
-// In alphabetical order for binary search
-static const int kCLDTable2Size = 257;
-static const LangTagLookup kCLDLangTagsHintTable2[kCLDTable2Size] = {
- {"aa", "aa", AFAR + W10, 0},
- {"ab", "ab", ABKHAZIAN + W10, 0},
- {"af", "af", AFRIKAANS + W10, 0},
- {"ak", "ak", AKAN + W10, 0},
- {"al", "sq", ALBANIAN + W10, 0}, // Albania
- {"am", "am,hy", AMHARIC + W10, ARMENIAN + W10}, // 1:2 Amharic Armenian
- {"ar", "ar", ARABIC + W10, 0},
- {"ara", "ar", ARABIC + W10, 0},
- {"arm", "hy", ARMENIAN + W10, 0}, // Armenia
- {"arz", "ar", ARABIC + W10, 0}, // Egyptian Arabic
- {"as", "as", ASSAMESE + W10, 0},
- {"at", "de", GERMAN + W10, 0}, // Austria
- {"au", "de", GERMAN + W10, 0}, // Austria
- {"ay", "ay", AYMARA + W10, 0},
- {"az", "az", AZERBAIJANI + W10, 0},
- {"aze", "az", AZERBAIJANI + W10, 0},
-
- {"ba", "ba,bs", BASHKIR + W10, BOSNIAN + W10}, // 1:2 Bashkir Bosnia
- {"be", "be", BELARUSIAN + W10, 0},
- {"bel", "be", BELARUSIAN + W10, 0},
- {"bg", "bg", BULGARIAN + W10, 0},
- {"bh", "bh", BIHARI + W10, HINDI - W4},
- {"bi", "bi", BISLAMA + W10, 0},
- {"big", "zhT", CHINESE_T + W10, 0}, // Big5 encoding
- {"bm", "ms", MALAY + W10, INDONESIAN - W4}, // Bahasa Malaysia
- {"bn", "bn", BENGALI + W10, 0},
- {"bo", "bo", TIBETAN + W10, DZONGKHA - W4},
- // 1:2 Breton, Brazil country code, both Latn .br TLD enough for pt to win
- {"br", "br,pt", BRETON + W10, PORTUGUESE + W8}, // 1:2 Breton, Brazil
- {"bs", "bs", BOSNIAN + W10, 0}, // Bosnian => Bosnian
-
- {"ca", "ca", CATALAN + W10, 0},
- {"cat", "ca", CATALAN + W10, 0},
- {"ch", "de,fr", GERMAN + W10, FRENCH + W10}, // 1:2 Switzerland
- {"chn", "zh", CHINESE + W10, 0},
- {"chr", "chr", CHEROKEE + W10, 0},
- {"ckb", "ku", KURDISH + W10, 0}, // Central Kurdish
- {"cn", "zh,zhT", CHINESE + W6, CHINESE_T + W4}, // Ambiguous, so weaker.
- // Offset by 2 so that TLD=tw or
- // enc=big5 will put zhT ahead
- {"co", "co", CORSICAN + W10, 0},
- {"cro", "hr", CROATIAN + W10, 0}, // Croatia
- {"crs", "crs", SESELWA + W10, 0},
- {"cs", "cs", CZECH + W10, SLOVAK - W4},
- {"ct", "ca", CATALAN + W10, 0},
- {"cy", "cy", WELSH + W10, 0},
- {"cym", "cy", WELSH + W10, 0},
- {"cz", "cs", CZECH + W10, SLOVAK - W4},
-
- {"da", "da", DANISH + W10, NORWEGIAN - W4},
- {"dan", "da", DANISH + W10, NORWEGIAN - W4},
- {"de", "de", GERMAN + W10, 0},
- {"deu", "de", GERMAN + W10, 0},
- {"div", "dv", DHIVEHI + W10, 0},
- {"dk", "da", DANISH + W10, NORWEGIAN - W4}, // Denmark
- {"dut", "nl", DUTCH + W10, 0}, // Dutch
- {"dv", "dv", DHIVEHI + W10, 0},
- {"dz", "dz", DZONGKHA + W10, TIBETAN - W4},
-
- {"ee", "et", ESTONIAN + W10, 0}, // Estonia
- {"eg", "ar", ARABIC + W10, 0}, // Egypt
- {"el", "el", GREEK + W10, 0},
- {"en", "en", ENGLISH + W4, 0},
- {"eng", "en", ENGLISH + W4, 0},
- {"eo", "eo", ESPERANTO + W10, 0},
- {"er", "ur", URDU + W10, 0}, // "Erdu"
- {"es", "es", SPANISH + W10, 0},
- {"esp", "es", SPANISH + W10, 0},
- {"est", "et", ESTONIAN + W10, 0},
- {"et", "et", ESTONIAN + W10, 0},
- {"eu", "eu", BASQUE + W10, 0},
-
- {"fa", "fa", PERSIAN + W10, 0},
- {"far", "fa", PERSIAN + W10, 0},
- {"fi", "fi", FINNISH + W10, 0},
- {"fil", "tl", TAGALOG + W10, 0}, // Philippines
- {"fj", "fj", FIJIAN + W10, 0},
- {"fo", "fo", FAROESE + W10, ICELANDIC - W4},
- {"fr", "fr", FRENCH + W10, 0},
- {"fra", "fr", FRENCH + W10, 0},
- {"fre", "fr", FRENCH + W10, 0},
- {"fy", "fy", FRISIAN + W10, 0},
-
- {"ga", "ga,gl", IRISH + W10, GALICIAN + W10}, // 1:2 Irish, Galician
- {"gae", "gd,ga", SCOTS_GAELIC + W10, IRISH + W10}, // 1:2 Gaelic, either
- {"gal", "gl", GALICIAN + W10, 0},
- {"gb", "zh", CHINESE + W10, 0}, // GB2312 encoding
- {"gbk", "zh", CHINESE + W10, 0}, // GBK encoding
- {"gd", "gd", SCOTS_GAELIC + W10, 0},
- {"ge", "ka", GEORGIAN + W10, 0}, // Georgia
- {"geo", "ka", GEORGIAN + W10, 0},
- {"ger", "de", GERMAN + W10, 0},
- {"gl", "gl", GALICIAN + W10, 0}, // Also Greenland; hard to confuse
- {"gn", "gn", GUARANI + W10, 0},
- {"gr", "el", GREEK + W10, 0}, // Greece
- {"gu", "gu", GUJARATI + W10, 0},
- {"gv", "gv", MANX + W10, 0},
-
- {"ha", "ha", HAUSA + W10, 0},
- {"hat", "ht", HAITIAN_CREOLE + W10, 0}, // Haiti
- {"haw", "haw", HAWAIIAN + W10, 0},
- {"hb", "he", HEBREW + W10, 0},
- {"he", "he", HEBREW + W10, 0},
- {"heb", "he", HEBREW + W10, 0},
- {"hi", "hi", HINDI + W10, MARATHI - W4},
- {"hk", "zhT", CHINESE_T + W10, 0}, // Hong Kong
- {"hr", "hr", CROATIAN + W10, 0},
- {"ht", "ht", HAITIAN_CREOLE + W10, 0},
- {"hu", "hu", HUNGARIAN + W10, 0},
- {"hun", "hu", HUNGARIAN + W10, 0},
- {"hy", "hy", ARMENIAN + W10, 0},
-
- {"ia", "ia", INTERLINGUA + W10, 0},
- {"ice", "is", ICELANDIC + W10, FAROESE - W4}, // Iceland
- {"id", "id", INDONESIAN + W10, MALAY - W4},
- {"ids", "id", INDONESIAN + W10, MALAY - W4},
- {"ie", "ie", INTERLINGUE + W10, 0},
- {"ig", "ig", IGBO + W10, 0},
- // 1:2 iu-Cans ik-Latn
- {"ik", "ik,iu", INUPIAK + W10, INUKTITUT + W10}, // 1:2
- {"in", "id", INDONESIAN + W10, MALAY - W4},
- {"ind", "id", INDONESIAN + W10, MALAY - W4}, // Indonesia
- {"inu", "iu,ik", INUKTITUT + W10, INUPIAK + W10}, // 1:2
- {"is", "is", ICELANDIC + W10, FAROESE - W4},
- {"it", "it", ITALIAN + W10, 0},
- {"ita", "it", ITALIAN + W10, 0},
- {"iu", "iu,ik", INUKTITUT + W10, INUPIAK + W10}, // 1:2
- {"iw", "he", HEBREW + W10, 0},
-
- {"ja", "ja", JAPANESE + W10, 0},
- {"jp", "ja", JAPANESE + W10, 0}, // Japan
- {"jpn", "ja", JAPANESE + W10, 0},
- {"jv", "jw", JAVANESE + W10, 0},
- {"jw", "jw", JAVANESE + W10, 0},
-
- {"ka", "ka", GEORGIAN + W10, 0},
- {"kc", "qu", QUECHUA + W10, 0}, // (K)Quechua
- {"kg", "ky", KYRGYZ + W10, 0}, // Kyrgyzstan
- {"kh", "km", KHMER + W10, 0}, // Country code Khmer (Cambodia)
- {"kha", "kha", KHASI + W10, 0},
- {"kk", "kk", KAZAKH + W10, 0}, // Kazakh
- {"kl", "kl", GREENLANDIC + W10, 0},
- {"km", "km", KHMER + W10, 0},
- {"kn", "kn", KANNADA + W10, 0},
- {"ko", "ko", KOREAN + W10, 0},
- {"kor", "ko", KOREAN + W10, 0},
- {"kr", "ko", KOREAN + W10, 0}, // Country code Korea
- {"ks", "ks", KASHMIRI + W10, 0},
- {"ksc", "ko", KOREAN + W10, 0}, // KSC encoding
- {"ku", "ku", KURDISH + W10, 0},
- {"ky", "ky", KYRGYZ + W10, 0},
- {"kz", "kk", KAZAKH + W10, 0}, // Kazakhstan
- {"la", "la", LATIN + W10, 0},
- {"lao", "lo", LAOTHIAN + W10, 0}, // Laos
-
- {"lb", "lb", LUXEMBOURGISH + W10, 0},
- {"lg", "lg", GANDA + W10, 0},
- {"lit", "lt", LITHUANIAN + W10, 0},
- {"ln", "ln", LINGALA + W10, 0},
- {"lo", "lo", LAOTHIAN + W10, 0},
- {"lt", "lt", LITHUANIAN + W10, 0},
- {"ltu", "lt", LITHUANIAN + W10, 0},
- {"lv", "lv", LATVIAN + W10, 0},
-
- {"mfe", "mfe", MAURITIAN_CREOLE + W10, 0},
- {"mg", "mg", MALAGASY + W10, 0},
- {"mi", "mi", MAORI + W10, 0},
- {"mk", "mk", MACEDONIAN + W10, 0},
- {"ml", "ml", MALAYALAM + W10, 0},
- {"mn", "mn", MONGOLIAN + W10, 0},
- {"mo", "mo", ROMANIAN + W10, 0},
- {"mon", "mn", MONGOLIAN + W10, 0}, // Mongolian
- {"mr", "mr", MARATHI + W10, HINDI - W4},
- {"ms", "ms", MALAY + W10, INDONESIAN - W4},
- {"mt", "mt", MALTESE + W10, 0},
- {"mx", "es", SPANISH + W10, 0}, // Mexico
- {"my", "my,ms", BURMESE + W10, MALAY + W10}, // Myanmar, Malaysia
-
- {"na", "na", NAURU + W10, 0},
- {"nb", "no", NORWEGIAN + W10, NORWEGIAN_N - W4},
- {"ne", "ne", NEPALI + W10, 0},
- {"nl", "nl", DUTCH + W10, 0},
- {"nn", "nn", NORWEGIAN_N + W10, NORWEGIAN - W4},
- {"no", "no", NORWEGIAN + W10, NORWEGIAN_N - W4},
- {"nr", "nr", NDEBELE + W10, 0},
- {"nso", "nso", PEDI + W10, 0},
- {"ny", "ny", NYANJA + W10, 0},
-
- {"oc", "oc", OCCITAN + W10, 0},
- {"om", "om", OROMO + W10, 0},
- {"or", "or", ORIYA + W10, 0},
-
- {"pa", "pa,ps", PUNJABI + W10, PASHTO + W10}, // 1:2 pa-Guru ps-Arab
- {"per", "fa", PERSIAN + W10, 0},
- {"ph", "tl", TAGALOG + W10, 0}, // Philippines
- {"pk", "ur", URDU + W10, 0}, // Pakistan
- {"pl", "pl", POLISH + W10, 0},
- {"pnb", "pa", PUNJABI + W10, 0}, // Western Punjabi
- {"pol", "pl", POLISH + W10, 0},
- {"por", "pt", PORTUGUESE + W10, 0},
- {"ps", "ps", PASHTO + W10, 0},
- {"pt", "pt", PORTUGUESE + W10, 0},
- {"ptg", "pt", PORTUGUESE + W10, 0},
- {"qc", "fr", FRENCH + W10, 0}, // Quebec "country" code
- {"qu", "qu", QUECHUA + W10, 0},
-
- {"rm", "rm", RHAETO_ROMANCE + W10, 0},
- {"rn", "rn", RUNDI + W10, 0},
- {"ro", "ro", ROMANIAN + W10, 0},
- {"rs", "sr", SERBIAN + W10, 0}, // Serbia country code
- {"ru", "ru", RUSSIAN + W10, 0},
- {"rus", "ru", RUSSIAN + W10, 0},
- {"rw", "rw", KINYARWANDA + W10, 0},
-
- {"sa", "sa", SANSKRIT + W10, 0},
- {"sco", "sco", SCOTS + W10, ENGLISH - W4},
- {"sd", "sd", SINDHI + W10, 0},
- {"se", "sv", SWEDISH + W10, 0},
- {"sg", "sg", SANGO + W10, 0},
- {"si", "si,sl", SINHALESE + W10, SLOVENIAN + W10}, // 1:2 Sinhalese, Slovinia
- {"sk", "sk", SLOVAK + W10, CZECH - W4},
- {"sl", "sl", SLOVENIAN + W10, 0},
- {"slo", "sl", SLOVENIAN + W10, 0},
- {"sm", "sm", SAMOAN + W10, 0},
- {"sn", "sn", SHONA + W10, 0},
- {"so", "so", SOMALI + W10, 0},
- {"sp", "es", SPANISH + W10, 0},
- {"sq", "sq", ALBANIAN + W10, 0},
- {"sr", "sr", SERBIAN + W10, 0},
- {"srb", "sr", SERBIAN + W10, 0},
- {"srl", "sr", SERBIAN + W10, 0}, // Serbian Latin
- {"srp", "sr", SERBIAN + W10, 0},
- {"ss", "ss", SISWANT + W10, 0},
- {"st", "st", SESOTHO + W10, 0},
- {"su", "su", SUNDANESE + W10, 0},
- {"sv", "sv", SWEDISH + W10, 0},
- {"sve", "sv", SWEDISH + W10, 0},
- {"sw", "sw", SWAHILI + W10, 0},
- {"swe", "sv", SWEDISH + W10, 0},
- {"sy", "syr", SYRIAC + W10, 0},
- {"syr", "syr", SYRIAC + W10, 0},
-
- {"ta", "ta", TAMIL + W10, 0},
- {"te", "te", TELUGU + W10, 0},
- {"tg", "tg", TAJIK + W10, 0},
- {"th", "th", THAI + W10, 0},
- {"ti", "ti,bo", TIGRINYA + W10, TIBETAN + W10}, // 1:2 Tigrinya, Tibet
- {"tj", "tg", TAJIK + W10, 0}, // Tajikistan
- {"tk", "tk", TURKMEN + W10, 0},
- {"tl", "tl", TAGALOG + W10, 0},
- {"tlh", "tlh", X_KLINGON + W10, 0},
- {"tn", "tn", TSWANA + W10, 0},
- {"to", "to", TONGA + W10, 0},
- {"tr", "tr", TURKISH + W10, 0},
- {"ts", "ts", TSONGA + W10, 0},
- {"tt", "tt", TATAR + W10, 0},
- {"tw", "ak,zhT", AKAN + W10, CHINESE_T + W10}, // 1:2 Twi => Akan, Taiwan
- {"twi", "ak", AKAN + W10, 0}, // Twi => Akan
-
- {"ua", "uk", UKRAINIAN + W10, 0}, // Ukraine
- {"ug", "ug", UIGHUR + W10, 0},
- {"uk", "uk", UKRAINIAN + W10, 0},
- {"ur", "ur", URDU + W10, 0},
- {"uz", "uz", UZBEK + W10, 0},
-
- {"va", "ca", CATALAN + W10, 0}, // Valencia => Catalan
- {"val", "ca", CATALAN + W10, 0}, // Valencia => Catalan
- {"ve", "ve", VENDA + W10, 0},
- {"vi", "vi", VIETNAMESE + W10, 0},
- {"vie", "vi", VIETNAMESE + W10, 0},
- {"vn", "vi", VIETNAMESE + W10, 0},
- {"vo", "vo", VOLAPUK + W10, 0},
-
- {"wo", "wo", WOLOF + W10, 0},
-
- {"xh", "xh", XHOSA + W10, ZULU - W4},
- {"xho", "xh", XHOSA + W10, ZULU - W4},
-
- {"yi", "yi", YIDDISH + W10, 0},
- {"yo", "yo", YORUBA + W10, 0},
-
- {"za", "za", ZHUANG + W10, 0},
- {"zh", "zh", CHINESE + W10, 0},
- {"zht", "zhT", CHINESE_T + W10, 0},
- {"zu", "zu", ZULU + W10, XHOSA - W4},
-};
-
-
-// Possibly map to tl:
-// -LangTags tl-Latn /7val.com/ ,bcl 2 Central Bicolano
-// -LangTags tl-Latn /7val.com/ ,ceb 6 Cebuano
-// -LangTags tl-Latn /7val.com/ ,war 1 Waray
-
-
-
-// Table to look up country TLD (no general TLD)
-// In alphabetical order for binary search
-static const int kCLDTable3Size = 181;
-static const TLDLookup kCLDTLDHintTable[kCLDTable3Size] = {
- {"ac", JAPANESE + W2, 0},
- {"ad", CATALAN + W4, 0},
- {"ae", ARABIC + W4, 0},
- {"af", PASHTO + W4, PERSIAN + W4},
- {"ag", GERMAN + W2, 0}, // meager
- // {"ai", 0, 0}, // meager
- {"al", ALBANIAN + W4, 0},
- {"am", ARMENIAN + W4, 0},
- {"an", DUTCH + W4, 0}, // meager
- {"ao", PORTUGUESE + W4, 0},
- // {"aq", 0, 0}, // meager
- {"ar", SPANISH + W4, 0},
- // {"as", 0, 0},
- {"at", GERMAN + W4, 0},
- {"au", ENGLISH + W2, 0},
- {"aw", DUTCH + W4, 0},
- {"ax", SWEDISH + W4, 0},
- {"az", AZERBAIJANI + W4, 0},
-
- {"ba", BOSNIAN + W8, CROATIAN - W4},
- // {"bb", 0, 0},
- {"bd", BENGALI + W4, 0},
- {"be", DUTCH + W4, FRENCH + W4},
- {"bf", FRENCH + W4, 0},
- {"bg", BULGARIAN + W4, 0},
- {"bh", ARABIC + W4, 0},
- {"bi", RUNDI + W4, FRENCH + W4},
- {"bj", FRENCH + W4, 0},
- {"bm", ENGLISH + W2, 0},
- {"bn", MALAY + W4, INDONESIAN - W4},
- {"bo", SPANISH + W4, AYMARA + W2}, // and GUARANI QUECHUA
- {"br", PORTUGUESE + W4, 0},
- // {"bs", 0, 0},
- {"bt", DZONGKHA + W10, TIBETAN - W10}, // Strong presumption of Dzongha
- {"bw", TSWANA + W4, 0},
- {"by", BELARUSIAN + W4, 0},
- // {"bz", 0, 0},
-
- {"ca", FRENCH + W4, ENGLISH + W2},
- {"cat", CATALAN + W4, 0},
- {"cc", 0, 0},
- {"cd", FRENCH + W4, 0},
- {"cf", FRENCH + W4, 0},
- {"cg", FRENCH + W4, 0},
- {"ch", GERMAN + W4, FRENCH + W4},
- {"ci", FRENCH + W4, 0},
- // {"ck", 0, 0},
- {"cl", SPANISH + W4, 0},
- {"cm", FRENCH + W4, 0},
- {"cn", CHINESE + W4, 0},
- {"co", SPANISH + W4, 0},
- {"cr", SPANISH + W4, 0},
- {"cu", SPANISH + W4, 0},
- {"cv", PORTUGUESE + W4, 0},
- // {"cx", 0, 0},
- {"cy", GREEK + W4, TURKISH + W4},
- {"cz", CZECH + W4, SLOVAK - W4},
-
- {"de", GERMAN + W4, 0},
- {"dj", 0, 0},
- {"dk", DANISH + W4, NORWEGIAN - W4},
- {"dm", 0, 0},
- {"do", SPANISH + W4, 0},
- {"dz", FRENCH + W4, ARABIC + W4},
-
- {"ec", SPANISH + W4, 0},
- {"ee", ESTONIAN + W4, 0},
- {"eg", ARABIC + W4, 0},
- {"er", AFAR + W4, 0},
- {"es", SPANISH + W4, 0},
- {"et", AMHARIC + W4, AFAR + W4},
-
- {"fi", FINNISH + W4, 0},
- {"fj", FIJIAN + W4, 0},
- // {"fk", 0, 0},
- // {"fm", 0, 0},
- {"fo", FAROESE + W4, ICELANDIC - W4},
- {"fr", FRENCH + W4, 0},
-
- {"ga", FRENCH + W4, 0},
- {"gd", 0, 0},
- {"ge", GEORGIAN + W4, 0},
- {"gf", FRENCH + W4, 0},
- // {"gg", 0, 0},
- // {"gh", 0, 0},
- // {"gi", 0, 0},
- {"gl", GREENLANDIC + W4, DANISH + W4},
- // {"gm", 0, 0},
- {"gn", FRENCH + W4, 0},
- // {"gp", 0, 0},
- // {"gq", 0, 0},
- {"gr", GREEK + W4, 0},
- // {"gs", 0, 0},
- {"gt", SPANISH + W4, 0},
- // {"gu", 0, 0},
- // {"gy", 0, 0},
-
- {"hk", CHINESE_T + W4, 0},
- // {"hm", 0, 0},
- {"hn", SPANISH + W4, 0},
- {"hr", CROATIAN + W8, BOSNIAN - W4},
- {"ht", HAITIAN_CREOLE + W4, FRENCH + W4},
- {"hu", HUNGARIAN + W4, 0},
-
- {"id", INDONESIAN + W4, MALAY - W4},
- {"ie", IRISH + W4, 0},
- {"il", HEBREW + W4, 0},
- {"im", MANX + W4, 0},
- // {"in", 0, 0},
- // {"io", 0, 0},
- {"iq", ARABIC + W4, 0},
- {"ir", PERSIAN + W4, 0},
- {"is", ICELANDIC + W4, FAROESE - W4},
- {"it", ITALIAN + W4, 0},
-
- // {"je", 0, 0},
- // {"jm", 0, 0},
- {"jo", ARABIC + W4, 0},
- {"jp", JAPANESE + W4, 0},
-
- // {"ke", 0, 0},
- {"kg", KYRGYZ + W4, 0},
- {"kh", KHMER + W4, 0},
- // {"ki", 0, 0},
- {"km", FRENCH + W4, 0},
- // {"kn", 0, 0},
- {"kp", KOREAN + W4, 0},
- {"kr", KOREAN + W4, 0},
- {"kw", ARABIC + W4, 0},
- // {"ky", 0, 0},
- {"kz", KAZAKH + W4, 0},
-
- {"la", LAOTHIAN + W4, 0},
- {"lb", ARABIC + W4, FRENCH + W4},
- // {"lc", 0, 0},
- {"li", GERMAN + W4, 0},
- {"lk", SINHALESE + W4, 0},
- // {"lr", 0, 0},
- {"ls", SESOTHO + W4, 0},
- {"lt", LITHUANIAN + W4, 0},
- {"lu", LUXEMBOURGISH + W4},
- {"lv", LATVIAN + W4, 0},
- {"ly", ARABIC + W4, 0},
-
- {"ma", FRENCH + W4, 0},
- {"mc", FRENCH + W4, 0},
- {"md", ROMANIAN + W4, 0},
- {"me", MONTENEGRIN + W8, SERBIAN - W4},
- {"mg", FRENCH + W4, 0},
- {"mk", MACEDONIAN + W4, 0},
- {"ml", FRENCH + W4, 0},
- {"mm", BURMESE + W4, 0},
- {"mn", MONGOLIAN + W4, 0},
- {"mo", CHINESE_T + W4, PORTUGUESE + W4},
- // {"mp", 0, 0},
- {"mq", FRENCH + W4, 0},
- {"mr", FRENCH + W4, ARABIC + W4},
- // {"ms", 0, 0},
- {"mt", MALTESE + W4, 0},
- // {"mu", 0, 0},
- {"mv", DHIVEHI + W4, 0},
- // {"mw", 0, 0},
- {"mx", SPANISH + W4, 0},
- {"my", MALAY + W4, INDONESIAN - W4},
- {"mz", PORTUGUESE + W4, 0},
-
- {"na", 0, 0}, // Namibia
- {"nc", FRENCH + W4, 0},
- {"ne", FRENCH + W4, 0},
- {"nf", FRENCH + W4, 0},
- // {"ng", 0, 0},
- {"ni", SPANISH + W4, 0},
- {"nl", DUTCH + W4, 0},
- {"no", NORWEGIAN + W4, NORWEGIAN_N + W2},
- {"np", NEPALI + W4, 0},
- {"nr", NAURU + W4, 0},
- {"nu", SWEDISH + W4, 0},
- {"nz", MAORI + W4, ENGLISH + W2},
-
- {"om", ARABIC + W4, 0},
-
- {"pa", SPANISH + W4, 0},
- {"pe", SPANISH + W4, QUECHUA + W2}, // also AYMARA
- {"pf", FRENCH + W4, 0},
- // {"pg", 0, 0},
- {"ph", TAGALOG + W4, 0},
- {"pk", URDU + W4, 0},
- {"pl", POLISH + W4, 0},
- // {"pn", 0, 0},
- {"pr", SPANISH + W4, 0},
- {"ps", ARABIC + W4, 0},
- {"pt", PORTUGUESE + W4, 0},
- {"py", SPANISH + W4, GUARANI + W2},
-
- {"qa", ARABIC + W4, 0},
-
- {"re", FRENCH + W4, 0},
- {"ro", ROMANIAN + W4, 0},
- {"rs", SERBIAN + W8, MONTENEGRIN - W4},
- {"ru", RUSSIAN + W4, 0},
- {"rw", KINYARWANDA + W4, FRENCH + W2},
-
- {"sa", ARABIC + W4, 0},
- // {"sb", 0, 0},
- {"sc", SESELWA + W4, 0},
- {"sd", ARABIC + W4, 0},
- {"se", SWEDISH + W4, 0},
- // {"sg", 0, 0},
- // {"sh", 0, 0},
- {"si", SLOVENIAN + W4, 0},
- {"sk", SLOVAK + W4, CZECH - W4},
- // {"sl", 0, 0},
- {"sm", ITALIAN + W4, 0},
- {"sn", FRENCH + W4, 0},
- // {"sr", 0, 0},
- {"ss", ARABIC + W4, 0}, // Presumed South Sudan TLD. dsites 2011.07.07
- // {"st", 0, 0},
- {"su", RUSSIAN + W4, 0},
- {"sv", SPANISH + W4, 0},
- {"sy", ARABIC + W4, 0},
- // {"sz", 0, 0},
-
- // {"tc", 0, 0},
- {"td", FRENCH + W4, 0},
- // {"tf", 0, 0},
- {"tg", FRENCH + W4, 0},
- {"th", THAI + W4, 0},
- // Tibet has no country code (see .cn)
- {"tj", TAJIK + W4, 0},
- // {"tk", 0, 0},
- // {"tl", 0, 0},
- {"tm", TURKISH + W4, 0},
- {"tn", FRENCH + W4, ARABIC + W4},
- // {"to", 0, 0},
- {"tp", JAPANESE + W4, 0},
- {"tr", TURKISH + W4, 0},
- // {"tt", 0, 0},
- // {"tv", 0, 0},
- {"tw", CHINESE_T + W4, 0},
- {"tz", SWAHILI + W4, AKAN + W4},
-
- {"ua", UKRAINIAN + W4, 0},
- {"ug", GANDA + W4, 0},
- {"uk", ENGLISH + W2, 0},
- {"us", ENGLISH + W2, 0},
- {"uy", SPANISH + W4, 0},
- {"uz", UZBEK + W4, 0},
-
- {"va", ITALIAN + W4, LATIN + W2},
- // {"vc", 0, 0},
- {"ve", SPANISH + W4, 0},
- // {"vg", 0, 0},
- // {"vi", 0, 0},
- {"vn", VIETNAMESE + W4, 0},
- // {"vu", 0, 0},
-
- {"wf", FRENCH + W4, 0},
- // {"ws", 0, 0},
-
- {"ye", ARABIC + W4, 0},
-
- {"za", AFRIKAANS + W4, 0},
- // {"zm", 0, 0},
- // {"zw", 0, 0},
-};
-
-#undef W2
-#undef W4
-#undef W6
-#undef W8
-#undef W10
-#undef W12
-
-
-
-
-
-inline void SetCLDPriorWeight(int w, OneCLDLangPrior* olp) {
- *olp = (*olp & 0x3ff) + (w << 10);
-}
-inline void SetCLDPriorLang(Language lang, OneCLDLangPrior* olp) {
- *olp = (*olp & ~0x3ff) + lang;
-}
-
-OneCLDLangPrior PackCLDPriorLangWeight(Language lang, int w) {
- return (w << 10) + lang;
-}
-
-inline int MaxInt(int a, int b) {
- return (a >= b) ? a : b;
-}
-
-// Merge in another language prior, taking max if already there
-void MergeCLDLangPriorsMax(OneCLDLangPrior olp, CLDLangPriors* lps) {
- if (olp == 0) {return;}
- Language target_lang = GetCLDPriorLang(olp);
- for (int i = 0; i < lps->n; ++i) {
- if (GetCLDPriorLang(lps->prior[i]) == target_lang) {
- int new_weight = MaxInt(GetCLDPriorWeight(lps->prior[i]),
- GetCLDPriorWeight(olp));
- SetCLDPriorWeight(new_weight, &lps->prior[i]);
- return;
- }
- }
- // Not found; add it if room
- if (lps->n >= kMaxOneCLDLangPrior) {return;}
- lps->prior[lps->n++] = olp;
-}
-
-// Merge in another language prior, boosting 10x if already there
-void MergeCLDLangPriorsBoost(OneCLDLangPrior olp, CLDLangPriors* lps) {
- if (olp == 0) {return;}
- Language target_lang = GetCLDPriorLang(olp);
- for (int i = 0; i < lps->n; ++i) {
- if (GetCLDPriorLang(lps->prior[i]) == target_lang) {
- int new_weight = GetCLDPriorWeight(lps->prior[i]) + 2;
- SetCLDPriorWeight(new_weight, &lps->prior[i]);
- return;
- }
- }
- // Not found; add it if room
- if (lps->n >= kMaxOneCLDLangPrior) {return;}
- lps->prior[lps->n++] = olp;
-}
-
-
-// Trim language priors to no more than max_entries, keeping largest abs weights
-void TrimCLDLangPriors(int max_entries, CLDLangPriors* lps) {
- if (lps->n <= max_entries) {return;}
-
- // Insertion sort in-place by abs(weight)
- for (int i = 0; i < lps->n; ++i) {
- OneCLDLangPrior temp_olp = lps->prior[i];
- int w = abs(GetCLDPriorWeight(temp_olp));
- int kk = i;
- for (; kk > 0; --kk) {
- if (abs(GetCLDPriorWeight(lps->prior[kk - 1])) < w) {
- // Move down and continue
- lps->prior[kk] = lps->prior[kk - 1];
- } else {
- // abs(weight[kk - 1]) >= w, time to stop
- break;
- }
- }
- lps->prior[kk] = temp_olp;
- }
-
- lps->n = max_entries;
-}
-
-int CountCommas(const string& langtags) {
- int commas = 0;
- for (int i = 0; i < static_cast<int>(langtags.size()); ++i) {
- if (langtags[i] == ',') {++commas;}
- }
- return commas;
-}
-
-// Binary lookup on language tag
-const LangTagLookup* DoLangTagLookup(const char* key,
- const LangTagLookup* tbl, int tbl_size) {
- // Key is always in range [lo..hi)
- int lo = 0;
- int hi = tbl_size;
- while (lo < hi) {
- int mid = (lo + hi) >> 1;
- int comp = strcmp(tbl[mid].langtag, key);
- if (comp < 0) {
- lo = mid + 1;
- } else if (comp > 0) {
- hi = mid;
- } else {
- return &tbl[mid];
- }
- }
- return NULL;
-}
-
-// Binary lookup on tld
-const TLDLookup* DoTLDLookup(const char* key,
- const TLDLookup* tbl, int tbl_size) {
- // Key is always in range [lo..hi)
- int lo = 0;
- int hi = tbl_size;
- while (lo < hi) {
- int mid = (lo + hi) >> 1;
- int comp = strcmp(tbl[mid].tld, key);
- if (comp < 0) {
- lo = mid + 1;
- } else if (comp > 0) {
- hi = mid;
- } else {
- return &tbl[mid];
- }
- }
- return NULL;
-}
-
-
-
-// Trim language tag string to canonical form for each language
-// Input is from GetLangTagsFromHtml(), already lowercased
-string TrimCLDLangTagsHint(const string& langtags) {
- string retval;
- if (langtags.empty()) {return retval;}
- int commas = CountCommas(langtags);
- if (commas > 4) {return retval;} // Ignore if too many language tags
-
- char temp[20];
- int pos = 0;
- while (pos < static_cast<int>(langtags.size())) {
- int comma = langtags.find(',', pos);
- if (comma == string::npos) {comma = langtags.size();} // fake trailing comma
- int len = comma - pos;
- if (len <= 16) {
- // Short enough to use
- memcpy(temp, &langtags[pos], len);
- temp[len] = '\0';
- const LangTagLookup* entry = DoLangTagLookup(temp,
- kCLDLangTagsHintTable1,
- kCLDTable1Size);
- if (entry != NULL) {
- // First table hit
- retval.append(entry->langcode); // may be "code1,code2"
- retval.append(1, ',');
- } else {
- // Try second table with language code truncated at first hyphen
- char* hyphen = strchr(temp, '-');
- if (hyphen != NULL) {*hyphen = '\0';}
- len = strlen(temp);
- if (len <= 3) { // Short enough to use
- entry = DoLangTagLookup(temp,
- kCLDLangTagsHintTable2,
- kCLDTable2Size);
- if (entry != NULL) {
- // Second table hit
- retval.append(entry->langcode); // may be "code1,code2"
- retval.append(1, ',');
- }
- }
- }
- }
- pos = comma + 1;
- }
-
- // Remove trainling comma, if any
- if (!retval.empty()) {retval.resize(retval.size() - 1);}
- return retval;
-}
-
-
-
-//==============================================================================
-
-// Little state machine to scan insides of language attribute quoted-string.
-// Each language code is lowercased and copied to the output string. Underscore
-// is mapped to minus. Space, tab, and comma are all mapped to comma, and
-// multiple consecutive commas are removed.
-// Each language code in the output list will be followed by a single comma.
-
-// There are three states, and we start in state 1:
-// State 0: After a letter.
-// Copy all letters/minus[0], copy comma[1]; all others copy comma and skip [2]
-// State 1: Just after a comma.
-// Copy letter [0], Ignore subsequent commas[1]. minus and all others skip [2]
-// State 2: Skipping.
-// All characters except comma skip and stay in [2]. comma goes to [1]
-
-// The thing that is copied is kLangCodeRemap[c] when going to state 0,
-// and always comma when going to state 1 or 2. The design depends on copying
-// a comma at the *beginning* of skipping, and in state 2 never doing a copy.
-
-// We pack all this into 8 bits:
-// +--+---+---+
-// |78|654|321|
-// +--+---+---+
-//
-// Shift byte right by 3*state, giving [0] 321, [1] 654, [2] .78
-// where . is always zero
-// Of these 3 bits, low two are next state ss, high bit is copy bit C.
-// If C=1 and ss == 0, copy kLangCodeRemap[c], else copy a comma
-
-#define SKIP0 0
-#define SKIP1 1
-#define SKIP2 2
-#define COPY0 4 // copy kLangCodeRemap[c]
-#define COPY1 5 // copy ','
-#define COPY2 6 // copy ','
-
-// These combined actions pack three states into one byte.
-// Ninth bit must be zero, so all state 2 values must be skips.
-// state[2] state[1] state[0]
-#define LTR ((SKIP2 << 6) + (COPY0 << 3) + COPY0)
-#define MINUS ((SKIP2 << 6) + (COPY2 << 3) + COPY0)
-#define COMMA ((SKIP1 << 6) + (SKIP1 << 3) + COPY1)
-#define Bad ((SKIP2 << 6) + (COPY2 << 3) + COPY2)
-
-// Treat as letter: a-z, A-Z
-// Treat as minus: 2D minus, 5F underscore
-// Treat as comma: 09 tab, 20 space, 2C comma
-
-static const unsigned char kLangCodeAction[256] = {
- Bad,Bad,Bad,Bad,Bad,Bad,Bad,Bad, Bad,COMMA,Bad,Bad,Bad,Bad,Bad,Bad,
- Bad,Bad,Bad,Bad,Bad,Bad,Bad,Bad, Bad,Bad,Bad,Bad,Bad,Bad,Bad,Bad,
- COMMA,Bad,Bad,Bad,Bad,Bad,Bad,Bad, Bad,Bad,Bad,Bad,COMMA,MINUS,Bad,Bad,
- Bad,Bad,Bad,Bad,Bad,Bad,Bad,Bad, Bad,Bad,Bad,Bad,Bad,Bad,Bad,Bad,
-
- Bad,LTR,LTR,LTR,LTR,LTR,LTR,LTR, LTR,LTR,LTR,LTR,LTR,LTR,LTR,LTR,
- LTR,LTR,LTR,LTR,LTR,LTR,LTR,LTR, LTR,LTR,LTR,Bad,Bad,Bad,Bad,MINUS,
- Bad,LTR,LTR,LTR,LTR,LTR,LTR,LTR, LTR,LTR,LTR,LTR,LTR,LTR,LTR,LTR,
- LTR,LTR,LTR,LTR,LTR,LTR,LTR,LTR, LTR,LTR,LTR,Bad,Bad,Bad,Bad,Bad,
-
- Bad,Bad,Bad,Bad,Bad,Bad,Bad,Bad, Bad,Bad,Bad,Bad,Bad,Bad,Bad,Bad,
- Bad,Bad,Bad,Bad,Bad,Bad,Bad,Bad, Bad,Bad,Bad,Bad,Bad,Bad,Bad,Bad,
- Bad,Bad,Bad,Bad,Bad,Bad,Bad,Bad, Bad,Bad,Bad,Bad,Bad,Bad,Bad,Bad,
- Bad,Bad,Bad,Bad,Bad,Bad,Bad,Bad, Bad,Bad,Bad,Bad,Bad,Bad,Bad,Bad,
-
- Bad,Bad,Bad,Bad,Bad,Bad,Bad,Bad, Bad,Bad,Bad,Bad,Bad,Bad,Bad,Bad,
- Bad,Bad,Bad,Bad,Bad,Bad,Bad,Bad, Bad,Bad,Bad,Bad,Bad,Bad,Bad,Bad,
- Bad,Bad,Bad,Bad,Bad,Bad,Bad,Bad, Bad,Bad,Bad,Bad,Bad,Bad,Bad,Bad,
- Bad,Bad,Bad,Bad,Bad,Bad,Bad,Bad, Bad,Bad,Bad,Bad,Bad,Bad,Bad,Bad,
-};
-
-// This does lowercasing, maps underscore to minus, and maps tab/space to comma
-static const unsigned char kLangCodeRemap[256] = {
- 0,0,0,0,0,0,0,0, 0,',',0,0,0,0,0,0, // 09 tab
- 0,0,0,0,0,0,0,0, 0,0,0,0,0,0,0,0,
- ',',0,0,0,0,0,0,0, 0,0,0,0,',','-',0,0, // 20 space 2C comma 2D minus
- 0,0,0,0,0,0,0,0, 0,0,0,0,0,0,0,0,
-
- 0,'a','b','c','d','e','f','g', 'h','i','j','k','l','m','n','o',
- 'p','q','r','s','t','u','v','w', 'x','y','z',0,0,0,0,'-', // 5F underscore
- 0,'a','b','c','d','e','f','g', 'h','i','j','k','l','m','n','o',
- 'p','q','r','s','t','u','v','w', 'x','y','z',0,0,0,0,0,
-
- 0,0,0,0,0,0,0,0, 0,0,0,0,0,0,0,0,
- 0,0,0,0,0,0,0,0, 0,0,0,0,0,0,0,0,
- 0,0,0,0,0,0,0,0, 0,0,0,0,0,0,0,0,
- 0,0,0,0,0,0,0,0, 0,0,0,0,0,0,0,0,
-
- 0,0,0,0,0,0,0,0, 0,0,0,0,0,0,0,0,
- 0,0,0,0,0,0,0,0, 0,0,0,0,0,0,0,0,
- 0,0,0,0,0,0,0,0, 0,0,0,0,0,0,0,0,
- 0,0,0,0,0,0,0,0, 0,0,0,0,0,0,0,0,
-};
-
-#undef LTR
-#undef MINUS
-#undef COMMA
-#undef Bad
-
-#undef SKIP0
-#undef SKIP1
-#undef SKIP2
-#undef COPY0
-#undef COPY1
-#undef COPY2
-
-
-// Find opening '<' for HTML tag
-// Note: this is all somewhat insensitive to mismatched quotes
-int32 FindTagStart(const char* utf8_body, int32 pos, int32 max_pos) {
- int i = pos;
- // Advance i by 4 if none of the next 4 bytes are '<'
- for (i = pos; i < (max_pos - 3); i += 4) {
- // Fast check for any <
- const char* p = &utf8_body[i];
- uint32 s0123 = UNALIGNED_LOAD32(p);
- uint32 temp = s0123 ^ 0x3c3c3c3c; // <<<<
- if (((temp - 0x01010101) & (~temp & 0x80808080)) != 0) {
- // At least one byte is '<'
- break;
- }
- }
- // Continue, advancing i by 1
- for (; i < max_pos; ++i) {
- if (utf8_body[i] == '<') {return i;}
- }
- return -1;
-}
-
-
-// Find closing '>' for HTML tag. Also stop on < and & (simplistic parsing)
-int32 FindTagEnd(const char* utf8_body, int32 pos, int32 max_pos) {
- // Always outside quotes
- for (int i = pos; i < max_pos; ++i) {
- char c = utf8_body[i];
- if (c == '>') {return i;}
- if (c == '<') {return i - 1;}
- if (c == '&') {return i - 1;}
- }
- return -1; // nothing found
-}
-
-// Find opening quote or apostrophe, skipping spaces
-// Note: this is all somewhat insensitive to mismatched quotes
-int32 FindQuoteStart(const char* utf8_body, int32 pos, int32 max_pos) {
- for (int i = pos; i < max_pos; ++i) {
- char c = utf8_body[i];
- if (c == '"') {return i;}
- if (c == '\'') {return i;}
- if (c != ' ') {return -1;}
- }
- return -1;
-}
-
-// Find closing quot/apos. Also stop on = > < and & (simplistic parsing)
-int32 FindQuoteEnd(const char* utf8_body, int32 pos, int32 max_pos) {
- // Always outside quotes
- for (int i = pos; i < max_pos; ++i) {
- char c = utf8_body[i];
- if (c == '"') {return i;}
- if (c == '\'') {return i;}
- if (c == '>') {return i - 1;}
- if (c == '=') {return i - 1;}
- if (c == '<') {return i - 1;}
- if (c == '&') {return i - 1;}
- }
- return -1; // nothing found
-}
-
-int32 FindEqualSign(const char* utf8_body, int32 pos, int32 max_pos) {
- // Outside quotes/apostrophes loop
- for (int i = pos; i < max_pos; ++i) {
- char c = utf8_body[i];
- if (c == '=') { // Found bare equal sign inside tag
- return i;
- } else if (c == '"') {
- // Inside quotes loop
- int j;
- for (j = i + 1; j < max_pos; ++j) {
- if (utf8_body[j] == '"') {
- break;
- } else if (utf8_body[j] == '\\') {
- ++j;
- }
- }
- i = j;
- } else if (c == '\'') {
- // Inside apostrophes loop
- int j;
- for (j = i + 1; j < max_pos; ++j) {
- if (utf8_body[j] == '\'') {
- break;
- } else if (utf8_body[j] == '\\') {
- ++j;
- }
- }
- i = j;
- }
-
- }
- return -1; // nothing found
-}
-
-// Scan backwards for case-insensitive string s in [min_pos..pos)
-// Bytes of s must already be lowercase, i.e. in [20..3f] or [60..7f]
-// Cheap lowercase. Control codes will masquerade as 20..3f
-bool FindBefore(const char* utf8_body,
- int32 min_pos, int32 pos, const char* s) {
- int len = strlen(s);
- if ((pos - min_pos) < len) {return false;} // Too small to fit s
-
- // Skip trailing spaces
- int i = pos;
- while ((i > (min_pos + len)) && (utf8_body[i - 1] == ' ')) {--i;}
- i -= len;
- if (i < min_pos) {return false;} // pos - min_pos < len, so s can't be found
-
- const char* p = &utf8_body[i];
- for (int j = 0; j < len; ++j) {
- if ((p[j] | 0x20) != s[j]) {return false;} // Unequal byte
- }
- return true; // All bytes equal at i
-}
-
-// Scan forwards for case-insensitive string s in [pos..max_pos)
-// Bytes of s must already be lowercase, i.e. in [20..3f] or [60..7f]
-// Cheap lowercase. Control codes will masquerade as 20..3f
-// Allows but does not require quoted/apostrophe string
-bool FindAfter(const char* utf8_body,
- int32 pos, int32 max_pos, const char* s) {
- int len = strlen(s);
- if ((max_pos - pos) < len) {return false;} // Too small to fit s
-
- // Skip leading spaces, quote, apostrophe
- int i = pos;
- while (i < (max_pos - len)) {
- unsigned char c = utf8_body[i];
- if ((c == ' ') || (c == '"') || (c == '\'')) {++i;}
- else {break;}
- }
-
- const char* p = &utf8_body[i];
- for (int j = 0; j < len; ++j) {
- if ((p[j] | 0x20) != s[j]) {return false;} // Unequal byte
- }
- return true; // All bytes equal
-}
-
-
-
-// Copy attribute value in [pos..max_pos)
-// pos is just after an opening quote/apostrophe and max_pos is the ending one
-// String must all be on a single line.
-// Return slightly-normalized language list, empty or ending in comma
-// Does lowercasing and removes excess punctuation/space
-string CopyOneQuotedString(const char* utf8_body,
- int32 pos, int32 max_pos) {
- string s;
- int state = 1; // Front is logically just after a comma
- for (int i = pos; i < max_pos; ++i) {
- unsigned char c = utf8_body[i];
- int e = kLangCodeAction[c] >> (3 * state);
- state = e & 3; // Update to next state
- if ((e & 4) != 0) {
- // Copy a remapped byte if going to state 0, else copy a comma
- if (state == 0) {
- s.append(1, kLangCodeRemap[c]);
- } else {
- s.append(1, ',');
- }
- }
- }
-
- // Add final comma if needed
- if (state == 0) {
- s.append(1, ',');
- }
- return s;
-}
-
-// Find and copy attribute value: quoted string in [pos..max_pos)
-// Return slightly-normalized language list, empty or ending in comma
-string CopyQuotedString(const char* utf8_body,
- int32 pos, int32 max_pos) {
- int32 start_quote = FindQuoteStart(utf8_body, pos, max_pos);
- if (start_quote < 0) {return string("");}
- int32 end_quote = FindQuoteEnd(utf8_body, start_quote + 1, max_pos);
- if (end_quote < 0) {return string("");}
-
- return CopyOneQuotedString(utf8_body, start_quote + 1, end_quote);
-}
-
-// Add hints to vector of langpriors
-// Input is from GetLangTagsFromHtml(), already lowercased
-void SetCLDLangTagsHint(const string& langtags, CLDLangPriors* langpriors) {
- if (langtags.empty()) {return;}
- int commas = CountCommas(langtags);
- if (commas > 4) {return;} // Ignore if too many language tags
-
- char temp[20];
- int pos = 0;
- while (pos < static_cast<int>(langtags.size())) {
- int comma = langtags.find(',', pos);
- if (comma == string::npos) {comma = langtags.size();} // fake trailing comma
- int len = comma - pos;
- if (len <= 16) {
- // Short enough to use
- memcpy(temp, &langtags[pos], len);
- temp[len] = '\0';
- const LangTagLookup* entry = DoLangTagLookup(temp,
- kCLDLangTagsHintTable1,
- kCLDTable1Size);
- if (entry != NULL) {
- // First table hit
- MergeCLDLangPriorsMax(entry->onelangprior1, langpriors);
- MergeCLDLangPriorsMax(entry->onelangprior2, langpriors);
- } else {
- // Try second table with language code truncated at first hyphen
- char* hyphen = strchr(temp, '-');
- if (hyphen != NULL) {*hyphen = '\0';}
- len = strlen(temp);
- if (len <= 3) { // Short enough to use
- entry = DoLangTagLookup(temp,
- kCLDLangTagsHintTable2,
- kCLDTable2Size);
- if (entry != NULL) {
- // Second table hit
- MergeCLDLangPriorsMax(entry->onelangprior1, langpriors);
- MergeCLDLangPriorsMax(entry->onelangprior2, langpriors);
- }
- }
- }
- }
- pos = comma + 1;
- }
-}
-
-// Add hints to vector of langpriors
-// Input is string after HTTP header Content-Language:
-void SetCLDContentLangHint(const char* contentlang, CLDLangPriors* langpriors) {
- string langtags = CopyOneQuotedString(contentlang, 0, strlen(contentlang));
- SetCLDLangTagsHint(langtags, langpriors);
-}
-
-// Add hints to vector of langpriors
-// Input is last element of hostname (no dot), e.g. from GetTLD()
-void SetCLDTLDHint(const char* tld, CLDLangPriors* langpriors) {
- int len = strlen(tld);
- if (len > 3) {return;} // Ignore if more than three letters
- char local_tld[4];
- strncpy(local_tld, tld, 4);
- local_tld[3] = '\0'; // Safety move
- // Lowercase
- for (int i = 0; i < len; ++i) {local_tld[i] |= 0x20;}
- const TLDLookup* entry = DoTLDLookup(local_tld,
- kCLDTLDHintTable,
- kCLDTable3Size);
- if (entry != NULL) {
- // Table hit
- MergeCLDLangPriorsBoost(entry->onelangprior1, langpriors);
- MergeCLDLangPriorsBoost(entry->onelangprior2, langpriors);
- }
-}
-
-// Add hints to vector of langpriors
-// Input is from DetectEncoding()
-void SetCLDEncodingHint(Encoding enc, CLDLangPriors* langpriors) {
- OneCLDLangPrior olp;
- switch (enc) {
- case CHINESE_GB:
- case GBK:
- case GB18030:
- case ISO_2022_CN:
- case HZ_GB_2312:
- olp = PackCLDPriorLangWeight(CHINESE, kCLDPriorEncodingWeight);
- MergeCLDLangPriorsBoost(olp, langpriors);
- break;
- case CHINESE_BIG5:
- case CHINESE_BIG5_CP950:
- case BIG5_HKSCS:
- olp = PackCLDPriorLangWeight(CHINESE_T, kCLDPriorEncodingWeight);
- MergeCLDLangPriorsBoost(olp, langpriors);
- break;
- case JAPANESE_EUC_JP:
- case JAPANESE_SHIFT_JIS:
- case JAPANESE_CP932:
- case JAPANESE_JIS: // ISO-2022-JP
- olp = PackCLDPriorLangWeight(JAPANESE, kCLDPriorEncodingWeight);
- MergeCLDLangPriorsBoost(olp, langpriors);
- break;
- case KOREAN_EUC_KR:
- case ISO_2022_KR:
- olp = PackCLDPriorLangWeight(KOREAN, kCLDPriorEncodingWeight);
- MergeCLDLangPriorsBoost(olp, langpriors);
- break;
-
- default:
- break;
- }
-}
-
-// Add hints to vector of langpriors
-// Input is from random source
-void SetCLDLanguageHint(Language lang, CLDLangPriors* langpriors) {
- OneCLDLangPrior olp = PackCLDPriorLangWeight(lang, kCLDPriorLanguageWeight);
- MergeCLDLangPriorsBoost(olp, langpriors);
-}
-
-
-// Make printable string of priors
-string DumpCLDLangPriors(const CLDLangPriors* langpriors) {
- string retval;
- for (int i = 0; i < langpriors->n; ++i) {
- char temp[64];
- sprintf(temp, "%s.%d ",
- LanguageCode(GetCLDPriorLang(langpriors->prior[i])),
- GetCLDPriorWeight(langpriors->prior[i]));
- retval.append(temp);
- }
- return retval;
-}
-
-
-
-
-// Look for
-// <html lang="en">
-// <doc xml:lang="en">
-// <html xmlns="http://www.w3.org/1999/xhtml" xml:lang="en" lang="en-US">
-// <meta http-equiv="content-language" content="en-GB" />
-// <meta name="language" content="Srpski">
-// <meta name="DC.language" scheme="RFCOMMA766" content="en">
-// <SPAN id="msg1" class="info" lang='en'>
-//
-// Do not trigger on
-// <!-- lang=french ...-->
-// <font lang=postscript ...>
-// <link href="index.fr.html" hreflang="fr-FR" xml:lang="fr-FR" />
-// <META name="Author" lang="fr" content="Arnaud Le Hors">
-//
-// Stop fairly quickly on mismatched quotes
-//
-// Allowed language characters
-// a-z A-Z -_ , space\t
-// Think about: GB2312, big5, shift-jis, euc-jp, ksc euc-kr
-// zh-hans zh-TW cmn-Hani zh_cn.gb18030_CN zh-min-nan zh-yue
-// de-x-mtfrom-en zh-tw-x-mtfrom-en (machine translation)
-// GB2312 => gb
-// Big5 => big
-// zh_CN.gb18030_C => zh-cn
-//
-// Remove duplicates and extra spaces as we go
-// Lowercase as we go.
-
-// Get language tag hints from HTML body
-// Normalize: remove spaces and make lowercase comma list
-
-string GetLangTagsFromHtml(const char* utf8_body, int32 utf8_body_len,
- int32 max_scan_bytes) {
- string retval;
- if (max_scan_bytes > utf8_body_len) {
- max_scan_bytes = utf8_body_len;
- }
-
- int32 k = 0;
- while (k < max_scan_bytes) {
- int32 start_tag = FindTagStart(utf8_body, k, max_scan_bytes);
- if (start_tag < 0) {break;}
- int32 end_tag = FindTagEnd(utf8_body, start_tag + 1, max_scan_bytes);
- // FindTagEnd exits on < > &
- if (end_tag < 0) {break;}
-
- // Skip <!--...>
- // Skip <font ...>
- // Skip <script ...>
- // Skip <link ...>
- // Skip <img ...>
- // Skip <a ...>
- if (FindAfter(utf8_body, start_tag + 1, end_tag, "!--") ||
- FindAfter(utf8_body, start_tag + 1, end_tag, "font ") ||
- FindAfter(utf8_body, start_tag + 1, end_tag, "script ") ||
- FindAfter(utf8_body, start_tag + 1, end_tag, "link ") ||
- FindAfter(utf8_body, start_tag + 1, end_tag, "img ") ||
- FindAfter(utf8_body, start_tag + 1, end_tag, "a ")) {
- k = end_tag + 1;
- continue;
- }
-
- // Remember <meta ...>
- bool in_meta = false;
- if (FindAfter(utf8_body, start_tag + 1, end_tag, "meta ")) {
- in_meta = true;
- }
-
- // Scan for each equal sign inside tag
- bool content_is_lang = false;
- int32 kk = start_tag + 1;
- int32 equal_sign;
- while ((equal_sign = FindEqualSign(utf8_body, kk, end_tag)) >= 0) {
- // eq exits on < > &
-
- // Look inside a meta tag
- // <meta ... http-equiv="content-language" ...>
- // <meta ... name="language" ...>
- // <meta ... name="dc.language" ...>
- if (in_meta) {
- if (FindBefore(utf8_body, kk, equal_sign, " http-equiv") &&
- FindAfter(utf8_body, equal_sign + 1, end_tag,
- "content-language ")) {
- content_is_lang = true;
- } else if (FindBefore(utf8_body, kk, equal_sign, " name") &&
- (FindAfter(utf8_body, equal_sign + 1, end_tag,
- "dc.language ") ||
- FindAfter(utf8_body, equal_sign + 1, end_tag,
- "language "))) {
- content_is_lang = true;
- }
- }
-
- // Look inside any tag
- // <meta ... content="lang-list" ...>
- // <... lang="lang-list" ...>
- // <... xml:lang="lang-list" ...>
- if ((content_is_lang && FindBefore(utf8_body, kk, equal_sign,
- " content")) ||
- FindBefore(utf8_body, kk, equal_sign, " lang") ||
- FindBefore(utf8_body, kk, equal_sign, ":lang")) {
- string temp = CopyQuotedString(utf8_body, equal_sign + 1, end_tag);
-
- // Append new lang tag(s) if not a duplicate
- if (!temp.empty() && (retval.find(temp) == string::npos)) {
- retval.append(temp);
- }
- }
-
- kk = equal_sign + 1;
- }
- k = end_tag + 1;
- }
-
- // Strip last comma
- if (retval.size() > 1) {
- retval.erase(retval.size() - 1);
- }
- return retval;
-}
-
-} // End namespace CLD2
-
-//==============================================================================