diff options
author | Matt A. Tobin <email@mattatobin.com> | 2019-12-16 13:57:01 -0500 |
---|---|---|
committer | Matt A. Tobin <email@mattatobin.com> | 2019-12-16 13:57:01 -0500 |
commit | 06494f307850c576868831bd28a61464eab1f359 (patch) | |
tree | f281f5c46c3e0b73c7eabe22f02622dc013b0c35 /application/basilisk/components/translation/cld2/internal/compact_lang_det_hint_code.cc | |
parent | e7d4713e0765c79feddf2384d343d10595fa5cb3 (diff) | |
download | UXP-06494f307850c576868831bd28a61464eab1f359.tar UXP-06494f307850c576868831bd28a61464eab1f359.tar.gz UXP-06494f307850c576868831bd28a61464eab1f359.tar.lz UXP-06494f307850c576868831bd28a61464eab1f359.tar.xz UXP-06494f307850c576868831bd28a61464eab1f359.zip |
Remove Basilisk from the Unified XUL Platform repository
Development will proceed at https://github.com/MoonchildProductions/Basilisk
Diffstat (limited to 'application/basilisk/components/translation/cld2/internal/compact_lang_det_hint_code.cc')
-rw-r--r-- | application/basilisk/components/translation/cld2/internal/compact_lang_det_hint_code.cc | 1649 |
1 files changed, 0 insertions, 1649 deletions
diff --git a/application/basilisk/components/translation/cld2/internal/compact_lang_det_hint_code.cc b/application/basilisk/components/translation/cld2/internal/compact_lang_det_hint_code.cc deleted file mode 100644 index 9bde8a86a..000000000 --- a/application/basilisk/components/translation/cld2/internal/compact_lang_det_hint_code.cc +++ /dev/null @@ -1,1649 +0,0 @@ -// Copyright 2013 Google Inc. All Rights Reserved. -// -// Licensed under the Apache License, Version 2.0 (the "License"); -// you may not use this file except in compliance with the License. -// You may obtain a copy of the License at -// -// http://www.apache.org/licenses/LICENSE-2.0 -// -// Unless required by applicable law or agreed to in writing, software -// distributed under the License is distributed on an "AS IS" BASIS, -// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -// See the License for the specific language governing permissions and -// limitations under the License. - -// -// Author: dsites@google.com (Dick Sites) -// - -#include "compact_lang_det_hint_code.h" - -#include <stdlib.h> // for abs() -#include <stdio.h> // for sprintf() -#include <string.h> // -#include "lang_script.h" -#include "port.h" - -using namespace std; - -namespace CLD2 { - -static const int kCLDPriorEncodingWeight = 4; // 100x more likely -static const int kCLDPriorLanguageWeight = 8; // 10000x more likely - - -// Tables to map lang="..." language code lists to actual languages. -// based on scraping and hand-edits, dsites June 2011 - -// n = f(string, &a) gives list of n<=4 language pairs: primary, secondary - -// For close pairs like ms/id, more weight on TLD and lang= -// Alternately, weaker boost but mark others of set as negative; -// makes "neither" an easier result. -// lang=en low weight 4 -// tld=lu boost lu maaybe 4. but lang= alwyas overcomes tld and encoding -// (except maybe en) - -// TLD to separate, e.g., burundi from rwanda - -// Encoding lookup: OneLangProb array -// TLD lookup: tld OneLangProb pairs - - -typedef struct { - const char* const langtag; // Lowercased, hyphen only lookup key - const char* const langcode; // Canonical language codes; two if ambiguous - OneCLDLangPrior onelangprior1; - OneCLDLangPrior onelangprior2; -} LangTagLookup; - -typedef struct { - const char* const tld; // Lowercased, hyphen only lookup key - OneCLDLangPrior onelangprior1; - OneCLDLangPrior onelangprior2; -} TLDLookup; - - -#define W2 (2 << 10) // 3**2 = 10x more likely -#define W4 (4 << 10) // 3**4 = 100x more likely -#define W6 (6 << 10) // 3**6 = 1000x more likely -#define W8 (8 << 10) // 3**8 = 10K x more likely -#define W10 (10 << 10) // 3**10 = 100K x more likely -#define W12 (12 << 10) // 3**12 = 1M x more likely - -// TODO: more about ba hr sr sr-ME and sl -// Temporary state of affairs: -// BOSNIAN CROATIAN MONTENEGRIN SERBIAN detecting just CROATIAN SERBIAN -// Eventually, we want to do all four, but it requires a CLD change to handle -// up to six languages per quadgram. - - -// Close pairs boost one of pair, demote other. -// Statistically close pairs: -// INDONESIAN/MALAY difficult to distinguish -- extra word-based lookups used -// -// INDONESIAN MALAY coef=0.4698 Problematic w/o extra words -// TIBETAN DZONGKHA coef=0.4571 -// CZECH SLOVAK coef=0.4273 -// NORWEGIAN NORWEGIAN_N coef=0.4182 -// -// HINDI MARATHI coef=0.3795 -// ZULU XHOSA coef=0.3716 -// -// DANISH NORWEGIAN coef=0.3672 Usually OK -// BIHARI HINDI coef=0.3668 Usually OK -// ICELANDIC FAROESE coef=0.3519 Usually OK - -// -// Table to look up lang= tags longer than three characters -// Overrides table below, which is truncated at first hyphen -// In alphabetical order for binary search -static const int kCLDTable1Size = 213; -static const LangTagLookup kCLDLangTagsHintTable1[kCLDTable1Size] = { - {"abkhazian", "ab", ABKHAZIAN + W10, 0}, - {"afar", "aa", AFAR + W10, 0}, - {"afrikaans", "af", AFRIKAANS + W10, 0}, - {"akan", "ak", AKAN + W10, 0}, - {"albanian", "sq", ALBANIAN + W10, 0}, - {"am-am", "hy", ARMENIAN + W10, 0}, // 1:2 Armenian, not ambiguous - {"amharic", "am", AMHARIC + W10, 0}, - {"arabic", "ar", ARABIC + W10, 0}, - {"argentina", "es", SPANISH + W10, 0}, - {"armenian", "hy", ARMENIAN + W10, 0}, - {"assamese", "as", ASSAMESE + W10, 0}, - {"aymara", "ay", AYMARA + W10, 0}, - {"azerbaijani", "az", AZERBAIJANI + W10, 0}, - - {"bangla", "bn", BENGALI + W10, 0}, - {"bashkir", "ba", BASHKIR + W10, 0}, - {"basque", "eu", BASQUE + W10, 0}, - {"belarusian", "be", BELARUSIAN + W10, 0}, - {"bengali", "bn", BENGALI + W10, 0}, - {"bihari", "bh", BIHARI + W10, HINDI - W4}, - {"bislama", "bi", BISLAMA + W10, 0}, - {"bosnian", "bs", BOSNIAN + W10, 0}, // Bosnian => Bosnian - {"br-br", "pt", PORTUGUESE + W10, 0}, // 1:2 Portuguese, not ambiguous - {"br-fr", "br", BRETON + W10, 0}, // 1:2 Breton, not ambiguous - {"breton", "br", BRETON + W10, 0}, - {"bulgarian", "bg", BULGARIAN + W10, 0}, - {"burmese", "my", BURMESE + W10, 0}, // Myanmar - - {"catalan", "ca", CATALAN + W10, 0}, - {"cherokee", "chr", CHEROKEE + W10, 0}, - {"chichewa", "ny", NYANJA + W10, 0}, - - {"chinese", "zh", CHINESE + W10, 0}, - {"chinese-t", "zhT", CHINESE_T + W10, 0}, - {"chineset", "zhT", CHINESE_T + W10, 0}, - {"corsican", "co", CORSICAN + W10, 0}, - {"cpf-hat", "ht", HAITIAN_CREOLE + W10, 0}, // Creole, French-based - {"croatian", "hr", CROATIAN + W10, 0}, - {"czech", "cs", CZECH + W10, SLOVAK - W4}, - - {"danish", "da", DANISH + W10, NORWEGIAN - W4}, - {"deutsch", "de", GERMAN + W10, 0}, - {"dhivehi", "dv", DHIVEHI + W10, 0}, - {"dutch", "nl", DUTCH + W10, 0}, - {"dzongkha", "dz", DZONGKHA + W10, TIBETAN - W4}, - - {"ell-gr", "el", GREEK + W10, 0}, - {"english", "en", ENGLISH + W4, 0}, - {"esperanto", "eo", ESPERANTO + W10, 0}, - {"estonian", "et", ESTONIAN + W10, 0}, - {"euc-jp", "ja", JAPANESE + W10, 0}, // Japanese encoding - {"euc-kr", "ko", KOREAN + W10, 0}, // Korean encoding - - {"faroese", "fo", FAROESE + W10, ICELANDIC - W4}, - {"fijian", "fj", FIJIAN + W10, 0}, - {"finnish", "fi", FINNISH + W10, 0}, - {"fran", "fr", FRENCH + W10, 0}, // Truncated at non-ASCII - {"francais", "fr", FRENCH + W10, 0}, - {"french", "fr", FRENCH + W10, 0}, - {"frisian", "fy", FRISIAN + W10, 0}, - - {"ga-es", "gl", GALICIAN + W10, 0}, // 1:2 Galician, not ambiguous - {"galician", "gl", GALICIAN + W10, 0}, - {"ganda", "lg", GANDA + W10, 0}, - {"georgian", "ka", GEORGIAN + W10, 0}, - {"german", "de", GERMAN + W10, 0}, - {"greek", "el", GREEK + W10, 0}, - {"greenlandic", "kl", GREENLANDIC + W10, 0}, - {"guarani", "gn", GUARANI + W10, 0}, - {"gujarati", "gu", GUJARATI + W10, 0}, - - {"haitian_creole", "ht", HAITIAN_CREOLE + W10, 0}, - {"hausa", "ha", HAUSA + W10, 0}, - {"hawaiian", "haw", HAWAIIAN + W10, 0}, - {"hebrew", "he", HEBREW + W10, 0}, - {"hindi", "hi", HINDI + W10, MARATHI - W4}, - {"hn-in", "hi", HINDI + W10, MARATHI - W4}, - {"hungarian", "hu", HUNGARIAN + W10, 0}, - - {"icelandic", "is", ICELANDIC + W10, FAROESE - W4}, - {"igbo", "ig", IGBO + W10, 0}, - {"indonesian", "id", INDONESIAN + W10, MALAY - W4}, - {"interlingua", "ia", INTERLINGUA + W10, 0}, - {"interlingue", "ie", INTERLINGUE + W10, 0}, - // 1:2 iu-Cans ik-Latn - {"inuktitut", "iu,ik", INUKTITUT + W10, INUPIAK + W10}, // 1:2 - {"inupiak", "ik,iu", INUPIAK + W10, INUKTITUT + W10}, // 1:2 - {"ir-ie", "ga", IRISH + W10, 0}, // Irish - {"irish", "ga", IRISH + W10, 0}, - {"italian", "it", ITALIAN + W10, 0}, - - {"ja-euc", "ja", JAPANESE + W10, 0}, // Japanese encoding - {"jan-jp", "ja", JAPANESE + W10, 0}, // Japanese encoding - {"japanese", "ja", JAPANESE + W10, 0}, - {"javanese", "jw", JAVANESE + W10, 0}, - - {"kannada", "kn", KANNADA + W10, 0}, - {"kashmiri", "ks", KASHMIRI + W10, 0}, - {"kazakh", "kk", KAZAKH + W10, 0}, - {"khasi", "kha", KHASI + W10, 0}, - {"khmer", "km", KHMER + W10, 0}, - {"kinyarwanda", "rw", KINYARWANDA + W10, 0}, - {"klingon", "tlh", X_KLINGON + W10, 0}, - {"korean", "ko", KOREAN + W10, 0}, - {"kurdish", "ku", KURDISH + W10, 0}, - {"kyrgyz", "ky", KYRGYZ + W10, 0}, - - {"laothian", "lo", LAOTHIAN + W10, 0}, - {"latin", "la", LATIN + W10, 0}, - {"latvian", "lv", LATVIAN + W10, 0}, - {"limbu", "sit", LIMBU + W10, 0}, - {"lingala", "ln", LINGALA + W10, 0}, - {"lithuanian", "lt", LITHUANIAN + W10, 0}, - {"luxembourgish", "lb", LUXEMBOURGISH + W10, 0}, - - {"macedonian", "mk", MACEDONIAN + W10, 0}, - {"malagasy", "mg", MALAGASY + W10, 0}, - {"malay", "ms", MALAY + W10, INDONESIAN - W4}, - {"malayalam", "ml", MALAYALAM + W10, 0}, - {"maltese", "mt", MALTESE + W10, 0}, - {"manx", "gv", MANX + W10, 0}, - {"maori", "mi", MAORI + W10, 0}, - {"marathi", "mr", MARATHI + W10, HINDI - W4}, - {"mauritian_creole", "mfe", MAURITIAN_CREOLE + W10, 0}, - {"moldavian", "mo", ROMANIAN + W10, 0}, - {"mongolian", "mn", MONGOLIAN + W10, 0}, - {"montenegrin", "sr-me", MONTENEGRIN + W10, 0}, - {"myanmar", "my", BURMESE + W10, 0}, // Myanmar - {"nauru", "na", NAURU + W10, 0}, - {"ndebele", "nr", NDEBELE + W10, 0}, - {"nepali", "ne", NEPALI + W10, 0}, - {"no-bok", "no", NORWEGIAN + W10, NORWEGIAN_N - W4}, // Bokmaal - {"no-bokmaal", "no", NORWEGIAN + W10, NORWEGIAN_N - W4}, - {"no-nb", "no", NORWEGIAN + W10, NORWEGIAN_N - W4}, // Bokmaal - {"no-no", "no", NORWEGIAN + W10, NORWEGIAN_N - W4}, - {"no-nyn", "nn", NORWEGIAN_N + W10, NORWEGIAN - W4}, // Nynorsk - {"no-nynorsk", "nn", NORWEGIAN_N + W10, NORWEGIAN - W4}, - {"norwegian", "no", NORWEGIAN + W10, NORWEGIAN_N - W4}, - {"norwegian_n", "nn", NORWEGIAN_N + W10, NORWEGIAN - W4}, - {"nyanja", "ny", NYANJA + W10, 0}, - - {"occitan", "oc", OCCITAN + W10, 0}, - {"oriya", "or", ORIYA + W10, 0}, - {"oromo", "om", OROMO + W10, 0}, - {"parsi", "fa", PERSIAN + W10, 0}, - - {"pashto", "ps", PASHTO + W10, 0}, - {"pedi", "nso", PEDI + W10, 0}, - {"persian", "fa", PERSIAN + W10, 0}, - {"polish", "pl", POLISH + W10, 0}, - {"polska", "pl", POLISH + W10, 0}, - {"polski", "pl", POLISH + W10, 0}, - {"portugu", "pt", PORTUGUESE + W10, 0}, // Truncated at non-ASCII - {"portuguese", "pt", PORTUGUESE + W10, 0}, - {"punjabi", "pa", PUNJABI + W10, 0}, - - {"quechua", "qu", QUECHUA + W10, 0}, - - {"rhaeto_romance", "rm", RHAETO_ROMANCE + W10, 0}, - {"romanian", "ro", ROMANIAN + W10, 0}, - {"rundi", "rn", RUNDI + W10, 0}, - {"russian", "ru", RUSSIAN + W10, 0}, - - {"samoan", "sm", SAMOAN + W10, 0}, - {"sango", "sg", SANGO + W10, 0}, - {"sanskrit", "sa", SANSKRIT + W10, 0}, - {"scots", "sco", SCOTS + W10, ENGLISH - W4}, - {"scots_gaelic", "gd", SCOTS_GAELIC + W10, 0}, - {"serbian", "sr", SERBIAN + W10, 0}, - {"seselwa", "crs", SESELWA + W10, 0}, - {"sesotho", "st", SESOTHO + W10, 0}, - {"shift-jis", "ja", JAPANESE + W10, 0}, // Japanese encoding - {"shift-js", "ja", JAPANESE + W10, 0}, // Japanese encoding - {"shona", "sn", SHONA + W10, 0}, - {"si-lk", "si", SINHALESE + W10, 0}, // 1:2 Sri Lanka, not ambiguous - {"si-si", "sl", SLOVENIAN + W10, 0}, // 1:2 Slovenia, not ambiguous - {"si-sl", "sl", SLOVENIAN + W10, 0}, // 1:2 Slovenia, not ambiguous - {"sindhi", "sd", SINDHI + W10, 0}, - {"sinhalese", "si", SINHALESE + W10, 0}, - {"siswant", "ss", SISWANT + W10, 0}, - {"sit-np", "sit", LIMBU + W10, 0}, - {"slovak", "sk", SLOVAK + W10, CZECH - W4}, - {"slovenian", "sl", SLOVENIAN + W10, 0}, - {"somali", "so", SOMALI + W10, 0}, - {"spanish", "es", SPANISH + W10, 0}, - {"sr-me", "sr-me", MONTENEGRIN + W10, 0}, // Montenegrin => Montenegrin - {"sundanese", "su", SUNDANESE + W10, 0}, - {"suomi", "fi", FINNISH + W10, 0}, // Finnish - {"swahili", "sw", SWAHILI + W10, 0}, - {"swedish", "sv", SWEDISH + W10, 0}, - {"syriac", "syr", SYRIAC + W10, 0}, - - {"tagalog", "tl", TAGALOG + W10, 0}, - {"tajik", "tg", TAJIK + W10, 0}, - {"tamil", "ta", TAMIL + W10, 0}, - {"tatar", "tt", TATAR + W10, 0}, - {"tb-tb", "bo", TIBETAN + W10, DZONGKHA - W4}, // Tibet - {"tchinese", "zhT", CHINESE_T + W10, 0}, - {"telugu", "te", TELUGU + W10, 0}, - {"thai", "th", THAI + W10, 0}, - {"tibetan", "bo", TIBETAN + W10, DZONGKHA - W4}, - {"tigrinya", "ti", TIGRINYA + W10, 0}, - {"tonga", "to", TONGA + W10, 0}, - {"tsonga", "ts", TSONGA + W10, 0}, - {"tswana", "tn", TSWANA + W10, 0}, - {"tt-ru", "tt", TATAR + W10, 0}, - {"tur-tr", "tr", TURKISH + W10, 0}, - {"turkish", "tr", TURKISH + W10, 0}, - {"turkmen", "tk", TURKMEN + W10, 0}, - {"uighur", "ug", UIGHUR + W10, 0}, - {"ukrainian", "uk", UKRAINIAN + W10, 0}, - {"urdu", "ur", URDU + W10, 0}, - {"uzbek", "uz", UZBEK + W10, 0}, - - {"venda", "ve", VENDA + W10, 0}, - {"vietnam", "vi", VIETNAMESE + W10, 0}, - {"vietnamese", "vi", VIETNAMESE + W10, 0}, - {"volapuk", "vo", VOLAPUK + W10, 0}, - - {"welsh", "cy", WELSH + W10, 0}, - {"wolof", "wo", WOLOF + W10, 0}, - - {"xhosa", "xh", XHOSA + W10, ZULU - W4}, - - {"yiddish", "yi", YIDDISH + W10, 0}, - {"yoruba", "yo", YORUBA + W10, 0}, - - {"zh-classical", "zhT", CHINESE_T + W10, 0}, - {"zh-cn", "zh", CHINESE + W10, 0}, - {"zh-hans", "zh", CHINESE + W10, 0}, - {"zh-hant", "zhT", CHINESE_T + W10, 0}, - {"zh-hk", "zhT", CHINESE_T + W10, 0}, - {"zh-min-nan", "zhT", CHINESE_T + W10, 0}, // Min Nan => ChineseT - {"zh-sg", "zhT", CHINESE_T + W10, 0}, - {"zh-tw", "zhT", CHINESE_T + W10, 0}, - {"zh-yue", "zh", CHINESE + W10, 0}, // Yue (Cantonese) => Chinese - {"zhuang", "za", ZHUANG + W10, 0}, - {"zulu", "zu", ZULU + W10, XHOSA - W4}, -}; - - - -// Table to look up lang= tags of two/three characters after truncate at hyphen -// In alphabetical order for binary search -static const int kCLDTable2Size = 257; -static const LangTagLookup kCLDLangTagsHintTable2[kCLDTable2Size] = { - {"aa", "aa", AFAR + W10, 0}, - {"ab", "ab", ABKHAZIAN + W10, 0}, - {"af", "af", AFRIKAANS + W10, 0}, - {"ak", "ak", AKAN + W10, 0}, - {"al", "sq", ALBANIAN + W10, 0}, // Albania - {"am", "am,hy", AMHARIC + W10, ARMENIAN + W10}, // 1:2 Amharic Armenian - {"ar", "ar", ARABIC + W10, 0}, - {"ara", "ar", ARABIC + W10, 0}, - {"arm", "hy", ARMENIAN + W10, 0}, // Armenia - {"arz", "ar", ARABIC + W10, 0}, // Egyptian Arabic - {"as", "as", ASSAMESE + W10, 0}, - {"at", "de", GERMAN + W10, 0}, // Austria - {"au", "de", GERMAN + W10, 0}, // Austria - {"ay", "ay", AYMARA + W10, 0}, - {"az", "az", AZERBAIJANI + W10, 0}, - {"aze", "az", AZERBAIJANI + W10, 0}, - - {"ba", "ba,bs", BASHKIR + W10, BOSNIAN + W10}, // 1:2 Bashkir Bosnia - {"be", "be", BELARUSIAN + W10, 0}, - {"bel", "be", BELARUSIAN + W10, 0}, - {"bg", "bg", BULGARIAN + W10, 0}, - {"bh", "bh", BIHARI + W10, HINDI - W4}, - {"bi", "bi", BISLAMA + W10, 0}, - {"big", "zhT", CHINESE_T + W10, 0}, // Big5 encoding - {"bm", "ms", MALAY + W10, INDONESIAN - W4}, // Bahasa Malaysia - {"bn", "bn", BENGALI + W10, 0}, - {"bo", "bo", TIBETAN + W10, DZONGKHA - W4}, - // 1:2 Breton, Brazil country code, both Latn .br TLD enough for pt to win - {"br", "br,pt", BRETON + W10, PORTUGUESE + W8}, // 1:2 Breton, Brazil - {"bs", "bs", BOSNIAN + W10, 0}, // Bosnian => Bosnian - - {"ca", "ca", CATALAN + W10, 0}, - {"cat", "ca", CATALAN + W10, 0}, - {"ch", "de,fr", GERMAN + W10, FRENCH + W10}, // 1:2 Switzerland - {"chn", "zh", CHINESE + W10, 0}, - {"chr", "chr", CHEROKEE + W10, 0}, - {"ckb", "ku", KURDISH + W10, 0}, // Central Kurdish - {"cn", "zh,zhT", CHINESE + W6, CHINESE_T + W4}, // Ambiguous, so weaker. - // Offset by 2 so that TLD=tw or - // enc=big5 will put zhT ahead - {"co", "co", CORSICAN + W10, 0}, - {"cro", "hr", CROATIAN + W10, 0}, // Croatia - {"crs", "crs", SESELWA + W10, 0}, - {"cs", "cs", CZECH + W10, SLOVAK - W4}, - {"ct", "ca", CATALAN + W10, 0}, - {"cy", "cy", WELSH + W10, 0}, - {"cym", "cy", WELSH + W10, 0}, - {"cz", "cs", CZECH + W10, SLOVAK - W4}, - - {"da", "da", DANISH + W10, NORWEGIAN - W4}, - {"dan", "da", DANISH + W10, NORWEGIAN - W4}, - {"de", "de", GERMAN + W10, 0}, - {"deu", "de", GERMAN + W10, 0}, - {"div", "dv", DHIVEHI + W10, 0}, - {"dk", "da", DANISH + W10, NORWEGIAN - W4}, // Denmark - {"dut", "nl", DUTCH + W10, 0}, // Dutch - {"dv", "dv", DHIVEHI + W10, 0}, - {"dz", "dz", DZONGKHA + W10, TIBETAN - W4}, - - {"ee", "et", ESTONIAN + W10, 0}, // Estonia - {"eg", "ar", ARABIC + W10, 0}, // Egypt - {"el", "el", GREEK + W10, 0}, - {"en", "en", ENGLISH + W4, 0}, - {"eng", "en", ENGLISH + W4, 0}, - {"eo", "eo", ESPERANTO + W10, 0}, - {"er", "ur", URDU + W10, 0}, // "Erdu" - {"es", "es", SPANISH + W10, 0}, - {"esp", "es", SPANISH + W10, 0}, - {"est", "et", ESTONIAN + W10, 0}, - {"et", "et", ESTONIAN + W10, 0}, - {"eu", "eu", BASQUE + W10, 0}, - - {"fa", "fa", PERSIAN + W10, 0}, - {"far", "fa", PERSIAN + W10, 0}, - {"fi", "fi", FINNISH + W10, 0}, - {"fil", "tl", TAGALOG + W10, 0}, // Philippines - {"fj", "fj", FIJIAN + W10, 0}, - {"fo", "fo", FAROESE + W10, ICELANDIC - W4}, - {"fr", "fr", FRENCH + W10, 0}, - {"fra", "fr", FRENCH + W10, 0}, - {"fre", "fr", FRENCH + W10, 0}, - {"fy", "fy", FRISIAN + W10, 0}, - - {"ga", "ga,gl", IRISH + W10, GALICIAN + W10}, // 1:2 Irish, Galician - {"gae", "gd,ga", SCOTS_GAELIC + W10, IRISH + W10}, // 1:2 Gaelic, either - {"gal", "gl", GALICIAN + W10, 0}, - {"gb", "zh", CHINESE + W10, 0}, // GB2312 encoding - {"gbk", "zh", CHINESE + W10, 0}, // GBK encoding - {"gd", "gd", SCOTS_GAELIC + W10, 0}, - {"ge", "ka", GEORGIAN + W10, 0}, // Georgia - {"geo", "ka", GEORGIAN + W10, 0}, - {"ger", "de", GERMAN + W10, 0}, - {"gl", "gl", GALICIAN + W10, 0}, // Also Greenland; hard to confuse - {"gn", "gn", GUARANI + W10, 0}, - {"gr", "el", GREEK + W10, 0}, // Greece - {"gu", "gu", GUJARATI + W10, 0}, - {"gv", "gv", MANX + W10, 0}, - - {"ha", "ha", HAUSA + W10, 0}, - {"hat", "ht", HAITIAN_CREOLE + W10, 0}, // Haiti - {"haw", "haw", HAWAIIAN + W10, 0}, - {"hb", "he", HEBREW + W10, 0}, - {"he", "he", HEBREW + W10, 0}, - {"heb", "he", HEBREW + W10, 0}, - {"hi", "hi", HINDI + W10, MARATHI - W4}, - {"hk", "zhT", CHINESE_T + W10, 0}, // Hong Kong - {"hr", "hr", CROATIAN + W10, 0}, - {"ht", "ht", HAITIAN_CREOLE + W10, 0}, - {"hu", "hu", HUNGARIAN + W10, 0}, - {"hun", "hu", HUNGARIAN + W10, 0}, - {"hy", "hy", ARMENIAN + W10, 0}, - - {"ia", "ia", INTERLINGUA + W10, 0}, - {"ice", "is", ICELANDIC + W10, FAROESE - W4}, // Iceland - {"id", "id", INDONESIAN + W10, MALAY - W4}, - {"ids", "id", INDONESIAN + W10, MALAY - W4}, - {"ie", "ie", INTERLINGUE + W10, 0}, - {"ig", "ig", IGBO + W10, 0}, - // 1:2 iu-Cans ik-Latn - {"ik", "ik,iu", INUPIAK + W10, INUKTITUT + W10}, // 1:2 - {"in", "id", INDONESIAN + W10, MALAY - W4}, - {"ind", "id", INDONESIAN + W10, MALAY - W4}, // Indonesia - {"inu", "iu,ik", INUKTITUT + W10, INUPIAK + W10}, // 1:2 - {"is", "is", ICELANDIC + W10, FAROESE - W4}, - {"it", "it", ITALIAN + W10, 0}, - {"ita", "it", ITALIAN + W10, 0}, - {"iu", "iu,ik", INUKTITUT + W10, INUPIAK + W10}, // 1:2 - {"iw", "he", HEBREW + W10, 0}, - - {"ja", "ja", JAPANESE + W10, 0}, - {"jp", "ja", JAPANESE + W10, 0}, // Japan - {"jpn", "ja", JAPANESE + W10, 0}, - {"jv", "jw", JAVANESE + W10, 0}, - {"jw", "jw", JAVANESE + W10, 0}, - - {"ka", "ka", GEORGIAN + W10, 0}, - {"kc", "qu", QUECHUA + W10, 0}, // (K)Quechua - {"kg", "ky", KYRGYZ + W10, 0}, // Kyrgyzstan - {"kh", "km", KHMER + W10, 0}, // Country code Khmer (Cambodia) - {"kha", "kha", KHASI + W10, 0}, - {"kk", "kk", KAZAKH + W10, 0}, // Kazakh - {"kl", "kl", GREENLANDIC + W10, 0}, - {"km", "km", KHMER + W10, 0}, - {"kn", "kn", KANNADA + W10, 0}, - {"ko", "ko", KOREAN + W10, 0}, - {"kor", "ko", KOREAN + W10, 0}, - {"kr", "ko", KOREAN + W10, 0}, // Country code Korea - {"ks", "ks", KASHMIRI + W10, 0}, - {"ksc", "ko", KOREAN + W10, 0}, // KSC encoding - {"ku", "ku", KURDISH + W10, 0}, - {"ky", "ky", KYRGYZ + W10, 0}, - {"kz", "kk", KAZAKH + W10, 0}, // Kazakhstan - {"la", "la", LATIN + W10, 0}, - {"lao", "lo", LAOTHIAN + W10, 0}, // Laos - - {"lb", "lb", LUXEMBOURGISH + W10, 0}, - {"lg", "lg", GANDA + W10, 0}, - {"lit", "lt", LITHUANIAN + W10, 0}, - {"ln", "ln", LINGALA + W10, 0}, - {"lo", "lo", LAOTHIAN + W10, 0}, - {"lt", "lt", LITHUANIAN + W10, 0}, - {"ltu", "lt", LITHUANIAN + W10, 0}, - {"lv", "lv", LATVIAN + W10, 0}, - - {"mfe", "mfe", MAURITIAN_CREOLE + W10, 0}, - {"mg", "mg", MALAGASY + W10, 0}, - {"mi", "mi", MAORI + W10, 0}, - {"mk", "mk", MACEDONIAN + W10, 0}, - {"ml", "ml", MALAYALAM + W10, 0}, - {"mn", "mn", MONGOLIAN + W10, 0}, - {"mo", "mo", ROMANIAN + W10, 0}, - {"mon", "mn", MONGOLIAN + W10, 0}, // Mongolian - {"mr", "mr", MARATHI + W10, HINDI - W4}, - {"ms", "ms", MALAY + W10, INDONESIAN - W4}, - {"mt", "mt", MALTESE + W10, 0}, - {"mx", "es", SPANISH + W10, 0}, // Mexico - {"my", "my,ms", BURMESE + W10, MALAY + W10}, // Myanmar, Malaysia - - {"na", "na", NAURU + W10, 0}, - {"nb", "no", NORWEGIAN + W10, NORWEGIAN_N - W4}, - {"ne", "ne", NEPALI + W10, 0}, - {"nl", "nl", DUTCH + W10, 0}, - {"nn", "nn", NORWEGIAN_N + W10, NORWEGIAN - W4}, - {"no", "no", NORWEGIAN + W10, NORWEGIAN_N - W4}, - {"nr", "nr", NDEBELE + W10, 0}, - {"nso", "nso", PEDI + W10, 0}, - {"ny", "ny", NYANJA + W10, 0}, - - {"oc", "oc", OCCITAN + W10, 0}, - {"om", "om", OROMO + W10, 0}, - {"or", "or", ORIYA + W10, 0}, - - {"pa", "pa,ps", PUNJABI + W10, PASHTO + W10}, // 1:2 pa-Guru ps-Arab - {"per", "fa", PERSIAN + W10, 0}, - {"ph", "tl", TAGALOG + W10, 0}, // Philippines - {"pk", "ur", URDU + W10, 0}, // Pakistan - {"pl", "pl", POLISH + W10, 0}, - {"pnb", "pa", PUNJABI + W10, 0}, // Western Punjabi - {"pol", "pl", POLISH + W10, 0}, - {"por", "pt", PORTUGUESE + W10, 0}, - {"ps", "ps", PASHTO + W10, 0}, - {"pt", "pt", PORTUGUESE + W10, 0}, - {"ptg", "pt", PORTUGUESE + W10, 0}, - {"qc", "fr", FRENCH + W10, 0}, // Quebec "country" code - {"qu", "qu", QUECHUA + W10, 0}, - - {"rm", "rm", RHAETO_ROMANCE + W10, 0}, - {"rn", "rn", RUNDI + W10, 0}, - {"ro", "ro", ROMANIAN + W10, 0}, - {"rs", "sr", SERBIAN + W10, 0}, // Serbia country code - {"ru", "ru", RUSSIAN + W10, 0}, - {"rus", "ru", RUSSIAN + W10, 0}, - {"rw", "rw", KINYARWANDA + W10, 0}, - - {"sa", "sa", SANSKRIT + W10, 0}, - {"sco", "sco", SCOTS + W10, ENGLISH - W4}, - {"sd", "sd", SINDHI + W10, 0}, - {"se", "sv", SWEDISH + W10, 0}, - {"sg", "sg", SANGO + W10, 0}, - {"si", "si,sl", SINHALESE + W10, SLOVENIAN + W10}, // 1:2 Sinhalese, Slovinia - {"sk", "sk", SLOVAK + W10, CZECH - W4}, - {"sl", "sl", SLOVENIAN + W10, 0}, - {"slo", "sl", SLOVENIAN + W10, 0}, - {"sm", "sm", SAMOAN + W10, 0}, - {"sn", "sn", SHONA + W10, 0}, - {"so", "so", SOMALI + W10, 0}, - {"sp", "es", SPANISH + W10, 0}, - {"sq", "sq", ALBANIAN + W10, 0}, - {"sr", "sr", SERBIAN + W10, 0}, - {"srb", "sr", SERBIAN + W10, 0}, - {"srl", "sr", SERBIAN + W10, 0}, // Serbian Latin - {"srp", "sr", SERBIAN + W10, 0}, - {"ss", "ss", SISWANT + W10, 0}, - {"st", "st", SESOTHO + W10, 0}, - {"su", "su", SUNDANESE + W10, 0}, - {"sv", "sv", SWEDISH + W10, 0}, - {"sve", "sv", SWEDISH + W10, 0}, - {"sw", "sw", SWAHILI + W10, 0}, - {"swe", "sv", SWEDISH + W10, 0}, - {"sy", "syr", SYRIAC + W10, 0}, - {"syr", "syr", SYRIAC + W10, 0}, - - {"ta", "ta", TAMIL + W10, 0}, - {"te", "te", TELUGU + W10, 0}, - {"tg", "tg", TAJIK + W10, 0}, - {"th", "th", THAI + W10, 0}, - {"ti", "ti,bo", TIGRINYA + W10, TIBETAN + W10}, // 1:2 Tigrinya, Tibet - {"tj", "tg", TAJIK + W10, 0}, // Tajikistan - {"tk", "tk", TURKMEN + W10, 0}, - {"tl", "tl", TAGALOG + W10, 0}, - {"tlh", "tlh", X_KLINGON + W10, 0}, - {"tn", "tn", TSWANA + W10, 0}, - {"to", "to", TONGA + W10, 0}, - {"tr", "tr", TURKISH + W10, 0}, - {"ts", "ts", TSONGA + W10, 0}, - {"tt", "tt", TATAR + W10, 0}, - {"tw", "ak,zhT", AKAN + W10, CHINESE_T + W10}, // 1:2 Twi => Akan, Taiwan - {"twi", "ak", AKAN + W10, 0}, // Twi => Akan - - {"ua", "uk", UKRAINIAN + W10, 0}, // Ukraine - {"ug", "ug", UIGHUR + W10, 0}, - {"uk", "uk", UKRAINIAN + W10, 0}, - {"ur", "ur", URDU + W10, 0}, - {"uz", "uz", UZBEK + W10, 0}, - - {"va", "ca", CATALAN + W10, 0}, // Valencia => Catalan - {"val", "ca", CATALAN + W10, 0}, // Valencia => Catalan - {"ve", "ve", VENDA + W10, 0}, - {"vi", "vi", VIETNAMESE + W10, 0}, - {"vie", "vi", VIETNAMESE + W10, 0}, - {"vn", "vi", VIETNAMESE + W10, 0}, - {"vo", "vo", VOLAPUK + W10, 0}, - - {"wo", "wo", WOLOF + W10, 0}, - - {"xh", "xh", XHOSA + W10, ZULU - W4}, - {"xho", "xh", XHOSA + W10, ZULU - W4}, - - {"yi", "yi", YIDDISH + W10, 0}, - {"yo", "yo", YORUBA + W10, 0}, - - {"za", "za", ZHUANG + W10, 0}, - {"zh", "zh", CHINESE + W10, 0}, - {"zht", "zhT", CHINESE_T + W10, 0}, - {"zu", "zu", ZULU + W10, XHOSA - W4}, -}; - - -// Possibly map to tl: -// -LangTags tl-Latn /7val.com/ ,bcl 2 Central Bicolano -// -LangTags tl-Latn /7val.com/ ,ceb 6 Cebuano -// -LangTags tl-Latn /7val.com/ ,war 1 Waray - - - -// Table to look up country TLD (no general TLD) -// In alphabetical order for binary search -static const int kCLDTable3Size = 181; -static const TLDLookup kCLDTLDHintTable[kCLDTable3Size] = { - {"ac", JAPANESE + W2, 0}, - {"ad", CATALAN + W4, 0}, - {"ae", ARABIC + W4, 0}, - {"af", PASHTO + W4, PERSIAN + W4}, - {"ag", GERMAN + W2, 0}, // meager - // {"ai", 0, 0}, // meager - {"al", ALBANIAN + W4, 0}, - {"am", ARMENIAN + W4, 0}, - {"an", DUTCH + W4, 0}, // meager - {"ao", PORTUGUESE + W4, 0}, - // {"aq", 0, 0}, // meager - {"ar", SPANISH + W4, 0}, - // {"as", 0, 0}, - {"at", GERMAN + W4, 0}, - {"au", ENGLISH + W2, 0}, - {"aw", DUTCH + W4, 0}, - {"ax", SWEDISH + W4, 0}, - {"az", AZERBAIJANI + W4, 0}, - - {"ba", BOSNIAN + W8, CROATIAN - W4}, - // {"bb", 0, 0}, - {"bd", BENGALI + W4, 0}, - {"be", DUTCH + W4, FRENCH + W4}, - {"bf", FRENCH + W4, 0}, - {"bg", BULGARIAN + W4, 0}, - {"bh", ARABIC + W4, 0}, - {"bi", RUNDI + W4, FRENCH + W4}, - {"bj", FRENCH + W4, 0}, - {"bm", ENGLISH + W2, 0}, - {"bn", MALAY + W4, INDONESIAN - W4}, - {"bo", SPANISH + W4, AYMARA + W2}, // and GUARANI QUECHUA - {"br", PORTUGUESE + W4, 0}, - // {"bs", 0, 0}, - {"bt", DZONGKHA + W10, TIBETAN - W10}, // Strong presumption of Dzongha - {"bw", TSWANA + W4, 0}, - {"by", BELARUSIAN + W4, 0}, - // {"bz", 0, 0}, - - {"ca", FRENCH + W4, ENGLISH + W2}, - {"cat", CATALAN + W4, 0}, - {"cc", 0, 0}, - {"cd", FRENCH + W4, 0}, - {"cf", FRENCH + W4, 0}, - {"cg", FRENCH + W4, 0}, - {"ch", GERMAN + W4, FRENCH + W4}, - {"ci", FRENCH + W4, 0}, - // {"ck", 0, 0}, - {"cl", SPANISH + W4, 0}, - {"cm", FRENCH + W4, 0}, - {"cn", CHINESE + W4, 0}, - {"co", SPANISH + W4, 0}, - {"cr", SPANISH + W4, 0}, - {"cu", SPANISH + W4, 0}, - {"cv", PORTUGUESE + W4, 0}, - // {"cx", 0, 0}, - {"cy", GREEK + W4, TURKISH + W4}, - {"cz", CZECH + W4, SLOVAK - W4}, - - {"de", GERMAN + W4, 0}, - {"dj", 0, 0}, - {"dk", DANISH + W4, NORWEGIAN - W4}, - {"dm", 0, 0}, - {"do", SPANISH + W4, 0}, - {"dz", FRENCH + W4, ARABIC + W4}, - - {"ec", SPANISH + W4, 0}, - {"ee", ESTONIAN + W4, 0}, - {"eg", ARABIC + W4, 0}, - {"er", AFAR + W4, 0}, - {"es", SPANISH + W4, 0}, - {"et", AMHARIC + W4, AFAR + W4}, - - {"fi", FINNISH + W4, 0}, - {"fj", FIJIAN + W4, 0}, - // {"fk", 0, 0}, - // {"fm", 0, 0}, - {"fo", FAROESE + W4, ICELANDIC - W4}, - {"fr", FRENCH + W4, 0}, - - {"ga", FRENCH + W4, 0}, - {"gd", 0, 0}, - {"ge", GEORGIAN + W4, 0}, - {"gf", FRENCH + W4, 0}, - // {"gg", 0, 0}, - // {"gh", 0, 0}, - // {"gi", 0, 0}, - {"gl", GREENLANDIC + W4, DANISH + W4}, - // {"gm", 0, 0}, - {"gn", FRENCH + W4, 0}, - // {"gp", 0, 0}, - // {"gq", 0, 0}, - {"gr", GREEK + W4, 0}, - // {"gs", 0, 0}, - {"gt", SPANISH + W4, 0}, - // {"gu", 0, 0}, - // {"gy", 0, 0}, - - {"hk", CHINESE_T + W4, 0}, - // {"hm", 0, 0}, - {"hn", SPANISH + W4, 0}, - {"hr", CROATIAN + W8, BOSNIAN - W4}, - {"ht", HAITIAN_CREOLE + W4, FRENCH + W4}, - {"hu", HUNGARIAN + W4, 0}, - - {"id", INDONESIAN + W4, MALAY - W4}, - {"ie", IRISH + W4, 0}, - {"il", HEBREW + W4, 0}, - {"im", MANX + W4, 0}, - // {"in", 0, 0}, - // {"io", 0, 0}, - {"iq", ARABIC + W4, 0}, - {"ir", PERSIAN + W4, 0}, - {"is", ICELANDIC + W4, FAROESE - W4}, - {"it", ITALIAN + W4, 0}, - - // {"je", 0, 0}, - // {"jm", 0, 0}, - {"jo", ARABIC + W4, 0}, - {"jp", JAPANESE + W4, 0}, - - // {"ke", 0, 0}, - {"kg", KYRGYZ + W4, 0}, - {"kh", KHMER + W4, 0}, - // {"ki", 0, 0}, - {"km", FRENCH + W4, 0}, - // {"kn", 0, 0}, - {"kp", KOREAN + W4, 0}, - {"kr", KOREAN + W4, 0}, - {"kw", ARABIC + W4, 0}, - // {"ky", 0, 0}, - {"kz", KAZAKH + W4, 0}, - - {"la", LAOTHIAN + W4, 0}, - {"lb", ARABIC + W4, FRENCH + W4}, - // {"lc", 0, 0}, - {"li", GERMAN + W4, 0}, - {"lk", SINHALESE + W4, 0}, - // {"lr", 0, 0}, - {"ls", SESOTHO + W4, 0}, - {"lt", LITHUANIAN + W4, 0}, - {"lu", LUXEMBOURGISH + W4}, - {"lv", LATVIAN + W4, 0}, - {"ly", ARABIC + W4, 0}, - - {"ma", FRENCH + W4, 0}, - {"mc", FRENCH + W4, 0}, - {"md", ROMANIAN + W4, 0}, - {"me", MONTENEGRIN + W8, SERBIAN - W4}, - {"mg", FRENCH + W4, 0}, - {"mk", MACEDONIAN + W4, 0}, - {"ml", FRENCH + W4, 0}, - {"mm", BURMESE + W4, 0}, - {"mn", MONGOLIAN + W4, 0}, - {"mo", CHINESE_T + W4, PORTUGUESE + W4}, - // {"mp", 0, 0}, - {"mq", FRENCH + W4, 0}, - {"mr", FRENCH + W4, ARABIC + W4}, - // {"ms", 0, 0}, - {"mt", MALTESE + W4, 0}, - // {"mu", 0, 0}, - {"mv", DHIVEHI + W4, 0}, - // {"mw", 0, 0}, - {"mx", SPANISH + W4, 0}, - {"my", MALAY + W4, INDONESIAN - W4}, - {"mz", PORTUGUESE + W4, 0}, - - {"na", 0, 0}, // Namibia - {"nc", FRENCH + W4, 0}, - {"ne", FRENCH + W4, 0}, - {"nf", FRENCH + W4, 0}, - // {"ng", 0, 0}, - {"ni", SPANISH + W4, 0}, - {"nl", DUTCH + W4, 0}, - {"no", NORWEGIAN + W4, NORWEGIAN_N + W2}, - {"np", NEPALI + W4, 0}, - {"nr", NAURU + W4, 0}, - {"nu", SWEDISH + W4, 0}, - {"nz", MAORI + W4, ENGLISH + W2}, - - {"om", ARABIC + W4, 0}, - - {"pa", SPANISH + W4, 0}, - {"pe", SPANISH + W4, QUECHUA + W2}, // also AYMARA - {"pf", FRENCH + W4, 0}, - // {"pg", 0, 0}, - {"ph", TAGALOG + W4, 0}, - {"pk", URDU + W4, 0}, - {"pl", POLISH + W4, 0}, - // {"pn", 0, 0}, - {"pr", SPANISH + W4, 0}, - {"ps", ARABIC + W4, 0}, - {"pt", PORTUGUESE + W4, 0}, - {"py", SPANISH + W4, GUARANI + W2}, - - {"qa", ARABIC + W4, 0}, - - {"re", FRENCH + W4, 0}, - {"ro", ROMANIAN + W4, 0}, - {"rs", SERBIAN + W8, MONTENEGRIN - W4}, - {"ru", RUSSIAN + W4, 0}, - {"rw", KINYARWANDA + W4, FRENCH + W2}, - - {"sa", ARABIC + W4, 0}, - // {"sb", 0, 0}, - {"sc", SESELWA + W4, 0}, - {"sd", ARABIC + W4, 0}, - {"se", SWEDISH + W4, 0}, - // {"sg", 0, 0}, - // {"sh", 0, 0}, - {"si", SLOVENIAN + W4, 0}, - {"sk", SLOVAK + W4, CZECH - W4}, - // {"sl", 0, 0}, - {"sm", ITALIAN + W4, 0}, - {"sn", FRENCH + W4, 0}, - // {"sr", 0, 0}, - {"ss", ARABIC + W4, 0}, // Presumed South Sudan TLD. dsites 2011.07.07 - // {"st", 0, 0}, - {"su", RUSSIAN + W4, 0}, - {"sv", SPANISH + W4, 0}, - {"sy", ARABIC + W4, 0}, - // {"sz", 0, 0}, - - // {"tc", 0, 0}, - {"td", FRENCH + W4, 0}, - // {"tf", 0, 0}, - {"tg", FRENCH + W4, 0}, - {"th", THAI + W4, 0}, - // Tibet has no country code (see .cn) - {"tj", TAJIK + W4, 0}, - // {"tk", 0, 0}, - // {"tl", 0, 0}, - {"tm", TURKISH + W4, 0}, - {"tn", FRENCH + W4, ARABIC + W4}, - // {"to", 0, 0}, - {"tp", JAPANESE + W4, 0}, - {"tr", TURKISH + W4, 0}, - // {"tt", 0, 0}, - // {"tv", 0, 0}, - {"tw", CHINESE_T + W4, 0}, - {"tz", SWAHILI + W4, AKAN + W4}, - - {"ua", UKRAINIAN + W4, 0}, - {"ug", GANDA + W4, 0}, - {"uk", ENGLISH + W2, 0}, - {"us", ENGLISH + W2, 0}, - {"uy", SPANISH + W4, 0}, - {"uz", UZBEK + W4, 0}, - - {"va", ITALIAN + W4, LATIN + W2}, - // {"vc", 0, 0}, - {"ve", SPANISH + W4, 0}, - // {"vg", 0, 0}, - // {"vi", 0, 0}, - {"vn", VIETNAMESE + W4, 0}, - // {"vu", 0, 0}, - - {"wf", FRENCH + W4, 0}, - // {"ws", 0, 0}, - - {"ye", ARABIC + W4, 0}, - - {"za", AFRIKAANS + W4, 0}, - // {"zm", 0, 0}, - // {"zw", 0, 0}, -}; - -#undef W2 -#undef W4 -#undef W6 -#undef W8 -#undef W10 -#undef W12 - - - - - -inline void SetCLDPriorWeight(int w, OneCLDLangPrior* olp) { - *olp = (*olp & 0x3ff) + (w << 10); -} -inline void SetCLDPriorLang(Language lang, OneCLDLangPrior* olp) { - *olp = (*olp & ~0x3ff) + lang; -} - -OneCLDLangPrior PackCLDPriorLangWeight(Language lang, int w) { - return (w << 10) + lang; -} - -inline int MaxInt(int a, int b) { - return (a >= b) ? a : b; -} - -// Merge in another language prior, taking max if already there -void MergeCLDLangPriorsMax(OneCLDLangPrior olp, CLDLangPriors* lps) { - if (olp == 0) {return;} - Language target_lang = GetCLDPriorLang(olp); - for (int i = 0; i < lps->n; ++i) { - if (GetCLDPriorLang(lps->prior[i]) == target_lang) { - int new_weight = MaxInt(GetCLDPriorWeight(lps->prior[i]), - GetCLDPriorWeight(olp)); - SetCLDPriorWeight(new_weight, &lps->prior[i]); - return; - } - } - // Not found; add it if room - if (lps->n >= kMaxOneCLDLangPrior) {return;} - lps->prior[lps->n++] = olp; -} - -// Merge in another language prior, boosting 10x if already there -void MergeCLDLangPriorsBoost(OneCLDLangPrior olp, CLDLangPriors* lps) { - if (olp == 0) {return;} - Language target_lang = GetCLDPriorLang(olp); - for (int i = 0; i < lps->n; ++i) { - if (GetCLDPriorLang(lps->prior[i]) == target_lang) { - int new_weight = GetCLDPriorWeight(lps->prior[i]) + 2; - SetCLDPriorWeight(new_weight, &lps->prior[i]); - return; - } - } - // Not found; add it if room - if (lps->n >= kMaxOneCLDLangPrior) {return;} - lps->prior[lps->n++] = olp; -} - - -// Trim language priors to no more than max_entries, keeping largest abs weights -void TrimCLDLangPriors(int max_entries, CLDLangPriors* lps) { - if (lps->n <= max_entries) {return;} - - // Insertion sort in-place by abs(weight) - for (int i = 0; i < lps->n; ++i) { - OneCLDLangPrior temp_olp = lps->prior[i]; - int w = abs(GetCLDPriorWeight(temp_olp)); - int kk = i; - for (; kk > 0; --kk) { - if (abs(GetCLDPriorWeight(lps->prior[kk - 1])) < w) { - // Move down and continue - lps->prior[kk] = lps->prior[kk - 1]; - } else { - // abs(weight[kk - 1]) >= w, time to stop - break; - } - } - lps->prior[kk] = temp_olp; - } - - lps->n = max_entries; -} - -int CountCommas(const string& langtags) { - int commas = 0; - for (int i = 0; i < static_cast<int>(langtags.size()); ++i) { - if (langtags[i] == ',') {++commas;} - } - return commas; -} - -// Binary lookup on language tag -const LangTagLookup* DoLangTagLookup(const char* key, - const LangTagLookup* tbl, int tbl_size) { - // Key is always in range [lo..hi) - int lo = 0; - int hi = tbl_size; - while (lo < hi) { - int mid = (lo + hi) >> 1; - int comp = strcmp(tbl[mid].langtag, key); - if (comp < 0) { - lo = mid + 1; - } else if (comp > 0) { - hi = mid; - } else { - return &tbl[mid]; - } - } - return NULL; -} - -// Binary lookup on tld -const TLDLookup* DoTLDLookup(const char* key, - const TLDLookup* tbl, int tbl_size) { - // Key is always in range [lo..hi) - int lo = 0; - int hi = tbl_size; - while (lo < hi) { - int mid = (lo + hi) >> 1; - int comp = strcmp(tbl[mid].tld, key); - if (comp < 0) { - lo = mid + 1; - } else if (comp > 0) { - hi = mid; - } else { - return &tbl[mid]; - } - } - return NULL; -} - - - -// Trim language tag string to canonical form for each language -// Input is from GetLangTagsFromHtml(), already lowercased -string TrimCLDLangTagsHint(const string& langtags) { - string retval; - if (langtags.empty()) {return retval;} - int commas = CountCommas(langtags); - if (commas > 4) {return retval;} // Ignore if too many language tags - - char temp[20]; - int pos = 0; - while (pos < static_cast<int>(langtags.size())) { - int comma = langtags.find(',', pos); - if (comma == string::npos) {comma = langtags.size();} // fake trailing comma - int len = comma - pos; - if (len <= 16) { - // Short enough to use - memcpy(temp, &langtags[pos], len); - temp[len] = '\0'; - const LangTagLookup* entry = DoLangTagLookup(temp, - kCLDLangTagsHintTable1, - kCLDTable1Size); - if (entry != NULL) { - // First table hit - retval.append(entry->langcode); // may be "code1,code2" - retval.append(1, ','); - } else { - // Try second table with language code truncated at first hyphen - char* hyphen = strchr(temp, '-'); - if (hyphen != NULL) {*hyphen = '\0';} - len = strlen(temp); - if (len <= 3) { // Short enough to use - entry = DoLangTagLookup(temp, - kCLDLangTagsHintTable2, - kCLDTable2Size); - if (entry != NULL) { - // Second table hit - retval.append(entry->langcode); // may be "code1,code2" - retval.append(1, ','); - } - } - } - } - pos = comma + 1; - } - - // Remove trainling comma, if any - if (!retval.empty()) {retval.resize(retval.size() - 1);} - return retval; -} - - - -//============================================================================== - -// Little state machine to scan insides of language attribute quoted-string. -// Each language code is lowercased and copied to the output string. Underscore -// is mapped to minus. Space, tab, and comma are all mapped to comma, and -// multiple consecutive commas are removed. -// Each language code in the output list will be followed by a single comma. - -// There are three states, and we start in state 1: -// State 0: After a letter. -// Copy all letters/minus[0], copy comma[1]; all others copy comma and skip [2] -// State 1: Just after a comma. -// Copy letter [0], Ignore subsequent commas[1]. minus and all others skip [2] -// State 2: Skipping. -// All characters except comma skip and stay in [2]. comma goes to [1] - -// The thing that is copied is kLangCodeRemap[c] when going to state 0, -// and always comma when going to state 1 or 2. The design depends on copying -// a comma at the *beginning* of skipping, and in state 2 never doing a copy. - -// We pack all this into 8 bits: -// +--+---+---+ -// |78|654|321| -// +--+---+---+ -// -// Shift byte right by 3*state, giving [0] 321, [1] 654, [2] .78 -// where . is always zero -// Of these 3 bits, low two are next state ss, high bit is copy bit C. -// If C=1 and ss == 0, copy kLangCodeRemap[c], else copy a comma - -#define SKIP0 0 -#define SKIP1 1 -#define SKIP2 2 -#define COPY0 4 // copy kLangCodeRemap[c] -#define COPY1 5 // copy ',' -#define COPY2 6 // copy ',' - -// These combined actions pack three states into one byte. -// Ninth bit must be zero, so all state 2 values must be skips. -// state[2] state[1] state[0] -#define LTR ((SKIP2 << 6) + (COPY0 << 3) + COPY0) -#define MINUS ((SKIP2 << 6) + (COPY2 << 3) + COPY0) -#define COMMA ((SKIP1 << 6) + (SKIP1 << 3) + COPY1) -#define Bad ((SKIP2 << 6) + (COPY2 << 3) + COPY2) - -// Treat as letter: a-z, A-Z -// Treat as minus: 2D minus, 5F underscore -// Treat as comma: 09 tab, 20 space, 2C comma - -static const unsigned char kLangCodeAction[256] = { - Bad,Bad,Bad,Bad,Bad,Bad,Bad,Bad, Bad,COMMA,Bad,Bad,Bad,Bad,Bad,Bad, - Bad,Bad,Bad,Bad,Bad,Bad,Bad,Bad, Bad,Bad,Bad,Bad,Bad,Bad,Bad,Bad, - COMMA,Bad,Bad,Bad,Bad,Bad,Bad,Bad, Bad,Bad,Bad,Bad,COMMA,MINUS,Bad,Bad, - Bad,Bad,Bad,Bad,Bad,Bad,Bad,Bad, Bad,Bad,Bad,Bad,Bad,Bad,Bad,Bad, - - Bad,LTR,LTR,LTR,LTR,LTR,LTR,LTR, LTR,LTR,LTR,LTR,LTR,LTR,LTR,LTR, - LTR,LTR,LTR,LTR,LTR,LTR,LTR,LTR, LTR,LTR,LTR,Bad,Bad,Bad,Bad,MINUS, - Bad,LTR,LTR,LTR,LTR,LTR,LTR,LTR, LTR,LTR,LTR,LTR,LTR,LTR,LTR,LTR, - LTR,LTR,LTR,LTR,LTR,LTR,LTR,LTR, LTR,LTR,LTR,Bad,Bad,Bad,Bad,Bad, - - Bad,Bad,Bad,Bad,Bad,Bad,Bad,Bad, Bad,Bad,Bad,Bad,Bad,Bad,Bad,Bad, - Bad,Bad,Bad,Bad,Bad,Bad,Bad,Bad, Bad,Bad,Bad,Bad,Bad,Bad,Bad,Bad, - Bad,Bad,Bad,Bad,Bad,Bad,Bad,Bad, Bad,Bad,Bad,Bad,Bad,Bad,Bad,Bad, - Bad,Bad,Bad,Bad,Bad,Bad,Bad,Bad, Bad,Bad,Bad,Bad,Bad,Bad,Bad,Bad, - - Bad,Bad,Bad,Bad,Bad,Bad,Bad,Bad, Bad,Bad,Bad,Bad,Bad,Bad,Bad,Bad, - Bad,Bad,Bad,Bad,Bad,Bad,Bad,Bad, Bad,Bad,Bad,Bad,Bad,Bad,Bad,Bad, - Bad,Bad,Bad,Bad,Bad,Bad,Bad,Bad, Bad,Bad,Bad,Bad,Bad,Bad,Bad,Bad, - Bad,Bad,Bad,Bad,Bad,Bad,Bad,Bad, Bad,Bad,Bad,Bad,Bad,Bad,Bad,Bad, -}; - -// This does lowercasing, maps underscore to minus, and maps tab/space to comma -static const unsigned char kLangCodeRemap[256] = { - 0,0,0,0,0,0,0,0, 0,',',0,0,0,0,0,0, // 09 tab - 0,0,0,0,0,0,0,0, 0,0,0,0,0,0,0,0, - ',',0,0,0,0,0,0,0, 0,0,0,0,',','-',0,0, // 20 space 2C comma 2D minus - 0,0,0,0,0,0,0,0, 0,0,0,0,0,0,0,0, - - 0,'a','b','c','d','e','f','g', 'h','i','j','k','l','m','n','o', - 'p','q','r','s','t','u','v','w', 'x','y','z',0,0,0,0,'-', // 5F underscore - 0,'a','b','c','d','e','f','g', 'h','i','j','k','l','m','n','o', - 'p','q','r','s','t','u','v','w', 'x','y','z',0,0,0,0,0, - - 0,0,0,0,0,0,0,0, 0,0,0,0,0,0,0,0, - 0,0,0,0,0,0,0,0, 0,0,0,0,0,0,0,0, - 0,0,0,0,0,0,0,0, 0,0,0,0,0,0,0,0, - 0,0,0,0,0,0,0,0, 0,0,0,0,0,0,0,0, - - 0,0,0,0,0,0,0,0, 0,0,0,0,0,0,0,0, - 0,0,0,0,0,0,0,0, 0,0,0,0,0,0,0,0, - 0,0,0,0,0,0,0,0, 0,0,0,0,0,0,0,0, - 0,0,0,0,0,0,0,0, 0,0,0,0,0,0,0,0, -}; - -#undef LTR -#undef MINUS -#undef COMMA -#undef Bad - -#undef SKIP0 -#undef SKIP1 -#undef SKIP2 -#undef COPY0 -#undef COPY1 -#undef COPY2 - - -// Find opening '<' for HTML tag -// Note: this is all somewhat insensitive to mismatched quotes -int32 FindTagStart(const char* utf8_body, int32 pos, int32 max_pos) { - int i = pos; - // Advance i by 4 if none of the next 4 bytes are '<' - for (i = pos; i < (max_pos - 3); i += 4) { - // Fast check for any < - const char* p = &utf8_body[i]; - uint32 s0123 = UNALIGNED_LOAD32(p); - uint32 temp = s0123 ^ 0x3c3c3c3c; // <<<< - if (((temp - 0x01010101) & (~temp & 0x80808080)) != 0) { - // At least one byte is '<' - break; - } - } - // Continue, advancing i by 1 - for (; i < max_pos; ++i) { - if (utf8_body[i] == '<') {return i;} - } - return -1; -} - - -// Find closing '>' for HTML tag. Also stop on < and & (simplistic parsing) -int32 FindTagEnd(const char* utf8_body, int32 pos, int32 max_pos) { - // Always outside quotes - for (int i = pos; i < max_pos; ++i) { - char c = utf8_body[i]; - if (c == '>') {return i;} - if (c == '<') {return i - 1;} - if (c == '&') {return i - 1;} - } - return -1; // nothing found -} - -// Find opening quote or apostrophe, skipping spaces -// Note: this is all somewhat insensitive to mismatched quotes -int32 FindQuoteStart(const char* utf8_body, int32 pos, int32 max_pos) { - for (int i = pos; i < max_pos; ++i) { - char c = utf8_body[i]; - if (c == '"') {return i;} - if (c == '\'') {return i;} - if (c != ' ') {return -1;} - } - return -1; -} - -// Find closing quot/apos. Also stop on = > < and & (simplistic parsing) -int32 FindQuoteEnd(const char* utf8_body, int32 pos, int32 max_pos) { - // Always outside quotes - for (int i = pos; i < max_pos; ++i) { - char c = utf8_body[i]; - if (c == '"') {return i;} - if (c == '\'') {return i;} - if (c == '>') {return i - 1;} - if (c == '=') {return i - 1;} - if (c == '<') {return i - 1;} - if (c == '&') {return i - 1;} - } - return -1; // nothing found -} - -int32 FindEqualSign(const char* utf8_body, int32 pos, int32 max_pos) { - // Outside quotes/apostrophes loop - for (int i = pos; i < max_pos; ++i) { - char c = utf8_body[i]; - if (c == '=') { // Found bare equal sign inside tag - return i; - } else if (c == '"') { - // Inside quotes loop - int j; - for (j = i + 1; j < max_pos; ++j) { - if (utf8_body[j] == '"') { - break; - } else if (utf8_body[j] == '\\') { - ++j; - } - } - i = j; - } else if (c == '\'') { - // Inside apostrophes loop - int j; - for (j = i + 1; j < max_pos; ++j) { - if (utf8_body[j] == '\'') { - break; - } else if (utf8_body[j] == '\\') { - ++j; - } - } - i = j; - } - - } - return -1; // nothing found -} - -// Scan backwards for case-insensitive string s in [min_pos..pos) -// Bytes of s must already be lowercase, i.e. in [20..3f] or [60..7f] -// Cheap lowercase. Control codes will masquerade as 20..3f -bool FindBefore(const char* utf8_body, - int32 min_pos, int32 pos, const char* s) { - int len = strlen(s); - if ((pos - min_pos) < len) {return false;} // Too small to fit s - - // Skip trailing spaces - int i = pos; - while ((i > (min_pos + len)) && (utf8_body[i - 1] == ' ')) {--i;} - i -= len; - if (i < min_pos) {return false;} // pos - min_pos < len, so s can't be found - - const char* p = &utf8_body[i]; - for (int j = 0; j < len; ++j) { - if ((p[j] | 0x20) != s[j]) {return false;} // Unequal byte - } - return true; // All bytes equal at i -} - -// Scan forwards for case-insensitive string s in [pos..max_pos) -// Bytes of s must already be lowercase, i.e. in [20..3f] or [60..7f] -// Cheap lowercase. Control codes will masquerade as 20..3f -// Allows but does not require quoted/apostrophe string -bool FindAfter(const char* utf8_body, - int32 pos, int32 max_pos, const char* s) { - int len = strlen(s); - if ((max_pos - pos) < len) {return false;} // Too small to fit s - - // Skip leading spaces, quote, apostrophe - int i = pos; - while (i < (max_pos - len)) { - unsigned char c = utf8_body[i]; - if ((c == ' ') || (c == '"') || (c == '\'')) {++i;} - else {break;} - } - - const char* p = &utf8_body[i]; - for (int j = 0; j < len; ++j) { - if ((p[j] | 0x20) != s[j]) {return false;} // Unequal byte - } - return true; // All bytes equal -} - - - -// Copy attribute value in [pos..max_pos) -// pos is just after an opening quote/apostrophe and max_pos is the ending one -// String must all be on a single line. -// Return slightly-normalized language list, empty or ending in comma -// Does lowercasing and removes excess punctuation/space -string CopyOneQuotedString(const char* utf8_body, - int32 pos, int32 max_pos) { - string s; - int state = 1; // Front is logically just after a comma - for (int i = pos; i < max_pos; ++i) { - unsigned char c = utf8_body[i]; - int e = kLangCodeAction[c] >> (3 * state); - state = e & 3; // Update to next state - if ((e & 4) != 0) { - // Copy a remapped byte if going to state 0, else copy a comma - if (state == 0) { - s.append(1, kLangCodeRemap[c]); - } else { - s.append(1, ','); - } - } - } - - // Add final comma if needed - if (state == 0) { - s.append(1, ','); - } - return s; -} - -// Find and copy attribute value: quoted string in [pos..max_pos) -// Return slightly-normalized language list, empty or ending in comma -string CopyQuotedString(const char* utf8_body, - int32 pos, int32 max_pos) { - int32 start_quote = FindQuoteStart(utf8_body, pos, max_pos); - if (start_quote < 0) {return string("");} - int32 end_quote = FindQuoteEnd(utf8_body, start_quote + 1, max_pos); - if (end_quote < 0) {return string("");} - - return CopyOneQuotedString(utf8_body, start_quote + 1, end_quote); -} - -// Add hints to vector of langpriors -// Input is from GetLangTagsFromHtml(), already lowercased -void SetCLDLangTagsHint(const string& langtags, CLDLangPriors* langpriors) { - if (langtags.empty()) {return;} - int commas = CountCommas(langtags); - if (commas > 4) {return;} // Ignore if too many language tags - - char temp[20]; - int pos = 0; - while (pos < static_cast<int>(langtags.size())) { - int comma = langtags.find(',', pos); - if (comma == string::npos) {comma = langtags.size();} // fake trailing comma - int len = comma - pos; - if (len <= 16) { - // Short enough to use - memcpy(temp, &langtags[pos], len); - temp[len] = '\0'; - const LangTagLookup* entry = DoLangTagLookup(temp, - kCLDLangTagsHintTable1, - kCLDTable1Size); - if (entry != NULL) { - // First table hit - MergeCLDLangPriorsMax(entry->onelangprior1, langpriors); - MergeCLDLangPriorsMax(entry->onelangprior2, langpriors); - } else { - // Try second table with language code truncated at first hyphen - char* hyphen = strchr(temp, '-'); - if (hyphen != NULL) {*hyphen = '\0';} - len = strlen(temp); - if (len <= 3) { // Short enough to use - entry = DoLangTagLookup(temp, - kCLDLangTagsHintTable2, - kCLDTable2Size); - if (entry != NULL) { - // Second table hit - MergeCLDLangPriorsMax(entry->onelangprior1, langpriors); - MergeCLDLangPriorsMax(entry->onelangprior2, langpriors); - } - } - } - } - pos = comma + 1; - } -} - -// Add hints to vector of langpriors -// Input is string after HTTP header Content-Language: -void SetCLDContentLangHint(const char* contentlang, CLDLangPriors* langpriors) { - string langtags = CopyOneQuotedString(contentlang, 0, strlen(contentlang)); - SetCLDLangTagsHint(langtags, langpriors); -} - -// Add hints to vector of langpriors -// Input is last element of hostname (no dot), e.g. from GetTLD() -void SetCLDTLDHint(const char* tld, CLDLangPriors* langpriors) { - int len = strlen(tld); - if (len > 3) {return;} // Ignore if more than three letters - char local_tld[4]; - strncpy(local_tld, tld, 4); - local_tld[3] = '\0'; // Safety move - // Lowercase - for (int i = 0; i < len; ++i) {local_tld[i] |= 0x20;} - const TLDLookup* entry = DoTLDLookup(local_tld, - kCLDTLDHintTable, - kCLDTable3Size); - if (entry != NULL) { - // Table hit - MergeCLDLangPriorsBoost(entry->onelangprior1, langpriors); - MergeCLDLangPriorsBoost(entry->onelangprior2, langpriors); - } -} - -// Add hints to vector of langpriors -// Input is from DetectEncoding() -void SetCLDEncodingHint(Encoding enc, CLDLangPriors* langpriors) { - OneCLDLangPrior olp; - switch (enc) { - case CHINESE_GB: - case GBK: - case GB18030: - case ISO_2022_CN: - case HZ_GB_2312: - olp = PackCLDPriorLangWeight(CHINESE, kCLDPriorEncodingWeight); - MergeCLDLangPriorsBoost(olp, langpriors); - break; - case CHINESE_BIG5: - case CHINESE_BIG5_CP950: - case BIG5_HKSCS: - olp = PackCLDPriorLangWeight(CHINESE_T, kCLDPriorEncodingWeight); - MergeCLDLangPriorsBoost(olp, langpriors); - break; - case JAPANESE_EUC_JP: - case JAPANESE_SHIFT_JIS: - case JAPANESE_CP932: - case JAPANESE_JIS: // ISO-2022-JP - olp = PackCLDPriorLangWeight(JAPANESE, kCLDPriorEncodingWeight); - MergeCLDLangPriorsBoost(olp, langpriors); - break; - case KOREAN_EUC_KR: - case ISO_2022_KR: - olp = PackCLDPriorLangWeight(KOREAN, kCLDPriorEncodingWeight); - MergeCLDLangPriorsBoost(olp, langpriors); - break; - - default: - break; - } -} - -// Add hints to vector of langpriors -// Input is from random source -void SetCLDLanguageHint(Language lang, CLDLangPriors* langpriors) { - OneCLDLangPrior olp = PackCLDPriorLangWeight(lang, kCLDPriorLanguageWeight); - MergeCLDLangPriorsBoost(olp, langpriors); -} - - -// Make printable string of priors -string DumpCLDLangPriors(const CLDLangPriors* langpriors) { - string retval; - for (int i = 0; i < langpriors->n; ++i) { - char temp[64]; - sprintf(temp, "%s.%d ", - LanguageCode(GetCLDPriorLang(langpriors->prior[i])), - GetCLDPriorWeight(langpriors->prior[i])); - retval.append(temp); - } - return retval; -} - - - - -// Look for -// <html lang="en"> -// <doc xml:lang="en"> -// <html xmlns="http://www.w3.org/1999/xhtml" xml:lang="en" lang="en-US"> -// <meta http-equiv="content-language" content="en-GB" /> -// <meta name="language" content="Srpski"> -// <meta name="DC.language" scheme="RFCOMMA766" content="en"> -// <SPAN id="msg1" class="info" lang='en'> -// -// Do not trigger on -// <!-- lang=french ...--> -// <font lang=postscript ...> -// <link href="index.fr.html" hreflang="fr-FR" xml:lang="fr-FR" /> -// <META name="Author" lang="fr" content="Arnaud Le Hors"> -// -// Stop fairly quickly on mismatched quotes -// -// Allowed language characters -// a-z A-Z -_ , space\t -// Think about: GB2312, big5, shift-jis, euc-jp, ksc euc-kr -// zh-hans zh-TW cmn-Hani zh_cn.gb18030_CN zh-min-nan zh-yue -// de-x-mtfrom-en zh-tw-x-mtfrom-en (machine translation) -// GB2312 => gb -// Big5 => big -// zh_CN.gb18030_C => zh-cn -// -// Remove duplicates and extra spaces as we go -// Lowercase as we go. - -// Get language tag hints from HTML body -// Normalize: remove spaces and make lowercase comma list - -string GetLangTagsFromHtml(const char* utf8_body, int32 utf8_body_len, - int32 max_scan_bytes) { - string retval; - if (max_scan_bytes > utf8_body_len) { - max_scan_bytes = utf8_body_len; - } - - int32 k = 0; - while (k < max_scan_bytes) { - int32 start_tag = FindTagStart(utf8_body, k, max_scan_bytes); - if (start_tag < 0) {break;} - int32 end_tag = FindTagEnd(utf8_body, start_tag + 1, max_scan_bytes); - // FindTagEnd exits on < > & - if (end_tag < 0) {break;} - - // Skip <!--...> - // Skip <font ...> - // Skip <script ...> - // Skip <link ...> - // Skip <img ...> - // Skip <a ...> - if (FindAfter(utf8_body, start_tag + 1, end_tag, "!--") || - FindAfter(utf8_body, start_tag + 1, end_tag, "font ") || - FindAfter(utf8_body, start_tag + 1, end_tag, "script ") || - FindAfter(utf8_body, start_tag + 1, end_tag, "link ") || - FindAfter(utf8_body, start_tag + 1, end_tag, "img ") || - FindAfter(utf8_body, start_tag + 1, end_tag, "a ")) { - k = end_tag + 1; - continue; - } - - // Remember <meta ...> - bool in_meta = false; - if (FindAfter(utf8_body, start_tag + 1, end_tag, "meta ")) { - in_meta = true; - } - - // Scan for each equal sign inside tag - bool content_is_lang = false; - int32 kk = start_tag + 1; - int32 equal_sign; - while ((equal_sign = FindEqualSign(utf8_body, kk, end_tag)) >= 0) { - // eq exits on < > & - - // Look inside a meta tag - // <meta ... http-equiv="content-language" ...> - // <meta ... name="language" ...> - // <meta ... name="dc.language" ...> - if (in_meta) { - if (FindBefore(utf8_body, kk, equal_sign, " http-equiv") && - FindAfter(utf8_body, equal_sign + 1, end_tag, - "content-language ")) { - content_is_lang = true; - } else if (FindBefore(utf8_body, kk, equal_sign, " name") && - (FindAfter(utf8_body, equal_sign + 1, end_tag, - "dc.language ") || - FindAfter(utf8_body, equal_sign + 1, end_tag, - "language "))) { - content_is_lang = true; - } - } - - // Look inside any tag - // <meta ... content="lang-list" ...> - // <... lang="lang-list" ...> - // <... xml:lang="lang-list" ...> - if ((content_is_lang && FindBefore(utf8_body, kk, equal_sign, - " content")) || - FindBefore(utf8_body, kk, equal_sign, " lang") || - FindBefore(utf8_body, kk, equal_sign, ":lang")) { - string temp = CopyQuotedString(utf8_body, equal_sign + 1, end_tag); - - // Append new lang tag(s) if not a duplicate - if (!temp.empty() && (retval.find(temp) == string::npos)) { - retval.append(temp); - } - } - - kk = equal_sign + 1; - } - k = end_tag + 1; - } - - // Strip last comma - if (retval.size() > 1) { - retval.erase(retval.size() - 1); - } - return retval; -} - -} // End namespace CLD2 - -//============================================================================== |