summaryrefslogtreecommitdiffstats
path: root/application/basilisk/components/translation/cld2/internal/compact_lang_det_hint_code.h
diff options
context:
space:
mode:
Diffstat (limited to 'application/basilisk/components/translation/cld2/internal/compact_lang_det_hint_code.h')
-rw-r--r--application/basilisk/components/translation/cld2/internal/compact_lang_det_hint_code.h95
1 files changed, 95 insertions, 0 deletions
diff --git a/application/basilisk/components/translation/cld2/internal/compact_lang_det_hint_code.h b/application/basilisk/components/translation/cld2/internal/compact_lang_det_hint_code.h
new file mode 100644
index 000000000..df8948a27
--- /dev/null
+++ b/application/basilisk/components/translation/cld2/internal/compact_lang_det_hint_code.h
@@ -0,0 +1,95 @@
+// Copyright 2013 Google Inc. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+// http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+//
+// Author: dsites@google.com (Dick Sites)
+//
+
+#ifndef I18N_ENCODINGS_COMPACT_LANG_DET_COMPACT_LANG_DET_HINT_CODE_H__
+#define I18N_ENCODINGS_COMPACT_LANG_DET_COMPACT_LANG_DET_HINT_CODE_H__
+
+
+#include <string>
+#include "integral_types.h"
+#include "lang_script.h"
+#include "../public/encodings.h"
+
+namespace CLD2 {
+
+// Packed <Language, weight>, weight in [-32..31] (powers of 2**1.6 ~=3.03)
+// Full language in bottom 10 bits, weight in top 6 bits
+typedef int16 OneCLDLangPrior;
+
+const int kMaxOneCLDLangPrior = 14;
+typedef struct {
+ int32 n;
+ OneCLDLangPrior prior[kMaxOneCLDLangPrior];
+} CLDLangPriors;
+
+// Reading exposed here; setting hidden in .cc
+inline int GetCLDPriorWeight(OneCLDLangPrior olp) {
+ return olp >> 10;
+}
+inline Language GetCLDPriorLang(OneCLDLangPrior olp) {
+ return static_cast<Language>(olp & 0x3ff);
+}
+
+inline int32 GetCLDLangPriorCount(CLDLangPriors* lps) {
+ return lps->n;
+}
+
+inline void InitCLDLangPriors(CLDLangPriors* lps) {
+ lps->n = 0;
+}
+
+// Trim language priors to no more than max_entries, keeping largest abs weights
+void TrimCLDLangPriors(int max_entries, CLDLangPriors* lps);
+
+// Trim language tag string to canonical form for each language
+// Input is from GetLangTagsFromHtml(), already lowercased
+std::string TrimCLDLangTagsHint(const std::string& langtags);
+
+// Add hints to vector of langpriors
+// Input is from GetLangTagsFromHtml(), already lowercased
+void SetCLDLangTagsHint(const std::string& langtags, CLDLangPriors* langpriors);
+
+// Add hints to vector of langpriors
+// Input is from HTTP content-language
+void SetCLDContentLangHint(const char* contentlang, CLDLangPriors* langpriors);
+
+// Add hints to vector of langpriors
+// Input is from GetTLD(), already lowercased
+void SetCLDTLDHint(const char* tld, CLDLangPriors* langpriors);
+
+// Add hints to vector of langpriors
+// Input is from DetectEncoding()
+void SetCLDEncodingHint(Encoding enc, CLDLangPriors* langpriors);
+
+// Add hints to vector of langpriors
+// Input is from random source
+void SetCLDLanguageHint(Language lang, CLDLangPriors* langpriors);
+
+// Make printable string of priors
+std::string DumpCLDLangPriors(const CLDLangPriors* langpriors);
+
+
+// Get language tag hints from HTML body
+// Normalize: remove spaces and make lowercase comma list
+std::string GetLangTagsFromHtml(const char* utf8_body, int32 utf8_body_len,
+ int32 max_scan_bytes);
+
+} // End namespace CLD2
+
+#endif // I18N_ENCODINGS_COMPACT_LANG_DET_COMPACT_LANG_DET_HINT_CODE_H__
+