summaryrefslogtreecommitdiffstats
path: root/application/basilisk/components/translation/cld2/internal/compact_lang_det.cc
diff options
context:
space:
mode:
Diffstat (limited to 'application/basilisk/components/translation/cld2/internal/compact_lang_det.cc')
-rw-r--r--application/basilisk/components/translation/cld2/internal/compact_lang_det.cc322
1 files changed, 322 insertions, 0 deletions
diff --git a/application/basilisk/components/translation/cld2/internal/compact_lang_det.cc b/application/basilisk/components/translation/cld2/internal/compact_lang_det.cc
new file mode 100644
index 000000000..ccc6588fc
--- /dev/null
+++ b/application/basilisk/components/translation/cld2/internal/compact_lang_det.cc
@@ -0,0 +1,322 @@
+// Copyright 2013 Google Inc. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+// http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+//
+// Author: dsites@google.com (Dick Sites)
+//
+
+#include <stdio.h>
+#include <stdlib.h>
+
+#include "../public/compact_lang_det.h"
+#include "../public/encodings.h"
+#include "compact_lang_det_impl.h"
+#include "integral_types.h"
+#include "lang_script.h"
+
+namespace CLD2 {
+
+// String is "code_version - data_scrape_date"
+//static const char* kDetectLanguageVersion = "V2.0 - 20130715";
+
+
+// Large-table version for all ~160 languages
+// Small-table version for all ~60 languages
+
+// Scan interchange-valid UTF-8 bytes and detect most likely language
+Language DetectLanguage(
+ const char* buffer,
+ int buffer_length,
+ bool is_plain_text,
+ bool* is_reliable) {
+ bool allow_extended_lang = false;
+ Language language3[3];
+ int percent3[3];
+ double normalized_score3[3];
+ int text_bytes;
+ int flags = 0;
+ Language plus_one = UNKNOWN_LANGUAGE;
+ const char* tld_hint = "";
+ int encoding_hint = UNKNOWN_ENCODING;
+ Language language_hint = UNKNOWN_LANGUAGE;
+ CLDHints cldhints = {NULL, tld_hint, encoding_hint, language_hint};
+
+ Language lang = DetectLanguageSummaryV2(
+ buffer,
+ buffer_length,
+ is_plain_text,
+ &cldhints,
+ allow_extended_lang,
+ flags,
+ plus_one,
+ language3,
+ percent3,
+ normalized_score3,
+ NULL,
+ &text_bytes,
+ is_reliable);
+ // Default to English
+ if (lang == UNKNOWN_LANGUAGE) {
+ lang = ENGLISH;
+ }
+ return lang;
+}
+
+// Scan interchange-valid UTF-8 bytes and detect list of top 3 languages.
+Language DetectLanguageSummary(
+ const char* buffer,
+ int buffer_length,
+ bool is_plain_text,
+ Language* language3,
+ int* percent3,
+ int* text_bytes,
+ bool* is_reliable) {
+ double normalized_score3[3];
+ bool allow_extended_lang = false;
+ int flags = 0;
+ Language plus_one = UNKNOWN_LANGUAGE;
+ const char* tld_hint = "";
+ int encoding_hint = UNKNOWN_ENCODING;
+ Language language_hint = UNKNOWN_LANGUAGE;
+ CLDHints cldhints = {NULL, tld_hint, encoding_hint, language_hint};
+
+ Language lang = DetectLanguageSummaryV2(
+ buffer,
+ buffer_length,
+ is_plain_text,
+ &cldhints,
+ allow_extended_lang,
+ flags,
+ plus_one,
+ language3,
+ percent3,
+ normalized_score3,
+ NULL,
+ text_bytes,
+ is_reliable);
+ // Default to English
+ if (lang == UNKNOWN_LANGUAGE) {
+ lang = ENGLISH;
+ }
+ return lang;
+}
+
+// Same as above, with hints supplied
+// Scan interchange-valid UTF-8 bytes and detect list of top 3 languages.
+Language DetectLanguageSummary(
+ const char* buffer,
+ int buffer_length,
+ bool is_plain_text,
+ const char* tld_hint, // "id" boosts Indonesian
+ int encoding_hint, // SJS boosts Japanese
+ Language language_hint, // ITALIAN boosts it
+ Language* language3,
+ int* percent3,
+ int* text_bytes,
+ bool* is_reliable) {
+ double normalized_score3[3];
+ bool allow_extended_lang = false;
+ int flags = 0;
+ Language plus_one = UNKNOWN_LANGUAGE;
+ CLDHints cldhints = {NULL, tld_hint, encoding_hint, language_hint};
+
+ Language lang = DetectLanguageSummaryV2(
+ buffer,
+ buffer_length,
+ is_plain_text,
+ &cldhints,
+ allow_extended_lang,
+ flags,
+ plus_one,
+ language3,
+ percent3,
+ normalized_score3,
+ NULL,
+ text_bytes,
+ is_reliable);
+ // Default to English
+ if (lang == UNKNOWN_LANGUAGE) {
+ lang = ENGLISH;
+ }
+ return lang;
+}
+
+
+// Scan interchange-valid UTF-8 bytes and detect list of top 3 extended
+// languages.
+// Extended languages are additional Google interface languages and Unicode
+// single-language scripts, from ext_lang_enc.h
+Language ExtDetectLanguageSummary(
+ const char* buffer,
+ int buffer_length,
+ bool is_plain_text,
+ Language* language3,
+ int* percent3,
+ int* text_bytes,
+ bool* is_reliable) {
+ double normalized_score3[3];
+ bool allow_extended_lang = true;
+ int flags = 0;
+ Language plus_one = UNKNOWN_LANGUAGE;
+ const char* tld_hint = "";
+ int encoding_hint = UNKNOWN_ENCODING;
+ Language language_hint = UNKNOWN_LANGUAGE;
+ CLDHints cldhints = {NULL, tld_hint, encoding_hint, language_hint};
+
+ Language lang = DetectLanguageSummaryV2(
+ buffer,
+ buffer_length,
+ is_plain_text,
+ &cldhints,
+ allow_extended_lang,
+ flags,
+ plus_one,
+ language3,
+ percent3,
+ normalized_score3,
+ NULL,
+ text_bytes,
+ is_reliable);
+ // Do not default to English
+ return lang;
+}
+
+// Same as above, with hints supplied
+// Scan interchange-valid UTF-8 bytes and detect list of top 3 extended
+// languages.
+// Extended languages are additional Google interface languages and Unicode
+// single-language scripts, from ext_lang_enc.h
+Language ExtDetectLanguageSummary(
+ const char* buffer,
+ int buffer_length,
+ bool is_plain_text,
+ const char* tld_hint, // "id" boosts Indonesian
+ int encoding_hint, // SJS boosts Japanese
+ Language language_hint, // ITALIAN boosts it
+ Language* language3,
+ int* percent3,
+ int* text_bytes,
+ bool* is_reliable) {
+ double normalized_score3[3];
+ bool allow_extended_lang = true;
+ int flags = 0;
+ Language plus_one = UNKNOWN_LANGUAGE;
+ CLDHints cldhints = {NULL, tld_hint, encoding_hint, language_hint};
+
+ Language lang = DetectLanguageSummaryV2(
+ buffer,
+ buffer_length,
+ is_plain_text,
+ &cldhints,
+ allow_extended_lang,
+ flags,
+ plus_one,
+ language3,
+ percent3,
+ normalized_score3,
+ NULL,
+ text_bytes,
+ is_reliable);
+ // Do not default to English
+ return lang;
+}
+
+// Same as above, and also returns internal language scores as a ratio to
+// normal score for real text in that language. Scores close to 1.0 indicate
+// normal text, while scores far away from 1.0 indicate badly-skewed text or
+// gibberish
+//
+Language ExtDetectLanguageSummary(
+ const char* buffer,
+ int buffer_length,
+ bool is_plain_text,
+ const char* tld_hint, // "id" boosts Indonesian
+ int encoding_hint, // SJS boosts Japanese
+ Language language_hint, // ITALIAN boosts it
+ Language* language3,
+ int* percent3,
+ double* normalized_score3,
+ int* text_bytes,
+ bool* is_reliable) {
+ bool allow_extended_lang = true;
+ int flags = 0;
+ Language plus_one = UNKNOWN_LANGUAGE;
+ CLDHints cldhints = {NULL, tld_hint, encoding_hint, language_hint};
+
+ Language lang = DetectLanguageSummaryV2(
+ buffer,
+ buffer_length,
+ is_plain_text,
+ &cldhints,
+ allow_extended_lang,
+ flags,
+ plus_one,
+ language3,
+ percent3,
+ normalized_score3,
+ NULL,
+ text_bytes,
+ is_reliable);
+ // Do not default to English
+ return lang;
+}
+
+// Use this one.
+// Hints are collected into a struct.
+// Flags are passed in (normally zero).
+//
+// Also returns 3 internal language scores as a ratio to
+// normal score for real text in that language. Scores close to 1.0 indicate
+// normal text, while scores far away from 1.0 indicate badly-skewed text or
+// gibberish
+//
+// Returns a vector of chunks in different languages, so that caller may
+// spell-check, translate, or otherwaise process different parts of the input
+// buffer in language-dependant ways.
+//
+Language ExtDetectLanguageSummary(
+ const char* buffer,
+ int buffer_length,
+ bool is_plain_text,
+ const CLDHints* cld_hints,
+ int flags,
+ Language* language3,
+ int* percent3,
+ double* normalized_score3,
+ ResultChunkVector* resultchunkvector,
+ int* text_bytes,
+ bool* is_reliable) {
+ bool allow_extended_lang = true;
+ Language plus_one = UNKNOWN_LANGUAGE;
+
+ Language lang = DetectLanguageSummaryV2(
+ buffer,
+ buffer_length,
+ is_plain_text,
+ cld_hints,
+ allow_extended_lang,
+ flags,
+ plus_one,
+ language3,
+ percent3,
+ normalized_score3,
+ resultchunkvector,
+ text_bytes,
+ is_reliable);
+ // Do not default to English
+ return lang;
+}
+
+} // End namespace CLD2
+