diff options
author | Matt A. Tobin <email@mattatobin.com> | 2018-02-02 05:06:10 -0500 |
---|---|---|
committer | Matt A. Tobin <email@mattatobin.com> | 2018-02-02 05:06:10 -0500 |
commit | 6d614170cbfa958564eb5f824234ad5a9e484344 (patch) | |
tree | 3e1eb384382f30987cb2e64bd654afa8b74fd06b /application/basilisk/components/translation/cld2/internal/cldutil.cc | |
parent | 2a6b605d64b19411a300efdbbd7f78c349f90206 (diff) | |
download | UXP-6d614170cbfa958564eb5f824234ad5a9e484344.tar UXP-6d614170cbfa958564eb5f824234ad5a9e484344.tar.gz UXP-6d614170cbfa958564eb5f824234ad5a9e484344.tar.lz UXP-6d614170cbfa958564eb5f824234ad5a9e484344.tar.xz UXP-6d614170cbfa958564eb5f824234ad5a9e484344.zip |
Revert "Add Basilisk"
This reverts commit e72ef92b5bdc43cd2584198e2e54e951b70299e8.
Diffstat (limited to 'application/basilisk/components/translation/cld2/internal/cldutil.cc')
-rw-r--r-- | application/basilisk/components/translation/cld2/internal/cldutil.cc | 620 |
1 files changed, 0 insertions, 620 deletions
diff --git a/application/basilisk/components/translation/cld2/internal/cldutil.cc b/application/basilisk/components/translation/cld2/internal/cldutil.cc deleted file mode 100644 index ecda9a53e..000000000 --- a/application/basilisk/components/translation/cld2/internal/cldutil.cc +++ /dev/null @@ -1,620 +0,0 @@ -// Copyright 2013 Google Inc. All Rights Reserved. -// -// Licensed under the Apache License, Version 2.0 (the "License"); -// you may not use this file except in compliance with the License. -// You may obtain a copy of the License at -// -// http://www.apache.org/licenses/LICENSE-2.0 -// -// Unless required by applicable law or agreed to in writing, software -// distributed under the License is distributed on an "AS IS" BASIS, -// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -// See the License for the specific language governing permissions and -// limitations under the License. - -// -// Author: dsites@google.com (Dick Sites) -// Updated 2014.01 for dual table lookup -// - -#include "cldutil.h" -#include <string> - -#include "cld2tablesummary.h" -#include "integral_types.h" -#include "port.h" -#include "utf8statetable.h" - -namespace CLD2 { - -// Caller supplies the right tables in scoringcontext - -// Runtime routines for hashing, looking up, and scoring -// unigrams (CJK), bigrams (CJK), quadgrams, and octagrams. -// Unigrams and bigrams are for CJK languages only, including simplified/ -// traditional Chinese, Japanese, Korean, Vietnamese Han characters, and -// Zhuang Han characters. Surrounding spaces are not considered. -// Quadgrams and octagrams for for non-CJK and include two bits indicating -// preceding and trailing spaces (word boundaries). - - -static const int kMinCJKUTF8CharBytes = 3; - -static const int kMinGramCount = 3; -static const int kMaxGramCount = 16; - -static const int UTFmax = 4; // Max number of bytes in a UTF-8 character - - // 1 to skip ASCII space, vowels AEIOU aeiou and UTF-8 continuation bytes 80-BF - static const uint8 kSkipSpaceVowelContinue[256] = { - 0,0,0,0,0,0,0,0, 0,0,0,0,0,0,0,0, 0,0,0,0,0,0,0,0, 0,0,0,0,0,0,0,0, - 1,0,0,0,0,0,0,0, 0,0,0,0,0,0,0,0, 0,0,0,0,0,0,0,0, 0,0,0,0,0,0,0,0, - 0,1,0,0,0,1,0,0, 0,1,0,0,0,0,0,1, 0,0,0,0,0,1,0,0, 0,0,0,0,0,0,0,0, - 0,1,0,0,0,1,0,0, 0,1,0,0,0,0,0,1, 0,0,0,0,0,1,0,0, 0,0,0,0,0,0,0,0, - - 1,1,1,1,1,1,1,1, 1,1,1,1,1,1,1,1, 1,1,1,1,1,1,1,1, 1,1,1,1,1,1,1,1, - 1,1,1,1,1,1,1,1, 1,1,1,1,1,1,1,1, 1,1,1,1,1,1,1,1, 1,1,1,1,1,1,1,1, - 0,0,0,0,0,0,0,0, 0,0,0,0,0,0,0,0, 0,0,0,0,0,0,0,0, 0,0,0,0,0,0,0,0, - 0,0,0,0,0,0,0,0, 0,0,0,0,0,0,0,0, 0,0,0,0,0,0,0,0, 0,0,0,0,0,0,0,0, - }; - - // 1 to skip ASCII space, and UTF-8 continuation bytes 80-BF - static const uint8 kSkipSpaceContinue[256] = { - 0,0,0,0,0,0,0,0, 0,0,0,0,0,0,0,0, 0,0,0,0,0,0,0,0, 0,0,0,0,0,0,0,0, - 1,0,0,0,0,0,0,0, 0,0,0,0,0,0,0,0, 0,0,0,0,0,0,0,0, 0,0,0,0,0,0,0,0, - 0,0,0,0,0,0,0,0, 0,0,0,0,0,0,0,0, 0,0,0,0,0,0,0,0, 0,0,0,0,0,0,0,0, - 0,0,0,0,0,0,0,0, 0,0,0,0,0,0,0,0, 0,0,0,0,0,0,0,0, 0,0,0,0,0,0,0,0, - - 1,1,1,1,1,1,1,1, 1,1,1,1,1,1,1,1, 1,1,1,1,1,1,1,1, 1,1,1,1,1,1,1,1, - 1,1,1,1,1,1,1,1, 1,1,1,1,1,1,1,1, 1,1,1,1,1,1,1,1, 1,1,1,1,1,1,1,1, - 0,0,0,0,0,0,0,0, 0,0,0,0,0,0,0,0, 0,0,0,0,0,0,0,0, 0,0,0,0,0,0,0,0, - 0,0,0,0,0,0,0,0, 0,0,0,0,0,0,0,0, 0,0,0,0,0,0,0,0, 0,0,0,0,0,0,0,0, - }; - - - // Always advances one UTF-8 character - static const uint8 kAdvanceOneChar[256] = { - 1,1,1,1,1,1,1,1, 1,1,1,1,1,1,1,1, 1,1,1,1,1,1,1,1, 1,1,1,1,1,1,1,1, - 1,1,1,1,1,1,1,1, 1,1,1,1,1,1,1,1, 1,1,1,1,1,1,1,1, 1,1,1,1,1,1,1,1, - 1,1,1,1,1,1,1,1, 1,1,1,1,1,1,1,1, 1,1,1,1,1,1,1,1, 1,1,1,1,1,1,1,1, - 1,1,1,1,1,1,1,1, 1,1,1,1,1,1,1,1, 1,1,1,1,1,1,1,1, 1,1,1,1,1,1,1,1, - - 1,1,1,1,1,1,1,1, 1,1,1,1,1,1,1,1, 1,1,1,1,1,1,1,1, 1,1,1,1,1,1,1,1, - 1,1,1,1,1,1,1,1, 1,1,1,1,1,1,1,1, 1,1,1,1,1,1,1,1, 1,1,1,1,1,1,1,1, - 2,2,2,2,2,2,2,2, 2,2,2,2,2,2,2,2, 2,2,2,2,2,2,2,2, 2,2,2,2,2,2,2,2, - 3,3,3,3,3,3,3,3, 3,3,3,3,3,3,3,3, 4,4,4,4,4,4,4,4, 4,4,4,4,4,4,4,4, - }; - - // Advances *only* on space (or illegal byte) - static const uint8 kAdvanceOneCharSpace[256] = { - 1,1,1,1,1,1,1,1, 1,1,1,1,1,1,1,1, 1,1,1,1,1,1,1,1, 1,1,1,1,1,1,1,1, - 1,0,0,0,0,0,0,0, 0,0,0,0,0,0,0,0, 0,0,0,0,0,0,0,0, 0,0,0,0,0,0,0,0, - 0,0,0,0,0,0,0,0, 0,0,0,0,0,0,0,0, 0,0,0,0,0,0,0,0, 0,0,0,0,0,0,0,0, - 0,0,0,0,0,0,0,0, 0,0,0,0,0,0,0,0, 0,0,0,0,0,0,0,0, 0,0,0,0,0,0,0,0, - - 1,1,1,1,1,1,1,1, 1,1,1,1,1,1,1,1, 1,1,1,1,1,1,1,1, 1,1,1,1,1,1,1,1, - 1,1,1,1,1,1,1,1, 1,1,1,1,1,1,1,1, 1,1,1,1,1,1,1,1, 1,1,1,1,1,1,1,1, - 0,0,0,0,0,0,0,0, 0,0,0,0,0,0,0,0, 0,0,0,0,0,0,0,0, 0,0,0,0,0,0,0,0, - 0,0,0,0,0,0,0,0, 0,0,0,0,0,0,0,0, 0,0,0,0,0,0,0,0, 0,0,0,0,0,0,0,0, - }; - - -// Routines to access a hash table of <key:wordhash, value:probs> pairs -// Buckets have 4-byte wordhash for sizes < 32K buckets, but only -// 2-byte wordhash for sizes >= 32K buckets, with other wordhash bits used as -// bucket subscript. -// Probs is a packed: three languages plus a subscript for probability table -// Buckets have all the keys together, then all the values.Key array never -// crosses a cache-line boundary, so no-match case takes exactly one cache miss. -// Match case may sometimes take an additional cache miss on value access. -// -// Other possibilites include 5 or 10 6-byte entries plus pad to make 32 or 64 -// byte buckets with single cache miss. -// Or 2-byte key and 6-byte value, allowing 5 languages instead of three. -//------------------------------------------------------------------------------ - -//----------------------------------------------------------------------------// -// Hashing groups of 1/2/4/8 letters, perhaps with spaces or underscores // -//----------------------------------------------------------------------------// - -//----------------------------------------------------------------------------// -// Scoring single groups of letters // -//----------------------------------------------------------------------------// - -// BIGRAM, QUADGRAM, OCTAGRAM score one => tote -// Input: 4-byte entry of 3 language numbers and one probability subscript, plus -// an accumulator tote. (language 0 means unused entry) -// Output: running sums in tote updated -void ProcessProbV2Tote(uint32 probs, Tote* tote) { - uint8 prob123 = (probs >> 0) & 0xff; - const uint8* prob123_entry = LgProb2TblEntry(prob123); - - uint8 top1 = (probs >> 8) & 0xff; - if (top1 > 0) {tote->Add(top1, LgProb3(prob123_entry, 0));} - uint8 top2 = (probs >> 16) & 0xff; - if (top2 > 0) {tote->Add(top2, LgProb3(prob123_entry, 1));} - uint8 top3 = (probs >> 24) & 0xff; - if (top3 > 0) {tote->Add(top3, LgProb3(prob123_entry, 2));} -} - -// Return score for a particular per-script language, or zero -int GetLangScore(uint32 probs, uint8 pslang) { - uint8 prob123 = (probs >> 0) & 0xff; - const uint8* prob123_entry = LgProb2TblEntry(prob123); - int retval = 0; - uint8 top1 = (probs >> 8) & 0xff; - if (top1 == pslang) {retval += LgProb3(prob123_entry, 0);} - uint8 top2 = (probs >> 16) & 0xff; - if (top2 == pslang) {retval += LgProb3(prob123_entry, 1);} - uint8 top3 = (probs >> 24) & 0xff; - if (top3 == pslang) {retval += LgProb3(prob123_entry, 2);} - return retval; -} - -//----------------------------------------------------------------------------// -// Routines to accumulate probabilities // -//----------------------------------------------------------------------------// - - -// BIGRAM, using hash table, always advancing by 1 char -// Caller supplies table, such as &kCjkBiTable_obj or &kGibberishTable_obj -// Score all bigrams in isrc, using languages that have bigrams (CJK) -// Return number of bigrams that hit in the hash table -int DoBigramScoreV3(const CLD2TableSummary* bigram_obj, - const char* isrc, int srclen, Tote* chunk_tote) { - int hit_count = 0; - const char* src = isrc; - - // Hashtable-based CJK bigram lookup - const uint8* usrc = reinterpret_cast<const uint8*>(src); - const uint8* usrclimit1 = usrc + srclen - UTFmax; - - while (usrc < usrclimit1) { - int len = kAdvanceOneChar[usrc[0]]; - int len2 = kAdvanceOneChar[usrc[len]] + len; - - if ((kMinCJKUTF8CharBytes * 2) <= len2) { // Two CJK chars possible - // Lookup and score this bigram - // Always ignore pre/post spaces - uint32 bihash = BiHashV2(reinterpret_cast<const char*>(usrc), len2); - uint32 probs = QuadHashV3Lookup4(bigram_obj, bihash); - // Now go indirect on the subscript - probs = bigram_obj->kCLDTableInd[probs & - ~bigram_obj->kCLDTableKeyMask]; - - // Process the bigram - if (probs != 0) { - ProcessProbV2Tote(probs, chunk_tote); - ++hit_count; - } - } - usrc += len; // Advance by one char - } - - return hit_count; -} - - -// Score up to 64KB of a single script span in one pass -// Make a dummy entry off the end to calc length of last span -// Return offset of first unused input byte -int GetUniHits(const char* text, - int letter_offset, int letter_limit, - ScoringContext* scoringcontext, - ScoringHitBuffer* hitbuffer) { - const char* isrc = &text[letter_offset]; - const char* src = isrc; - // Limit is end, which has extra 20 20 20 00 past len - const char* srclimit = &text[letter_limit]; - - // Local copies - const UTF8PropObj* unigram_obj = - scoringcontext->scoringtables->unigram_obj; - int next_base = hitbuffer->next_base; - int next_base_limit = hitbuffer->maxscoringhits; - - // Visit all unigrams - if (src[0] == ' ') {++src;} // skip any initial space - while (src < srclimit) { - const uint8* usrc = reinterpret_cast<const uint8*>(src); - int len = kAdvanceOneChar[usrc[0]]; - src += len; - // Look up property of one UTF-8 character and advance over it. - // Updates usrc and len (bad interface design), hence increment above - int propval = UTF8GenericPropertyBigOneByte(unigram_obj, &usrc, &len); - if (propval > 0) { - // Save indirect subscript for later scoring; 1 or 2 langprobs - int indirect_subscr = propval; - hitbuffer->base[next_base].offset = src - text; // Offset in text - hitbuffer->base[next_base].indirect = indirect_subscr; - ++next_base; - } - - if (next_base >= next_base_limit) {break;} - } - - hitbuffer->next_base = next_base; - - // Make a dummy entry off the end to calc length of last span - int dummy_offset = src - text; - hitbuffer->base[hitbuffer->next_base].offset = dummy_offset; - hitbuffer->base[hitbuffer->next_base].indirect = 0; - - return src - text; -} - -// Score up to 64KB of a single script span, doing both delta-bi and -// distinct bis in one pass -void GetBiHits(const char* text, - int letter_offset, int letter_limit, - ScoringContext* scoringcontext, - ScoringHitBuffer* hitbuffer) { - const char* isrc = &text[letter_offset]; - const char* src = isrc; - // Limit is end - const char* srclimit1 = &text[letter_limit]; - - // Local copies - const CLD2TableSummary* deltabi_obj = - scoringcontext->scoringtables->deltabi_obj; - const CLD2TableSummary* distinctbi_obj = - scoringcontext->scoringtables->distinctbi_obj; - int next_delta = hitbuffer->next_delta; - int next_delta_limit = hitbuffer->maxscoringhits; - int next_distinct = hitbuffer->next_distinct; - // We can do 2 inserts per loop, so -1 - int next_distinct_limit = hitbuffer->maxscoringhits - 1; - - while (src < srclimit1) { - const uint8* usrc = reinterpret_cast<const uint8*>(src); - int len = kAdvanceOneChar[usrc[0]]; - int len2 = kAdvanceOneChar[usrc[len]] + len; - - if ((kMinCJKUTF8CharBytes * 2) <= len2) { // Two CJK chars possible - // Lookup and this bigram and save <offset, indirect> - uint32 bihash = BiHashV2(src, len2); - uint32 probs = QuadHashV3Lookup4(deltabi_obj, bihash); - // Now go indirect on the subscript - if (probs != 0) { - // Save indirect subscript for later scoring; 1 langprob - int indirect_subscr = probs & ~deltabi_obj->kCLDTableKeyMask; - hitbuffer->delta[next_delta].offset = src - text; - hitbuffer->delta[next_delta].indirect = indirect_subscr; - ++next_delta; - } - // Lookup this distinct bigram and save <offset, indirect> - probs = QuadHashV3Lookup4(distinctbi_obj, bihash); - if (probs != 0) { - int indirect_subscr = probs & ~distinctbi_obj->kCLDTableKeyMask; - hitbuffer->distinct[next_distinct].offset = src - text; - hitbuffer->distinct[next_distinct].indirect = indirect_subscr; - ++next_distinct; - } - } - src += len; // Advance by one char (not two) - - // Almost always srclimit hit first - if (next_delta >= next_delta_limit) {break;} - if (next_distinct >= next_distinct_limit) {break;} - } - - hitbuffer->next_delta = next_delta; - hitbuffer->next_distinct = next_distinct; - - // Make a dummy entry off the end to calc length of last span - int dummy_offset = src - text; - hitbuffer->delta[hitbuffer->next_delta].offset = dummy_offset; - hitbuffer->delta[hitbuffer->next_delta].indirect = 0; - hitbuffer->distinct[hitbuffer->next_distinct].offset = dummy_offset; - hitbuffer->distinct[hitbuffer->next_distinct].indirect = 0; -} - -// Score up to 64KB of a single script span in one pass -// Make a dummy entry off the end to calc length of last span -// Return offset of first unused input byte -int GetQuadHits(const char* text, - int letter_offset, int letter_limit, - ScoringContext* scoringcontext, - ScoringHitBuffer* hitbuffer) { - const char* isrc = &text[letter_offset]; - const char* src = isrc; - // Limit is end, which has extra 20 20 20 00 past len - const char* srclimit = &text[letter_limit]; - - // Local copies - const CLD2TableSummary* quadgram_obj = - scoringcontext->scoringtables->quadgram_obj; - const CLD2TableSummary* quadgram_obj2 = - scoringcontext->scoringtables->quadgram_obj2; - int next_base = hitbuffer->next_base; - int next_base_limit = hitbuffer->maxscoringhits; - - // Run a little cache of last quad hits to catch overly-repetitive "text" - // We don't care if we miss a couple repetitions at scriptspan boundaries - int next_prior_quadhash = 0; - uint32 prior_quadhash[2] = {0, 0}; - - // Visit all quadgrams - if (src[0] == ' ') {++src;} // skip any initial space - while (src < srclimit) { - // Find one quadgram - const char* src_end = src; - src_end += kAdvanceOneCharButSpace[(uint8)src_end[0]]; - src_end += kAdvanceOneCharButSpace[(uint8)src_end[0]]; - const char* src_mid = src_end; - src_end += kAdvanceOneCharButSpace[(uint8)src_end[0]]; - src_end += kAdvanceOneCharButSpace[(uint8)src_end[0]]; - int len = src_end - src; - // Hash the quadgram - uint32 quadhash = QuadHashV2(src, len); - - // Filter out recent repeats - if ((quadhash != prior_quadhash[0]) && (quadhash != prior_quadhash[1])) { - // Look up this quadgram and save <offset, indirect> - uint32 indirect_flag = 0; // For dual tables - const CLD2TableSummary* hit_obj = quadgram_obj; - uint32 probs = QuadHashV3Lookup4(quadgram_obj, quadhash); - if ((probs == 0) && (quadgram_obj2->kCLDTableSize != 0)) { - // Try lookup in dual table if not found in first one - // Note: we need to know later which of two indirect tables to use. - indirect_flag = 0x80000000u; - hit_obj = quadgram_obj2; - probs = QuadHashV3Lookup4(quadgram_obj2, quadhash); - } - if (probs != 0) { - // Round-robin two entries of actual hits - prior_quadhash[next_prior_quadhash] = quadhash; - next_prior_quadhash = (next_prior_quadhash + 1) & 1; - - // Save indirect subscript for later scoring; 1 or 2 langprobs - int indirect_subscr = probs & ~hit_obj->kCLDTableKeyMask; - hitbuffer->base[next_base].offset = src - text; // Offset in text - // Flip the high bit for table2 - hitbuffer->base[next_base].indirect = indirect_subscr | indirect_flag; - ++next_base; - } - } - - // Advance: all the way past word if at end-of-word, else 2 chars - if (src_end[0] == ' ') { - src = src_end; - } else { - src = src_mid; - } - - // Skip over space at end of word, or ASCII vowel in middle of word - // Use kAdvanceOneCharSpace instead to get rid of vowel hack - if (src < srclimit) { - src += kAdvanceOneCharSpaceVowel[(uint8)src[0]]; - } else { - // Advancing by 4/8/16 can overshoot, but we are about to exit anyway - src = srclimit; - } - - if (next_base >= next_base_limit) {break;} - } - - hitbuffer->next_base = next_base; - - // Make a dummy entry off the end to calc length of last span - int dummy_offset = src - text; - hitbuffer->base[hitbuffer->next_base].offset = dummy_offset; - hitbuffer->base[hitbuffer->next_base].indirect = 0; - - return src - text; -} - -// inputs: -// const tables -// const char* isrc, int srclen (in sscriptbuffer) -// intermediates: -// vector of octa <offset, probs> (which need indirect table to decode) -// vector of distinct <offset, probs> (which need indirect table to decode) - -// Score up to 64KB of a single script span, doing both delta-octa and -// distinct words in one pass -void GetOctaHits(const char* text, - int letter_offset, int letter_limit, - ScoringContext* scoringcontext, - ScoringHitBuffer* hitbuffer) { - const char* isrc = &text[letter_offset]; - const char* src = isrc; - // Limit is end+1, to include extra space char (0x20) off the end - const char* srclimit = &text[letter_limit + 1]; - - // Local copies - const CLD2TableSummary* deltaocta_obj = - scoringcontext->scoringtables->deltaocta_obj; - int next_delta = hitbuffer->next_delta; - int next_delta_limit = hitbuffer->maxscoringhits; - - const CLD2TableSummary* distinctocta_obj = - scoringcontext->scoringtables->distinctocta_obj; - int next_distinct = hitbuffer->next_distinct; - // We can do 2 inserts per loop, so -1 - int next_distinct_limit = hitbuffer->maxscoringhits - 1; - - // Run a little cache of last octa hits to catch overly-repetitive "text" - // We don't care if we miss a couple repetitions at scriptspan boundaries - int next_prior_octahash = 0; - uint64 prior_octahash[2] = {0, 0}; - - // Score all words truncated to 8 characters - int charcount = 0; - // Skip any initial space - if (src[0] == ' ') {++src;} - - // Begin the first word - const char* prior_word_start = src; - const char* word_start = src; - const char* word_end = word_start; - while (src < srclimit) { - // Terminate previous word or continue current word - if (src[0] == ' ') { - int len = word_end - word_start; - // Hash the word - uint64 wordhash40 = OctaHash40(word_start, len); - uint32 probs; - - // Filter out recent repeats. Unlike quads, we update even if no hit, - // so we can get hits on same word if separated by non-hit words - if ((wordhash40 != prior_octahash[0]) && - (wordhash40 != prior_octahash[1])) { - // Round-robin two entries of words - prior_octahash[next_prior_octahash] = wordhash40; - next_prior_octahash = 1 - next_prior_octahash; // Alternates 0,1,0,1 - - // (1) Lookup distinct word PAIR. For a pair, we want an asymmetrical - // function of the two word hashs. For words A B C, B-A and C-B are good - // enough and fast. We use the same table as distinct single words - // Do not look up a pair of identical words -- all pairs hash to zero - // Both 1- and 2-word distinct lookups are in distinctocta_obj now - // Do this first, because it has the lowest offset - uint64 tmp_prior_hash = prior_octahash[next_prior_octahash]; - if ((tmp_prior_hash != 0) && (tmp_prior_hash != wordhash40)) { - uint64 pair_hash = PairHash(tmp_prior_hash, wordhash40); - probs = OctaHashV3Lookup4(distinctocta_obj, pair_hash); - if (probs != 0) { - int indirect_subscr = probs & ~distinctocta_obj->kCLDTableKeyMask; - hitbuffer->distinct[next_distinct].offset = prior_word_start - text; - hitbuffer->distinct[next_distinct].indirect = indirect_subscr; - ++next_distinct; - } - } - - // (2) Lookup this distinct word and save <offset, indirect> - probs = OctaHashV3Lookup4(distinctocta_obj, wordhash40); - if (probs != 0) { - int indirect_subscr = probs & ~distinctocta_obj->kCLDTableKeyMask; - hitbuffer->distinct[next_distinct].offset = word_start - text; - hitbuffer->distinct[next_distinct].indirect = indirect_subscr; - ++next_distinct; - } - - // (3) Lookup this word and save <offset, indirect> - probs = OctaHashV3Lookup4(deltaocta_obj, wordhash40); - if (probs != 0) { - // Save indirect subscript for later scoring; 1 langprob - int indirect_subscr = probs & ~deltaocta_obj->kCLDTableKeyMask; - hitbuffer->delta[next_delta].offset = word_start - text; - hitbuffer->delta[next_delta].indirect = indirect_subscr; - ++next_delta; - } - } - - // Begin the next word - charcount = 0; - prior_word_start = word_start; - word_start = src + 1; // Over the space - word_end = word_start; - } else { - ++charcount; - } - - // Advance to next char - src += UTF8OneCharLen(src); - if (charcount <= 8) { - word_end = src; - } - // Almost always srclimit hit first - if (next_delta >= next_delta_limit) {break;} - if (next_distinct >= next_distinct_limit) {break;} - } - - hitbuffer->next_delta = next_delta; - hitbuffer->next_distinct = next_distinct; - - // Make a dummy entry off the end to calc length of last span - int dummy_offset = src - text; - hitbuffer->delta[hitbuffer->next_delta].offset = dummy_offset; - hitbuffer->delta[hitbuffer->next_delta].indirect = 0; - hitbuffer->distinct[hitbuffer->next_distinct].offset = dummy_offset; - hitbuffer->distinct[hitbuffer->next_distinct].indirect = 0; -} - - -//----------------------------------------------------------------------------// -// Reliability calculations, for single language and between languages // -//----------------------------------------------------------------------------// - -// Return reliablity of result 0..100 for top two scores -// delta==0 is 0% reliable, delta==fully_reliable_thresh is 100% reliable -// (on a scale where +1 is a factor of 2 ** 1.6 = 3.02) -// Threshold is uni/quadgram increment count, bounded above and below. -// -// Requiring a factor of 3 improvement (e.g. +1 log base 3) -// for each scored quadgram is too stringent, so I've backed this off to a -// factor of 2 (e.g. +5/8 log base 3). -// -// I also somewhat lowered the Min/MaxGramCount limits above -// -// Added: if fewer than 8 quads/unis, max reliability is 12*n percent -// -int ReliabilityDelta(int value1, int value2, int gramcount) { - int max_reliability_percent = 100; - if (gramcount < 8) { - max_reliability_percent = 12 * gramcount; - } - int fully_reliable_thresh = (gramcount * 5) >> 3; // see note above - if (fully_reliable_thresh < kMinGramCount) { // Fully = 3..16 - fully_reliable_thresh = kMinGramCount; - } else if (fully_reliable_thresh > kMaxGramCount) { - fully_reliable_thresh = kMaxGramCount; - } - - int delta = value1 - value2; - if (delta >= fully_reliable_thresh) {return max_reliability_percent;} - if (delta <= 0) {return 0;} - return minint(max_reliability_percent, - (100 * delta) / fully_reliable_thresh); -} - -// Return reliablity of result 0..100 for top score vs. expected mainsteam score -// Values are score per 1024 bytes of input -// ratio = max(top/mainstream, mainstream/top) -// ratio > 4.0 is 0% reliable, <= 2.0 is 100% reliable -// Change: short-text word scoring can give unusually good results. -// Let top exceed mainstream by 4x at 50% reliable -// -// dsites April 2010: These could be tightened up. It would be -// reasonable with newer data and round-robin table allocation to start ramping -// down at mean * 1.5 and mean/1.5, while letting mean*2 and mean/2 pass, -// but just barely. -// -// dsites March 2013: Tightened up a bit. -static const double kRatio100 = 1.5; -static const double kRatio0 = 4.0; -int ReliabilityExpected(int actual_score_1kb, int expected_score_1kb) { - if (expected_score_1kb == 0) {return 100;} // No reliability data available yet - if (actual_score_1kb == 0) {return 0;} // zero score = unreliable - double ratio; - if (expected_score_1kb > actual_score_1kb) { - ratio = (1.0 * expected_score_1kb) / actual_score_1kb; - } else { - ratio = (1.0 * actual_score_1kb) / expected_score_1kb; - } - // Ratio 1.0 .. 1.5 scores 100% - // Ratio 2.0 scores 80% - // Linear decline, to ratio 4.0 scores 0% - if (ratio <= kRatio100) {return 100;} - if (ratio > kRatio0) {return 0;} - - int percent_good = 100.0 * (kRatio0 - ratio) / (kRatio0 - kRatio100); - return percent_good; -} - -// Create a langprob packed value from its parts. -// qprob is quantized [0..12] -// We use Latn script to represent any RTypeMany language -uint32 MakeLangProb(Language lang, int qprob) { - uint32 pslang = PerScriptNumber(ULScript_Latin, lang); - uint32 retval = (pslang << 8) | kLgProbV2TblBackmap[qprob]; - return retval; -} - -} // End namespace CLD2 - - - - - |