diff options
Diffstat (limited to 'application/basilisk/components/translation/cld2/internal/tote.cc')
-rw-r--r-- | application/basilisk/components/translation/cld2/internal/tote.cc | 265 |
1 files changed, 0 insertions, 265 deletions
diff --git a/application/basilisk/components/translation/cld2/internal/tote.cc b/application/basilisk/components/translation/cld2/internal/tote.cc deleted file mode 100644 index fbaba7d5c..000000000 --- a/application/basilisk/components/translation/cld2/internal/tote.cc +++ /dev/null @@ -1,265 +0,0 @@ -// Copyright 2013 Google Inc. All Rights Reserved. -// -// Licensed under the Apache License, Version 2.0 (the "License"); -// you may not use this file except in compliance with the License. -// You may obtain a copy of the License at -// -// http://www.apache.org/licenses/LICENSE-2.0 -// -// Unless required by applicable law or agreed to in writing, software -// distributed under the License is distributed on an "AS IS" BASIS, -// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -// See the License for the specific language governing permissions and -// limitations under the License. - -// -// Author: dsites@google.com (Dick Sites) -// - -#include "tote.h" -#include "lang_script.h" // For LanguageCode in Dump - -#include <stdio.h> -#include <string.h> // For memset - -namespace CLD2 { - -// Take a set of <key, value> pairs and tote them up. -// After explicitly sorting, retrieve top key, value pairs -// Normal use is key=per-script language and value = probability score -Tote::Tote() { - in_use_mask_ = 0; - byte_count_ = 0; - score_count_ = 0; - // No need to initialize values -} - -Tote::~Tote() { -} - -void Tote::Reinit() { - in_use_mask_ = 0; - byte_count_ = 0; - score_count_ = 0; - // No need to initialize values -} -// Increment count of quadgrams/trigrams/unigrams scored -void Tote::AddScoreCount() { - ++score_count_; -} - - -void Tote::Add(uint8 ikey, int idelta) { - int key_group = ikey >> 2; - uint64 groupmask = (1ULL << key_group); - if ((in_use_mask_ & groupmask) == 0) { - // Initialize this group - gscore_[key_group] = 0; - in_use_mask_ |= groupmask; - } - score_[ikey] += idelta; -} - - -// Return current top three keys -void Tote::CurrentTopThreeKeys(int* key3) const { - key3[0] = -1; - key3[1] = -1; - key3[2] = -1; - int score3[3] = {-1, -1, -1}; - uint64 tempmask = in_use_mask_; - int base = 0; - while (tempmask != 0) { - if (tempmask & 1) { - // Look at four in-use keys - for (int i = 0; i < 4; ++i) { - int insert_me = score_[base + i]; - // Favor lower numbers on ties - if (insert_me > score3[2]) { - // Insert - int insert_at = 2; - if (insert_me > score3[1]) { - score3[2] = score3[1]; - key3[2] = key3[1]; - insert_at = 1; - if (insert_me > score3[0]) { - score3[1] = score3[0]; - key3[1] = key3[0]; - insert_at = 0; - } - } - score3[insert_at] = insert_me; - key3[insert_at] = base + i; - } - } - } - tempmask >>= 1; - base += 4; - } -} - - -// Take a set of <key, value> pairs and tote them up. -// After explicitly sorting, retrieve top key, value pairs -// 0xFFFF in key signifies unused -DocTote::DocTote() { - // No need to initialize score_ or value_ - incr_count_ = 0; - sorted_ = 0; - memset(closepair_, 0, sizeof(closepair_)); - memset(key_, 0xFF, sizeof(key_)); -} - -DocTote::~DocTote() { -} - -void DocTote::Reinit() { - // No need to initialize score_ or value_ - incr_count_ = 0; - sorted_ = 0; - memset(closepair_, 0, sizeof(closepair_)); - memset(key_, 0xFF, sizeof(key_)); - runningscore_.Reinit(); -} - -// Weight reliability by ibytes -// Also see three-way associative comments above for Tote -void DocTote::Add(uint16 ikey, int ibytes, - int score, int ireliability) { - ++incr_count_; - - // Look for existing entry in top 2 positions of 3, times 8 columns - int sub0 = ikey & 15; - if (key_[sub0] == ikey) { - value_[sub0] += ibytes; - score_[sub0] += score; - reliability_[sub0] += ireliability * ibytes; - return; - } - // Look for existing entry in other of top 2 positions of 3, times 8 columns - int sub1 = sub0 ^ 8; - if (key_[sub1] == ikey) { - value_[sub1] += ibytes; - score_[sub1] += score; - reliability_[sub1] += ireliability * ibytes; - return; - } - // Look for existing entry in third position of 3, times 8 columns - int sub2 = (ikey & 7) + 16; - if (key_[sub2] == ikey) { - value_[sub2] += ibytes; - score_[sub2] += score; - reliability_[sub2] += ireliability * ibytes; - return; - } - - // Allocate new entry - int alloc = -1; - if (key_[sub0] == kUnusedKey) { - alloc = sub0; - } else if (key_[sub1] == kUnusedKey) { - alloc = sub1; - } else if (key_[sub2] == kUnusedKey) { - alloc = sub2; - } else { - // All choices allocated, need to replace smallest one - alloc = sub0; - if (value_[sub1] < value_[alloc]) {alloc = sub1;} - if (value_[sub2] < value_[alloc]) {alloc = sub2;} - } - key_[alloc] = ikey; - value_[alloc] = ibytes; - score_[alloc] = score; - reliability_[alloc] = ireliability * ibytes; - return; -} - -// Find subscript of a given packed language, or -1 -int DocTote::Find(uint16 ikey) { - if (sorted_) { - // Linear search if sorted - for (int sub = 0; sub < kMaxSize_; ++sub) { - if (key_[sub] == ikey) {return sub;} - } - return -1; - } - - // Look for existing entry - int sub0 = ikey & 15; - if (key_[sub0] == ikey) { - return sub0; - } - int sub1 = sub0 ^ 8; - if (key_[sub1] == ikey) { - return sub1; - } - int sub2 = (ikey & 7) + 16; - if (key_[sub2] == ikey) { - return sub2; - } - - return -1; -} - -// Return current top key -int DocTote::CurrentTopKey() { - int top_key = 0; - int top_value = -1; - for (int sub = 0; sub < kMaxSize_; ++sub) { - if (key_[sub] == kUnusedKey) {continue;} - if (top_value < value_[sub]) { - top_value = value_[sub]; - top_key = key_[sub]; - } - } - return top_key; -} - - -// Sort first n entries by decreasing order of value -// If key==0 other fields are not valid, treat value as -1 -void DocTote::Sort(int n) { - // This is n**2, but n is small - for (int sub = 0; sub < n; ++sub) { - if (key_[sub] == kUnusedKey) {value_[sub] = -1;} - - // Bubble sort key[sub] and entry[sub] - for (int sub2 = sub + 1; sub2 < kMaxSize_; ++sub2) { - if (key_[sub2] == kUnusedKey) {value_[sub2] = -1;} - if (value_[sub] < value_[sub2]) { - // swap - uint16 tmpk = key_[sub]; - key_[sub] = key_[sub2]; - key_[sub2] = tmpk; - - int tmpv = value_[sub]; - value_[sub] = value_[sub2]; - value_[sub2] = tmpv; - - double tmps = score_[sub]; - score_[sub] = score_[sub2]; - score_[sub2] = tmps; - - int tmpr = reliability_[sub]; - reliability_[sub] = reliability_[sub2]; - reliability_[sub2] = tmpr; - } - } - } - sorted_ = 1; -} - -void DocTote::Dump(FILE* f) { - fprintf(f, "DocTote::Dump\n"); - for (int sub = 0; sub < kMaxSize_; ++sub) { - if (key_[sub] != kUnusedKey) { - Language lang = static_cast<Language>(key_[sub]); - fprintf(f, "[%2d] %3s %6dB %5dp %4dR,\n", sub, LanguageCode(lang), - value_[sub], score_[sub], reliability_[sub]); - } - } - fprintf(f, " %d chunks scored<br>\n", incr_count_); -} - -} // End namespace CLD2 - |