summaryrefslogtreecommitdiffstats
path: root/application/basilisk/components/translation/cld2/internal/tote.cc
diff options
context:
space:
mode:
Diffstat (limited to 'application/basilisk/components/translation/cld2/internal/tote.cc')
-rw-r--r--application/basilisk/components/translation/cld2/internal/tote.cc265
1 files changed, 265 insertions, 0 deletions
diff --git a/application/basilisk/components/translation/cld2/internal/tote.cc b/application/basilisk/components/translation/cld2/internal/tote.cc
new file mode 100644
index 000000000..fbaba7d5c
--- /dev/null
+++ b/application/basilisk/components/translation/cld2/internal/tote.cc
@@ -0,0 +1,265 @@
+// Copyright 2013 Google Inc. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+// http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+//
+// Author: dsites@google.com (Dick Sites)
+//
+
+#include "tote.h"
+#include "lang_script.h" // For LanguageCode in Dump
+
+#include <stdio.h>
+#include <string.h> // For memset
+
+namespace CLD2 {
+
+// Take a set of <key, value> pairs and tote them up.
+// After explicitly sorting, retrieve top key, value pairs
+// Normal use is key=per-script language and value = probability score
+Tote::Tote() {
+ in_use_mask_ = 0;
+ byte_count_ = 0;
+ score_count_ = 0;
+ // No need to initialize values
+}
+
+Tote::~Tote() {
+}
+
+void Tote::Reinit() {
+ in_use_mask_ = 0;
+ byte_count_ = 0;
+ score_count_ = 0;
+ // No need to initialize values
+}
+// Increment count of quadgrams/trigrams/unigrams scored
+void Tote::AddScoreCount() {
+ ++score_count_;
+}
+
+
+void Tote::Add(uint8 ikey, int idelta) {
+ int key_group = ikey >> 2;
+ uint64 groupmask = (1ULL << key_group);
+ if ((in_use_mask_ & groupmask) == 0) {
+ // Initialize this group
+ gscore_[key_group] = 0;
+ in_use_mask_ |= groupmask;
+ }
+ score_[ikey] += idelta;
+}
+
+
+// Return current top three keys
+void Tote::CurrentTopThreeKeys(int* key3) const {
+ key3[0] = -1;
+ key3[1] = -1;
+ key3[2] = -1;
+ int score3[3] = {-1, -1, -1};
+ uint64 tempmask = in_use_mask_;
+ int base = 0;
+ while (tempmask != 0) {
+ if (tempmask & 1) {
+ // Look at four in-use keys
+ for (int i = 0; i < 4; ++i) {
+ int insert_me = score_[base + i];
+ // Favor lower numbers on ties
+ if (insert_me > score3[2]) {
+ // Insert
+ int insert_at = 2;
+ if (insert_me > score3[1]) {
+ score3[2] = score3[1];
+ key3[2] = key3[1];
+ insert_at = 1;
+ if (insert_me > score3[0]) {
+ score3[1] = score3[0];
+ key3[1] = key3[0];
+ insert_at = 0;
+ }
+ }
+ score3[insert_at] = insert_me;
+ key3[insert_at] = base + i;
+ }
+ }
+ }
+ tempmask >>= 1;
+ base += 4;
+ }
+}
+
+
+// Take a set of <key, value> pairs and tote them up.
+// After explicitly sorting, retrieve top key, value pairs
+// 0xFFFF in key signifies unused
+DocTote::DocTote() {
+ // No need to initialize score_ or value_
+ incr_count_ = 0;
+ sorted_ = 0;
+ memset(closepair_, 0, sizeof(closepair_));
+ memset(key_, 0xFF, sizeof(key_));
+}
+
+DocTote::~DocTote() {
+}
+
+void DocTote::Reinit() {
+ // No need to initialize score_ or value_
+ incr_count_ = 0;
+ sorted_ = 0;
+ memset(closepair_, 0, sizeof(closepair_));
+ memset(key_, 0xFF, sizeof(key_));
+ runningscore_.Reinit();
+}
+
+// Weight reliability by ibytes
+// Also see three-way associative comments above for Tote
+void DocTote::Add(uint16 ikey, int ibytes,
+ int score, int ireliability) {
+ ++incr_count_;
+
+ // Look for existing entry in top 2 positions of 3, times 8 columns
+ int sub0 = ikey & 15;
+ if (key_[sub0] == ikey) {
+ value_[sub0] += ibytes;
+ score_[sub0] += score;
+ reliability_[sub0] += ireliability * ibytes;
+ return;
+ }
+ // Look for existing entry in other of top 2 positions of 3, times 8 columns
+ int sub1 = sub0 ^ 8;
+ if (key_[sub1] == ikey) {
+ value_[sub1] += ibytes;
+ score_[sub1] += score;
+ reliability_[sub1] += ireliability * ibytes;
+ return;
+ }
+ // Look for existing entry in third position of 3, times 8 columns
+ int sub2 = (ikey & 7) + 16;
+ if (key_[sub2] == ikey) {
+ value_[sub2] += ibytes;
+ score_[sub2] += score;
+ reliability_[sub2] += ireliability * ibytes;
+ return;
+ }
+
+ // Allocate new entry
+ int alloc = -1;
+ if (key_[sub0] == kUnusedKey) {
+ alloc = sub0;
+ } else if (key_[sub1] == kUnusedKey) {
+ alloc = sub1;
+ } else if (key_[sub2] == kUnusedKey) {
+ alloc = sub2;
+ } else {
+ // All choices allocated, need to replace smallest one
+ alloc = sub0;
+ if (value_[sub1] < value_[alloc]) {alloc = sub1;}
+ if (value_[sub2] < value_[alloc]) {alloc = sub2;}
+ }
+ key_[alloc] = ikey;
+ value_[alloc] = ibytes;
+ score_[alloc] = score;
+ reliability_[alloc] = ireliability * ibytes;
+ return;
+}
+
+// Find subscript of a given packed language, or -1
+int DocTote::Find(uint16 ikey) {
+ if (sorted_) {
+ // Linear search if sorted
+ for (int sub = 0; sub < kMaxSize_; ++sub) {
+ if (key_[sub] == ikey) {return sub;}
+ }
+ return -1;
+ }
+
+ // Look for existing entry
+ int sub0 = ikey & 15;
+ if (key_[sub0] == ikey) {
+ return sub0;
+ }
+ int sub1 = sub0 ^ 8;
+ if (key_[sub1] == ikey) {
+ return sub1;
+ }
+ int sub2 = (ikey & 7) + 16;
+ if (key_[sub2] == ikey) {
+ return sub2;
+ }
+
+ return -1;
+}
+
+// Return current top key
+int DocTote::CurrentTopKey() {
+ int top_key = 0;
+ int top_value = -1;
+ for (int sub = 0; sub < kMaxSize_; ++sub) {
+ if (key_[sub] == kUnusedKey) {continue;}
+ if (top_value < value_[sub]) {
+ top_value = value_[sub];
+ top_key = key_[sub];
+ }
+ }
+ return top_key;
+}
+
+
+// Sort first n entries by decreasing order of value
+// If key==0 other fields are not valid, treat value as -1
+void DocTote::Sort(int n) {
+ // This is n**2, but n is small
+ for (int sub = 0; sub < n; ++sub) {
+ if (key_[sub] == kUnusedKey) {value_[sub] = -1;}
+
+ // Bubble sort key[sub] and entry[sub]
+ for (int sub2 = sub + 1; sub2 < kMaxSize_; ++sub2) {
+ if (key_[sub2] == kUnusedKey) {value_[sub2] = -1;}
+ if (value_[sub] < value_[sub2]) {
+ // swap
+ uint16 tmpk = key_[sub];
+ key_[sub] = key_[sub2];
+ key_[sub2] = tmpk;
+
+ int tmpv = value_[sub];
+ value_[sub] = value_[sub2];
+ value_[sub2] = tmpv;
+
+ double tmps = score_[sub];
+ score_[sub] = score_[sub2];
+ score_[sub2] = tmps;
+
+ int tmpr = reliability_[sub];
+ reliability_[sub] = reliability_[sub2];
+ reliability_[sub2] = tmpr;
+ }
+ }
+ }
+ sorted_ = 1;
+}
+
+void DocTote::Dump(FILE* f) {
+ fprintf(f, "DocTote::Dump\n");
+ for (int sub = 0; sub < kMaxSize_; ++sub) {
+ if (key_[sub] != kUnusedKey) {
+ Language lang = static_cast<Language>(key_[sub]);
+ fprintf(f, "[%2d] %3s %6dB %5dp %4dR,\n", sub, LanguageCode(lang),
+ value_[sub], score_[sub], reliability_[sub]);
+ }
+ }
+ fprintf(f, " %d chunks scored<br>\n", incr_count_);
+}
+
+} // End namespace CLD2
+