summaryrefslogtreecommitdiffstats
path: root/application/basilisk/components/translation/cld2/internal/tote.h
diff options
context:
space:
mode:
Diffstat (limited to 'application/basilisk/components/translation/cld2/internal/tote.h')
-rw-r--r--application/basilisk/components/translation/cld2/internal/tote.h112
1 files changed, 112 insertions, 0 deletions
diff --git a/application/basilisk/components/translation/cld2/internal/tote.h b/application/basilisk/components/translation/cld2/internal/tote.h
new file mode 100644
index 000000000..cd45592ec
--- /dev/null
+++ b/application/basilisk/components/translation/cld2/internal/tote.h
@@ -0,0 +1,112 @@
+// Copyright 2013 Google Inc. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+// http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+//
+// Author: dsites@google.com (Dick Sites)
+//
+
+#ifndef I18N_ENCODINGS_CLD2_INTERNAL_TOTE_H_
+#define I18N_ENCODINGS_CLD2_INTERNAL_TOTE_H_
+
+#include <stdio.h>
+#include "integral_types.h" // for uint8 etc
+
+namespace CLD2 {
+
+
+// Take a set of <key, score> pairs and tote them up.
+// Key is an 8-bit per-script language
+// After explicitly sorting, retrieve top key, score pairs
+// Normal use is key=per-script language
+// The main data structure is an array of 256 uint16 counts. We normally
+// expect this to be initialized, added-to about 60 times, then the top three
+// items found. The reduce the initial and final time, we also keep a bit vector
+// of unused (and uninitialized) parts, each of 64 bits covering four keys.
+class Tote {
+ public:
+ Tote();
+ ~Tote();
+ void Reinit();
+ void AddScoreCount();
+ void Add(uint8 ikey, int idelta);
+ void AddBytes(int ibytes) {byte_count_ += ibytes;}
+ void CurrentTopThreeKeys(int* key3) const;
+ int GetScoreCount() const {return score_count_;}
+ int GetByteCount() const {return byte_count_;}
+ int GetScore(int i) const {return score_[i];}
+ void SetScoreCount(uint16 v) {score_count_ = v;}
+ void SetScore(int i, int v) {score_[i] = v;}
+
+ private:
+ uint64 in_use_mask_; // 64 bits, one for each group of 4 scores.
+ // 0 = not initialized,not used
+ int byte_count_; // Bytes of text scored
+ int score_count_; // Number of quadgrams/etc. scored
+ union {
+ uint64 gscore_[64]; // For alignment and clearing quickly
+ uint16 score_[256]; // Probability score sum
+ };
+
+};
+
+
+// Take a set of <key, score, reliability> triples and tote them up.
+// Key is a 16-bit full language
+// After explicitly sorting, retrieve top key, score, reliability triples
+class DocTote {
+ public:
+ DocTote();
+ ~DocTote();
+ void Reinit();
+ void Add(uint16 ikey, int ibytes, int score, int ireliability);
+ int Find(uint16 ikey);
+ void AddClosePair(int subscr, int val) {closepair_[subscr] += val;}
+ int CurrentTopKey();
+ Tote* RunningScore() {return &runningscore_;}
+ void Sort(int n);
+ void Dump(FILE* f);
+
+ int GetIncrCount() const {return incr_count_;}
+ int GetClosePair(int subscr) const {return closepair_[subscr];}
+ int MaxSize() const {return kMaxSize_;}
+ uint16 Key(int i) const {return key_[i];}
+ int Value(int i) const {return value_[i];} // byte count
+ int Score(int i) const {return score_[i];} // sum lg prob
+ int Reliability(int i) const {return reliability_[i];}
+ void SetKey(int i, int v) {key_[i] = v;}
+ void SetValue(int i, int v) {value_[i] = v;}
+ void SetScore(int i, int v) {score_[i] = v;}
+ void SetReliability(int i, int v) {reliability_[i] = v;}
+
+ static const uint16 kUnusedKey = 0xFFFF;
+
+ private:
+ static const int kMaxSize_ = 24;
+ static const int kMaxClosePairSize_ = 8;
+
+ int incr_count_; // Number of Add calls
+ int sorted_; // Contents have been sorted, cannot Add
+ Tote runningscore_; // Top lang scores across entire doc, for
+ // helping resolve close pairs
+ // Align at multiple of 8 bytes
+ int closepair_[kMaxClosePairSize_];
+ uint16 key_[kMaxSize_]; // Lang unassigned = 0xFFFF, valid = 1..1023
+ int value_[kMaxSize_]; // Bytecount this lang
+ int score_[kMaxSize_]; // Probability score sum
+ int reliability_[kMaxSize_]; // Percentage 0..100
+};
+
+} // End namespace CLD2
+
+#endif // I18N_ENCODINGS_CLD2_INTERNAL_TOTE_H_