summaryrefslogtreecommitdiffstats
path: root/browser/components/translation/cld2/internal/offsetmap.cc
diff options
context:
space:
mode:
Diffstat (limited to 'browser/components/translation/cld2/internal/offsetmap.cc')
-rw-r--r--browser/components/translation/cld2/internal/offsetmap.cc569
1 files changed, 569 insertions, 0 deletions
diff --git a/browser/components/translation/cld2/internal/offsetmap.cc b/browser/components/translation/cld2/internal/offsetmap.cc
new file mode 100644
index 000000000..84609a71f
--- /dev/null
+++ b/browser/components/translation/cld2/internal/offsetmap.cc
@@ -0,0 +1,569 @@
+// Copyright 2013 Google Inc. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+// http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+//
+// Author: dsites@google.com (Dick Sites)
+//
+//
+
+#include "offsetmap.h"
+
+#include <string.h> // for strcmp
+#include <stdio.h> // for fprintf, stderr, fclose, etc
+#include <algorithm> // for min
+
+using namespace std;
+
+namespace CLD2 {
+
+// Constructor, destructor
+OffsetMap::OffsetMap() {
+ Clear();
+}
+
+OffsetMap::~OffsetMap() {
+}
+
+// Clear the map
+// After:
+// next_diff_sub_ is 0
+// Windows are the a and a' ranges covered by diffs_[next_diff_sub_-1]
+// which is a fake range of width 0 mapping 0=>0
+void OffsetMap::Clear() {
+ diffs_.clear();
+ pending_op_ = COPY_OP;
+ pending_length_ = 0;
+ next_diff_sub_ = 0;
+ current_lo_aoffset_ = 0;
+ current_hi_aoffset_ = 0;
+ current_lo_aprimeoffset_ = 0;
+ current_hi_aprimeoffset_ = 0;
+ current_diff_ = 0;
+ max_aoffset_ = 0; // Largest seen so far
+ max_aprimeoffset_ = 0; // Largest seen so far
+}
+
+static inline char OpPart(const char c) {
+ return (c >> 6) & 3;
+}
+static inline char LenPart(const char c) {
+ return c & 0x3f;
+}
+
+// Print map to file, for debugging
+void OffsetMap::Printmap(const char* filename) {
+ FILE* fout;
+ bool needs_close = false;
+ if (strcmp(filename, "stdout") == 0) {
+ fout = stdout;
+ } else if (strcmp(filename, "stderr") == 0) {
+ fout = stderr;
+ } else {
+ fout = fopen(filename, "w");
+ needs_close = true;
+ }
+ if (fout == NULL) {
+ fprintf(stderr, "%s did not open\n", filename);
+ return;
+ }
+
+ Flush(); // Make sure any pending entry gets printed
+ fprintf(fout, "Offsetmap: %ld bytes\n", diffs_.size());
+ for (int i = 0; i < static_cast<int>(diffs_.size()); ++i) {
+ fprintf(fout, "%c%02d ", "&=+-"[OpPart(diffs_[i])], LenPart(diffs_[i]));
+ if ((i % 20) == 19) {fprintf(fout, "\n");}
+ }
+ fprintf(fout, "\n");
+ if (needs_close) {
+ fclose(fout);
+ }
+}
+
+// Reset to offset 0
+void OffsetMap::Reset() {
+ MaybeFlushAll();
+
+ next_diff_sub_ = 0;
+ current_lo_aoffset_ = 0;
+ current_hi_aoffset_ = 0;
+ current_lo_aprimeoffset_ = 0;
+ current_hi_aprimeoffset_ = 0;
+ current_diff_ = 0;
+}
+
+// Add to mapping from A to A', specifying how many next bytes are
+// identical in A and A'
+void OffsetMap::Copy(int bytes) {
+ if (bytes == 0) {return;}
+ max_aoffset_ += bytes; // Largest seen so far
+ max_aprimeoffset_ += bytes; // Largest seen so far
+ if (pending_op_ == COPY_OP) {
+ pending_length_ += bytes;
+ } else {
+ Flush();
+ pending_op_ = COPY_OP;
+ pending_length_ = bytes;
+ }
+}
+
+// Add to mapping from A to A', specifying how many next bytes are
+// inserted in A' while not advancing in A at all
+void OffsetMap::Insert(int bytes){
+ if (bytes == 0) {return;}
+ max_aprimeoffset_ += bytes; // Largest seen so far
+ if (pending_op_ == INSERT_OP) {
+ pending_length_ += bytes;
+ } else if ((bytes == 1) &&
+ (pending_op_ == DELETE_OP) && (pending_length_ == 1)) {
+ // Special-case exactly delete(1) insert(1) +> copy(1);
+ // all others backmap inserts to after deletes
+ pending_op_ = COPY_OP;
+ } else {
+ Flush();
+ pending_op_ = INSERT_OP;
+ pending_length_ = bytes;
+ }
+}
+
+// Add to mapping from A to A', specifying how many next bytes are
+// deleted from A while not advancing in A' at all
+void OffsetMap::Delete(int bytes){
+ if (bytes == 0) {return;}
+ max_aoffset_ += bytes; // Largest seen so far
+ if (pending_op_ == DELETE_OP) {
+ pending_length_ += bytes;
+ } else if ((bytes == 1) &&
+ (pending_op_ == INSERT_OP) && (pending_length_ == 1)) {
+ // Special-case exactly insert(1) delete(1) => copy(1);
+ // all others backmap deletes to after insertss
+ pending_op_ = COPY_OP;
+ } else {
+ Flush();
+ pending_op_ = DELETE_OP;
+ pending_length_ = bytes;
+ }
+}
+
+void OffsetMap::Flush() {
+ if (pending_length_ == 0) {
+ return;
+ }
+ // We may be emitting a copy op just after a copy op because +1 -1 cancelled
+ // inbetween. If the lengths don't need a prefix byte, combine them
+ if ((pending_op_ == COPY_OP) && !diffs_.empty()) {
+ char c = diffs_[diffs_.size() - 1];
+ MapOp prior_op = static_cast<MapOp>(OpPart(c));
+ int prior_len = LenPart(c);
+ if ((prior_op == COPY_OP) && ((prior_len + pending_length_) <= 0x3f)) {
+ diffs_[diffs_.size() - 1] += pending_length_;
+ pending_length_ = 0;
+ return;
+ }
+ }
+ if (pending_length_ > 0x3f) {
+ bool non_zero_emitted = false;
+ for (int shift = 30; shift > 0; shift -= 6) {
+ int prefix = (pending_length_ >> shift) & 0x3f;
+ if ((prefix > 0) || non_zero_emitted) {
+ Emit(PREFIX_OP, prefix);
+ non_zero_emitted = true;
+ }
+ }
+ }
+ Emit(pending_op_, pending_length_ & 0x3f);
+ pending_length_ = 0;
+}
+
+
+// Add one more entry to copy one byte off the end, then flush
+void OffsetMap::FlushAll() {
+ Copy(1);
+ Flush();
+}
+
+// Flush all if necessary
+void OffsetMap::MaybeFlushAll() {
+ if ((0 < pending_length_) || diffs_.empty()) {
+ FlushAll();
+ }
+}
+
+// Len may be 0, for example as the low piece of length=64
+void OffsetMap::Emit(MapOp op, int len) {
+ char c = (static_cast<char>(op) << 6) | (len & 0x3f);
+ diffs_.push_back(c);
+}
+
+void OffsetMap::DumpString() {
+ for (int i = 0; i < static_cast<int>(diffs_.size()); ++i) {
+ fprintf(stderr, "%c%02d ", "&=+-"[OpPart(diffs_[i])], LenPart(diffs_[i]));
+ }
+ fprintf(stderr, "\n");
+
+ // Print running table of correspondences
+ fprintf(stderr, " op A => A' (A forward-maps to A')\n");
+ int aoffset = 0;
+ int aprimeoffset = 0;
+ int length = 0;
+ for (int i = 0; i < static_cast<int>(diffs_.size()); ++i) {
+ char c = diffs_[i];
+ MapOp op = static_cast<MapOp>(OpPart(c));
+ int len = LenPart(c);
+ length = (length << 6) + len;
+ if (op == COPY_OP) {
+ aoffset += length;
+ aprimeoffset += length;
+ length = 0;
+ } else if (op == INSERT_OP) {
+ aoffset += 0;
+ aprimeoffset += length;
+ length = 0;
+ } else if (op == DELETE_OP) {
+ aoffset += length;
+ aprimeoffset += 0;
+ length = 0;
+ } else { // (op == PREFIX_OP)
+ // Do nothing else
+ }
+ fprintf(stderr, "[%3d] %c%02d %6d %6d%s\n",
+ i, "&=+-"[op], len,
+ aoffset, aprimeoffset,
+ (next_diff_sub_ == i) ? " <==next_diff_sub_" : "");
+
+ }
+ fprintf(stderr, "\n");
+}
+
+void OffsetMap::DumpWindow() {
+ fprintf(stderr, "DumpWindow(A => A'): max_aoffset_ = %d, "
+ "max_aprimeoffset_ = %d, next_diff_sub_ = %d<br>\n",
+ max_aoffset_, max_aprimeoffset_, next_diff_sub_);
+ fprintf(stderr, "A [%u..%u)\n",
+ current_lo_aoffset_, current_hi_aoffset_);
+ fprintf(stderr, "A' [%u..%u)\n",
+ current_lo_aprimeoffset_, current_hi_aprimeoffset_);
+ fprintf(stderr, " diff = %d\n", current_diff_);
+ DumpString();
+}
+
+//----------------------------------------------------------------------------//
+// The guts of the 2013 design //
+// If there are three ranges a b c in diffs_, we can be in one of five //
+// states: LEFT of a, in ranges a b c, or RIGHT of c //
+// In each state, there are windows A[Alo..Ahi), A'[A'lo..A'hi) and diffs_ //
+// position next_diff_sub_ //
+// There also are mapping constants max_aoffset_ and max_aprimeoffset_ //
+// If LEFT, Alo=Ahi=0, A'lo=A'hi=0 and next_diff_sub_=0 //
+// If RIGHT, Alo=Ahi=max_aoffset_, A'lo=A'hi=max_aprimeoffset_ and //
+// next_diff_sub_=diffs_.size() //
+// Otherwise, at least one of A[) and A'[) is non-empty and the first bytes //
+// correspond to each other. If range i is active, next_diff_sub_ is at //
+// the first byte of range i+1. Because of the length-prefix operator, //
+// an individual range item in diffs_ may be multiple bytes //
+// In all cases aprimeoffset = aoffset + current_diff_ //
+// i.e. current_diff_ = aprimeoffset - aoffset //
+// //
+// In the degenerate case of diffs_.empty(), there are only two states //
+// LEFT and RIGHT and the mapping is the identity mapping. //
+// The initial state is LEFT. //
+// It is an error to move left into LEFT or right into RIGHT, but the code //
+// below is robust in these cases. //
+//----------------------------------------------------------------------------//
+
+void OffsetMap::SetLeft() {
+ current_lo_aoffset_ = 0;
+ current_hi_aoffset_ = 0;
+ current_lo_aprimeoffset_ = 0;
+ current_hi_aprimeoffset_ = 0;
+ current_diff_ = 0;
+ next_diff_sub_ = 0;
+}
+
+void OffsetMap::SetRight() {
+ current_lo_aoffset_ = max_aoffset_;
+ current_hi_aoffset_ = max_aoffset_;
+ current_lo_aprimeoffset_ = max_aprimeoffset_;
+ current_hi_aprimeoffset_ = max_aprimeoffset_;
+ current_diff_ = max_aprimeoffset_ - max_aoffset_;
+ next_diff_sub_ = 0;
+}
+
+// Back up over previous range, 1..5 bytes
+// Return subscript at the beginning of that. Pins at 0
+int OffsetMap::Backup(int sub) {
+ if (sub <= 0) {return 0;}
+ --sub;
+ while ((0 < sub) &&
+ (static_cast<MapOp>(OpPart(diffs_[sub - 1]) == PREFIX_OP))) {
+ --sub;
+ }
+ return sub;
+}
+
+// Parse next range, 1..5 bytes
+// Return subscript just off the end of that
+int OffsetMap::ParseNext(int sub, MapOp* op, int* length) {
+ *op = PREFIX_OP;
+ *length = 0;
+ char c;
+ while ((sub < static_cast<int>(diffs_.size())) && (*op == PREFIX_OP)) {
+ c = diffs_[sub++];
+ *op = static_cast<MapOp>(OpPart(c));
+ int len = LenPart(c);
+ *length = (*length << 6) + len;
+ }
+ // If mal-formed or in RIGHT, this will return with op = PREFIX_OP
+ // Mal-formed can include a trailing prefix byte with no following op
+ return sub;
+}
+
+// Parse previous range, 1..5 bytes
+// Return current subscript
+int OffsetMap::ParsePrevious(int sub, MapOp* op, int* length) {
+ sub = Backup(sub);
+ return ParseNext(sub, op, length);
+}
+
+// Quick debugging dump; does not parse multi-byte items, so just length & 0x3f
+void OffsetMap::PrintPosition(const char* str) {
+ MapOp op = PREFIX_OP;
+ int length = 0;
+ if ((0 < next_diff_sub_) && (next_diff_sub_ <= static_cast<int>(diffs_.size()))) {
+ op = static_cast<MapOp>(OpPart(diffs_[next_diff_sub_ - 1]));
+ length = LenPart(diffs_[next_diff_sub_ - 1]);
+ }
+ fprintf(stderr, "%s[%d] %c%02d = A[%d..%d) ==> A'[%d..%d)\n",
+ str,
+ next_diff_sub_, "&=+-"[op], length,
+ current_lo_aoffset_, current_hi_aoffset_,
+ current_lo_aprimeoffset_, current_hi_aprimeoffset_);
+}
+
+// Move active window one range to the right
+// Return true if move was OK
+bool OffsetMap::MoveRight() {
+ // If at last range or RIGHT, set to RIGHT, return error
+ if (next_diff_sub_ >= static_cast<int>(diffs_.size())) {
+ SetRight();
+ return false;
+ }
+ // Actually OK to move right
+ MapOp op;
+ int length;
+ bool retval = true;
+ // If mal-formed or in RIGHT, this will return with op = PREFIX_OP
+ next_diff_sub_ = ParseNext(next_diff_sub_, &op, &length);
+
+ current_lo_aoffset_ = current_hi_aoffset_;
+ current_lo_aprimeoffset_ = current_hi_aprimeoffset_;
+ if (op == COPY_OP) {
+ current_hi_aoffset_ = current_lo_aoffset_ + length;
+ current_hi_aprimeoffset_ = current_lo_aprimeoffset_ + length;
+ } else if (op == INSERT_OP) {
+ current_hi_aoffset_ = current_lo_aoffset_ + 0;
+ current_hi_aprimeoffset_ = current_lo_aprimeoffset_ + length;
+ } else if (op == DELETE_OP) {
+ current_hi_aoffset_ = current_lo_aoffset_ + length;
+ current_hi_aprimeoffset_ = current_lo_aprimeoffset_ + 0;
+ } else {
+ SetRight();
+ retval = false;
+ }
+ current_diff_ = current_lo_aprimeoffset_ - current_lo_aoffset_;
+ return retval;
+}
+
+// Move active window one range to the left
+// Return true if move was OK
+bool OffsetMap::MoveLeft() {
+ // If at first range or LEFT, set to LEFT, return error
+ if (next_diff_sub_ <= 0) {
+ SetLeft();
+ return false;
+ }
+ // Back up over current active window
+ next_diff_sub_ = Backup(next_diff_sub_);
+ if (next_diff_sub_ <= 0) {
+ SetLeft();
+ return false;
+ }
+ // Actually OK to move left
+ MapOp op;
+ int length;
+ bool retval = true;
+ // If mal-formed or in LEFT, this will return with op = PREFIX_OP
+ next_diff_sub_ = ParsePrevious(next_diff_sub_, &op, &length);
+
+ current_hi_aoffset_ = current_lo_aoffset_;
+ current_hi_aprimeoffset_ = current_lo_aprimeoffset_;
+ if (op == COPY_OP) {
+ current_lo_aoffset_ = current_hi_aoffset_ - length;
+ current_lo_aprimeoffset_ = current_hi_aprimeoffset_ - length;
+ } else if (op == INSERT_OP) {
+ current_lo_aoffset_ = current_hi_aoffset_ - 0;
+ current_lo_aprimeoffset_ = current_hi_aprimeoffset_ - length;
+ } else if (op == DELETE_OP) {
+ current_lo_aoffset_ = current_hi_aoffset_ - length;
+ current_lo_aprimeoffset_ = current_hi_aprimeoffset_ - 0;
+ } else {
+ SetLeft();
+ retval = false;
+ }
+ current_diff_ = current_lo_aprimeoffset_ - current_lo_aoffset_;
+ return true;
+}
+
+// Map an offset in A' to the corresponding offset in A
+int OffsetMap::MapBack(int aprimeoffset){
+ MaybeFlushAll();
+ if (aprimeoffset < 0) {return 0;}
+ if (max_aprimeoffset_ <= aprimeoffset) {
+ return (aprimeoffset - max_aprimeoffset_) + max_aoffset_;
+ }
+
+ // If current_lo_aprimeoffset_ <= aprimeoffset < current_hi_aprimeoffset_,
+ // use current mapping, else move window left/right
+ bool ok = true;
+ while (ok && (aprimeoffset < current_lo_aprimeoffset_)) {
+ ok = MoveLeft();
+ }
+ while (ok && (current_hi_aprimeoffset_ <= aprimeoffset)) {
+ ok = MoveRight();
+ }
+ // So now current_lo_aprimeoffset_ <= aprimeoffset < current_hi_aprimeoffset_
+
+ int aoffset = aprimeoffset - current_diff_;
+ if (aoffset >= current_hi_aoffset_) {
+ // A' is in an insert region, all bytes of which backmap to A=hi_aoffset_
+ aoffset = current_hi_aoffset_;
+ }
+ return aoffset;
+}
+
+// Map an offset in A to the corresponding offset in A'
+int OffsetMap::MapForward(int aoffset){
+ MaybeFlushAll();
+ if (aoffset < 0) {return 0;}
+ if (max_aoffset_ <= aoffset) {
+ return (aoffset - max_aoffset_) + max_aprimeoffset_;
+ }
+
+ // If current_lo_aoffset_ <= aoffset < current_hi_aoffset_,
+ // use current mapping, else move window left/right
+ bool ok = true;
+ while (ok && (aoffset < current_lo_aoffset_)) {
+ ok = MoveLeft();
+ }
+ while (ok && (current_hi_aoffset_ <= aoffset)) {
+ ok = MoveRight();
+ }
+
+ int aprimeoffset = aoffset + current_diff_;
+ if (aprimeoffset >= current_hi_aprimeoffset_) {
+ // A is in a delete region, all bytes of which map to A'=hi_aprimeoffset_
+ aprimeoffset = current_hi_aprimeoffset_;
+ }
+ return aprimeoffset;
+}
+
+
+// static
+bool OffsetMap::CopyInserts(OffsetMap* source, OffsetMap* dest) {
+ bool ok = true;
+ while (ok && (source->next_diff_sub_ != source->diffs_.size())) {
+ ok = source->MoveRight();
+ if (source->current_lo_aoffset_ != source->current_hi_aoffset_) {
+ return false;
+ }
+ dest->Insert(
+ source->current_hi_aprimeoffset_ - source->current_lo_aprimeoffset_);
+ }
+ return true;
+}
+
+// static
+bool OffsetMap::CopyDeletes(OffsetMap* source, OffsetMap* dest) {
+ bool ok = true;
+ while (ok && (source->next_diff_sub_ != source->diffs_.size())) {
+ ok = source->MoveRight();
+ if (source->current_lo_aprimeoffset_ != source->current_hi_aprimeoffset_) {
+ return false;
+ }
+ dest->Delete(source->current_hi_aoffset_ - source->current_lo_aoffset_);
+ }
+ return true;
+}
+
+// static
+void OffsetMap::ComposeOffsetMap(
+ OffsetMap* g, OffsetMap* f, OffsetMap* h) {
+ h->Clear();
+ f->Reset();
+ g->Reset();
+
+ int lo = 0;
+ for (;;) {
+ // Consume delete operations in f. This moves A without moving
+ // A' and A''.
+ if (lo >= g->current_hi_aoffset_ && CopyInserts(g, h)) {
+ if (lo >= f->current_hi_aprimeoffset_ && CopyDeletes(f, h)) {
+ // fprintf(stderr,
+ // "ComposeOffsetMap ERROR, f is longer than g.<br>\n");
+ }
+
+ // FlushAll(), called by Reset(), MapForward() or MapBack(), has
+ // added an extra COPY_OP to f and g, so this function has
+ // composed an extra COPY_OP in h from those. To avoid
+ // FlushAll() adds one more extra COPY_OP to h later, dispatch
+ // Flush() right now.
+ h->Flush();
+ return;
+ }
+
+ // Consume insert operations in g. This moves A'' without moving A
+ // and A'.
+ if (lo >= f->current_hi_aprimeoffset_) {
+ if (!CopyDeletes(f, h)) {
+ // fprintf(stderr,
+ // "ComposeOffsetMap ERROR, g is longer than f.<br>\n");
+ }
+ }
+
+ // Compose one operation which moves A' from lo to hi.
+ int hi = min(f->current_hi_aprimeoffset_, g->current_hi_aoffset_);
+ if (f->current_lo_aoffset_ != f->current_hi_aoffset_ &&
+ g->current_lo_aprimeoffset_ != g->current_hi_aprimeoffset_) {
+ h->Copy(hi - lo);
+ } else if (f->current_lo_aoffset_ != f->current_hi_aoffset_) {
+ h->Delete(hi - lo);
+ } else if (g->current_lo_aprimeoffset_ != g->current_hi_aprimeoffset_) {
+ h->Insert(hi - lo);
+ }
+
+ lo = hi;
+ }
+}
+
+// For testing only -- force a mapping
+void OffsetMap::StuffIt(const string& diffs,
+ int max_aoffset, int max_aprimeoffset) {
+ Clear();
+ diffs_ = diffs;
+ max_aoffset_ = max_aoffset;
+ max_aprimeoffset_ = max_aprimeoffset;
+}
+
+
+} // namespace CLD2
+