diff options
Diffstat (limited to 'browser/components/translation/cld2/internal/offsetmap.cc')
-rw-r--r-- | browser/components/translation/cld2/internal/offsetmap.cc | 569 |
1 files changed, 569 insertions, 0 deletions
diff --git a/browser/components/translation/cld2/internal/offsetmap.cc b/browser/components/translation/cld2/internal/offsetmap.cc new file mode 100644 index 000000000..84609a71f --- /dev/null +++ b/browser/components/translation/cld2/internal/offsetmap.cc @@ -0,0 +1,569 @@ +// Copyright 2013 Google Inc. All Rights Reserved. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +// +// Author: dsites@google.com (Dick Sites) +// +// + +#include "offsetmap.h" + +#include <string.h> // for strcmp +#include <stdio.h> // for fprintf, stderr, fclose, etc +#include <algorithm> // for min + +using namespace std; + +namespace CLD2 { + +// Constructor, destructor +OffsetMap::OffsetMap() { + Clear(); +} + +OffsetMap::~OffsetMap() { +} + +// Clear the map +// After: +// next_diff_sub_ is 0 +// Windows are the a and a' ranges covered by diffs_[next_diff_sub_-1] +// which is a fake range of width 0 mapping 0=>0 +void OffsetMap::Clear() { + diffs_.clear(); + pending_op_ = COPY_OP; + pending_length_ = 0; + next_diff_sub_ = 0; + current_lo_aoffset_ = 0; + current_hi_aoffset_ = 0; + current_lo_aprimeoffset_ = 0; + current_hi_aprimeoffset_ = 0; + current_diff_ = 0; + max_aoffset_ = 0; // Largest seen so far + max_aprimeoffset_ = 0; // Largest seen so far +} + +static inline char OpPart(const char c) { + return (c >> 6) & 3; +} +static inline char LenPart(const char c) { + return c & 0x3f; +} + +// Print map to file, for debugging +void OffsetMap::Printmap(const char* filename) { + FILE* fout; + bool needs_close = false; + if (strcmp(filename, "stdout") == 0) { + fout = stdout; + } else if (strcmp(filename, "stderr") == 0) { + fout = stderr; + } else { + fout = fopen(filename, "w"); + needs_close = true; + } + if (fout == NULL) { + fprintf(stderr, "%s did not open\n", filename); + return; + } + + Flush(); // Make sure any pending entry gets printed + fprintf(fout, "Offsetmap: %ld bytes\n", diffs_.size()); + for (int i = 0; i < static_cast<int>(diffs_.size()); ++i) { + fprintf(fout, "%c%02d ", "&=+-"[OpPart(diffs_[i])], LenPart(diffs_[i])); + if ((i % 20) == 19) {fprintf(fout, "\n");} + } + fprintf(fout, "\n"); + if (needs_close) { + fclose(fout); + } +} + +// Reset to offset 0 +void OffsetMap::Reset() { + MaybeFlushAll(); + + next_diff_sub_ = 0; + current_lo_aoffset_ = 0; + current_hi_aoffset_ = 0; + current_lo_aprimeoffset_ = 0; + current_hi_aprimeoffset_ = 0; + current_diff_ = 0; +} + +// Add to mapping from A to A', specifying how many next bytes are +// identical in A and A' +void OffsetMap::Copy(int bytes) { + if (bytes == 0) {return;} + max_aoffset_ += bytes; // Largest seen so far + max_aprimeoffset_ += bytes; // Largest seen so far + if (pending_op_ == COPY_OP) { + pending_length_ += bytes; + } else { + Flush(); + pending_op_ = COPY_OP; + pending_length_ = bytes; + } +} + +// Add to mapping from A to A', specifying how many next bytes are +// inserted in A' while not advancing in A at all +void OffsetMap::Insert(int bytes){ + if (bytes == 0) {return;} + max_aprimeoffset_ += bytes; // Largest seen so far + if (pending_op_ == INSERT_OP) { + pending_length_ += bytes; + } else if ((bytes == 1) && + (pending_op_ == DELETE_OP) && (pending_length_ == 1)) { + // Special-case exactly delete(1) insert(1) +> copy(1); + // all others backmap inserts to after deletes + pending_op_ = COPY_OP; + } else { + Flush(); + pending_op_ = INSERT_OP; + pending_length_ = bytes; + } +} + +// Add to mapping from A to A', specifying how many next bytes are +// deleted from A while not advancing in A' at all +void OffsetMap::Delete(int bytes){ + if (bytes == 0) {return;} + max_aoffset_ += bytes; // Largest seen so far + if (pending_op_ == DELETE_OP) { + pending_length_ += bytes; + } else if ((bytes == 1) && + (pending_op_ == INSERT_OP) && (pending_length_ == 1)) { + // Special-case exactly insert(1) delete(1) => copy(1); + // all others backmap deletes to after insertss + pending_op_ = COPY_OP; + } else { + Flush(); + pending_op_ = DELETE_OP; + pending_length_ = bytes; + } +} + +void OffsetMap::Flush() { + if (pending_length_ == 0) { + return; + } + // We may be emitting a copy op just after a copy op because +1 -1 cancelled + // inbetween. If the lengths don't need a prefix byte, combine them + if ((pending_op_ == COPY_OP) && !diffs_.empty()) { + char c = diffs_[diffs_.size() - 1]; + MapOp prior_op = static_cast<MapOp>(OpPart(c)); + int prior_len = LenPart(c); + if ((prior_op == COPY_OP) && ((prior_len + pending_length_) <= 0x3f)) { + diffs_[diffs_.size() - 1] += pending_length_; + pending_length_ = 0; + return; + } + } + if (pending_length_ > 0x3f) { + bool non_zero_emitted = false; + for (int shift = 30; shift > 0; shift -= 6) { + int prefix = (pending_length_ >> shift) & 0x3f; + if ((prefix > 0) || non_zero_emitted) { + Emit(PREFIX_OP, prefix); + non_zero_emitted = true; + } + } + } + Emit(pending_op_, pending_length_ & 0x3f); + pending_length_ = 0; +} + + +// Add one more entry to copy one byte off the end, then flush +void OffsetMap::FlushAll() { + Copy(1); + Flush(); +} + +// Flush all if necessary +void OffsetMap::MaybeFlushAll() { + if ((0 < pending_length_) || diffs_.empty()) { + FlushAll(); + } +} + +// Len may be 0, for example as the low piece of length=64 +void OffsetMap::Emit(MapOp op, int len) { + char c = (static_cast<char>(op) << 6) | (len & 0x3f); + diffs_.push_back(c); +} + +void OffsetMap::DumpString() { + for (int i = 0; i < static_cast<int>(diffs_.size()); ++i) { + fprintf(stderr, "%c%02d ", "&=+-"[OpPart(diffs_[i])], LenPart(diffs_[i])); + } + fprintf(stderr, "\n"); + + // Print running table of correspondences + fprintf(stderr, " op A => A' (A forward-maps to A')\n"); + int aoffset = 0; + int aprimeoffset = 0; + int length = 0; + for (int i = 0; i < static_cast<int>(diffs_.size()); ++i) { + char c = diffs_[i]; + MapOp op = static_cast<MapOp>(OpPart(c)); + int len = LenPart(c); + length = (length << 6) + len; + if (op == COPY_OP) { + aoffset += length; + aprimeoffset += length; + length = 0; + } else if (op == INSERT_OP) { + aoffset += 0; + aprimeoffset += length; + length = 0; + } else if (op == DELETE_OP) { + aoffset += length; + aprimeoffset += 0; + length = 0; + } else { // (op == PREFIX_OP) + // Do nothing else + } + fprintf(stderr, "[%3d] %c%02d %6d %6d%s\n", + i, "&=+-"[op], len, + aoffset, aprimeoffset, + (next_diff_sub_ == i) ? " <==next_diff_sub_" : ""); + + } + fprintf(stderr, "\n"); +} + +void OffsetMap::DumpWindow() { + fprintf(stderr, "DumpWindow(A => A'): max_aoffset_ = %d, " + "max_aprimeoffset_ = %d, next_diff_sub_ = %d<br>\n", + max_aoffset_, max_aprimeoffset_, next_diff_sub_); + fprintf(stderr, "A [%u..%u)\n", + current_lo_aoffset_, current_hi_aoffset_); + fprintf(stderr, "A' [%u..%u)\n", + current_lo_aprimeoffset_, current_hi_aprimeoffset_); + fprintf(stderr, " diff = %d\n", current_diff_); + DumpString(); +} + +//----------------------------------------------------------------------------// +// The guts of the 2013 design // +// If there are three ranges a b c in diffs_, we can be in one of five // +// states: LEFT of a, in ranges a b c, or RIGHT of c // +// In each state, there are windows A[Alo..Ahi), A'[A'lo..A'hi) and diffs_ // +// position next_diff_sub_ // +// There also are mapping constants max_aoffset_ and max_aprimeoffset_ // +// If LEFT, Alo=Ahi=0, A'lo=A'hi=0 and next_diff_sub_=0 // +// If RIGHT, Alo=Ahi=max_aoffset_, A'lo=A'hi=max_aprimeoffset_ and // +// next_diff_sub_=diffs_.size() // +// Otherwise, at least one of A[) and A'[) is non-empty and the first bytes // +// correspond to each other. If range i is active, next_diff_sub_ is at // +// the first byte of range i+1. Because of the length-prefix operator, // +// an individual range item in diffs_ may be multiple bytes // +// In all cases aprimeoffset = aoffset + current_diff_ // +// i.e. current_diff_ = aprimeoffset - aoffset // +// // +// In the degenerate case of diffs_.empty(), there are only two states // +// LEFT and RIGHT and the mapping is the identity mapping. // +// The initial state is LEFT. // +// It is an error to move left into LEFT or right into RIGHT, but the code // +// below is robust in these cases. // +//----------------------------------------------------------------------------// + +void OffsetMap::SetLeft() { + current_lo_aoffset_ = 0; + current_hi_aoffset_ = 0; + current_lo_aprimeoffset_ = 0; + current_hi_aprimeoffset_ = 0; + current_diff_ = 0; + next_diff_sub_ = 0; +} + +void OffsetMap::SetRight() { + current_lo_aoffset_ = max_aoffset_; + current_hi_aoffset_ = max_aoffset_; + current_lo_aprimeoffset_ = max_aprimeoffset_; + current_hi_aprimeoffset_ = max_aprimeoffset_; + current_diff_ = max_aprimeoffset_ - max_aoffset_; + next_diff_sub_ = 0; +} + +// Back up over previous range, 1..5 bytes +// Return subscript at the beginning of that. Pins at 0 +int OffsetMap::Backup(int sub) { + if (sub <= 0) {return 0;} + --sub; + while ((0 < sub) && + (static_cast<MapOp>(OpPart(diffs_[sub - 1]) == PREFIX_OP))) { + --sub; + } + return sub; +} + +// Parse next range, 1..5 bytes +// Return subscript just off the end of that +int OffsetMap::ParseNext(int sub, MapOp* op, int* length) { + *op = PREFIX_OP; + *length = 0; + char c; + while ((sub < static_cast<int>(diffs_.size())) && (*op == PREFIX_OP)) { + c = diffs_[sub++]; + *op = static_cast<MapOp>(OpPart(c)); + int len = LenPart(c); + *length = (*length << 6) + len; + } + // If mal-formed or in RIGHT, this will return with op = PREFIX_OP + // Mal-formed can include a trailing prefix byte with no following op + return sub; +} + +// Parse previous range, 1..5 bytes +// Return current subscript +int OffsetMap::ParsePrevious(int sub, MapOp* op, int* length) { + sub = Backup(sub); + return ParseNext(sub, op, length); +} + +// Quick debugging dump; does not parse multi-byte items, so just length & 0x3f +void OffsetMap::PrintPosition(const char* str) { + MapOp op = PREFIX_OP; + int length = 0; + if ((0 < next_diff_sub_) && (next_diff_sub_ <= static_cast<int>(diffs_.size()))) { + op = static_cast<MapOp>(OpPart(diffs_[next_diff_sub_ - 1])); + length = LenPart(diffs_[next_diff_sub_ - 1]); + } + fprintf(stderr, "%s[%d] %c%02d = A[%d..%d) ==> A'[%d..%d)\n", + str, + next_diff_sub_, "&=+-"[op], length, + current_lo_aoffset_, current_hi_aoffset_, + current_lo_aprimeoffset_, current_hi_aprimeoffset_); +} + +// Move active window one range to the right +// Return true if move was OK +bool OffsetMap::MoveRight() { + // If at last range or RIGHT, set to RIGHT, return error + if (next_diff_sub_ >= static_cast<int>(diffs_.size())) { + SetRight(); + return false; + } + // Actually OK to move right + MapOp op; + int length; + bool retval = true; + // If mal-formed or in RIGHT, this will return with op = PREFIX_OP + next_diff_sub_ = ParseNext(next_diff_sub_, &op, &length); + + current_lo_aoffset_ = current_hi_aoffset_; + current_lo_aprimeoffset_ = current_hi_aprimeoffset_; + if (op == COPY_OP) { + current_hi_aoffset_ = current_lo_aoffset_ + length; + current_hi_aprimeoffset_ = current_lo_aprimeoffset_ + length; + } else if (op == INSERT_OP) { + current_hi_aoffset_ = current_lo_aoffset_ + 0; + current_hi_aprimeoffset_ = current_lo_aprimeoffset_ + length; + } else if (op == DELETE_OP) { + current_hi_aoffset_ = current_lo_aoffset_ + length; + current_hi_aprimeoffset_ = current_lo_aprimeoffset_ + 0; + } else { + SetRight(); + retval = false; + } + current_diff_ = current_lo_aprimeoffset_ - current_lo_aoffset_; + return retval; +} + +// Move active window one range to the left +// Return true if move was OK +bool OffsetMap::MoveLeft() { + // If at first range or LEFT, set to LEFT, return error + if (next_diff_sub_ <= 0) { + SetLeft(); + return false; + } + // Back up over current active window + next_diff_sub_ = Backup(next_diff_sub_); + if (next_diff_sub_ <= 0) { + SetLeft(); + return false; + } + // Actually OK to move left + MapOp op; + int length; + bool retval = true; + // If mal-formed or in LEFT, this will return with op = PREFIX_OP + next_diff_sub_ = ParsePrevious(next_diff_sub_, &op, &length); + + current_hi_aoffset_ = current_lo_aoffset_; + current_hi_aprimeoffset_ = current_lo_aprimeoffset_; + if (op == COPY_OP) { + current_lo_aoffset_ = current_hi_aoffset_ - length; + current_lo_aprimeoffset_ = current_hi_aprimeoffset_ - length; + } else if (op == INSERT_OP) { + current_lo_aoffset_ = current_hi_aoffset_ - 0; + current_lo_aprimeoffset_ = current_hi_aprimeoffset_ - length; + } else if (op == DELETE_OP) { + current_lo_aoffset_ = current_hi_aoffset_ - length; + current_lo_aprimeoffset_ = current_hi_aprimeoffset_ - 0; + } else { + SetLeft(); + retval = false; + } + current_diff_ = current_lo_aprimeoffset_ - current_lo_aoffset_; + return true; +} + +// Map an offset in A' to the corresponding offset in A +int OffsetMap::MapBack(int aprimeoffset){ + MaybeFlushAll(); + if (aprimeoffset < 0) {return 0;} + if (max_aprimeoffset_ <= aprimeoffset) { + return (aprimeoffset - max_aprimeoffset_) + max_aoffset_; + } + + // If current_lo_aprimeoffset_ <= aprimeoffset < current_hi_aprimeoffset_, + // use current mapping, else move window left/right + bool ok = true; + while (ok && (aprimeoffset < current_lo_aprimeoffset_)) { + ok = MoveLeft(); + } + while (ok && (current_hi_aprimeoffset_ <= aprimeoffset)) { + ok = MoveRight(); + } + // So now current_lo_aprimeoffset_ <= aprimeoffset < current_hi_aprimeoffset_ + + int aoffset = aprimeoffset - current_diff_; + if (aoffset >= current_hi_aoffset_) { + // A' is in an insert region, all bytes of which backmap to A=hi_aoffset_ + aoffset = current_hi_aoffset_; + } + return aoffset; +} + +// Map an offset in A to the corresponding offset in A' +int OffsetMap::MapForward(int aoffset){ + MaybeFlushAll(); + if (aoffset < 0) {return 0;} + if (max_aoffset_ <= aoffset) { + return (aoffset - max_aoffset_) + max_aprimeoffset_; + } + + // If current_lo_aoffset_ <= aoffset < current_hi_aoffset_, + // use current mapping, else move window left/right + bool ok = true; + while (ok && (aoffset < current_lo_aoffset_)) { + ok = MoveLeft(); + } + while (ok && (current_hi_aoffset_ <= aoffset)) { + ok = MoveRight(); + } + + int aprimeoffset = aoffset + current_diff_; + if (aprimeoffset >= current_hi_aprimeoffset_) { + // A is in a delete region, all bytes of which map to A'=hi_aprimeoffset_ + aprimeoffset = current_hi_aprimeoffset_; + } + return aprimeoffset; +} + + +// static +bool OffsetMap::CopyInserts(OffsetMap* source, OffsetMap* dest) { + bool ok = true; + while (ok && (source->next_diff_sub_ != source->diffs_.size())) { + ok = source->MoveRight(); + if (source->current_lo_aoffset_ != source->current_hi_aoffset_) { + return false; + } + dest->Insert( + source->current_hi_aprimeoffset_ - source->current_lo_aprimeoffset_); + } + return true; +} + +// static +bool OffsetMap::CopyDeletes(OffsetMap* source, OffsetMap* dest) { + bool ok = true; + while (ok && (source->next_diff_sub_ != source->diffs_.size())) { + ok = source->MoveRight(); + if (source->current_lo_aprimeoffset_ != source->current_hi_aprimeoffset_) { + return false; + } + dest->Delete(source->current_hi_aoffset_ - source->current_lo_aoffset_); + } + return true; +} + +// static +void OffsetMap::ComposeOffsetMap( + OffsetMap* g, OffsetMap* f, OffsetMap* h) { + h->Clear(); + f->Reset(); + g->Reset(); + + int lo = 0; + for (;;) { + // Consume delete operations in f. This moves A without moving + // A' and A''. + if (lo >= g->current_hi_aoffset_ && CopyInserts(g, h)) { + if (lo >= f->current_hi_aprimeoffset_ && CopyDeletes(f, h)) { + // fprintf(stderr, + // "ComposeOffsetMap ERROR, f is longer than g.<br>\n"); + } + + // FlushAll(), called by Reset(), MapForward() or MapBack(), has + // added an extra COPY_OP to f and g, so this function has + // composed an extra COPY_OP in h from those. To avoid + // FlushAll() adds one more extra COPY_OP to h later, dispatch + // Flush() right now. + h->Flush(); + return; + } + + // Consume insert operations in g. This moves A'' without moving A + // and A'. + if (lo >= f->current_hi_aprimeoffset_) { + if (!CopyDeletes(f, h)) { + // fprintf(stderr, + // "ComposeOffsetMap ERROR, g is longer than f.<br>\n"); + } + } + + // Compose one operation which moves A' from lo to hi. + int hi = min(f->current_hi_aprimeoffset_, g->current_hi_aoffset_); + if (f->current_lo_aoffset_ != f->current_hi_aoffset_ && + g->current_lo_aprimeoffset_ != g->current_hi_aprimeoffset_) { + h->Copy(hi - lo); + } else if (f->current_lo_aoffset_ != f->current_hi_aoffset_) { + h->Delete(hi - lo); + } else if (g->current_lo_aprimeoffset_ != g->current_hi_aprimeoffset_) { + h->Insert(hi - lo); + } + + lo = hi; + } +} + +// For testing only -- force a mapping +void OffsetMap::StuffIt(const string& diffs, + int max_aoffset, int max_aprimeoffset) { + Clear(); + diffs_ = diffs; + max_aoffset_ = max_aoffset; + max_aprimeoffset_ = max_aprimeoffset; +} + + +} // namespace CLD2 + |