summaryrefslogtreecommitdiffstats
path: root/application/basilisk/components/translation/cld2/internal/offsetmap.h
blob: 6325554009aa5b1b4cf0657cca1de7441e899e4f (plain)
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
// Copyright 2013 Google Inc. All Rights Reserved.
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
//     http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.

//
// Author: dsites@google.com (Dick Sites)
//

#ifndef UTIL_UTF8_OFFSETMAP_H_
#define UTIL_UTF8_OFFSETMAP_H_

#include <string>                       // for string
#include "integral_types.h"             // for uint32

// ***************************** OffsetMap **************************
//
// An OffsetMap object is a container for a mapping from offsets in one text
// buffer A' to offsets in another text buffer A. It is most useful when A' is
// built from A via substitutions that occasionally do not preserve byte length.
//
// A series of operators are used to build the correspondence map, then
// calls can be made to map an offset in A' to an offset in A, or vice versa.
// The map starts with offset 0 in A corresponding to offset 0 in A'.
// The mapping is then built sequentially, adding on byte ranges that are
// identical in A and A', byte ranges that are inserted in A', and byte ranges
// that are deleted from A. All bytes beyond those specified when building the
// map are assumed to correspond, i.e. a Copy(infinity) is assumed at the
// end of the map.
//
// The internal data structure records positions at which bytes are added or
// deleted. Using the map is O(1) when increasing the A' or A offset
// monotonically, and O(n) when accessing random offsets, where n is the
// number of differences.
//

namespace CLD2 {

class OffsetMap {
 public:
  // Constructor, destructor
  OffsetMap();
  ~OffsetMap();

  // Clear the map
  void Clear();

  // Add to  mapping from A to A', specifying how many next bytes correspond
  // in A and A'
  void Copy(int bytes);

  // Add to mapping from A to A', specifying how many next bytes are
  // inserted in A' while not advancing in A at all
  void Insert(int bytes);

  // Add to mapping from A to A', specifying how many next bytes are
  // deleted from A while not advancing in A' at all
  void Delete(int bytes);

  // Print map to file, for debugging
  void Printmap(const char* filename);

  // [Finish building map,] Re-position to offset 0
  // This call is optional; MapForward and MapBack finish building the map
  // if necessary
  void Reset();

  // Map an offset in A' to the corresponding offset in A
  int MapBack(int aprimeoffset);

  // Map an offset in A to the corresponding offset in A'
  int MapForward(int aoffset);

  // h = ComposeOffsetMap(g, f), where f is a map from A to A', g is
  // from A' to A'' and h is from A to A''.
  //
  // Note that g->MoveForward(f->MoveForward(aoffset)) always equals
  // to h->MoveForward(aoffset), while
  // f->MoveBack(g->MoveBack(aprimeprimeoffset)) doesn't always equals
  // to h->MoveBack(aprimeprimeoffset). This happens when deletion in
  // f and insertion in g are at the same place.  For example,
  //
  // A    1   2   3   4
  //      ^   |  ^    ^
  //      |   | /     |  f
  //      v   vv      v
  // A'   1'  2'      3'
  //      ^   ^^      ^
  //      |   | \     |  g
  //      v   |  v    v
  // A''  1'' 2'' 3'' 4''
  //
  // results in:
  //
  // A    1   2   3   4
  //      ^   ^\  ^   ^
  //      |   | \ |   |  h
  //      v   |  vv   v
  // A''  1'' 2'' 3'' 4''
  //
  // 2'' is mapped 3 in the former figure, while 2'' is mapped to 2 in
  // the latter figure.
  static void ComposeOffsetMap(OffsetMap* g, OffsetMap* f, OffsetMap* h);

  // For debugging only; writes to stderr
  void DumpWindow();

  // For testing only -- force a mapping
  void StuffIt(const std::string& diffs, int max_aoffset, int max_aprimeoffset);

 private:
  enum MapOp {PREFIX_OP, COPY_OP, INSERT_OP, DELETE_OP};

  void Flush();
  void FlushAll();
  void MaybeFlushAll();
  void Emit(MapOp op, int len);

  void SetLeft();
  void SetRight();

  // Back up over previous range, 1..5 bytes
  // Return subscript at the beginning of that. Pins at 0
  int Backup(int sub);

  // Parse next range, 1..5 bytes
  // Return subscript just off the end of that
  int ParseNext(int sub, MapOp* op, int* length);

  // Parse previous range, 1..5 bytes
  // Return current subscript
  int ParsePrevious(int sub, MapOp* op, int* length);

  void PrintPosition(const char* str);

  bool MoveRight();     // Returns true if OK
  bool MoveLeft();      // Returns true if OK
  void DumpString();

  // Copies insert operations from source to dest. Returns true if no
  // other operations are found.
  static bool CopyInserts(OffsetMap* source, OffsetMap* dest);

  // Copies delete operations from source to dest. Returns true if no other
  // operations are found.
  static bool CopyDeletes(OffsetMap* source, OffsetMap* dest);

  std::string diffs_;
  MapOp pending_op_;
  uint32 pending_length_;

  // Offsets in the ranges below correspond to each other, with A' = A + diff
  int next_diff_sub_;
  int current_lo_aoffset_;
  int current_hi_aoffset_;
  int current_lo_aprimeoffset_;
  int current_hi_aprimeoffset_;
  int current_diff_;
  int max_aoffset_;
  int max_aprimeoffset_;
};

}  // namespace CLD2

#endif  // UTIL_UTF8_OFFSETMAP_H_