Add Basilisk

author: Matt A. Tobin <email@mattatobin.com> 2018-02-02 03:32:58 -0500
committer: Matt A. Tobin <email@mattatobin.com> 2018-02-02 03:32:58 -0500
commit: e72ef92b5bdc43cd2584198e2e54e951b70299e8 (patch)
tree: 01ceb4a897c33eca9e7ccf2bc3aefbe530169fe5 /application/basilisk/components/translation/cld2/internal/utf8statetable.h
parent: 0d19b77d3eaa5b8d837bf52c19759e68e42a1c4c (diff)
download: UXP-e72ef92b5bdc43cd2584198e2e54e951b70299e8.tar
UXP-e72ef92b5bdc43cd2584198e2e54e951b70299e8.tar.gz
UXP-e72ef92b5bdc43cd2584198e2e54e951b70299e8.tar.lz
UXP-e72ef92b5bdc43cd2584198e2e54e951b70299e8.tar.xz
UXP-e72ef92b5bdc43cd2584198e2e54e951b70299e8.zip
1 files changed, 283 insertions, 0 deletions
diff --git a/application/basilisk/components/translation/cld2/internal/utf8statetable.h b/application/basilisk/components/translation/cld2/internal/utf8statetable.h
new file mode 100644
index 000000000..55c00f45e
--- /dev/null
+++ b/application/basilisk/components/translation/cld2/internal/utf8statetable.h
@@ -0,0 +1,283 @@
+// Copyright 2013 Google Inc. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+//
+// State Table follower for scanning UTF-8 strings without converting to
+// 32- or 16-bit Unicode values.
+//
+// Author: dsites@google.com (Dick Sites)
+//
+
+#ifndef UTIL_UTF8_UTF8STATETABLE_H_
+#define UTIL_UTF8_UTF8STATETABLE_H_
+
+#include <string>
+#include "integral_types.h"             // for uint8, uint32, uint16
+#include "stringpiece.h"
+
+
+namespace CLD2 {
+
+class OffsetMap;
+
+
+// These four-byte entries compactly encode how many bytes 0..255 to delete
+// in making a string replacement, how many bytes to add 0..255, and the offset
+// 0..64k-1 of the replacement string in remap_string.
+struct RemapEntry {
+  uint8 delete_bytes;
+  uint8 add_bytes;
+  uint16 bytes_offset;
+};
+
+// Exit type codes for state tables. All but the first get stuffed into
+// signed one-byte entries. The first is only generated by executable code.
+// To distinguish from next-state entries, these must be contiguous and
+// all <= kExitNone
+typedef enum {
+  kExitDstSpaceFull = 239,
+  kExitIllegalStructure,  // 240
+  kExitOK,                // 241
+  kExitReject,            // ...
+  kExitReplace1,
+  kExitReplace2,
+  kExitReplace3,
+  kExitReplace21,
+  kExitReplace31,
+  kExitReplace32,
+  kExitReplaceOffset1,
+  kExitReplaceOffset2,
+  kExitReplace1S0,
+  kExitSpecial,
+  kExitDoAgain,
+  kExitRejectAlt,
+  kExitNone               // 255
+} ExitReason;
+
+typedef enum {
+  kExitDstSpaceFull_2 = 32767,       // 0x7fff
+  kExitIllegalStructure_2,  // 32768    0x8000
+  kExitOK_2,                // 32769    0x8001
+  kExitReject_2,            // ...
+  kExitReplace1_2,
+  kExitReplace2_2,
+  kExitReplace3_2,
+  kExitReplace21_2,
+  kExitReplace31_2,
+  kExitReplace32_2,
+  kExitReplaceOffset1_2,
+  kExitReplaceOffset2_2,
+  kExitReplace1S0_2,
+  kExitSpecial_2,
+  kExitDoAgain_2,
+  kExitRejectAlt_2,
+  kExitNone_2               // 32783    0x800f
+} ExitReason_2;
+
+
+// This struct represents one entire state table. The three initialized byte
+// areas are state_table, remap_base, and remap_string. state0 and state0_size
+// give the byte offset and length within state_table of the initial state --
+// table lookups are expected to start and end in this state, but for
+// truncated UTF-8 strings, may end in a different state. These allow a quick
+// test for that condition. entry_shift is 8 for tables subscripted by a full
+// byte value and 6 for space-optimized tables subscripted by only six
+// significant bits in UTF-8 continuation bytes.
+typedef struct {
+  const uint32 state0;
+  const uint32 state0_size;
+  const uint32 total_size;
+  const int max_expand;
+  const int entry_shift;
+  const int bytes_per_entry;
+  const uint32 losub;
+  const uint32 hiadd;
+  const uint8* state_table;
+  const RemapEntry* remap_base;
+  const uint8* remap_string;
+  const uint8* fast_state;
+} UTF8StateMachineObj;
+
+// Near-duplicate declaration for tables with two-byte entries
+typedef struct {
+  const uint32 state0;
+  const uint32 state0_size;
+  const uint32 total_size;
+  const int max_expand;
+  const int entry_shift;
+  const int bytes_per_entry;
+  const uint32 losub;
+  const uint32 hiadd;
+  const unsigned short* state_table;
+  const RemapEntry* remap_base;
+  const uint8* remap_string;
+  const uint8* fast_state;
+} UTF8StateMachineObj_2;
+
+
+typedef UTF8StateMachineObj UTF8PropObj;
+typedef UTF8StateMachineObj UTF8ScanObj;
+typedef UTF8StateMachineObj UTF8ReplaceObj;
+typedef UTF8StateMachineObj_2 UTF8PropObj_2;
+typedef UTF8StateMachineObj_2 UTF8ReplaceObj_2;
+// NOT IMPLEMENTED typedef UTF8StateMachineObj_2 UTF8ScanObj_2;
+
+
+// Look up property of one UTF-8 character and advance over it
+// Return 0 if input length is zero
+// Return 0 and advance one byte if input is ill-formed
+uint8 UTF8GenericProperty(const UTF8PropObj* st,
+                          const uint8** src,
+                          int* srclen);
+
+// Look up property of one UTF-8 character (assumed to be valid).
+// (This is a faster version of UTF8GenericProperty.)
+bool UTF8HasGenericProperty(const UTF8PropObj& st, const char* src);
+
+
+// BigOneByte versions are needed for tables > 240 states, but most
+// won't need the TwoByte versions.
+
+// Look up property of one UTF-8 character and advance over it
+// Return 0 if input length is zero
+// Return 0 and advance one byte if input is ill-formed
+uint8 UTF8GenericPropertyBigOneByte(const UTF8PropObj* st,
+                          const uint8** src,
+                          int* srclen);
+
+
+// TwoByte versions are needed for tables > 240 states that don't fit onto
+// BigOneByte -- rare ultimate fallback
+
+// Look up property of one UTF-8 character (assumed to be valid).
+// (This is a faster version of UTF8GenericProperty.)
+bool UTF8HasGenericPropertyBigOneByte(const UTF8PropObj& st, const char* src);
+
+// Look up property of one UTF-8 character and advance over it
+// Return 0 if input length is zero
+// Return 0 and advance one byte if input is ill-formed
+uint8 UTF8GenericPropertyTwoByte(const UTF8PropObj_2* st,
+                          const uint8** src,
+                          int* srclen);
+
+// Look up property of one UTF-8 character (assumed to be valid).
+// (This is a faster version of UTF8GenericProperty.)
+bool UTF8HasGenericPropertyTwoByte(const UTF8PropObj_2& st, const char* src);
+
+// Scan a UTF-8 stringpiece based on a state table.
+// Always scan complete UTF-8 characters
+// Set number of bytes scanned. Return reason for exiting
+int UTF8GenericScan(const UTF8ScanObj* st,
+                    const StringPiece& str,
+                    int* bytes_consumed);
+
+
+
+// Scan a UTF-8 stringpiece based on state table, copying to output stringpiece
+//   and doing text replacements.
+// Always scan complete UTF-8 characters
+// Set number of bytes consumed from input, number filled to output.
+// Return reason for exiting
+// Also writes an optional OffsetMap. Pass NULL to skip writing one.
+int UTF8GenericReplace(const UTF8ReplaceObj* st,
+                    const StringPiece& istr,
+                    StringPiece& ostr,
+                    bool is_plain_text,
+                    int* bytes_consumed,
+                    int* bytes_filled,
+                    int* chars_changed,
+                    OffsetMap* offsetmap);
+
+// Older version without offsetmap
+int UTF8GenericReplace(const UTF8ReplaceObj* st,
+                    const StringPiece& istr,
+                    StringPiece& ostr,
+                    bool is_plain_text,
+                    int* bytes_consumed,
+                    int* bytes_filled,
+                    int* chars_changed);
+
+// Older version without is_plain_text or offsetmap
+int UTF8GenericReplace(const UTF8ReplaceObj* st,
+                    const StringPiece& istr,
+                    StringPiece& ostr,
+                    int* bytes_consumed,
+                    int* bytes_filled,
+                    int* chars_changed);
+
+
+// TwoByte version is needed for tables > about 256 states, such
+// as the table for full Unicode 4.1 canonical + compatibility mapping
+
+// Scan a UTF-8 stringpiece based on state table with two-byte entries,
+//   copying to output stringpiece
+//   and doing text replacements.
+// Always scan complete UTF-8 characters
+// Set number of bytes consumed from input, number filled to output.
+// Return reason for exiting
+// Also writes an optional OffsetMap. Pass NULL to skip writing one.
+int UTF8GenericReplaceTwoByte(const UTF8ReplaceObj_2* st,
+                    const StringPiece& istr,
+                    StringPiece& ostr,
+                    bool is_plain_text,
+                    int* bytes_consumed,
+                    int* bytes_filled,
+                    int* chars_changed,
+                    OffsetMap* offsetmap);
+
+// Older version without offsetmap
+int UTF8GenericReplaceTwoByte(const UTF8ReplaceObj_2* st,
+                    const StringPiece& istr,
+                    StringPiece& ostr,
+                    bool is_plain_text,
+                    int* bytes_consumed,
+                    int* bytes_filled,
+                    int* chars_changed);
+
+// Older version without is_plain_text or offsetmap
+int UTF8GenericReplaceTwoByte(const UTF8ReplaceObj_2* st,
+                    const StringPiece& istr,
+                    StringPiece& ostr,
+                    int* bytes_consumed,
+                    int* bytes_filled,
+                    int* chars_changed);
+
+
+static const unsigned char kUTF8LenTbl[256] = {
+  1,1,1,1,1,1,1,1, 1,1,1,1,1,1,1,1, 1,1,1,1,1,1,1,1, 1,1,1,1,1,1,1,1,
+  1,1,1,1,1,1,1,1, 1,1,1,1,1,1,1,1, 1,1,1,1,1,1,1,1, 1,1,1,1,1,1,1,1,
+  1,1,1,1,1,1,1,1, 1,1,1,1,1,1,1,1, 1,1,1,1,1,1,1,1, 1,1,1,1,1,1,1,1,
+  1,1,1,1,1,1,1,1, 1,1,1,1,1,1,1,1, 1,1,1,1,1,1,1,1, 1,1,1,1,1,1,1,1,
+
+  1,1,1,1,1,1,1,1, 1,1,1,1,1,1,1,1, 1,1,1,1,1,1,1,1, 1,1,1,1,1,1,1,1,
+  1,1,1,1,1,1,1,1, 1,1,1,1,1,1,1,1, 1,1,1,1,1,1,1,1, 1,1,1,1,1,1,1,1,
+  2,2,2,2,2,2,2,2, 2,2,2,2,2,2,2,2, 2,2,2,2,2,2,2,2, 2,2,2,2,2,2,2,2,
+  3,3,3,3,3,3,3,3, 3,3,3,3,3,3,3,3, 4,4,4,4,4,4,4,4, 4,4,4,4,4,4,4,4
+};
+
+inline int UTF8OneCharLen(const char* in) {
+  return kUTF8LenTbl[*reinterpret_cast<const uint8*>(in)];
+}
+
+// Adjust a stringpiece to encompass complete UTF-8 characters.
+// The data pointer will be increased by 0..3 bytes to get to a character
+// boundary, and the length will then be decreased by 0..3 bytes
+// to encompass the last complete character.
+// This is useful especially when a UTF-8 string must be put into a fixed-
+// maximum-size buffer cleanly, such as a MySQL buffer.
+void UTF8TrimToChars(StringPiece* istr);
+
+}       // End namespace CLD2
+
+#endif  // UTIL_UTF8_UTF8STATETABLE_H_
author	Matt A. Tobin <email@mattatobin.com>	2018-02-02 03:32:58 -0500
committer	Matt A. Tobin <email@mattatobin.com>	2018-02-02 03:32:58 -0500
commit	e72ef92b5bdc43cd2584198e2e54e951b70299e8 (patch)
tree	01ceb4a897c33eca9e7ccf2bc3aefbe530169fe5 /application/basilisk/components/translation/cld2/internal/utf8statetable.h
parent	0d19b77d3eaa5b8d837bf52c19759e68e42a1c4c (diff)
download	UXP-e72ef92b5bdc43cd2584198e2e54e951b70299e8.tar UXP-e72ef92b5bdc43cd2584198e2e54e951b70299e8.tar.gz UXP-e72ef92b5bdc43cd2584198e2e54e951b70299e8.tar.lz UXP-e72ef92b5bdc43cd2584198e2e54e951b70299e8.tar.xz UXP-e72ef92b5bdc43cd2584198e2e54e951b70299e8.zip