diff options
Diffstat (limited to 'application/basilisk/components/translation/cld2/internal/utf8statetable.h')
-rw-r--r-- | application/basilisk/components/translation/cld2/internal/utf8statetable.h | 283 |
1 files changed, 0 insertions, 283 deletions
diff --git a/application/basilisk/components/translation/cld2/internal/utf8statetable.h b/application/basilisk/components/translation/cld2/internal/utf8statetable.h deleted file mode 100644 index 55c00f45e..000000000 --- a/application/basilisk/components/translation/cld2/internal/utf8statetable.h +++ /dev/null @@ -1,283 +0,0 @@ -// Copyright 2013 Google Inc. All Rights Reserved. -// -// Licensed under the Apache License, Version 2.0 (the "License"); -// you may not use this file except in compliance with the License. -// You may obtain a copy of the License at -// -// http://www.apache.org/licenses/LICENSE-2.0 -// -// Unless required by applicable law or agreed to in writing, software -// distributed under the License is distributed on an "AS IS" BASIS, -// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -// See the License for the specific language governing permissions and -// limitations under the License. - -// -// State Table follower for scanning UTF-8 strings without converting to -// 32- or 16-bit Unicode values. -// -// Author: dsites@google.com (Dick Sites) -// - -#ifndef UTIL_UTF8_UTF8STATETABLE_H_ -#define UTIL_UTF8_UTF8STATETABLE_H_ - -#include <string> -#include "integral_types.h" // for uint8, uint32, uint16 -#include "stringpiece.h" - - -namespace CLD2 { - -class OffsetMap; - - -// These four-byte entries compactly encode how many bytes 0..255 to delete -// in making a string replacement, how many bytes to add 0..255, and the offset -// 0..64k-1 of the replacement string in remap_string. -struct RemapEntry { - uint8 delete_bytes; - uint8 add_bytes; - uint16 bytes_offset; -}; - -// Exit type codes for state tables. All but the first get stuffed into -// signed one-byte entries. The first is only generated by executable code. -// To distinguish from next-state entries, these must be contiguous and -// all <= kExitNone -typedef enum { - kExitDstSpaceFull = 239, - kExitIllegalStructure, // 240 - kExitOK, // 241 - kExitReject, // ... - kExitReplace1, - kExitReplace2, - kExitReplace3, - kExitReplace21, - kExitReplace31, - kExitReplace32, - kExitReplaceOffset1, - kExitReplaceOffset2, - kExitReplace1S0, - kExitSpecial, - kExitDoAgain, - kExitRejectAlt, - kExitNone // 255 -} ExitReason; - -typedef enum { - kExitDstSpaceFull_2 = 32767, // 0x7fff - kExitIllegalStructure_2, // 32768 0x8000 - kExitOK_2, // 32769 0x8001 - kExitReject_2, // ... - kExitReplace1_2, - kExitReplace2_2, - kExitReplace3_2, - kExitReplace21_2, - kExitReplace31_2, - kExitReplace32_2, - kExitReplaceOffset1_2, - kExitReplaceOffset2_2, - kExitReplace1S0_2, - kExitSpecial_2, - kExitDoAgain_2, - kExitRejectAlt_2, - kExitNone_2 // 32783 0x800f -} ExitReason_2; - - -// This struct represents one entire state table. The three initialized byte -// areas are state_table, remap_base, and remap_string. state0 and state0_size -// give the byte offset and length within state_table of the initial state -- -// table lookups are expected to start and end in this state, but for -// truncated UTF-8 strings, may end in a different state. These allow a quick -// test for that condition. entry_shift is 8 for tables subscripted by a full -// byte value and 6 for space-optimized tables subscripted by only six -// significant bits in UTF-8 continuation bytes. -typedef struct { - const uint32 state0; - const uint32 state0_size; - const uint32 total_size; - const int max_expand; - const int entry_shift; - const int bytes_per_entry; - const uint32 losub; - const uint32 hiadd; - const uint8* state_table; - const RemapEntry* remap_base; - const uint8* remap_string; - const uint8* fast_state; -} UTF8StateMachineObj; - -// Near-duplicate declaration for tables with two-byte entries -typedef struct { - const uint32 state0; - const uint32 state0_size; - const uint32 total_size; - const int max_expand; - const int entry_shift; - const int bytes_per_entry; - const uint32 losub; - const uint32 hiadd; - const unsigned short* state_table; - const RemapEntry* remap_base; - const uint8* remap_string; - const uint8* fast_state; -} UTF8StateMachineObj_2; - - -typedef UTF8StateMachineObj UTF8PropObj; -typedef UTF8StateMachineObj UTF8ScanObj; -typedef UTF8StateMachineObj UTF8ReplaceObj; -typedef UTF8StateMachineObj_2 UTF8PropObj_2; -typedef UTF8StateMachineObj_2 UTF8ReplaceObj_2; -// NOT IMPLEMENTED typedef UTF8StateMachineObj_2 UTF8ScanObj_2; - - -// Look up property of one UTF-8 character and advance over it -// Return 0 if input length is zero -// Return 0 and advance one byte if input is ill-formed -uint8 UTF8GenericProperty(const UTF8PropObj* st, - const uint8** src, - int* srclen); - -// Look up property of one UTF-8 character (assumed to be valid). -// (This is a faster version of UTF8GenericProperty.) -bool UTF8HasGenericProperty(const UTF8PropObj& st, const char* src); - - -// BigOneByte versions are needed for tables > 240 states, but most -// won't need the TwoByte versions. - -// Look up property of one UTF-8 character and advance over it -// Return 0 if input length is zero -// Return 0 and advance one byte if input is ill-formed -uint8 UTF8GenericPropertyBigOneByte(const UTF8PropObj* st, - const uint8** src, - int* srclen); - - -// TwoByte versions are needed for tables > 240 states that don't fit onto -// BigOneByte -- rare ultimate fallback - -// Look up property of one UTF-8 character (assumed to be valid). -// (This is a faster version of UTF8GenericProperty.) -bool UTF8HasGenericPropertyBigOneByte(const UTF8PropObj& st, const char* src); - -// Look up property of one UTF-8 character and advance over it -// Return 0 if input length is zero -// Return 0 and advance one byte if input is ill-formed -uint8 UTF8GenericPropertyTwoByte(const UTF8PropObj_2* st, - const uint8** src, - int* srclen); - -// Look up property of one UTF-8 character (assumed to be valid). -// (This is a faster version of UTF8GenericProperty.) -bool UTF8HasGenericPropertyTwoByte(const UTF8PropObj_2& st, const char* src); - -// Scan a UTF-8 stringpiece based on a state table. -// Always scan complete UTF-8 characters -// Set number of bytes scanned. Return reason for exiting -int UTF8GenericScan(const UTF8ScanObj* st, - const StringPiece& str, - int* bytes_consumed); - - - -// Scan a UTF-8 stringpiece based on state table, copying to output stringpiece -// and doing text replacements. -// Always scan complete UTF-8 characters -// Set number of bytes consumed from input, number filled to output. -// Return reason for exiting -// Also writes an optional OffsetMap. Pass NULL to skip writing one. -int UTF8GenericReplace(const UTF8ReplaceObj* st, - const StringPiece& istr, - StringPiece& ostr, - bool is_plain_text, - int* bytes_consumed, - int* bytes_filled, - int* chars_changed, - OffsetMap* offsetmap); - -// Older version without offsetmap -int UTF8GenericReplace(const UTF8ReplaceObj* st, - const StringPiece& istr, - StringPiece& ostr, - bool is_plain_text, - int* bytes_consumed, - int* bytes_filled, - int* chars_changed); - -// Older version without is_plain_text or offsetmap -int UTF8GenericReplace(const UTF8ReplaceObj* st, - const StringPiece& istr, - StringPiece& ostr, - int* bytes_consumed, - int* bytes_filled, - int* chars_changed); - - -// TwoByte version is needed for tables > about 256 states, such -// as the table for full Unicode 4.1 canonical + compatibility mapping - -// Scan a UTF-8 stringpiece based on state table with two-byte entries, -// copying to output stringpiece -// and doing text replacements. -// Always scan complete UTF-8 characters -// Set number of bytes consumed from input, number filled to output. -// Return reason for exiting -// Also writes an optional OffsetMap. Pass NULL to skip writing one. -int UTF8GenericReplaceTwoByte(const UTF8ReplaceObj_2* st, - const StringPiece& istr, - StringPiece& ostr, - bool is_plain_text, - int* bytes_consumed, - int* bytes_filled, - int* chars_changed, - OffsetMap* offsetmap); - -// Older version without offsetmap -int UTF8GenericReplaceTwoByte(const UTF8ReplaceObj_2* st, - const StringPiece& istr, - StringPiece& ostr, - bool is_plain_text, - int* bytes_consumed, - int* bytes_filled, - int* chars_changed); - -// Older version without is_plain_text or offsetmap -int UTF8GenericReplaceTwoByte(const UTF8ReplaceObj_2* st, - const StringPiece& istr, - StringPiece& ostr, - int* bytes_consumed, - int* bytes_filled, - int* chars_changed); - - -static const unsigned char kUTF8LenTbl[256] = { - 1,1,1,1,1,1,1,1, 1,1,1,1,1,1,1,1, 1,1,1,1,1,1,1,1, 1,1,1,1,1,1,1,1, - 1,1,1,1,1,1,1,1, 1,1,1,1,1,1,1,1, 1,1,1,1,1,1,1,1, 1,1,1,1,1,1,1,1, - 1,1,1,1,1,1,1,1, 1,1,1,1,1,1,1,1, 1,1,1,1,1,1,1,1, 1,1,1,1,1,1,1,1, - 1,1,1,1,1,1,1,1, 1,1,1,1,1,1,1,1, 1,1,1,1,1,1,1,1, 1,1,1,1,1,1,1,1, - - 1,1,1,1,1,1,1,1, 1,1,1,1,1,1,1,1, 1,1,1,1,1,1,1,1, 1,1,1,1,1,1,1,1, - 1,1,1,1,1,1,1,1, 1,1,1,1,1,1,1,1, 1,1,1,1,1,1,1,1, 1,1,1,1,1,1,1,1, - 2,2,2,2,2,2,2,2, 2,2,2,2,2,2,2,2, 2,2,2,2,2,2,2,2, 2,2,2,2,2,2,2,2, - 3,3,3,3,3,3,3,3, 3,3,3,3,3,3,3,3, 4,4,4,4,4,4,4,4, 4,4,4,4,4,4,4,4 -}; - -inline int UTF8OneCharLen(const char* in) { - return kUTF8LenTbl[*reinterpret_cast<const uint8*>(in)]; -} - -// Adjust a stringpiece to encompass complete UTF-8 characters. -// The data pointer will be increased by 0..3 bytes to get to a character -// boundary, and the length will then be decreased by 0..3 bytes -// to encompass the last complete character. -// This is useful especially when a UTF-8 string must be put into a fixed- -// maximum-size buffer cleanly, such as a MySQL buffer. -void UTF8TrimToChars(StringPiece* istr); - -} // End namespace CLD2 - -#endif // UTIL_UTF8_UTF8STATETABLE_H_ |