diff options
Diffstat (limited to 'application/basilisk/components/translation/cld2/internal/utf8statetable.cc')
-rw-r--r-- | application/basilisk/components/translation/cld2/internal/utf8statetable.cc | 1369 |
1 files changed, 0 insertions, 1369 deletions
diff --git a/application/basilisk/components/translation/cld2/internal/utf8statetable.cc b/application/basilisk/components/translation/cld2/internal/utf8statetable.cc deleted file mode 100644 index 8c97123d8..000000000 --- a/application/basilisk/components/translation/cld2/internal/utf8statetable.cc +++ /dev/null @@ -1,1369 +0,0 @@ -// Copyright 2013 Google Inc. All Rights Reserved. -// -// Licensed under the Apache License, Version 2.0 (the "License"); -// you may not use this file except in compliance with the License. -// You may obtain a copy of the License at -// -// http://www.apache.org/licenses/LICENSE-2.0 -// -// Unless required by applicable law or agreed to in writing, software -// distributed under the License is distributed on an "AS IS" BASIS, -// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -// See the License for the specific language governing permissions and -// limitations under the License. - -// -// State Table follower for scanning UTF-8 strings without converting to -// 32- or 16-bit Unicode values. -// - -#ifdef COMPILER_MSVC -// MSVC warns: warning C4309: 'initializing' : truncation of constant value -// But the value is in fact not truncated. 0xFF still comes out 0xFF at -// runtime. -#pragma warning ( disable : 4309 ) -#endif - -#include "utf8statetable.h" - -#include <stdint.h> // for uintptr_t -#include <string.h> // for NULL, memcpy, memmove - -#include "integral_types.h" // for uint8, uint32, int8 -#include "stringpiece.h" -#include "offsetmap.h" - - -namespace CLD2 { - -static const int kReplaceAndResumeFlag = 0x80; // Bit in del byte to distinguish - // optional next-state field - // after replacement text -static const int kHtmlPlaintextFlag = 0x80; // Bit in add byte to distinguish - // HTML replacement vs. plaintext - - -/** - * This code implements a little interpreter for UTF8 state - * tables. There are three kinds of quite-similar state tables, - * property, scanning, and replacement. Each state in one of - * these tables consists of an array of 256 or 64 one-byte - * entries. The state is subscripted by an incoming source byte, - * and the entry either specifies the next state or specifies an - * action. Space-optimized tables have full 256-entry states for - * the first byte of a UTF-8 character, but only 64-entry states - * for continuation bytes. Space-optimized tables may only be - * used with source input that has been checked to be - * structurally- (or stronger interchange-) valid. - * - * A property state table has an unsigned one-byte property for - * each possible UTF-8 character. One-byte character properties - * are in the state[0] array, while for other lengths the - * state[0] array gives the next state, which contains the - * property value for two-byte characters or yet another state - * for longer ones. The code simply loads the right number of - * next-state values, then returns the final byte as property - * value. There are no actions specified in property tables. - * States are typically shared for multi-byte UTF-8 characters - * that all have the same property value. - * - * A scanning state table has entries that are either a - * next-state specifier for bytes that are accepted by the - * scanner, or an exit action for the last byte of each - * character that is rejected by the scanner. - * - * Scanning long strings involves a tight loop that picks up one - * byte at a time and follows next-state value back to state[0] - * for each accepted UTF-8 character. Scanning stops at the end - * of the string or at the first character encountered that has - * an exit action such as "reject". Timing information is given - * below. - * - * Since so much of Google's text is 7-bit-ASCII values - * (approximately 94% of the bytes of web documents), the - * scanning interpreter has two speed optimizations. One checks - * 8 bytes at a time to see if they are all in the range lo..hi, - * as specified in constants in the overall statetable object. - * The check involves ORing together four 4-byte values that - * overflow into the high bit of some byte when a byte is out of - * range. For seven-bit-ASCII, lo is 0x20 and hi is 0x7E. This - * loop is about 8x faster than the one-byte-at-a-time loop. - * - * If checking for exit bytes in the 0x00-0x1F and 7F range is - * unneeded, an even faster loop just looks at the high bits of - * 8 bytes at once, and is about 1.33x faster than the lo..hi - * loop. - * - * Exit from the scanning routines backs up to the first byte of - * the rejected character, so the text spanned is always a - * complete number of UTF-8 characters. The normal scanning exit - * is at the first rejected character, or at the end of the - * input text. Scanning also exits on any detected ill-formed - * character or at a special do-again action built into some - * exit-optimized tables. The do-again action gets back to the - * top of the scanning loop to retry eight-byte ASCII scans. It - * is typically put into state tables after four seven-bit-ASCII - * characters in a row are seen, to allow restarting the fast - * scan after some slower processing of multi-byte characters. - * - * A replacement state table is similar to a scanning state - * table but has more extensive actions. The default - * byte-at-a-time loop copies one byte from source to - * destination and goes to the next state. The replacement - * actions overwrite 1-3 bytes of the destination with different - * bytes, possibly shortening the output by 1 or 2 bytes. The - * replacement bytes come from within the state table, from - * dummy states inserted just after any state that contains a - * replacement action. This gives a quick address calculation for - * the replacement byte(s) and gives some cache locality. - * - * Additional replacement actions use one or two bytes from - * within dummy states to index a side table of more-extensive - * replacements. The side table specifies a length of 0..15 - * destination bytes to overwrite and a length of 0..127 bytes - * to overwrite them with, plus the actual replacement bytes. - * - * This side table uses one extra bit to specify a pair of - * replacements, the first to be used in an HTML context and the - * second to be used in a plaintext context. This allows - * replacements that are spelled with "<" in the former - * context and "<" in the latter. - * - * The side table also uses an extra bit to specify a non-zero - * next state after a replacement. This allows a combination - * replacement and state change, used to implement a limited - * version of the Boyer-Moore algorithm for multi-character - * replacement without backtracking. This is useful when there - * are overlapping replacements, such as ch => x and also c => - * y, the latter to be used only if the character after c is not - * h. in this case, the state[0] table's entry for c would - * change c to y and also have a next-state of say n, and the - * state[n] entry for h would specify a replacement of the two - * bytes yh by x. No backtracking is needed. - * - * A replacement table may also include the exit actions of a - * scanning state table, so some character sequences can - * terminate early. - * - * During replacement, an optional data structure called an - * offset map can be updated to reflect each change in length - * between source and destination. This offset map can later be - * used to map destination-string offsets to corresponding - * source-string offsets or vice versa. - * - * The routines below also have variants in which state-table - * entries are all two bytes instead of one byte. This allows - * tables with more than 240 total states, but takes up twice as - * much space per state. - * -**/ - -// Return true if current Tbl pointer is within state0 range -// Note that unsigned compare checks both ends of range simultaneously -static inline bool InStateZero(const UTF8ScanObj* st, const uint8* Tbl) { - const uint8* Tbl0 = &st->state_table[st->state0]; - return (static_cast<uint32>(Tbl - Tbl0) < st->state0_size); -} - -static inline bool InStateZero_2(const UTF8ReplaceObj_2* st, - const unsigned short int* Tbl) { - const unsigned short int* Tbl0 = &st->state_table[st->state0]; - // Word difference, not byte difference - return (static_cast<uint32>(Tbl - Tbl0) < st->state0_size); -} - -// UTF8PropObj, UTF8ScanObj, UTF8ReplaceObj are all typedefs of -// UTF8MachineObj. - -static bool IsPropObj(const UTF8StateMachineObj& obj) { - return obj.fast_state == NULL - && obj.max_expand == 0; -} - -static bool IsPropObj_2(const UTF8StateMachineObj_2& obj) { - return obj.fast_state == NULL - && obj.max_expand == 0; -} - -static bool IsScanObj(const UTF8StateMachineObj& obj) { - return obj.fast_state != NULL - && obj.max_expand == 0; -} - -static bool IsReplaceObj(const UTF8StateMachineObj& obj) { - // Normally, obj.fast_state != NULL, but the handwritten tables - // in utf8statetable_unittest don't handle fast_states. - return obj.max_expand > 0; -} - -static bool IsReplaceObj_2(const UTF8StateMachineObj_2& obj) { - return obj.max_expand > 0; -} - -// Look up property of one UTF-8 character and advance over it -// Return 0 if input length is zero -// Return 0 and advance one byte if input is ill-formed -uint8 UTF8GenericProperty(const UTF8PropObj* st, - const uint8** src, - int* srclen) { - if (*srclen <= 0) { - return 0; - } - - const uint8* lsrc = *src; - const uint8* Tbl_0 = &st->state_table[st->state0]; - const uint8* Tbl = Tbl_0; - int e; - int eshift = st->entry_shift; - - // Short series of tests faster than switch, optimizes 7-bit ASCII - unsigned char c = lsrc[0]; - if (static_cast<signed char>(c) >= 0) { // one byte - e = Tbl[c]; - *src += 1; - *srclen -= 1; - } else if (((c & 0xe0) == 0xc0) && (*srclen >= 2)) { // two bytes - e = Tbl[c]; - Tbl = &Tbl_0[e << eshift]; - e = Tbl[lsrc[1]]; - *src += 2; - *srclen -= 2; - } else if (((c & 0xf0) == 0xe0) && (*srclen >= 3)) { // three bytes - e = Tbl[c]; - Tbl = &Tbl_0[e << eshift]; - e = Tbl[lsrc[1]]; - Tbl = &Tbl_0[e << eshift]; - e = Tbl[lsrc[2]]; - *src += 3; - *srclen -= 3; - }else if (((c & 0xf8) == 0xf0) && (*srclen >= 4)) { // four bytes - e = Tbl[c]; - Tbl = &Tbl_0[e << eshift]; - e = Tbl[lsrc[1]]; - Tbl = &Tbl_0[e << eshift]; - e = Tbl[lsrc[2]]; - Tbl = &Tbl_0[e << eshift]; - e = Tbl[lsrc[3]]; - *src += 4; - *srclen -= 4; - } else { // Ill-formed - e = 0; - *src += 1; - *srclen -= 1; - } - return e; -} - -bool UTF8HasGenericProperty(const UTF8PropObj& st, const char* src) { - const uint8* lsrc = reinterpret_cast<const uint8*>(src); - const uint8* Tbl_0 = &st.state_table[st.state0]; - const uint8* Tbl = Tbl_0; - int e; - int eshift = st.entry_shift; - - // Short series of tests faster than switch, optimizes 7-bit ASCII - unsigned char c = lsrc[0]; - if (static_cast<signed char>(c) >= 0) { // one byte - e = Tbl[c]; - } else if ((c & 0xe0) == 0xc0) { // two bytes - e = Tbl[c]; - Tbl = &Tbl_0[e << eshift]; - e = Tbl[lsrc[1]]; - } else if ((c & 0xf0) == 0xe0) { // three bytes - e = Tbl[c]; - Tbl = &Tbl_0[e << eshift]; - e = Tbl[lsrc[1]]; - Tbl = &Tbl_0[e << eshift]; - e = Tbl[lsrc[2]]; - } else { // four bytes - e = Tbl[c]; - Tbl = &Tbl_0[e << eshift]; - e = Tbl[lsrc[1]]; - Tbl = &Tbl_0[e << eshift]; - e = Tbl[lsrc[2]]; - Tbl = &Tbl_0[e << eshift]; - e = Tbl[lsrc[3]]; - } - return e; -} - - -// BigOneByte versions are needed for tables > 240 states, but most -// won't need the TwoByte versions. -// Internally, to next-to-last offset is multiplied by 16 and the last -// offset is relative instead of absolute. -// Look up property of one UTF-8 character and advance over it -// Return 0 if input length is zero -// Return 0 and advance one byte if input is ill-formed -uint8 UTF8GenericPropertyBigOneByte(const UTF8PropObj* st, - const uint8** src, - int* srclen) { - if (*srclen <= 0) { - return 0; - } - - const uint8* lsrc = *src; - const uint8* Tbl_0 = &st->state_table[st->state0]; - const uint8* Tbl = Tbl_0; - int e; - int eshift = st->entry_shift; - - // Short series of tests faster than switch, optimizes 7-bit ASCII - unsigned char c = lsrc[0]; - if (static_cast<signed char>(c) >= 0) { // one byte - e = Tbl[c]; - *src += 1; - *srclen -= 1; - } else if (((c & 0xe0) == 0xc0) && (*srclen >= 2)) { // two bytes - e = Tbl[c]; - Tbl = &Tbl_0[e << eshift]; - e = Tbl[lsrc[1]]; - *src += 2; - *srclen -= 2; - } else if (((c & 0xf0) == 0xe0) && (*srclen >= 3)) { // three bytes - e = Tbl[c]; - Tbl = &Tbl_0[e << (eshift + 4)]; // 16x the range - e = (reinterpret_cast<const int8*>(Tbl))[lsrc[1]]; - Tbl = &Tbl[e << eshift]; // Relative +/- - e = Tbl[lsrc[2]]; - *src += 3; - *srclen -= 3; - }else if (((c & 0xf8) == 0xf0) && (*srclen >= 4)) { // four bytes - e = Tbl[c]; - Tbl = &Tbl_0[e << eshift]; - e = Tbl[lsrc[1]]; - Tbl = &Tbl_0[e << (eshift + 4)]; // 16x the range - e = (reinterpret_cast<const int8*>(Tbl))[lsrc[2]]; - Tbl = &Tbl[e << eshift]; // Relative +/- - e = Tbl[lsrc[3]]; - *src += 4; - *srclen -= 4; - } else { // Ill-formed - e = 0; - *src += 1; - *srclen -= 1; - } - return e; -} - -// BigOneByte versions are needed for tables > 240 states, but most -// won't need the TwoByte versions. -bool UTF8HasGenericPropertyBigOneByte(const UTF8PropObj& st, const char* src) { - const uint8* lsrc = reinterpret_cast<const uint8*>(src); - const uint8* Tbl_0 = &st.state_table[st.state0]; - const uint8* Tbl = Tbl_0; - int e; - int eshift = st.entry_shift; - - // Short series of tests faster than switch, optimizes 7-bit ASCII - unsigned char c = lsrc[0]; - if (static_cast<signed char>(c) >= 0) { // one byte - e = Tbl[c]; - } else if ((c & 0xe0) == 0xc0) { // two bytes - e = Tbl[c]; - Tbl = &Tbl_0[e << eshift]; - e = Tbl[lsrc[1]]; - } else if ((c & 0xf0) == 0xe0) { // three bytes - e = Tbl[c]; - Tbl = &Tbl_0[e << (eshift + 4)]; // 16x the range - e = (reinterpret_cast<const int8*>(Tbl))[lsrc[1]]; - Tbl = &Tbl[e << eshift]; // Relative +/- - e = Tbl[lsrc[2]]; - } else { // four bytes - e = Tbl[c]; - Tbl = &Tbl_0[e << eshift]; - e = Tbl[lsrc[1]]; - Tbl = &Tbl_0[e << (eshift + 4)]; // 16x the range - e = (reinterpret_cast<const int8*>(Tbl))[lsrc[2]]; - Tbl = &Tbl[e << eshift]; // Relative +/- - e = Tbl[lsrc[3]]; - } - return e; -} - - -// TwoByte versions are needed for tables > 240 states -// Look up property of one UTF-8 character and advance over it -// Return 0 if input length is zero -// Return 0 and advance one byte if input is ill-formed -uint8 UTF8GenericPropertyTwoByte(const UTF8PropObj_2* st, - const uint8** src, - int* srclen) { - if (*srclen <= 0) { - return 0; - } - - const uint8* lsrc = *src; - const unsigned short* Tbl_0 = &st->state_table[st->state0]; - const unsigned short* Tbl = Tbl_0; - int e; - int eshift = st->entry_shift; - - // Short series of tests faster than switch, optimizes 7-bit ASCII - unsigned char c = lsrc[0]; - if (static_cast<signed char>(c) >= 0) { // one byte - e = Tbl[c]; - *src += 1; - *srclen -= 1; - } else if (((c & 0xe0) == 0xc0) && (*srclen >= 2)) { // two bytes - e = Tbl[c]; - Tbl = &Tbl_0[e << eshift]; - e = Tbl[lsrc[1]]; - *src += 2; - *srclen -= 2; - } else if (((c & 0xf0) == 0xe0) && (*srclen >= 3)) { // three bytes - e = Tbl[c]; - Tbl = &Tbl_0[e << eshift]; - e = Tbl[lsrc[1]]; - Tbl = &Tbl_0[e << eshift]; - e = Tbl[lsrc[2]]; - *src += 3; - *srclen -= 3; - }else if (((c & 0xf8) == 0xf0) && (*srclen >= 4)) { // four bytes - e = Tbl[c]; - Tbl = &Tbl_0[e << eshift]; - e = Tbl[lsrc[1]]; - Tbl = &Tbl_0[e << eshift]; - e = Tbl[lsrc[2]]; - Tbl = &Tbl_0[e << eshift]; - e = Tbl[lsrc[3]]; - *src += 4; - *srclen -= 4; - } else { // Ill-formed - e = 0; - *src += 1; - *srclen -= 1; - } - return e; -} - -// TwoByte versions are needed for tables > 240 states -bool UTF8HasGenericPropertyTwoByte(const UTF8PropObj_2& st, const char* src) { - const uint8* lsrc = reinterpret_cast<const uint8*>(src); - const unsigned short* Tbl_0 = &st.state_table[st.state0]; - const unsigned short* Tbl = Tbl_0; - int e; - int eshift = st.entry_shift; - - // Short series of tests faster than switch, optimizes 7-bit ASCII - unsigned char c = lsrc[0]; - if (static_cast<signed char>(c) >= 0) { // one byte - e = Tbl[c]; - } else if ((c & 0xe0) == 0xc0) { // two bytes - e = Tbl[c]; - Tbl = &Tbl_0[e << eshift]; - e = Tbl[lsrc[1]]; - } else if ((c & 0xf0) == 0xe0) { // three bytes - e = Tbl[c]; - Tbl = &Tbl_0[e << eshift]; - e = Tbl[lsrc[1]]; - Tbl = &Tbl_0[e << eshift]; - e = Tbl[lsrc[2]]; - } else { // four bytes - e = Tbl[c]; - Tbl = &Tbl_0[e << eshift]; - e = Tbl[lsrc[1]]; - Tbl = &Tbl_0[e << eshift]; - e = Tbl[lsrc[2]]; - Tbl = &Tbl_0[e << eshift]; - e = Tbl[lsrc[3]]; - } - return e; -} - - -// Approximate speeds on 2.8 GHz Pentium 4: -// GenericScan 1-byte loop 300 MB/sec * -// GenericScan 4-byte loop 1200 MB/sec -// GenericScan 8-byte loop 2400 MB/sec * -// GenericScanFastAscii 4-byte loop 3000 MB/sec -// GenericScanFastAscii 8-byte loop 3200 MB/sec * -// -// * Implemented below. FastAscii loop is memory-bandwidth constrained. - -// Scan a UTF-8 stringpiece based on state table. -// Always scan complete UTF-8 characters -// Set number of bytes scanned. Return reason for exiting -int UTF8GenericScan(const UTF8ScanObj* st, - const StringPiece& str, - int* bytes_consumed) { - int eshift = st->entry_shift; // 6 (space optimized) or 8 - // int nEntries = (1 << eshift); // 64 or 256 entries per state - - const uint8* isrc = - reinterpret_cast<const uint8*>(str.data()); - const uint8* src = isrc; - const int len = str.length(); - const uint8* srclimit = isrc + len; - const uint8* srclimit8 = srclimit - 7; - *bytes_consumed = 0; - if (len == 0) return kExitOK; - - const uint8* Tbl_0 = &st->state_table[st->state0]; - -DoAgain: - // Do state-table scan - int e = 0; - uint8 c; - - // Do fast for groups of 8 identity bytes. - // This covers a lot of 7-bit ASCII ~8x faster than the 1-byte loop, - // including slowing slightly on cr/lf/ht - //---------------------------- - const uint8* Tbl2 = &st->fast_state[0]; - uint32 losub = st->losub; - uint32 hiadd = st->hiadd; - while (src < srclimit8) { - uint32 s0123 = (reinterpret_cast<const uint32 *>(src))[0]; - uint32 s4567 = (reinterpret_cast<const uint32 *>(src))[1]; - src += 8; - // This is a fast range check for all bytes in [lowsub..0x80-hiadd) - uint32 temp = (s0123 - losub) | (s0123 + hiadd) | - (s4567 - losub) | (s4567 + hiadd); - if ((temp & 0x80808080) != 0) { - // We typically end up here on cr/lf/ht; src was incremented - int e0123 = (Tbl2[src[-8]] | Tbl2[src[-7]]) | - (Tbl2[src[-6]] | Tbl2[src[-5]]); - if (e0123 != 0) {src -= 8; break;} // Exit on Non-interchange - e0123 = (Tbl2[src[-4]] | Tbl2[src[-3]]) | - (Tbl2[src[-2]] | Tbl2[src[-1]]); - if (e0123 != 0) {src -= 4; break;} // Exit on Non-interchange - // Else OK, go around again - } - } - //---------------------------- - - // Byte-at-a-time scan - //---------------------------- - const uint8* Tbl = Tbl_0; - while (src < srclimit) { - c = *src; - e = Tbl[c]; - src++; - if (e >= kExitIllegalStructure) {break;} - Tbl = &Tbl_0[e << eshift]; - } - //---------------------------- - - - // Exit possibilities: - // Some exit code, !state0, back up over last char - // Some exit code, state0, back up one byte exactly - // source consumed, !state0, back up over partial char - // source consumed, state0, exit OK - // For illegal byte in state0, avoid backup up over PREVIOUS char - // For truncated last char, back up to beginning of it - - if (e >= kExitIllegalStructure) { - // Back up over exactly one byte of rejected/illegal UTF-8 character - src--; - // Back up more if needed - if (!InStateZero(st, Tbl)) { - do {src--;} while ((src > isrc) && ((src[0] & 0xc0) == 0x80)); - } - } else if (!InStateZero(st, Tbl)) { - // Back up over truncated UTF-8 character - e = kExitIllegalStructure; - do {src--;} while ((src > isrc) && ((src[0] & 0xc0) == 0x80)); - } else { - // Normal termination, source fully consumed - e = kExitOK; - } - - if (e == kExitDoAgain) { - // Loop back up to the fast scan - goto DoAgain; - } - - *bytes_consumed = src - isrc; - return e; -} - -// Scan a UTF-8 stringpiece based on state table. -// Always scan complete UTF-8 characters -// Set number of bytes scanned. Return reason for exiting -// OPTIMIZED for case of 7-bit ASCII 0000..007f all valid -int UTF8GenericScanFastAscii(const UTF8ScanObj* st, - const StringPiece& str, - int* bytes_consumed) { - const uint8* isrc = - reinterpret_cast<const uint8*>(str.data()); - const uint8* src = isrc; - const int len = str.length(); - const uint8* srclimit = isrc + len; - const uint8* srclimit8 = srclimit - 7; - *bytes_consumed = 0; - if (len == 0) return kExitOK; - - int n; - int rest_consumed; - int exit_reason; - do { - // Skip 8 bytes of ASCII at a whack; no endianness issue - while ((src < srclimit8) && - (((reinterpret_cast<const uint32*>(src)[0] | - reinterpret_cast<const uint32*>(src)[1]) & 0x80808080) == 0)) { - src += 8; - } - // Run state table on the rest - n = src - isrc; - StringPiece str2(str.data() + n, str.length() - n); - exit_reason = UTF8GenericScan(st, str2, &rest_consumed); - src += rest_consumed; - } while ( exit_reason == kExitDoAgain ); - - *bytes_consumed = src - isrc; - return exit_reason; -} - -// Hack to change halfwidth katakana to match an old UTF8CharToLower() - -// Return number of src bytes skipped -static int DoSpecialFixup(const unsigned char c, - const unsigned char** srcp, const unsigned char* srclimit, - unsigned char** dstp, unsigned char* dstlimit) { - return 0; -} - - -// Scan a UTF-8 stringpiece based on state table, copying to output stringpiece -// and doing text replacements. -// DO NOT CALL DIRECTLY. Use UTF8GenericReplace() below -// Needs caller to loop on kExitDoAgain -static int UTF8GenericReplaceInternal(const UTF8ReplaceObj* st, - const StringPiece& istr, - StringPiece& ostr, - bool is_plain_text, - int* bytes_consumed, - int* bytes_filled, - int* chars_changed, - OffsetMap* offsetmap) { - int eshift = st->entry_shift; - int nEntries = (1 << eshift); // 64 or 256 entries per state - const uint8* isrc = reinterpret_cast<const uint8*>(istr.data()); - const int ilen = istr.length(); - const uint8* copystart = isrc; - const uint8* src = isrc; - const uint8* srclimit = src + ilen; - *bytes_consumed = 0; - *bytes_filled = 0; - *chars_changed = 0; - - const uint8* odst = reinterpret_cast<const uint8*>(ostr.data()); - const int olen = ostr.length(); - uint8* dst = const_cast<uint8*>(odst); - uint8* dstlimit = dst + olen; - - int total_changed = 0; - - // Invariant condition during replacements: - // remaining dst size >= remaining src size - if ((dstlimit - dst) < (srclimit - src)) { - if (offsetmap != NULL) { - offsetmap->Copy(src - copystart); - copystart = src; - } - return kExitDstSpaceFull; - } - const uint8* Tbl_0 = &st->state_table[st->state0]; - - Do_state_table: - // Do state-table scan, copying as we go - const uint8* Tbl = Tbl_0; - int e = 0; - uint8 c = 0; - - Do_state_table_newe: - - //---------------------------- - while (src < srclimit) { - c = *src; - e = Tbl[c]; - *dst = c; - src++; - dst++; - if (e >= kExitIllegalStructure) {break;} - Tbl = &Tbl_0[e << eshift]; - } - //---------------------------- - - // Exit possibilities: - // Replacement code, do the replacement and loop - // Some other exit code, state0, back up one byte exactly - // Some other exit code, !state0, back up over last char - // source consumed, state0, exit OK - // source consumed, !state0, back up over partial char - // For illegal byte in state0, avoid backup up over PREVIOUS char - // For truncated last char, back up to beginning of it - - if (e >= kExitIllegalStructure) { - // Switch on exit code; most loop back to top - int offset = 0; - switch (e) { - // These all make the output string the same size or shorter - // No checking needed - case kExitReplace31: // del 2, add 1 bytes to change - dst -= 2; - if (offsetmap != NULL) { - offsetmap->Copy(src - copystart - 2); - offsetmap->Delete(2); - copystart = src; - } - dst[-1] = (unsigned char)Tbl[c + (nEntries * 1)]; - total_changed++; - goto Do_state_table; - case kExitReplace32: // del 3, add 2 bytes to change - dst--; - if (offsetmap != NULL) { - offsetmap->Copy(src - copystart - 1); - offsetmap->Delete(1); - copystart = src; - } - dst[-2] = (unsigned char)Tbl[c + (nEntries * 2)]; - dst[-1] = (unsigned char)Tbl[c + (nEntries * 1)]; - total_changed++; - goto Do_state_table; - case kExitReplace21: // del 2, add 1 bytes to change - dst--; - if (offsetmap != NULL) { - offsetmap->Copy(src - copystart - 1); - offsetmap->Delete(1); - copystart = src; - } - dst[-1] = (unsigned char)Tbl[c + (nEntries * 1)]; - total_changed++; - goto Do_state_table; - case kExitReplace3: // update 3 bytes to change - dst[-3] = (unsigned char)Tbl[c + (nEntries * 3)]; - // Fall into next case - case kExitReplace2: // update 2 bytes to change - dst[-2] = (unsigned char)Tbl[c + (nEntries * 2)]; - // Fall into next case - case kExitReplace1: // update 1 byte to change - dst[-1] = (unsigned char)Tbl[c + (nEntries * 1)]; - total_changed++; - goto Do_state_table; - case kExitReplace1S0: // update 1 byte to change, 256-entry state - dst[-1] = (unsigned char)Tbl[c + (256 * 1)]; - total_changed++; - goto Do_state_table; - // These can make the output string longer than the input - case kExitReplaceOffset2: - if ((nEntries != 256) && InStateZero(st, Tbl)) { - // For space-optimized table, we need multiples of 256 bytes - // in state0 and multiples of nEntries in other states - offset += ((unsigned char)Tbl[c + (256 * 2)] << 8); - } else { - offset += ((unsigned char)Tbl[c + (nEntries * 2)] << 8); - } - // Fall into next case - case kExitSpecial: // Apply special fixups [read: hacks] - case kExitReplaceOffset1: - if ((nEntries != 256) && InStateZero(st, Tbl)) { - // For space-optimized table, we need multiples of 256 bytes - // in state0 and multiples of nEntries in other states - offset += (unsigned char)Tbl[c + (256 * 1)]; - } else { - offset += (unsigned char)Tbl[c + (nEntries * 1)]; - } - { - const RemapEntry* re = &st->remap_base[offset]; - int del_len = re->delete_bytes & ~kReplaceAndResumeFlag; - int add_len = re->add_bytes & ~kHtmlPlaintextFlag; - - // Special-case non-HTML replacement of five sensitive entities - // " & ' < > - // 0022 0026 0027 003c 003e - // A replacement creating one of these is expressed as a pair of - // entries, one for HTML output and one for plaintext output. - // The first of the pair has the high bit of add_bytes set. - if (re->add_bytes & kHtmlPlaintextFlag) { - // Use this entry for plain text - if (!is_plain_text) { - // Use very next entry for HTML text (same back/delete length) - re = &st->remap_base[offset + 1]; - add_len = re->add_bytes & ~kHtmlPlaintextFlag; - } - } - - int string_offset = re->bytes_offset; - // After the replacement, need (dstlimit - newdst) >= (srclimit - src) - uint8* newdst = dst - del_len + add_len; - if ((dstlimit - newdst) < (srclimit - src)) { - // Won't fit; don't do the replacement. Caller may realloc and retry - e = kExitDstSpaceFull; - break; // exit, backing up over this char for later retry - } - dst -= del_len; - memcpy(dst, &st->remap_string[string_offset], add_len); - dst += add_len; - total_changed++; - if (offsetmap != NULL) { - if (add_len > del_len) { - offsetmap->Copy(src - copystart); - offsetmap->Insert(add_len - del_len); - copystart = src; - } else if (add_len < del_len) { - offsetmap->Copy(src - copystart + add_len - del_len); - offsetmap->Delete(del_len - add_len); - copystart = src; - } - } - if (re->delete_bytes & kReplaceAndResumeFlag) { - // There is a non-zero target state at the end of the - // replacement string - e = st->remap_string[string_offset + add_len]; - Tbl = &Tbl_0[e << eshift]; - goto Do_state_table_newe; - } - } - if (e == kExitRejectAlt) {break;} - if (e != kExitSpecial) {goto Do_state_table;} - - // case kExitSpecial: // Apply special fixups [read: hacks] - // In this routine, do either UTF8CharToLower() - // fullwidth/halfwidth mapping or - // voiced mapping or - // semi-voiced mapping - - // First, do EXIT_REPLACE_OFFSET1 action (above) - // Second: do additional code fixup - { - int srcdel = DoSpecialFixup(c, &src, srclimit, &dst, dstlimit); - if (offsetmap != NULL) { - if (srcdel != 0) { - offsetmap->Copy(src - copystart - srcdel); - offsetmap->Delete(srcdel); - copystart = src; - } - } - } - goto Do_state_table; - - case kExitIllegalStructure: // structurally illegal byte; quit - case kExitReject: // NUL or illegal code encountered; quit - case kExitRejectAlt: // Apply replacement, then exit - default: // and all other exits - break; - } // End switch (e) - - // Exit possibilities: - // Some other exit code, state0, back up one byte exactly - // Some other exit code, !state0, back up over last char - - // Back up over exactly one byte of rejected/illegal UTF-8 character - src--; - dst--; - // Back up more if needed - if (!InStateZero(st, Tbl)) { - do {src--;dst--;} while ((src > isrc) && ((src[0] & 0xc0) == 0x80)); - } - } else if (!InStateZero(st, Tbl)) { - // src >= srclimit, !state0 - // Back up over truncated UTF-8 character - e = kExitIllegalStructure; - do {src--; dst--;} while ((src > isrc) && ((src[0] & 0xc0) == 0x80)); - } else { - // src >= srclimit, state0 - // Normal termination, source fully consumed - e = kExitOK; - } - - if (offsetmap != NULL) { - if (src > copystart) { - offsetmap->Copy(src - copystart); - copystart = src; - } - } - - // Possible return values here: - // kExitDstSpaceFull caller may realloc and retry from middle - // kExitIllegalStructure caller my overwrite/truncate - // kExitOK all done and happy - // kExitReject caller may overwrite/truncate - // kExitDoAgain LOOP NOT DONE; caller must retry from middle - // (may do fast ASCII loop first) - // kExitPlaceholder -unused- - // kExitNone -unused- - *bytes_consumed = src - isrc; - *bytes_filled = dst - odst; - *chars_changed = total_changed; - return e; -} - -// TwoByte versions are needed for tables > 240 states, such -// as the table for full Unicode 4.1 canonical + compatibility mapping - -// Scan a UTF-8 stringpiece based on state table with two-byte entries, -// copying to output stringpiece -// and doing text replacements. -// DO NOT CALL DIRECTLY. Use UTF8GenericReplace() below -// Needs caller to loop on kExitDoAgain -static int UTF8GenericReplaceInternalTwoByte(const UTF8ReplaceObj_2* st, - const StringPiece& istr, - StringPiece& ostr, - bool is_plain_text, - int* bytes_consumed, - int* bytes_filled, - int* chars_changed, - OffsetMap* offsetmap) { - int eshift = st->entry_shift; - int nEntries = (1 << eshift); // 64 or 256 entries per state - const uint8* isrc = reinterpret_cast<const uint8*>(istr.data()); - const int ilen = istr.length(); - const uint8* copystart = isrc; - const uint8* src = isrc; - const uint8* srclimit = src + ilen; - *bytes_consumed = 0; - *bytes_filled = 0; - *chars_changed = 0; - - const uint8* odst = reinterpret_cast<const uint8*>(ostr.data()); - const int olen = ostr.length(); - uint8* dst = const_cast<uint8*>(odst); - uint8* dstlimit = dst + olen; - - *chars_changed = 0; - - int total_changed = 0; - - int src_lll = srclimit - src; - int dst_lll = dstlimit - dst; - - - // Invariant condition during replacements: - // remaining dst size >= remaining src size - if ((dstlimit - dst) < (srclimit - src)) { - if (offsetmap != NULL) { - offsetmap->Copy(src - copystart); - copystart = src; - } - return kExitDstSpaceFull_2; - } - const unsigned short* Tbl_0 = &st->state_table[st->state0]; - - Do_state_table_2: - // Do state-table scan, copying as we go - const unsigned short* Tbl = Tbl_0; - int e = 0; - uint8 c = 0; - - Do_state_table_newe_2: - - //---------------------------- - while (src < srclimit) { - c = *src; - e = Tbl[c]; - *dst = c; - src++; - dst++; - if (e >= kExitIllegalStructure_2) {break;} - Tbl = &Tbl_0[e << eshift]; - } - //---------------------------- - src_lll = src - isrc; - dst_lll = dst - odst; - - // Exit possibilities: - // Replacement code, do the replacement and loop - // Some other exit code, state0, back up one byte exactly - // Some other exit code, !state0, back up over last char - // source consumed, state0, exit OK - // source consumed, !state0, back up over partial char - // For illegal byte in state0, avoid backup up over PREVIOUS char - // For truncated last char, back up to beginning of it - - if (e >= kExitIllegalStructure_2) { - // Switch on exit code; most loop back to top - int offset = 0; - switch (e) { - // These all make the output string the same size or shorter - // No checking needed - case kExitReplace31_2: // del 2, add 1 bytes to change - dst -= 2; - if (offsetmap != NULL) { - offsetmap->Copy(src - copystart - 2); - offsetmap->Delete(2); - copystart = src; - } - dst[-1] = (unsigned char)(Tbl[c + (nEntries * 1)] & 0xff); - total_changed++; - goto Do_state_table_2; - case kExitReplace32_2: // del 3, add 2 bytes to change - dst--; - if (offsetmap != NULL) { - offsetmap->Copy(src - copystart - 1); - offsetmap->Delete(1); - copystart = src; - } - dst[-2] = (unsigned char)(Tbl[c + (nEntries * 1)] >> 8 & 0xff); - dst[-1] = (unsigned char)(Tbl[c + (nEntries * 1)] & 0xff); - total_changed++; - goto Do_state_table_2; - case kExitReplace21_2: // del 2, add 1 bytes to change - dst--; - if (offsetmap != NULL) { - offsetmap->Copy(src - copystart - 1); - offsetmap->Delete(1); - copystart = src; - } - dst[-1] = (unsigned char)(Tbl[c + (nEntries * 1)] & 0xff); - total_changed++; - goto Do_state_table_2; - case kExitReplace3_2: // update 3 bytes to change - dst[-3] = (unsigned char)(Tbl[c + (nEntries * 2)] & 0xff); - // Fall into next case - case kExitReplace2_2: // update 2 bytes to change - dst[-2] = (unsigned char)(Tbl[c + (nEntries * 1)] >> 8 & 0xff); - // Fall into next case - case kExitReplace1_2: // update 1 byte to change - dst[-1] = (unsigned char)(Tbl[c + (nEntries * 1)] & 0xff); - total_changed++; - goto Do_state_table_2; - case kExitReplace1S0_2: // update 1 byte to change, 256-entry state - dst[-1] = (unsigned char)(Tbl[c + (256 * 1)] & 0xff); - total_changed++; - goto Do_state_table_2; - // These can make the output string longer than the input - case kExitReplaceOffset2_2: - if ((nEntries != 256) && InStateZero_2(st, Tbl)) { - // For space-optimized table, we need multiples of 256 bytes - // in state0 and multiples of nEntries in other states - offset += ((unsigned char)(Tbl[c + (256 * 1)] >> 8 & 0xff) << 8); - } else { - offset += ((unsigned char)(Tbl[c + (nEntries * 1)] >> 8 & 0xff) << 8); - } - // Fall into next case - case kExitReplaceOffset1_2: - if ((nEntries != 256) && InStateZero_2(st, Tbl)) { - // For space-optimized table, we need multiples of 256 bytes - // in state0 and multiples of nEntries in other states - offset += (unsigned char)(Tbl[c + (256 * 1)] & 0xff); - } else { - offset += (unsigned char)(Tbl[c + (nEntries * 1)] & 0xff); - } - { - const RemapEntry* re = &st->remap_base[offset]; - int del_len = re->delete_bytes & ~kReplaceAndResumeFlag; - int add_len = re->add_bytes & ~kHtmlPlaintextFlag; - // Special-case non-HTML replacement of five sensitive entities - // " & ' < > - // 0022 0026 0027 003c 003e - // A replacement creating one of these is expressed as a pair of - // entries, one for HTML output and one for plaintext output. - // The first of the pair has the high bit of add_bytes set. - if (re->add_bytes & kHtmlPlaintextFlag) { - // Use this entry for plain text - if (!is_plain_text) { - // Use very next entry for HTML text (same back/delete length) - re = &st->remap_base[offset + 1]; - add_len = re->add_bytes & ~kHtmlPlaintextFlag; - } - } - - // After the replacement, need (dstlimit - dst) >= (srclimit - src) - int string_offset = re->bytes_offset; - // After the replacement, need (dstlimit - newdst) >= (srclimit - src) - uint8* newdst = dst - del_len + add_len; - if ((dstlimit - newdst) < (srclimit - src)) { - // Won't fit; don't do the replacement. Caller may realloc and retry - e = kExitDstSpaceFull_2; - break; // exit, backing up over this char for later retry - } - dst -= del_len; - memcpy(dst, &st->remap_string[string_offset], add_len); - dst += add_len; - if (offsetmap != NULL) { - if (add_len > del_len) { - offsetmap->Copy(src - copystart); - offsetmap->Insert(add_len - del_len); - copystart = src; - } else if (add_len < del_len) { - offsetmap->Copy(src - copystart + add_len - del_len); - offsetmap->Delete(del_len - add_len); - copystart = src; - } - } - if (re->delete_bytes & kReplaceAndResumeFlag) { - // There is a two-byte non-zero target state at the end of the - // replacement string - uint8 c1 = st->remap_string[string_offset + add_len]; - uint8 c2 = st->remap_string[string_offset + add_len + 1]; - e = (c1 << 8) | c2; - Tbl = &Tbl_0[e << eshift]; - total_changed++; - goto Do_state_table_newe_2; - } - } - total_changed++; - if (e == kExitRejectAlt_2) {break;} - goto Do_state_table_2; - - case kExitSpecial_2: // NO special fixups [read: hacks] - case kExitIllegalStructure_2: // structurally illegal byte; quit - case kExitReject_2: // NUL or illegal code encountered; quit - // and all other exits - default: - break; - } // End switch (e) - - // Exit possibilities: - // Some other exit code, state0, back up one byte exactly - // Some other exit code, !state0, back up over last char - - // Back up over exactly one byte of rejected/illegal UTF-8 character - src--; - dst--; - // Back up more if needed - if (!InStateZero_2(st, Tbl)) { - do {src--;dst--;} while ((src > isrc) && ((src[0] & 0xc0) == 0x80)); - } - } else if (!InStateZero_2(st, Tbl)) { - // src >= srclimit, !state0 - // Back up over truncated UTF-8 character - e = kExitIllegalStructure_2; - - do {src--; dst--;} while ((src > isrc) && ((src[0] & 0xc0) == 0x80)); - } else { - // src >= srclimit, state0 - // Normal termination, source fully consumed - e = kExitOK_2; - } - - if (offsetmap != NULL) { - if (src > copystart) { - offsetmap->Copy(src - copystart); - copystart = src; - } - } - - - // Possible return values here: - // kExitDstSpaceFull_2 caller may realloc and retry from middle - // kExitIllegalStructure_2 caller my overwrite/truncate - // kExitOK_2 all done and happy - // kExitReject_2 caller may overwrite/truncate - // kExitDoAgain_2 LOOP NOT DONE; caller must retry from middle - // (may do fast ASCII loop first) - // kExitPlaceholder_2 -unused- - // kExitNone_2 -unused- - *bytes_consumed = src - isrc; - *bytes_filled = dst - odst; - *chars_changed = total_changed; - return e; -} - - -// Scan a UTF-8 stringpiece based on state table, copying to output stringpiece -// and doing text replacements. -// Also writes an optional OffsetMap. Pass NULL to skip writing one. -// Always scan complete UTF-8 characters -// Set number of bytes consumed from input, number filled to output. -// Return reason for exiting -int UTF8GenericReplace(const UTF8ReplaceObj* st, - const StringPiece& istr, - StringPiece& ostr, - bool is_plain_text, - int* bytes_consumed, - int* bytes_filled, - int* chars_changed, - OffsetMap* offsetmap) { - StringPiece local_istr(istr.data(), istr.length()); - StringPiece local_ostr(ostr.data(), ostr.length()); - int total_consumed = 0; - int total_filled = 0; - int total_changed = 0; - int local_bytes_consumed, local_bytes_filled, local_chars_changed; - int e; - do { - e = UTF8GenericReplaceInternal(st, - local_istr, local_ostr, is_plain_text, - &local_bytes_consumed, &local_bytes_filled, - &local_chars_changed, - offsetmap); - local_istr.remove_prefix(local_bytes_consumed); - local_ostr.remove_prefix(local_bytes_filled); - total_consumed += local_bytes_consumed; - total_filled += local_bytes_filled; - total_changed += local_chars_changed; - } while ( e == kExitDoAgain ); - *bytes_consumed = total_consumed; - *bytes_filled = total_filled; - *chars_changed = total_changed; - return e; -} - -// Older version without offsetmap -int UTF8GenericReplace(const UTF8ReplaceObj* st, - const StringPiece& istr, - StringPiece& ostr, - bool is_plain_text, - int* bytes_consumed, - int* bytes_filled, - int* chars_changed) { - return UTF8GenericReplace(st, - istr, - ostr, - is_plain_text, - bytes_consumed, - bytes_filled, - chars_changed, - NULL); -} - -// Older version without is_plain_text or offsetmap -int UTF8GenericReplace(const UTF8ReplaceObj* st, - const StringPiece& istr, - StringPiece& ostr, - int* bytes_consumed, - int* bytes_filled, - int* chars_changed) { - bool is_plain_text = false; - return UTF8GenericReplace(st, - istr, - ostr, - is_plain_text, - bytes_consumed, - bytes_filled, - chars_changed, - NULL); -} - -// Scan a UTF-8 stringpiece based on state table with two-byte entries, -// copying to output stringpiece -// and doing text replacements. -// Also writes an optional OffsetMap. Pass NULL to skip writing one. -// Always scan complete UTF-8 characters -// Set number of bytes consumed from input, number filled to output. -// Return reason for exiting -int UTF8GenericReplaceTwoByte(const UTF8ReplaceObj_2* st, - const StringPiece& istr, - StringPiece& ostr, - bool is_plain_text, - int* bytes_consumed, - int* bytes_filled, - int* chars_changed, - OffsetMap* offsetmap) { - StringPiece local_istr(istr.data(), istr.length()); - StringPiece local_ostr(ostr.data(), ostr.length()); - int total_consumed = 0; - int total_filled = 0; - int total_changed = 0; - int local_bytes_consumed, local_bytes_filled, local_chars_changed; - int e; - do { - e = UTF8GenericReplaceInternalTwoByte(st, - local_istr, local_ostr, is_plain_text, - &local_bytes_consumed, - &local_bytes_filled, - &local_chars_changed, - offsetmap); - local_istr.remove_prefix(local_bytes_consumed); - local_ostr.remove_prefix(local_bytes_filled); - total_consumed += local_bytes_consumed; - total_filled += local_bytes_filled; - total_changed += local_chars_changed; - } while ( e == kExitDoAgain_2 ); - *bytes_consumed = total_consumed; - *bytes_filled = total_filled; - *chars_changed = total_changed; - - return e - kExitOK_2 + kExitOK; -} - -// Older version without offsetmap -int UTF8GenericReplaceTwoByte(const UTF8ReplaceObj_2* st, - const StringPiece& istr, - StringPiece& ostr, - bool is_plain_text, - int* bytes_consumed, - int* bytes_filled, - int* chars_changed) { - return UTF8GenericReplaceTwoByte(st, - istr, - ostr, - is_plain_text, - bytes_consumed, - bytes_filled, - chars_changed, - NULL); -} - -// Older version without is_plain_text or offsetmap -int UTF8GenericReplaceTwoByte(const UTF8ReplaceObj_2* st, - const StringPiece& istr, - StringPiece& ostr, - int* bytes_consumed, - int* bytes_filled, - int* chars_changed) { - bool is_plain_text = false; - return UTF8GenericReplaceTwoByte(st, - istr, - ostr, - is_plain_text, - bytes_consumed, - bytes_filled, - chars_changed, - NULL); -} - - - -// Adjust a stringpiece to encompass complete UTF-8 characters. -// The data pointer will be increased by 0..3 bytes to get to a character -// boundary, and the length will then be decreased by 0..3 bytes -// to encompass the last complete character. -void UTF8TrimToChars(StringPiece* istr) { - const char* src = istr->data(); - int len = istr->length(); - // Exit if empty string - if (len == 0) { - return; - } - - // Exit on simple, common case - if ( ((src[0] & 0xc0) != 0x80) && - (static_cast<signed char>(src[len - 1]) >= 0) ) { - // First byte is not a continuation and last byte is 7-bit ASCII -- done - return; - } - - // Adjust the back end, len > 0 - const char* srclimit = src + len; - // Backscan over any ending continuation bytes to find last char start - const char* s = srclimit - 1; // Last byte of the string - while ((src <= s) && ((*s & 0xc0) == 0x80)) { - s--; - } - // Include entire last char if it fits - if (src <= s) { - int last_char_len = UTF8OneCharLen(s); - if (s + last_char_len <= srclimit) { - // Last char fits, so include it, else exclude it - s += last_char_len; - } - } - if (s != srclimit) { - // s is one byte beyond the last full character, if any - istr->remove_suffix(srclimit - s); - // Exit if now empty string - if (istr->length() == 0) { - return; - } - } - - // Adjust the front end, len > 0 - len = istr->length(); - srclimit = src + len; - s = src; // First byte of the string - // Scan over any beginning continuation bytes to find first char start - while ((s < srclimit) && ((*s & 0xc0) == 0x80)) { - s++; - } - if (s != src) { - // s is at the first full character, if any - istr->remove_prefix(s - src); - } -} - -} // End namespace CLD2 |