diff options
Diffstat (limited to 'application/basilisk/components/translation/cld2/internal/utf8statetable.cc')
-rw-r--r-- | application/basilisk/components/translation/cld2/internal/utf8statetable.cc | 1369 |
1 files changed, 1369 insertions, 0 deletions
diff --git a/application/basilisk/components/translation/cld2/internal/utf8statetable.cc b/application/basilisk/components/translation/cld2/internal/utf8statetable.cc new file mode 100644 index 000000000..8c97123d8 --- /dev/null +++ b/application/basilisk/components/translation/cld2/internal/utf8statetable.cc @@ -0,0 +1,1369 @@ +// Copyright 2013 Google Inc. All Rights Reserved. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +// +// State Table follower for scanning UTF-8 strings without converting to +// 32- or 16-bit Unicode values. +// + +#ifdef COMPILER_MSVC +// MSVC warns: warning C4309: 'initializing' : truncation of constant value +// But the value is in fact not truncated. 0xFF still comes out 0xFF at +// runtime. +#pragma warning ( disable : 4309 ) +#endif + +#include "utf8statetable.h" + +#include <stdint.h> // for uintptr_t +#include <string.h> // for NULL, memcpy, memmove + +#include "integral_types.h" // for uint8, uint32, int8 +#include "stringpiece.h" +#include "offsetmap.h" + + +namespace CLD2 { + +static const int kReplaceAndResumeFlag = 0x80; // Bit in del byte to distinguish + // optional next-state field + // after replacement text +static const int kHtmlPlaintextFlag = 0x80; // Bit in add byte to distinguish + // HTML replacement vs. plaintext + + +/** + * This code implements a little interpreter for UTF8 state + * tables. There are three kinds of quite-similar state tables, + * property, scanning, and replacement. Each state in one of + * these tables consists of an array of 256 or 64 one-byte + * entries. The state is subscripted by an incoming source byte, + * and the entry either specifies the next state or specifies an + * action. Space-optimized tables have full 256-entry states for + * the first byte of a UTF-8 character, but only 64-entry states + * for continuation bytes. Space-optimized tables may only be + * used with source input that has been checked to be + * structurally- (or stronger interchange-) valid. + * + * A property state table has an unsigned one-byte property for + * each possible UTF-8 character. One-byte character properties + * are in the state[0] array, while for other lengths the + * state[0] array gives the next state, which contains the + * property value for two-byte characters or yet another state + * for longer ones. The code simply loads the right number of + * next-state values, then returns the final byte as property + * value. There are no actions specified in property tables. + * States are typically shared for multi-byte UTF-8 characters + * that all have the same property value. + * + * A scanning state table has entries that are either a + * next-state specifier for bytes that are accepted by the + * scanner, or an exit action for the last byte of each + * character that is rejected by the scanner. + * + * Scanning long strings involves a tight loop that picks up one + * byte at a time and follows next-state value back to state[0] + * for each accepted UTF-8 character. Scanning stops at the end + * of the string or at the first character encountered that has + * an exit action such as "reject". Timing information is given + * below. + * + * Since so much of Google's text is 7-bit-ASCII values + * (approximately 94% of the bytes of web documents), the + * scanning interpreter has two speed optimizations. One checks + * 8 bytes at a time to see if they are all in the range lo..hi, + * as specified in constants in the overall statetable object. + * The check involves ORing together four 4-byte values that + * overflow into the high bit of some byte when a byte is out of + * range. For seven-bit-ASCII, lo is 0x20 and hi is 0x7E. This + * loop is about 8x faster than the one-byte-at-a-time loop. + * + * If checking for exit bytes in the 0x00-0x1F and 7F range is + * unneeded, an even faster loop just looks at the high bits of + * 8 bytes at once, and is about 1.33x faster than the lo..hi + * loop. + * + * Exit from the scanning routines backs up to the first byte of + * the rejected character, so the text spanned is always a + * complete number of UTF-8 characters. The normal scanning exit + * is at the first rejected character, or at the end of the + * input text. Scanning also exits on any detected ill-formed + * character or at a special do-again action built into some + * exit-optimized tables. The do-again action gets back to the + * top of the scanning loop to retry eight-byte ASCII scans. It + * is typically put into state tables after four seven-bit-ASCII + * characters in a row are seen, to allow restarting the fast + * scan after some slower processing of multi-byte characters. + * + * A replacement state table is similar to a scanning state + * table but has more extensive actions. The default + * byte-at-a-time loop copies one byte from source to + * destination and goes to the next state. The replacement + * actions overwrite 1-3 bytes of the destination with different + * bytes, possibly shortening the output by 1 or 2 bytes. The + * replacement bytes come from within the state table, from + * dummy states inserted just after any state that contains a + * replacement action. This gives a quick address calculation for + * the replacement byte(s) and gives some cache locality. + * + * Additional replacement actions use one or two bytes from + * within dummy states to index a side table of more-extensive + * replacements. The side table specifies a length of 0..15 + * destination bytes to overwrite and a length of 0..127 bytes + * to overwrite them with, plus the actual replacement bytes. + * + * This side table uses one extra bit to specify a pair of + * replacements, the first to be used in an HTML context and the + * second to be used in a plaintext context. This allows + * replacements that are spelled with "<" in the former + * context and "<" in the latter. + * + * The side table also uses an extra bit to specify a non-zero + * next state after a replacement. This allows a combination + * replacement and state change, used to implement a limited + * version of the Boyer-Moore algorithm for multi-character + * replacement without backtracking. This is useful when there + * are overlapping replacements, such as ch => x and also c => + * y, the latter to be used only if the character after c is not + * h. in this case, the state[0] table's entry for c would + * change c to y and also have a next-state of say n, and the + * state[n] entry for h would specify a replacement of the two + * bytes yh by x. No backtracking is needed. + * + * A replacement table may also include the exit actions of a + * scanning state table, so some character sequences can + * terminate early. + * + * During replacement, an optional data structure called an + * offset map can be updated to reflect each change in length + * between source and destination. This offset map can later be + * used to map destination-string offsets to corresponding + * source-string offsets or vice versa. + * + * The routines below also have variants in which state-table + * entries are all two bytes instead of one byte. This allows + * tables with more than 240 total states, but takes up twice as + * much space per state. + * +**/ + +// Return true if current Tbl pointer is within state0 range +// Note that unsigned compare checks both ends of range simultaneously +static inline bool InStateZero(const UTF8ScanObj* st, const uint8* Tbl) { + const uint8* Tbl0 = &st->state_table[st->state0]; + return (static_cast<uint32>(Tbl - Tbl0) < st->state0_size); +} + +static inline bool InStateZero_2(const UTF8ReplaceObj_2* st, + const unsigned short int* Tbl) { + const unsigned short int* Tbl0 = &st->state_table[st->state0]; + // Word difference, not byte difference + return (static_cast<uint32>(Tbl - Tbl0) < st->state0_size); +} + +// UTF8PropObj, UTF8ScanObj, UTF8ReplaceObj are all typedefs of +// UTF8MachineObj. + +static bool IsPropObj(const UTF8StateMachineObj& obj) { + return obj.fast_state == NULL + && obj.max_expand == 0; +} + +static bool IsPropObj_2(const UTF8StateMachineObj_2& obj) { + return obj.fast_state == NULL + && obj.max_expand == 0; +} + +static bool IsScanObj(const UTF8StateMachineObj& obj) { + return obj.fast_state != NULL + && obj.max_expand == 0; +} + +static bool IsReplaceObj(const UTF8StateMachineObj& obj) { + // Normally, obj.fast_state != NULL, but the handwritten tables + // in utf8statetable_unittest don't handle fast_states. + return obj.max_expand > 0; +} + +static bool IsReplaceObj_2(const UTF8StateMachineObj_2& obj) { + return obj.max_expand > 0; +} + +// Look up property of one UTF-8 character and advance over it +// Return 0 if input length is zero +// Return 0 and advance one byte if input is ill-formed +uint8 UTF8GenericProperty(const UTF8PropObj* st, + const uint8** src, + int* srclen) { + if (*srclen <= 0) { + return 0; + } + + const uint8* lsrc = *src; + const uint8* Tbl_0 = &st->state_table[st->state0]; + const uint8* Tbl = Tbl_0; + int e; + int eshift = st->entry_shift; + + // Short series of tests faster than switch, optimizes 7-bit ASCII + unsigned char c = lsrc[0]; + if (static_cast<signed char>(c) >= 0) { // one byte + e = Tbl[c]; + *src += 1; + *srclen -= 1; + } else if (((c & 0xe0) == 0xc0) && (*srclen >= 2)) { // two bytes + e = Tbl[c]; + Tbl = &Tbl_0[e << eshift]; + e = Tbl[lsrc[1]]; + *src += 2; + *srclen -= 2; + } else if (((c & 0xf0) == 0xe0) && (*srclen >= 3)) { // three bytes + e = Tbl[c]; + Tbl = &Tbl_0[e << eshift]; + e = Tbl[lsrc[1]]; + Tbl = &Tbl_0[e << eshift]; + e = Tbl[lsrc[2]]; + *src += 3; + *srclen -= 3; + }else if (((c & 0xf8) == 0xf0) && (*srclen >= 4)) { // four bytes + e = Tbl[c]; + Tbl = &Tbl_0[e << eshift]; + e = Tbl[lsrc[1]]; + Tbl = &Tbl_0[e << eshift]; + e = Tbl[lsrc[2]]; + Tbl = &Tbl_0[e << eshift]; + e = Tbl[lsrc[3]]; + *src += 4; + *srclen -= 4; + } else { // Ill-formed + e = 0; + *src += 1; + *srclen -= 1; + } + return e; +} + +bool UTF8HasGenericProperty(const UTF8PropObj& st, const char* src) { + const uint8* lsrc = reinterpret_cast<const uint8*>(src); + const uint8* Tbl_0 = &st.state_table[st.state0]; + const uint8* Tbl = Tbl_0; + int e; + int eshift = st.entry_shift; + + // Short series of tests faster than switch, optimizes 7-bit ASCII + unsigned char c = lsrc[0]; + if (static_cast<signed char>(c) >= 0) { // one byte + e = Tbl[c]; + } else if ((c & 0xe0) == 0xc0) { // two bytes + e = Tbl[c]; + Tbl = &Tbl_0[e << eshift]; + e = Tbl[lsrc[1]]; + } else if ((c & 0xf0) == 0xe0) { // three bytes + e = Tbl[c]; + Tbl = &Tbl_0[e << eshift]; + e = Tbl[lsrc[1]]; + Tbl = &Tbl_0[e << eshift]; + e = Tbl[lsrc[2]]; + } else { // four bytes + e = Tbl[c]; + Tbl = &Tbl_0[e << eshift]; + e = Tbl[lsrc[1]]; + Tbl = &Tbl_0[e << eshift]; + e = Tbl[lsrc[2]]; + Tbl = &Tbl_0[e << eshift]; + e = Tbl[lsrc[3]]; + } + return e; +} + + +// BigOneByte versions are needed for tables > 240 states, but most +// won't need the TwoByte versions. +// Internally, to next-to-last offset is multiplied by 16 and the last +// offset is relative instead of absolute. +// Look up property of one UTF-8 character and advance over it +// Return 0 if input length is zero +// Return 0 and advance one byte if input is ill-formed +uint8 UTF8GenericPropertyBigOneByte(const UTF8PropObj* st, + const uint8** src, + int* srclen) { + if (*srclen <= 0) { + return 0; + } + + const uint8* lsrc = *src; + const uint8* Tbl_0 = &st->state_table[st->state0]; + const uint8* Tbl = Tbl_0; + int e; + int eshift = st->entry_shift; + + // Short series of tests faster than switch, optimizes 7-bit ASCII + unsigned char c = lsrc[0]; + if (static_cast<signed char>(c) >= 0) { // one byte + e = Tbl[c]; + *src += 1; + *srclen -= 1; + } else if (((c & 0xe0) == 0xc0) && (*srclen >= 2)) { // two bytes + e = Tbl[c]; + Tbl = &Tbl_0[e << eshift]; + e = Tbl[lsrc[1]]; + *src += 2; + *srclen -= 2; + } else if (((c & 0xf0) == 0xe0) && (*srclen >= 3)) { // three bytes + e = Tbl[c]; + Tbl = &Tbl_0[e << (eshift + 4)]; // 16x the range + e = (reinterpret_cast<const int8*>(Tbl))[lsrc[1]]; + Tbl = &Tbl[e << eshift]; // Relative +/- + e = Tbl[lsrc[2]]; + *src += 3; + *srclen -= 3; + }else if (((c & 0xf8) == 0xf0) && (*srclen >= 4)) { // four bytes + e = Tbl[c]; + Tbl = &Tbl_0[e << eshift]; + e = Tbl[lsrc[1]]; + Tbl = &Tbl_0[e << (eshift + 4)]; // 16x the range + e = (reinterpret_cast<const int8*>(Tbl))[lsrc[2]]; + Tbl = &Tbl[e << eshift]; // Relative +/- + e = Tbl[lsrc[3]]; + *src += 4; + *srclen -= 4; + } else { // Ill-formed + e = 0; + *src += 1; + *srclen -= 1; + } + return e; +} + +// BigOneByte versions are needed for tables > 240 states, but most +// won't need the TwoByte versions. +bool UTF8HasGenericPropertyBigOneByte(const UTF8PropObj& st, const char* src) { + const uint8* lsrc = reinterpret_cast<const uint8*>(src); + const uint8* Tbl_0 = &st.state_table[st.state0]; + const uint8* Tbl = Tbl_0; + int e; + int eshift = st.entry_shift; + + // Short series of tests faster than switch, optimizes 7-bit ASCII + unsigned char c = lsrc[0]; + if (static_cast<signed char>(c) >= 0) { // one byte + e = Tbl[c]; + } else if ((c & 0xe0) == 0xc0) { // two bytes + e = Tbl[c]; + Tbl = &Tbl_0[e << eshift]; + e = Tbl[lsrc[1]]; + } else if ((c & 0xf0) == 0xe0) { // three bytes + e = Tbl[c]; + Tbl = &Tbl_0[e << (eshift + 4)]; // 16x the range + e = (reinterpret_cast<const int8*>(Tbl))[lsrc[1]]; + Tbl = &Tbl[e << eshift]; // Relative +/- + e = Tbl[lsrc[2]]; + } else { // four bytes + e = Tbl[c]; + Tbl = &Tbl_0[e << eshift]; + e = Tbl[lsrc[1]]; + Tbl = &Tbl_0[e << (eshift + 4)]; // 16x the range + e = (reinterpret_cast<const int8*>(Tbl))[lsrc[2]]; + Tbl = &Tbl[e << eshift]; // Relative +/- + e = Tbl[lsrc[3]]; + } + return e; +} + + +// TwoByte versions are needed for tables > 240 states +// Look up property of one UTF-8 character and advance over it +// Return 0 if input length is zero +// Return 0 and advance one byte if input is ill-formed +uint8 UTF8GenericPropertyTwoByte(const UTF8PropObj_2* st, + const uint8** src, + int* srclen) { + if (*srclen <= 0) { + return 0; + } + + const uint8* lsrc = *src; + const unsigned short* Tbl_0 = &st->state_table[st->state0]; + const unsigned short* Tbl = Tbl_0; + int e; + int eshift = st->entry_shift; + + // Short series of tests faster than switch, optimizes 7-bit ASCII + unsigned char c = lsrc[0]; + if (static_cast<signed char>(c) >= 0) { // one byte + e = Tbl[c]; + *src += 1; + *srclen -= 1; + } else if (((c & 0xe0) == 0xc0) && (*srclen >= 2)) { // two bytes + e = Tbl[c]; + Tbl = &Tbl_0[e << eshift]; + e = Tbl[lsrc[1]]; + *src += 2; + *srclen -= 2; + } else if (((c & 0xf0) == 0xe0) && (*srclen >= 3)) { // three bytes + e = Tbl[c]; + Tbl = &Tbl_0[e << eshift]; + e = Tbl[lsrc[1]]; + Tbl = &Tbl_0[e << eshift]; + e = Tbl[lsrc[2]]; + *src += 3; + *srclen -= 3; + }else if (((c & 0xf8) == 0xf0) && (*srclen >= 4)) { // four bytes + e = Tbl[c]; + Tbl = &Tbl_0[e << eshift]; + e = Tbl[lsrc[1]]; + Tbl = &Tbl_0[e << eshift]; + e = Tbl[lsrc[2]]; + Tbl = &Tbl_0[e << eshift]; + e = Tbl[lsrc[3]]; + *src += 4; + *srclen -= 4; + } else { // Ill-formed + e = 0; + *src += 1; + *srclen -= 1; + } + return e; +} + +// TwoByte versions are needed for tables > 240 states +bool UTF8HasGenericPropertyTwoByte(const UTF8PropObj_2& st, const char* src) { + const uint8* lsrc = reinterpret_cast<const uint8*>(src); + const unsigned short* Tbl_0 = &st.state_table[st.state0]; + const unsigned short* Tbl = Tbl_0; + int e; + int eshift = st.entry_shift; + + // Short series of tests faster than switch, optimizes 7-bit ASCII + unsigned char c = lsrc[0]; + if (static_cast<signed char>(c) >= 0) { // one byte + e = Tbl[c]; + } else if ((c & 0xe0) == 0xc0) { // two bytes + e = Tbl[c]; + Tbl = &Tbl_0[e << eshift]; + e = Tbl[lsrc[1]]; + } else if ((c & 0xf0) == 0xe0) { // three bytes + e = Tbl[c]; + Tbl = &Tbl_0[e << eshift]; + e = Tbl[lsrc[1]]; + Tbl = &Tbl_0[e << eshift]; + e = Tbl[lsrc[2]]; + } else { // four bytes + e = Tbl[c]; + Tbl = &Tbl_0[e << eshift]; + e = Tbl[lsrc[1]]; + Tbl = &Tbl_0[e << eshift]; + e = Tbl[lsrc[2]]; + Tbl = &Tbl_0[e << eshift]; + e = Tbl[lsrc[3]]; + } + return e; +} + + +// Approximate speeds on 2.8 GHz Pentium 4: +// GenericScan 1-byte loop 300 MB/sec * +// GenericScan 4-byte loop 1200 MB/sec +// GenericScan 8-byte loop 2400 MB/sec * +// GenericScanFastAscii 4-byte loop 3000 MB/sec +// GenericScanFastAscii 8-byte loop 3200 MB/sec * +// +// * Implemented below. FastAscii loop is memory-bandwidth constrained. + +// Scan a UTF-8 stringpiece based on state table. +// Always scan complete UTF-8 characters +// Set number of bytes scanned. Return reason for exiting +int UTF8GenericScan(const UTF8ScanObj* st, + const StringPiece& str, + int* bytes_consumed) { + int eshift = st->entry_shift; // 6 (space optimized) or 8 + // int nEntries = (1 << eshift); // 64 or 256 entries per state + + const uint8* isrc = + reinterpret_cast<const uint8*>(str.data()); + const uint8* src = isrc; + const int len = str.length(); + const uint8* srclimit = isrc + len; + const uint8* srclimit8 = srclimit - 7; + *bytes_consumed = 0; + if (len == 0) return kExitOK; + + const uint8* Tbl_0 = &st->state_table[st->state0]; + +DoAgain: + // Do state-table scan + int e = 0; + uint8 c; + + // Do fast for groups of 8 identity bytes. + // This covers a lot of 7-bit ASCII ~8x faster than the 1-byte loop, + // including slowing slightly on cr/lf/ht + //---------------------------- + const uint8* Tbl2 = &st->fast_state[0]; + uint32 losub = st->losub; + uint32 hiadd = st->hiadd; + while (src < srclimit8) { + uint32 s0123 = (reinterpret_cast<const uint32 *>(src))[0]; + uint32 s4567 = (reinterpret_cast<const uint32 *>(src))[1]; + src += 8; + // This is a fast range check for all bytes in [lowsub..0x80-hiadd) + uint32 temp = (s0123 - losub) | (s0123 + hiadd) | + (s4567 - losub) | (s4567 + hiadd); + if ((temp & 0x80808080) != 0) { + // We typically end up here on cr/lf/ht; src was incremented + int e0123 = (Tbl2[src[-8]] | Tbl2[src[-7]]) | + (Tbl2[src[-6]] | Tbl2[src[-5]]); + if (e0123 != 0) {src -= 8; break;} // Exit on Non-interchange + e0123 = (Tbl2[src[-4]] | Tbl2[src[-3]]) | + (Tbl2[src[-2]] | Tbl2[src[-1]]); + if (e0123 != 0) {src -= 4; break;} // Exit on Non-interchange + // Else OK, go around again + } + } + //---------------------------- + + // Byte-at-a-time scan + //---------------------------- + const uint8* Tbl = Tbl_0; + while (src < srclimit) { + c = *src; + e = Tbl[c]; + src++; + if (e >= kExitIllegalStructure) {break;} + Tbl = &Tbl_0[e << eshift]; + } + //---------------------------- + + + // Exit possibilities: + // Some exit code, !state0, back up over last char + // Some exit code, state0, back up one byte exactly + // source consumed, !state0, back up over partial char + // source consumed, state0, exit OK + // For illegal byte in state0, avoid backup up over PREVIOUS char + // For truncated last char, back up to beginning of it + + if (e >= kExitIllegalStructure) { + // Back up over exactly one byte of rejected/illegal UTF-8 character + src--; + // Back up more if needed + if (!InStateZero(st, Tbl)) { + do {src--;} while ((src > isrc) && ((src[0] & 0xc0) == 0x80)); + } + } else if (!InStateZero(st, Tbl)) { + // Back up over truncated UTF-8 character + e = kExitIllegalStructure; + do {src--;} while ((src > isrc) && ((src[0] & 0xc0) == 0x80)); + } else { + // Normal termination, source fully consumed + e = kExitOK; + } + + if (e == kExitDoAgain) { + // Loop back up to the fast scan + goto DoAgain; + } + + *bytes_consumed = src - isrc; + return e; +} + +// Scan a UTF-8 stringpiece based on state table. +// Always scan complete UTF-8 characters +// Set number of bytes scanned. Return reason for exiting +// OPTIMIZED for case of 7-bit ASCII 0000..007f all valid +int UTF8GenericScanFastAscii(const UTF8ScanObj* st, + const StringPiece& str, + int* bytes_consumed) { + const uint8* isrc = + reinterpret_cast<const uint8*>(str.data()); + const uint8* src = isrc; + const int len = str.length(); + const uint8* srclimit = isrc + len; + const uint8* srclimit8 = srclimit - 7; + *bytes_consumed = 0; + if (len == 0) return kExitOK; + + int n; + int rest_consumed; + int exit_reason; + do { + // Skip 8 bytes of ASCII at a whack; no endianness issue + while ((src < srclimit8) && + (((reinterpret_cast<const uint32*>(src)[0] | + reinterpret_cast<const uint32*>(src)[1]) & 0x80808080) == 0)) { + src += 8; + } + // Run state table on the rest + n = src - isrc; + StringPiece str2(str.data() + n, str.length() - n); + exit_reason = UTF8GenericScan(st, str2, &rest_consumed); + src += rest_consumed; + } while ( exit_reason == kExitDoAgain ); + + *bytes_consumed = src - isrc; + return exit_reason; +} + +// Hack to change halfwidth katakana to match an old UTF8CharToLower() + +// Return number of src bytes skipped +static int DoSpecialFixup(const unsigned char c, + const unsigned char** srcp, const unsigned char* srclimit, + unsigned char** dstp, unsigned char* dstlimit) { + return 0; +} + + +// Scan a UTF-8 stringpiece based on state table, copying to output stringpiece +// and doing text replacements. +// DO NOT CALL DIRECTLY. Use UTF8GenericReplace() below +// Needs caller to loop on kExitDoAgain +static int UTF8GenericReplaceInternal(const UTF8ReplaceObj* st, + const StringPiece& istr, + StringPiece& ostr, + bool is_plain_text, + int* bytes_consumed, + int* bytes_filled, + int* chars_changed, + OffsetMap* offsetmap) { + int eshift = st->entry_shift; + int nEntries = (1 << eshift); // 64 or 256 entries per state + const uint8* isrc = reinterpret_cast<const uint8*>(istr.data()); + const int ilen = istr.length(); + const uint8* copystart = isrc; + const uint8* src = isrc; + const uint8* srclimit = src + ilen; + *bytes_consumed = 0; + *bytes_filled = 0; + *chars_changed = 0; + + const uint8* odst = reinterpret_cast<const uint8*>(ostr.data()); + const int olen = ostr.length(); + uint8* dst = const_cast<uint8*>(odst); + uint8* dstlimit = dst + olen; + + int total_changed = 0; + + // Invariant condition during replacements: + // remaining dst size >= remaining src size + if ((dstlimit - dst) < (srclimit - src)) { + if (offsetmap != NULL) { + offsetmap->Copy(src - copystart); + copystart = src; + } + return kExitDstSpaceFull; + } + const uint8* Tbl_0 = &st->state_table[st->state0]; + + Do_state_table: + // Do state-table scan, copying as we go + const uint8* Tbl = Tbl_0; + int e = 0; + uint8 c = 0; + + Do_state_table_newe: + + //---------------------------- + while (src < srclimit) { + c = *src; + e = Tbl[c]; + *dst = c; + src++; + dst++; + if (e >= kExitIllegalStructure) {break;} + Tbl = &Tbl_0[e << eshift]; + } + //---------------------------- + + // Exit possibilities: + // Replacement code, do the replacement and loop + // Some other exit code, state0, back up one byte exactly + // Some other exit code, !state0, back up over last char + // source consumed, state0, exit OK + // source consumed, !state0, back up over partial char + // For illegal byte in state0, avoid backup up over PREVIOUS char + // For truncated last char, back up to beginning of it + + if (e >= kExitIllegalStructure) { + // Switch on exit code; most loop back to top + int offset = 0; + switch (e) { + // These all make the output string the same size or shorter + // No checking needed + case kExitReplace31: // del 2, add 1 bytes to change + dst -= 2; + if (offsetmap != NULL) { + offsetmap->Copy(src - copystart - 2); + offsetmap->Delete(2); + copystart = src; + } + dst[-1] = (unsigned char)Tbl[c + (nEntries * 1)]; + total_changed++; + goto Do_state_table; + case kExitReplace32: // del 3, add 2 bytes to change + dst--; + if (offsetmap != NULL) { + offsetmap->Copy(src - copystart - 1); + offsetmap->Delete(1); + copystart = src; + } + dst[-2] = (unsigned char)Tbl[c + (nEntries * 2)]; + dst[-1] = (unsigned char)Tbl[c + (nEntries * 1)]; + total_changed++; + goto Do_state_table; + case kExitReplace21: // del 2, add 1 bytes to change + dst--; + if (offsetmap != NULL) { + offsetmap->Copy(src - copystart - 1); + offsetmap->Delete(1); + copystart = src; + } + dst[-1] = (unsigned char)Tbl[c + (nEntries * 1)]; + total_changed++; + goto Do_state_table; + case kExitReplace3: // update 3 bytes to change + dst[-3] = (unsigned char)Tbl[c + (nEntries * 3)]; + // Fall into next case + case kExitReplace2: // update 2 bytes to change + dst[-2] = (unsigned char)Tbl[c + (nEntries * 2)]; + // Fall into next case + case kExitReplace1: // update 1 byte to change + dst[-1] = (unsigned char)Tbl[c + (nEntries * 1)]; + total_changed++; + goto Do_state_table; + case kExitReplace1S0: // update 1 byte to change, 256-entry state + dst[-1] = (unsigned char)Tbl[c + (256 * 1)]; + total_changed++; + goto Do_state_table; + // These can make the output string longer than the input + case kExitReplaceOffset2: + if ((nEntries != 256) && InStateZero(st, Tbl)) { + // For space-optimized table, we need multiples of 256 bytes + // in state0 and multiples of nEntries in other states + offset += ((unsigned char)Tbl[c + (256 * 2)] << 8); + } else { + offset += ((unsigned char)Tbl[c + (nEntries * 2)] << 8); + } + // Fall into next case + case kExitSpecial: // Apply special fixups [read: hacks] + case kExitReplaceOffset1: + if ((nEntries != 256) && InStateZero(st, Tbl)) { + // For space-optimized table, we need multiples of 256 bytes + // in state0 and multiples of nEntries in other states + offset += (unsigned char)Tbl[c + (256 * 1)]; + } else { + offset += (unsigned char)Tbl[c + (nEntries * 1)]; + } + { + const RemapEntry* re = &st->remap_base[offset]; + int del_len = re->delete_bytes & ~kReplaceAndResumeFlag; + int add_len = re->add_bytes & ~kHtmlPlaintextFlag; + + // Special-case non-HTML replacement of five sensitive entities + // " & ' < > + // 0022 0026 0027 003c 003e + // A replacement creating one of these is expressed as a pair of + // entries, one for HTML output and one for plaintext output. + // The first of the pair has the high bit of add_bytes set. + if (re->add_bytes & kHtmlPlaintextFlag) { + // Use this entry for plain text + if (!is_plain_text) { + // Use very next entry for HTML text (same back/delete length) + re = &st->remap_base[offset + 1]; + add_len = re->add_bytes & ~kHtmlPlaintextFlag; + } + } + + int string_offset = re->bytes_offset; + // After the replacement, need (dstlimit - newdst) >= (srclimit - src) + uint8* newdst = dst - del_len + add_len; + if ((dstlimit - newdst) < (srclimit - src)) { + // Won't fit; don't do the replacement. Caller may realloc and retry + e = kExitDstSpaceFull; + break; // exit, backing up over this char for later retry + } + dst -= del_len; + memcpy(dst, &st->remap_string[string_offset], add_len); + dst += add_len; + total_changed++; + if (offsetmap != NULL) { + if (add_len > del_len) { + offsetmap->Copy(src - copystart); + offsetmap->Insert(add_len - del_len); + copystart = src; + } else if (add_len < del_len) { + offsetmap->Copy(src - copystart + add_len - del_len); + offsetmap->Delete(del_len - add_len); + copystart = src; + } + } + if (re->delete_bytes & kReplaceAndResumeFlag) { + // There is a non-zero target state at the end of the + // replacement string + e = st->remap_string[string_offset + add_len]; + Tbl = &Tbl_0[e << eshift]; + goto Do_state_table_newe; + } + } + if (e == kExitRejectAlt) {break;} + if (e != kExitSpecial) {goto Do_state_table;} + + // case kExitSpecial: // Apply special fixups [read: hacks] + // In this routine, do either UTF8CharToLower() + // fullwidth/halfwidth mapping or + // voiced mapping or + // semi-voiced mapping + + // First, do EXIT_REPLACE_OFFSET1 action (above) + // Second: do additional code fixup + { + int srcdel = DoSpecialFixup(c, &src, srclimit, &dst, dstlimit); + if (offsetmap != NULL) { + if (srcdel != 0) { + offsetmap->Copy(src - copystart - srcdel); + offsetmap->Delete(srcdel); + copystart = src; + } + } + } + goto Do_state_table; + + case kExitIllegalStructure: // structurally illegal byte; quit + case kExitReject: // NUL or illegal code encountered; quit + case kExitRejectAlt: // Apply replacement, then exit + default: // and all other exits + break; + } // End switch (e) + + // Exit possibilities: + // Some other exit code, state0, back up one byte exactly + // Some other exit code, !state0, back up over last char + + // Back up over exactly one byte of rejected/illegal UTF-8 character + src--; + dst--; + // Back up more if needed + if (!InStateZero(st, Tbl)) { + do {src--;dst--;} while ((src > isrc) && ((src[0] & 0xc0) == 0x80)); + } + } else if (!InStateZero(st, Tbl)) { + // src >= srclimit, !state0 + // Back up over truncated UTF-8 character + e = kExitIllegalStructure; + do {src--; dst--;} while ((src > isrc) && ((src[0] & 0xc0) == 0x80)); + } else { + // src >= srclimit, state0 + // Normal termination, source fully consumed + e = kExitOK; + } + + if (offsetmap != NULL) { + if (src > copystart) { + offsetmap->Copy(src - copystart); + copystart = src; + } + } + + // Possible return values here: + // kExitDstSpaceFull caller may realloc and retry from middle + // kExitIllegalStructure caller my overwrite/truncate + // kExitOK all done and happy + // kExitReject caller may overwrite/truncate + // kExitDoAgain LOOP NOT DONE; caller must retry from middle + // (may do fast ASCII loop first) + // kExitPlaceholder -unused- + // kExitNone -unused- + *bytes_consumed = src - isrc; + *bytes_filled = dst - odst; + *chars_changed = total_changed; + return e; +} + +// TwoByte versions are needed for tables > 240 states, such +// as the table for full Unicode 4.1 canonical + compatibility mapping + +// Scan a UTF-8 stringpiece based on state table with two-byte entries, +// copying to output stringpiece +// and doing text replacements. +// DO NOT CALL DIRECTLY. Use UTF8GenericReplace() below +// Needs caller to loop on kExitDoAgain +static int UTF8GenericReplaceInternalTwoByte(const UTF8ReplaceObj_2* st, + const StringPiece& istr, + StringPiece& ostr, + bool is_plain_text, + int* bytes_consumed, + int* bytes_filled, + int* chars_changed, + OffsetMap* offsetmap) { + int eshift = st->entry_shift; + int nEntries = (1 << eshift); // 64 or 256 entries per state + const uint8* isrc = reinterpret_cast<const uint8*>(istr.data()); + const int ilen = istr.length(); + const uint8* copystart = isrc; + const uint8* src = isrc; + const uint8* srclimit = src + ilen; + *bytes_consumed = 0; + *bytes_filled = 0; + *chars_changed = 0; + + const uint8* odst = reinterpret_cast<const uint8*>(ostr.data()); + const int olen = ostr.length(); + uint8* dst = const_cast<uint8*>(odst); + uint8* dstlimit = dst + olen; + + *chars_changed = 0; + + int total_changed = 0; + + int src_lll = srclimit - src; + int dst_lll = dstlimit - dst; + + + // Invariant condition during replacements: + // remaining dst size >= remaining src size + if ((dstlimit - dst) < (srclimit - src)) { + if (offsetmap != NULL) { + offsetmap->Copy(src - copystart); + copystart = src; + } + return kExitDstSpaceFull_2; + } + const unsigned short* Tbl_0 = &st->state_table[st->state0]; + + Do_state_table_2: + // Do state-table scan, copying as we go + const unsigned short* Tbl = Tbl_0; + int e = 0; + uint8 c = 0; + + Do_state_table_newe_2: + + //---------------------------- + while (src < srclimit) { + c = *src; + e = Tbl[c]; + *dst = c; + src++; + dst++; + if (e >= kExitIllegalStructure_2) {break;} + Tbl = &Tbl_0[e << eshift]; + } + //---------------------------- + src_lll = src - isrc; + dst_lll = dst - odst; + + // Exit possibilities: + // Replacement code, do the replacement and loop + // Some other exit code, state0, back up one byte exactly + // Some other exit code, !state0, back up over last char + // source consumed, state0, exit OK + // source consumed, !state0, back up over partial char + // For illegal byte in state0, avoid backup up over PREVIOUS char + // For truncated last char, back up to beginning of it + + if (e >= kExitIllegalStructure_2) { + // Switch on exit code; most loop back to top + int offset = 0; + switch (e) { + // These all make the output string the same size or shorter + // No checking needed + case kExitReplace31_2: // del 2, add 1 bytes to change + dst -= 2; + if (offsetmap != NULL) { + offsetmap->Copy(src - copystart - 2); + offsetmap->Delete(2); + copystart = src; + } + dst[-1] = (unsigned char)(Tbl[c + (nEntries * 1)] & 0xff); + total_changed++; + goto Do_state_table_2; + case kExitReplace32_2: // del 3, add 2 bytes to change + dst--; + if (offsetmap != NULL) { + offsetmap->Copy(src - copystart - 1); + offsetmap->Delete(1); + copystart = src; + } + dst[-2] = (unsigned char)(Tbl[c + (nEntries * 1)] >> 8 & 0xff); + dst[-1] = (unsigned char)(Tbl[c + (nEntries * 1)] & 0xff); + total_changed++; + goto Do_state_table_2; + case kExitReplace21_2: // del 2, add 1 bytes to change + dst--; + if (offsetmap != NULL) { + offsetmap->Copy(src - copystart - 1); + offsetmap->Delete(1); + copystart = src; + } + dst[-1] = (unsigned char)(Tbl[c + (nEntries * 1)] & 0xff); + total_changed++; + goto Do_state_table_2; + case kExitReplace3_2: // update 3 bytes to change + dst[-3] = (unsigned char)(Tbl[c + (nEntries * 2)] & 0xff); + // Fall into next case + case kExitReplace2_2: // update 2 bytes to change + dst[-2] = (unsigned char)(Tbl[c + (nEntries * 1)] >> 8 & 0xff); + // Fall into next case + case kExitReplace1_2: // update 1 byte to change + dst[-1] = (unsigned char)(Tbl[c + (nEntries * 1)] & 0xff); + total_changed++; + goto Do_state_table_2; + case kExitReplace1S0_2: // update 1 byte to change, 256-entry state + dst[-1] = (unsigned char)(Tbl[c + (256 * 1)] & 0xff); + total_changed++; + goto Do_state_table_2; + // These can make the output string longer than the input + case kExitReplaceOffset2_2: + if ((nEntries != 256) && InStateZero_2(st, Tbl)) { + // For space-optimized table, we need multiples of 256 bytes + // in state0 and multiples of nEntries in other states + offset += ((unsigned char)(Tbl[c + (256 * 1)] >> 8 & 0xff) << 8); + } else { + offset += ((unsigned char)(Tbl[c + (nEntries * 1)] >> 8 & 0xff) << 8); + } + // Fall into next case + case kExitReplaceOffset1_2: + if ((nEntries != 256) && InStateZero_2(st, Tbl)) { + // For space-optimized table, we need multiples of 256 bytes + // in state0 and multiples of nEntries in other states + offset += (unsigned char)(Tbl[c + (256 * 1)] & 0xff); + } else { + offset += (unsigned char)(Tbl[c + (nEntries * 1)] & 0xff); + } + { + const RemapEntry* re = &st->remap_base[offset]; + int del_len = re->delete_bytes & ~kReplaceAndResumeFlag; + int add_len = re->add_bytes & ~kHtmlPlaintextFlag; + // Special-case non-HTML replacement of five sensitive entities + // " & ' < > + // 0022 0026 0027 003c 003e + // A replacement creating one of these is expressed as a pair of + // entries, one for HTML output and one for plaintext output. + // The first of the pair has the high bit of add_bytes set. + if (re->add_bytes & kHtmlPlaintextFlag) { + // Use this entry for plain text + if (!is_plain_text) { + // Use very next entry for HTML text (same back/delete length) + re = &st->remap_base[offset + 1]; + add_len = re->add_bytes & ~kHtmlPlaintextFlag; + } + } + + // After the replacement, need (dstlimit - dst) >= (srclimit - src) + int string_offset = re->bytes_offset; + // After the replacement, need (dstlimit - newdst) >= (srclimit - src) + uint8* newdst = dst - del_len + add_len; + if ((dstlimit - newdst) < (srclimit - src)) { + // Won't fit; don't do the replacement. Caller may realloc and retry + e = kExitDstSpaceFull_2; + break; // exit, backing up over this char for later retry + } + dst -= del_len; + memcpy(dst, &st->remap_string[string_offset], add_len); + dst += add_len; + if (offsetmap != NULL) { + if (add_len > del_len) { + offsetmap->Copy(src - copystart); + offsetmap->Insert(add_len - del_len); + copystart = src; + } else if (add_len < del_len) { + offsetmap->Copy(src - copystart + add_len - del_len); + offsetmap->Delete(del_len - add_len); + copystart = src; + } + } + if (re->delete_bytes & kReplaceAndResumeFlag) { + // There is a two-byte non-zero target state at the end of the + // replacement string + uint8 c1 = st->remap_string[string_offset + add_len]; + uint8 c2 = st->remap_string[string_offset + add_len + 1]; + e = (c1 << 8) | c2; + Tbl = &Tbl_0[e << eshift]; + total_changed++; + goto Do_state_table_newe_2; + } + } + total_changed++; + if (e == kExitRejectAlt_2) {break;} + goto Do_state_table_2; + + case kExitSpecial_2: // NO special fixups [read: hacks] + case kExitIllegalStructure_2: // structurally illegal byte; quit + case kExitReject_2: // NUL or illegal code encountered; quit + // and all other exits + default: + break; + } // End switch (e) + + // Exit possibilities: + // Some other exit code, state0, back up one byte exactly + // Some other exit code, !state0, back up over last char + + // Back up over exactly one byte of rejected/illegal UTF-8 character + src--; + dst--; + // Back up more if needed + if (!InStateZero_2(st, Tbl)) { + do {src--;dst--;} while ((src > isrc) && ((src[0] & 0xc0) == 0x80)); + } + } else if (!InStateZero_2(st, Tbl)) { + // src >= srclimit, !state0 + // Back up over truncated UTF-8 character + e = kExitIllegalStructure_2; + + do {src--; dst--;} while ((src > isrc) && ((src[0] & 0xc0) == 0x80)); + } else { + // src >= srclimit, state0 + // Normal termination, source fully consumed + e = kExitOK_2; + } + + if (offsetmap != NULL) { + if (src > copystart) { + offsetmap->Copy(src - copystart); + copystart = src; + } + } + + + // Possible return values here: + // kExitDstSpaceFull_2 caller may realloc and retry from middle + // kExitIllegalStructure_2 caller my overwrite/truncate + // kExitOK_2 all done and happy + // kExitReject_2 caller may overwrite/truncate + // kExitDoAgain_2 LOOP NOT DONE; caller must retry from middle + // (may do fast ASCII loop first) + // kExitPlaceholder_2 -unused- + // kExitNone_2 -unused- + *bytes_consumed = src - isrc; + *bytes_filled = dst - odst; + *chars_changed = total_changed; + return e; +} + + +// Scan a UTF-8 stringpiece based on state table, copying to output stringpiece +// and doing text replacements. +// Also writes an optional OffsetMap. Pass NULL to skip writing one. +// Always scan complete UTF-8 characters +// Set number of bytes consumed from input, number filled to output. +// Return reason for exiting +int UTF8GenericReplace(const UTF8ReplaceObj* st, + const StringPiece& istr, + StringPiece& ostr, + bool is_plain_text, + int* bytes_consumed, + int* bytes_filled, + int* chars_changed, + OffsetMap* offsetmap) { + StringPiece local_istr(istr.data(), istr.length()); + StringPiece local_ostr(ostr.data(), ostr.length()); + int total_consumed = 0; + int total_filled = 0; + int total_changed = 0; + int local_bytes_consumed, local_bytes_filled, local_chars_changed; + int e; + do { + e = UTF8GenericReplaceInternal(st, + local_istr, local_ostr, is_plain_text, + &local_bytes_consumed, &local_bytes_filled, + &local_chars_changed, + offsetmap); + local_istr.remove_prefix(local_bytes_consumed); + local_ostr.remove_prefix(local_bytes_filled); + total_consumed += local_bytes_consumed; + total_filled += local_bytes_filled; + total_changed += local_chars_changed; + } while ( e == kExitDoAgain ); + *bytes_consumed = total_consumed; + *bytes_filled = total_filled; + *chars_changed = total_changed; + return e; +} + +// Older version without offsetmap +int UTF8GenericReplace(const UTF8ReplaceObj* st, + const StringPiece& istr, + StringPiece& ostr, + bool is_plain_text, + int* bytes_consumed, + int* bytes_filled, + int* chars_changed) { + return UTF8GenericReplace(st, + istr, + ostr, + is_plain_text, + bytes_consumed, + bytes_filled, + chars_changed, + NULL); +} + +// Older version without is_plain_text or offsetmap +int UTF8GenericReplace(const UTF8ReplaceObj* st, + const StringPiece& istr, + StringPiece& ostr, + int* bytes_consumed, + int* bytes_filled, + int* chars_changed) { + bool is_plain_text = false; + return UTF8GenericReplace(st, + istr, + ostr, + is_plain_text, + bytes_consumed, + bytes_filled, + chars_changed, + NULL); +} + +// Scan a UTF-8 stringpiece based on state table with two-byte entries, +// copying to output stringpiece +// and doing text replacements. +// Also writes an optional OffsetMap. Pass NULL to skip writing one. +// Always scan complete UTF-8 characters +// Set number of bytes consumed from input, number filled to output. +// Return reason for exiting +int UTF8GenericReplaceTwoByte(const UTF8ReplaceObj_2* st, + const StringPiece& istr, + StringPiece& ostr, + bool is_plain_text, + int* bytes_consumed, + int* bytes_filled, + int* chars_changed, + OffsetMap* offsetmap) { + StringPiece local_istr(istr.data(), istr.length()); + StringPiece local_ostr(ostr.data(), ostr.length()); + int total_consumed = 0; + int total_filled = 0; + int total_changed = 0; + int local_bytes_consumed, local_bytes_filled, local_chars_changed; + int e; + do { + e = UTF8GenericReplaceInternalTwoByte(st, + local_istr, local_ostr, is_plain_text, + &local_bytes_consumed, + &local_bytes_filled, + &local_chars_changed, + offsetmap); + local_istr.remove_prefix(local_bytes_consumed); + local_ostr.remove_prefix(local_bytes_filled); + total_consumed += local_bytes_consumed; + total_filled += local_bytes_filled; + total_changed += local_chars_changed; + } while ( e == kExitDoAgain_2 ); + *bytes_consumed = total_consumed; + *bytes_filled = total_filled; + *chars_changed = total_changed; + + return e - kExitOK_2 + kExitOK; +} + +// Older version without offsetmap +int UTF8GenericReplaceTwoByte(const UTF8ReplaceObj_2* st, + const StringPiece& istr, + StringPiece& ostr, + bool is_plain_text, + int* bytes_consumed, + int* bytes_filled, + int* chars_changed) { + return UTF8GenericReplaceTwoByte(st, + istr, + ostr, + is_plain_text, + bytes_consumed, + bytes_filled, + chars_changed, + NULL); +} + +// Older version without is_plain_text or offsetmap +int UTF8GenericReplaceTwoByte(const UTF8ReplaceObj_2* st, + const StringPiece& istr, + StringPiece& ostr, + int* bytes_consumed, + int* bytes_filled, + int* chars_changed) { + bool is_plain_text = false; + return UTF8GenericReplaceTwoByte(st, + istr, + ostr, + is_plain_text, + bytes_consumed, + bytes_filled, + chars_changed, + NULL); +} + + + +// Adjust a stringpiece to encompass complete UTF-8 characters. +// The data pointer will be increased by 0..3 bytes to get to a character +// boundary, and the length will then be decreased by 0..3 bytes +// to encompass the last complete character. +void UTF8TrimToChars(StringPiece* istr) { + const char* src = istr->data(); + int len = istr->length(); + // Exit if empty string + if (len == 0) { + return; + } + + // Exit on simple, common case + if ( ((src[0] & 0xc0) != 0x80) && + (static_cast<signed char>(src[len - 1]) >= 0) ) { + // First byte is not a continuation and last byte is 7-bit ASCII -- done + return; + } + + // Adjust the back end, len > 0 + const char* srclimit = src + len; + // Backscan over any ending continuation bytes to find last char start + const char* s = srclimit - 1; // Last byte of the string + while ((src <= s) && ((*s & 0xc0) == 0x80)) { + s--; + } + // Include entire last char if it fits + if (src <= s) { + int last_char_len = UTF8OneCharLen(s); + if (s + last_char_len <= srclimit) { + // Last char fits, so include it, else exclude it + s += last_char_len; + } + } + if (s != srclimit) { + // s is one byte beyond the last full character, if any + istr->remove_suffix(srclimit - s); + // Exit if now empty string + if (istr->length() == 0) { + return; + } + } + + // Adjust the front end, len > 0 + len = istr->length(); + srclimit = src + len; + s = src; // First byte of the string + // Scan over any beginning continuation bytes to find first char start + while ((s < srclimit) && ((*s & 0xc0) == 0x80)) { + s++; + } + if (s != src) { + // s is at the first full character, if any + istr->remove_prefix(s - src); + } +} + +} // End namespace CLD2 |