diff options
Diffstat (limited to 'browser/components/translation/cld2/internal/getonescriptspan.cc')
-rw-r--r-- | browser/components/translation/cld2/internal/getonescriptspan.cc | 1086 |
1 files changed, 0 insertions, 1086 deletions
diff --git a/browser/components/translation/cld2/internal/getonescriptspan.cc b/browser/components/translation/cld2/internal/getonescriptspan.cc deleted file mode 100644 index 6bdd4871b..000000000 --- a/browser/components/translation/cld2/internal/getonescriptspan.cc +++ /dev/null @@ -1,1086 +0,0 @@ -// Copyright 2013 Google Inc. All Rights Reserved. -// -// Licensed under the Apache License, Version 2.0 (the "License"); -// you may not use this file except in compliance with the License. -// You may obtain a copy of the License at -// -// http://www.apache.org/licenses/LICENSE-2.0 -// -// Unless required by applicable law or agreed to in writing, software -// distributed under the License is distributed on an "AS IS" BASIS, -// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -// See the License for the specific language governing permissions and -// limitations under the License. - -// -// Author: dsites@google.com (Dick Sites) -// - - -#include "getonescriptspan.h" -#include <string.h> - -#include "fixunicodevalue.h" -#include "lang_script.h" -#include "port.h" -#include "utf8statetable.h" - -#include "utf8prop_lettermarkscriptnum.h" -#include "utf8repl_lettermarklower.h" -#include "utf8scannot_lettermarkspecial.h" - - -namespace CLD2 { - -// Alphabetical order for binary search, from -// generated_entities.cc -extern const int kNameToEntitySize; -extern const CharIntPair kNameToEntity[]; - -static const int kMaxUpToWordBoundary = 50; // span < this make longer, - // else make shorter -static const int kMaxAdvanceToWordBoundary = 10; // +/- this many bytes - // to round to word boundary, - // direction above - -static const char kSpecialSymbol[256] = { // true for < > & - 0,0,0,0,0,0,0,0, 0,0,0,0,0,0,0,0, 0,0,0,0,0,0,0,0, 0,0,0,0,0,0,0,0, - 0,0,0,0,0,0,1,0, 0,0,0,0,0,0,0,0, 0,0,0,0,0,0,0,0, 0,0,0,0,1,0,1,0, - 0,0,0,0,0,0,0,0, 0,0,0,0,0,0,0,0, 0,0,0,0,0,0,0,0, 0,0,0,0,0,0,0,0, - 0,0,0,0,0,0,0,0, 0,0,0,0,0,0,0,0, 0,0,0,0,0,0,0,0, 0,0,0,0,0,0,0,0, - - 0,0,0,0,0,0,0,0, 0,0,0,0,0,0,0,0, 0,0,0,0,0,0,0,0, 0,0,0,0,0,0,0,0, - 0,0,0,0,0,0,0,0, 0,0,0,0,0,0,0,0, 0,0,0,0,0,0,0,0, 0,0,0,0,0,0,0,0, - 0,0,0,0,0,0,0,0, 0,0,0,0,0,0,0,0, 0,0,0,0,0,0,0,0, 0,0,0,0,0,0,0,0, - 0,0,0,0,0,0,0,0, 0,0,0,0,0,0,0,0, 0,0,0,0,0,0,0,0, 0,0,0,0,0,0,0,0, -}; - - - -#define LT 0 // < -#define GT 1 // > -#define EX 2 // ! -#define HY 3 // - -#define QU 4 // " -#define AP 5 // ' -#define SL 6 // / -#define S_ 7 -#define C_ 8 -#define R_ 9 -#define I_ 10 -#define P_ 11 -#define T_ 12 -#define Y_ 13 -#define L_ 14 -#define E_ 15 -#define CR 16 // <cr> or <lf> -#define NL 17 // non-letter: ASCII whitespace, digit, punctuation -#define PL 18 // possible letter, incl. & -#define xx 19 // <unused> - -// Map byte to one of ~20 interesting categories for cheap tag parsing -static const uint8 kCharToSub[256] = { - NL,NL,NL,NL, NL,NL,NL,NL, NL,NL,CR,NL, NL,CR,NL,NL, - NL,NL,NL,NL, NL,NL,NL,NL, NL,NL,NL,NL, NL,NL,NL,NL, - NL,EX,QU,NL, NL,NL,PL,AP, NL,NL,NL,NL, NL,HY,NL,SL, - NL,NL,NL,NL, NL,NL,NL,NL, NL,NL,NL,NL, LT,NL,GT,NL, - - PL,PL,PL,C_, PL,E_,PL,PL, PL,I_,PL,PL, L_,PL,PL,PL, - P_,PL,R_,S_, T_,PL,PL,PL, PL,Y_,PL,NL, NL,NL,NL,NL, - PL,PL,PL,C_, PL,E_,PL,PL, PL,I_,PL,PL, L_,PL,PL,PL, - P_,PL,R_,S_, T_,PL,PL,PL, PL,Y_,PL,NL, NL,NL,NL,NL, - - NL,NL,NL,NL, NL,NL,NL,NL, NL,NL,NL,NL, NL,NL,NL,NL, - NL,NL,NL,NL, NL,NL,NL,NL, NL,NL,NL,NL, NL,NL,NL,NL, - NL,NL,NL,NL, NL,NL,NL,NL, NL,NL,NL,NL, NL,NL,NL,NL, - NL,NL,NL,NL, NL,NL,NL,NL, NL,NL,NL,NL, NL,NL,NL,NL, - - PL,PL,PL,PL, PL,PL,PL,PL, PL,PL,PL,PL, PL,PL,PL,PL, - PL,PL,PL,PL, PL,PL,PL,PL, PL,PL,PL,PL, PL,PL,PL,PL, - PL,PL,PL,PL, PL,PL,PL,PL, PL,PL,PL,PL, PL,PL,PL,PL, - PL,PL,PL,PL, PL,PL,PL,PL, PL,PL,PL,PL, PL,PL,PL,PL, -}; - -#undef LT -#undef GT -#undef EX -#undef HY -#undef QU -#undef AP -#undef SL -#undef S_ -#undef C_ -#undef R_ -#undef I_ -#undef P_ -#undef T_ -#undef Y_ -#undef L_ -#undef E_ -#undef CR -#undef NL -#undef PL -#undef xx - - -#define OK 0 -#define X_ 1 - - -static const int kMaxExitStateLettersMarksOnly = 1; -static const int kMaxExitStateAllText = 2; - - -// State machine to do cheap parse of non-letter strings incl. tags -// advances <tag> -// | | -// advances <tag> ... </tag> for <script> <style> -// | | -// advances <!-- ... <tag> ... --> -// | | -// advances <tag -// || (0) -// advances <tag <tag2> -// || (0) -// -// We start in state [0] at a non-letter and make at least one transition -// When scanning for just letters, arriving back at state [0] or [1] exits -// the state machine. -// When scanning for any non-tag text, arriving at state [2] also exits -static const uint8 kTagParseTbl_0[] = { -// < > ! - " ' / S C R I P T Y L E CR NL PL xx - 3, 2, 2, 2, 2, 2, 2,OK, OK,OK,OK,OK, OK,OK,OK,OK, 2, 2,OK,X_, // [0] OK exit state - X_,X_,X_,X_, X_,X_,X_,X_, X_,X_,X_,X_, X_,X_,X_,X_, X_,X_,X_,X_, // [1] error exit state - 3, 2, 2, 2, 2, 2, 2,OK, OK,OK,OK,OK, OK,OK,OK,OK, 2, 2,OK,X_, // [2] NL* [exit state] - X_, 2, 4, 9, 10,11, 9,13, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9,X_, // [3] < - X_, 2, 9, 5, 10,11, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9,X_, // [4] <! - X_, 2, 9, 6, 10,11, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9,X_, // [5] <!- - 6, 6, 6, 7, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6,X_, // [6] <!--.* - 6, 6, 6, 8, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6,X_, // [7] <!--.*- - 6, 2, 6, 8, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6,X_, // [8] <!--.*-- - X_, 2, 9, 9, 10,11, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9,X_, // [9] <.* - 10,10,10,10, 9,10,10,10, 10,10,10,10, 10,10,10,10, 12,10,10,X_, // [10] <.*" - 11,11,11,11, 11, 9,11,11, 11,11,11,11, 11,11,11,11, 12,11,11,X_, // [11] <.*' - X_, 2,12,12, 12,12,12,12, 12,12,12,12, 12,12,12,12, 12,12,12,X_, // [12] <.* no " ' - -// < > ! - " ' / S C R I P T Y L E CR NL PL xx - X_, 2, 9, 9, 10,11, 9, 9, 14, 9, 9, 9, 28, 9, 9, 9, 9, 9, 9,X_, // [13] <S - X_, 2, 9, 9, 10,11, 9, 9, 9,15, 9, 9, 9, 9, 9, 9, 9, 9, 9,X_, // [14] <SC - X_, 2, 9, 9, 10,11, 9, 9, 9, 9,16, 9, 9, 9, 9, 9, 9, 9, 9,X_, // [15] <SCR - X_, 2, 9, 9, 10,11, 9, 9, 9, 9, 9,17, 9, 9, 9, 9, 9, 9, 9,X_, // [16] <SCRI - X_, 2, 9, 9, 10,11, 9, 9, 9, 9, 9, 9, 18, 9, 9, 9, 9, 9, 9,X_, // [17] <SCRIP - X_,19, 9, 9, 10,11, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 19,19, 9,X_, // [18] <SCRIPT - 20,19,19,19, 19,19,19,19, 19,19,19,19, 19,19,19,19, 19,19,19,X_, // [19] <SCRIPT .* - 19,19,19,19, 19,19,21,19, 19,19,19,19, 19,19,19,19, 19,19,19,X_, // [20] <SCRIPT .*< - 19,19,19,19, 19,19,19,22, 19,19,19,19, 19,19,19,19, 21,21,19,X_, // [21] <SCRIPT .*</ allow SP CR LF - 19,19,19,19, 19,19,19,19, 23,19,19,19, 19,19,19,19, 19,19,19,X_, // [22] <SCRIPT .*</S - 19,19,19,19, 19,19,19,19, 19,24,19,19, 19,19,19,19, 19,19,19,X_, // [23] <SCRIPT .*</SC - 19,19,19,19, 19,19,19,19, 19,19,25,19, 19,19,19,19, 19,19,19,X_, // [24] <SCRIPT .*</SCR - 19,19,19,19, 19,19,19,19, 19,19,19,26, 19,19,19,19, 19,19,19,X_, // [25] <SCRIPT .*</SCRI - 19,19,19,19, 19,19,19,19, 19,19,19,19, 27,19,19,19, 19,19,19,X_, // [26] <SCRIPT .*</SCRIP - 19, 2,19,19, 19,19,19,19, 19,19,19,19, 19,19,19,19, 19,19,19,X_, // [27] <SCRIPT .*</SCRIPT - -// < > ! - " ' / S C R I P T Y L E CR NL PL xx - X_, 2, 9, 9, 10,11, 9, 9, 9, 9, 9, 9, 9,29, 9, 9, 9, 9, 9,X_, // [28] <ST - X_, 2, 9, 9, 10,11, 9, 9, 9, 9, 9, 9, 9, 9,30, 9, 9, 9, 9,X_, // [29] <STY - X_, 2, 9, 9, 10,11, 9, 9, 9, 9, 9, 9, 9, 9, 9,31, 9, 9, 9,X_, // [30] <STYL - X_,32, 9, 9, 10,11, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 32,32, 9,X_, // [31] <STYLE - 33,32,32,32, 32,32,32,32, 32,32,32,32, 32,32,32,32, 32,32,32,X_, // [32] <STYLE .* - 32,32,32,32, 32,32,34,32, 32,32,32,32, 32,32,32,32, 32,32,32,X_, // [33] <STYLE .*< - 32,32,32,32, 32,32,32,35, 32,32,32,32, 32,32,32,32, 34,34,32,X_, // [34] <STYLE .*</ allow SP CR LF - 32,32,32,32, 32,32,32,32, 32,32,32,32, 36,32,32,32, 32,32,32,X_, // [35] <STYLE .*</S - 32,32,32,32, 32,32,32,32, 32,32,32,32, 32,37,32,32, 32,32,32,X_, // [36] <STYLE .*</ST - 32,32,32,32, 32,32,32,32, 32,32,32,32, 32,32,38,32, 32,32,32,X_, // [37] <STYLE .*</STY - 32,32,32,32, 32,32,32,32, 32,32,32,32, 32,32,32,39, 32,32,32,X_, // [38] <STYLE .*</STYL - 32, 2,32,32, 32,32,32,32, 32,32,32,32, 32,32,32,32, 32,32,32,X_, // [39] <STYLE .*</STYLE -}; - -#undef OK -#undef X_ - -enum -{ - UTFmax = 4, // maximum bytes per rune - Runesync = 0x80, // cannot represent part of a UTF sequence (<) - Runeself = 0x80, // rune and UTF sequences are the same (<) - Runeerror = 0xFFFD, // decoding error in UTF - Runemax = 0x10FFFF, // maximum rune value -}; - -// Debugging. Not thread safe. -static char gDisplayPiece[32]; -const uint8 gCharlen[16] = {1,1,1,1, 1,1,1,1, 1,1,1,1, 2,2,3,4}; -char* DisplayPiece(const char* next_byte_, int byte_length_) { - // Copy up to 8 UTF-8 chars to buffer - int k = 0; // byte count - int n = 0; // character count - for (int i = 0; i < byte_length_; ++i) { - char c = next_byte_[i]; - if ((c & 0xc0) != 0x80) { - // Beginning of a UTF-8 character - int charlen = gCharlen[static_cast<uint8>(c) >> 4]; - if (i + charlen > byte_length_) {break;} // Not enough room for full char - if (k >= (32 - 7)) {break;} // Not necessarily enough room - if (n >= 8) {break;} // Enough characters already - ++n; - } - if (c == '<') { - memcpy(&gDisplayPiece[k], "<", 4); k += 4; - } else if (c == '>') { - memcpy(&gDisplayPiece[k], ">", 4); k += 4; - } else if (c == '&') { - memcpy(&gDisplayPiece[k], "&", 5); k += 5; - } else if (c == '\'') { - memcpy(&gDisplayPiece[k], "'", 6); k += 6; - } else if (c == '"') { - memcpy(&gDisplayPiece[k], """, 6); k += 6; - } else { - gDisplayPiece[k++] = c; - } - } - gDisplayPiece[k++] = '\0'; - return gDisplayPiece; -} - - - -// runetochar copies (encodes) one rune, pointed to by r, to at most -// UTFmax bytes starting at s and returns the number of bytes generated. -int runetochar(char *str, const char32 *rune) { - // Convert to unsigned for range check. - unsigned long c; - - // 1 char 00-7F - c = *rune; - if(c <= 0x7F) { - str[0] = c; - return 1; - } - - // 2 char 0080-07FF - if(c <= 0x07FF) { - str[0] = 0xC0 | (c >> 1*6); - str[1] = 0x80 | (c & 0x3F); - return 2; - } - - // Range check - if (c > Runemax) { - c = Runeerror; - } - - // 3 char 0800-FFFF - if (c <= 0xFFFF) { - str[0] = 0xE0 | (c >> 2*6); - str[1] = 0x80 | ((c >> 1*6) & 0x3F); - str[2] = 0x80 | (c & 0x3F); - return 3; - } - - // 4 char 10000-1FFFFF - str[0] = 0xF0 | (c >> 3*6); - str[1] = 0x80 | ((c >> 2*6) & 0x3F); - str[2] = 0x80 | ((c >> 1*6) & 0x3F); - str[3] = 0x80 | (c & 0x3F); - return 4; -} - - - -// Useful for converting an entity to an ascii value. -// RETURNS unicode value, or -1 if entity isn't valid. Don't include & or ; -int LookupEntity(const char* entity_name, int entity_len) { - // Make a C string - if (entity_len >= 16) {return -1;} // All real entities are shorter - char temp[16]; - memcpy(temp, entity_name, entity_len); - temp[entity_len] = '\0'; - int match = BinarySearch(temp, 0, kNameToEntitySize, kNameToEntity); - if (match >= 0) {return kNameToEntity[match].i;} - return -1; -} - -bool ascii_isdigit(char c) { - return ('0' <= c) && (c <= '9'); -} -bool ascii_isxdigit(char c) { - if (('0' <= c) && (c <= '9')) {return true;} - if (('a' <= c) && (c <= 'f')) {return true;} - if (('A' <= c) && (c <= 'F')) {return true;} - return false; -} -bool ascii_isalnum(char c) { - if (('0' <= c) && (c <= '9')) {return true;} - if (('a' <= c) && (c <= 'z')) {return true;} - if (('A' <= c) && (c <= 'Z')) {return true;} - return false; -} -int hex_digit_to_int(char c) { - if (('0' <= c) && (c <= '9')) {return c - '0';} - if (('a' <= c) && (c <= 'f')) {return c - 'a' + 10;} - if (('A' <= c) && (c <= 'F')) {return c - 'A' + 10;} - return 0; -} - -static int32 strto32_base10(const char* nptr, const char* limit, - const char **endptr) { - *endptr = nptr; - while (nptr < limit && *nptr == '0') { - ++nptr; - } - if (nptr == limit || !ascii_isdigit(*nptr)) - return -1; - const char* end_digits_run = nptr; - while (end_digits_run < limit && ascii_isdigit(*end_digits_run)) { - ++end_digits_run; - } - *endptr = end_digits_run; - const int num_digits = end_digits_run - nptr; - // kint32max == 2147483647. - if (num_digits < 9 || - (num_digits == 10 && memcmp(nptr, "2147483647", 10) <= 0)) { - int value = 0; - for (; nptr < end_digits_run; ++nptr) { - value *= 10; - value += *nptr - '0'; - } - // Overflow past the last valid unicode codepoint - // (0x10ffff) is converted to U+FFFD by FixUnicodeValue(). - return FixUnicodeValue(value); - } else { - // Overflow: can't fit in an int32; - // returns the replacement character 0xFFFD. - return 0xFFFD; - } -} - -static int32 strto32_base16(const char* nptr, const char* limit, - const char **endptr) { - *endptr = nptr; - while (nptr < limit && *nptr == '0') { - ++nptr; - } - if (nptr == limit || !ascii_isxdigit(*nptr)) { - return -1; - } - const char* end_xdigits_run = nptr; - while (end_xdigits_run < limit && ascii_isxdigit(*end_xdigits_run)) { - ++end_xdigits_run; - } - *endptr = end_xdigits_run; - const int num_xdigits = end_xdigits_run - nptr; - // kint32max == 0x7FFFFFFF. - if (num_xdigits < 8 || (num_xdigits == 8 && nptr[0] < '8')) { - int value = 0; - for (; nptr < end_xdigits_run; ++nptr) { - value <<= 4; - value += hex_digit_to_int(*nptr); - } - // Overflow past the last valid unicode codepoint - // (0x10ffff) is converted to U+FFFD by FixUnicodeValue(). - return FixUnicodeValue(value); - } else { - // Overflow: can't fit in an int32; - // returns the replacement character 0xFFFD. - return 0xFFFD; - } -} - -// Unescape the current character pointed to by src. SETS the number -// of chars read for the conversion (in UTF8). If src isn't a valid entity, -// just consume the & and RETURN -1. If src doesn't point to & -- which it -// should -- set src_consumed to 0 and RETURN -1. -int ReadEntity(const char* src, int srcn, int* src_consumed) { - const char* const srcend = src + srcn; - - if (srcn == 0 || *src != '&') { // input should start with an ampersand - *src_consumed = 0; - return -1; - } - *src_consumed = 1; // we'll get the & at least - - // The standards are a bit unclear on when an entity ends. Certainly a ";" - // ends one, but spaces probably do too. We follow the lead of both IE and - // Netscape, which as far as we can tell end numeric entities (1st case below) - // at any non-digit, and end character entities (2nd case) at any non-alnum. - const char* entstart, *entend; // where the entity starts and ends - entstart = src + 1; // read past the & - int entval; // UCS2 value of the entity - if ( *entstart == '#' ) { // -- 1st case: numeric entity - if ( entstart + 2 >= srcend ) { - return -1; // no way a legitimate number could fit - } else if ( entstart[1] == 'x' || entstart[1] == 'X' ) { // hex numeric - entval = strto32_base16(entstart + 2, srcend, &entend); - } else { // decimal numeric entity - entval = strto32_base10(entstart+1, srcend, &entend); - } - if (entval == -1 || entend > srcend) { - return -1; // not entirely correct, but close enough - } - } else { // -- 2nd case: character entity - for (entend = entstart; - entend < srcend && ascii_isalnum(*entend); - ++entend ) { - // entity consists of alphanumeric chars - } - entval = LookupEntity(entstart, entend - entstart); - if (entval < 0) { - return -1; // not a legal entity name - } - // Now we do a strange-seeming IE6-compatibility check: if entval is - // >= 256, it *must* be followed by a semicolon or it's not considered - // an entity. The problem is lots of the newfangled entity names, like - // "lang", also occur in URL CGI arguments: "/search?q=test&lang=en". - // When these links are written in HTML, it would be really bad if the - // "&lang" were treated as an entity, which is what the spec says - // *should* happen (even when the HTML is inside an "A HREF" tag!) - // IE ignores the spec for these new, high-value entities, so we do too. - if ( entval >= 256 && !(entend < srcend && *entend == ';') ) { - return -1; // make non-;-terminated entity illegal - } - } - - // Finally, figure out how much src was consumed - if ( entend < srcend && *entend == ';' ) { - entend++; // standard says ; terminator is special - } - *src_consumed = entend - src; - return entval; -} - - -// Src points to '&' -// Writes entity value to dst. Returns take(src), put(dst) byte counts -void EntityToBuffer(const char* src, int len, char* dst, - int* tlen, int* plen) { - char32 entval = ReadEntity(src, len, tlen); - - // ReadEntity does this already: entval = FixUnicodeValue(entval); - - // Convert UTF-32 to UTF-8 - if (entval > 0) { - *plen = runetochar(dst, &entval); - } else { - // Illegal entity; ignore the '&' - *tlen = 1; - *plen = 0; - } -} - -// Returns true if character is < > or &, none of which are letters -bool inline IsSpecial(char c) { - if ((c & 0xe0) == 0x20) { - return kSpecialSymbol[static_cast<uint8>(c)]; - } - return false; -} - -// Quick Skip to next letter or < > & or to end of string (eos) -// Always return is_letter for eos -int ScanToLetterOrSpecial(const char* src, int len) { - int bytes_consumed; - StringPiece str(src, len); - UTF8GenericScan(&utf8scannot_lettermarkspecial_obj, str, &bytes_consumed); - return bytes_consumed; -} - - - - -// src points to non-letter, such as tag-opening '<' -// Return length from here to next possible letter -// On another < before >, return 1 -// advances <tag> -// | | -// advances <tag> ... </tag> for <script> <style> -// | | -// advances <!-- ... <tag> ... --> -// | | -// advances <tag -// | | end of string -// advances <tag <tag2> -// || -int ScanToPossibleLetter(const char* isrc, int len, int max_exit_state) { - const uint8* src = reinterpret_cast<const uint8*>(isrc); - const uint8* srclimit = src + len; - const uint8* tagParseTbl = kTagParseTbl_0; - int e = 0; - while (src < srclimit) { - e = tagParseTbl[kCharToSub[*src++]]; - if (e <= max_exit_state) { - // We overshot by one byte - --src; - break; - } - tagParseTbl = &kTagParseTbl_0[e * 20]; - } - - if (src >= srclimit) { - // We fell off the end of the text. - // It looks like the most common case for this is a truncated file, not - // mismatched angle brackets. So we pretend that the last char was '>' - return len; - } - - // OK to be in state 0 or state 2 at exit - if ((e != 0) && (e != 2)) { - // Error, '<' followed by '<' - // We want to back up to first <, then advance by one byte past it - int offset = src - reinterpret_cast<const uint8*>(isrc); - - // Backscan to first '<' and return enough length to just get past it - --offset; // back up over the second '<', which caused us to stop - while ((0 < offset) && (isrc[offset] != '<')) { - // Find the first '<', which is unmatched - --offset; - } - // skip to just beyond first '<' - return offset + 1; - } - - return src - reinterpret_cast<const uint8*>(isrc); -} - - -ScriptScanner::ScriptScanner(const char* buffer, - int buffer_length, - bool is_plain_text) - : start_byte_(buffer), - next_byte_(buffer), - next_byte_limit_(buffer + buffer_length), - byte_length_(buffer_length), - is_plain_text_(is_plain_text), - letters_marks_only_(true), - one_script_only_(true), - exit_state_(kMaxExitStateLettersMarksOnly) { - script_buffer_ = new char[kMaxScriptBuffer]; - script_buffer_lower_ = new char[kMaxScriptLowerBuffer]; - map2original_.Clear(); // map from script_buffer_ to buffer - map2uplow_.Clear(); // map from script_buffer_lower_ to script_buffer_ -} - -// Extended version to allow spans of any non-tag text and spans of mixed script -ScriptScanner::ScriptScanner(const char* buffer, - int buffer_length, - bool is_plain_text, - bool any_text, - bool any_script) - : start_byte_(buffer), - next_byte_(buffer), - next_byte_limit_(buffer + buffer_length), - byte_length_(buffer_length), - is_plain_text_(is_plain_text), - letters_marks_only_(!any_text), - one_script_only_(!any_script), - exit_state_(any_text ? kMaxExitStateAllText : kMaxExitStateLettersMarksOnly) { - script_buffer_ = new char[kMaxScriptBuffer]; - script_buffer_lower_ = new char[kMaxScriptLowerBuffer]; - map2original_.Clear(); // map from script_buffer_ to buffer - map2uplow_.Clear(); // map from script_buffer_lower_ to script_buffer_ -} - - -ScriptScanner::~ScriptScanner() { - delete[] script_buffer_; - delete[] script_buffer_lower_; -} - - - - -// Get to the first real non-tag letter or entity that is a letter -// Sets script of that letter -// Return len if no more letters -int ScriptScanner::SkipToFrontOfSpan(const char* src, int len, int* script) { - int sc = UNKNOWN_ULSCRIPT; - int skip = 0; - int tlen, plen; - - // Do run of non-letters (tag | &NL | NL)* - tlen = 0; - while (skip < len) { - // Do fast scan to next interesting byte - // int oldskip = skip; - skip += ScanToLetterOrSpecial(src + skip, len - skip); - - // Check for no more letters/specials - if (skip >= len) { - // All done - *script = sc; - return len; - } - - // We are at a letter, nonletter, tag, or entity - if (IsSpecial(src[skip]) && !is_plain_text_) { - if (src[skip] == '<') { - // Begining of tag; skip to end and go around again - tlen = ScanToPossibleLetter(src + skip, len - skip, - exit_state_); - sc = 0; - } else if (src[skip] == '>') { - // Unexpected end of tag; skip it and go around again - tlen = 1; // Over the > - sc = 0; - } else if (src[skip] == '&') { - // Expand entity, no advance - char temp[4]; - EntityToBuffer(src + skip, len - skip, - temp, &tlen, &plen); - sc = GetUTF8LetterScriptNum(temp); - } - } else { - // Update 1..4 bytes - tlen = UTF8OneCharLen(src + skip); - sc = GetUTF8LetterScriptNum(src + skip); - } - if (sc != 0) {break;} // Letter found - skip += tlen; // Else advance - } - - *script = sc; - return skip; -} - - -// These are for ASCII-only tag names -// Compare one letter uplow to c, ignoring case of uplowp -inline bool EqCase(char uplow, char c) { - return (uplow | 0x20) == c; -} - -// These are for ASCII-only tag names -// Return true for space / < > etc. all less than 0x40 -inline bool NeqLetter(char c) { - return c < 0x40; -} - -// These are for ASCII-only tag names -// Return true for space \n false for \r -inline bool WS(char c) { - return (c == ' ') || (c == '\n'); -} - -// Canonical CR or LF -static const char LF = '\n'; - - -// The naive loop scans from next_byte_ to script_buffer_ until full. -// But this can leave an awkward hard-to-identify short fragment at the -// end of the input. We would prefer to make the next-to-last fragment -// shorter and the last fragment longer. - -// Copy next run of non-tag characters to buffer [NUL terminated] -// This just replaces tags with space or \n and removes entities. -// Tags <br> <p> and <tr> are replaced with \n. Non-letter sequences -// including \r or \n are replaced by \n. All other tags and skipped text -// are replaced with ASCII space. -// -// Buffer ALWAYS has leading space and trailing space space space NUL -bool ScriptScanner::GetOneTextSpan(LangSpan* span) { - span->text = script_buffer_; - span->text_bytes = 0; - span->offset = next_byte_ - start_byte_; - span->ulscript = UNKNOWN_ULSCRIPT; - span->lang = UNKNOWN_LANGUAGE; - span->truncated = false; - - int put_soft_limit = kMaxScriptBytes - kWithinScriptTail; - if ((kMaxScriptBytes <= byte_length_) && - (byte_length_ < (2 * kMaxScriptBytes))) { - // Try to split the last two fragments in half - put_soft_limit = byte_length_ / 2; - } - - script_buffer_[0] = ' '; // Always a space at front of output - script_buffer_[1] = '\0'; - int take = 0; - int put = 1; // Start after the initial space - int tlen, plen; - - if (byte_length_ <= 0) { - return false; // No more text to be found - } - - // Go over alternating spans of text and tags, - // copying letters to buffer with single spaces for each run of non-letters - bool last_byte_was_space = false; - while (take < byte_length_) { - char c = next_byte_[take]; - if (c == '\r') {c = LF;} // Canonical CR or LF - if (c == '\n') {c = LF;} // Canonical CR or LF - - if (IsSpecial(c) && !is_plain_text_) { - if (c == '<') { - // Replace tag with space - c = ' '; // for almost-full test below - // or if <p> <br> <tr>, replace with \n - if (take < (byte_length_ - 3)) { - if (EqCase(next_byte_[take + 1], 'p') && - NeqLetter(next_byte_[take + 2])) { - c = LF; - } - if (EqCase(next_byte_[take + 1], 'b') && - EqCase(next_byte_[take + 2], 'r') && - NeqLetter(next_byte_[take + 3])) { - c = LF; - } - if (EqCase(next_byte_[take + 1], 't') && - EqCase(next_byte_[take + 2], 'r') && - NeqLetter(next_byte_[take + 3])) { - c = LF; - } - } - // Begining of tag; skip to end and go around again - tlen = 1 + ScanToPossibleLetter(next_byte_ + take, byte_length_ - take, - exit_state_); - // Copy one byte, compressing spaces - if (!last_byte_was_space || !WS(c)) { - script_buffer_[put++] = c; // Advance dest - last_byte_was_space = WS(c); - } - } else if (c == '>') { - // Unexpected end of tag; copy it and go around again - tlen = 1; // Over the > - script_buffer_[put++] = c; // Advance dest - } else if (c == '&') { - // Expand entity, no advance - EntityToBuffer(next_byte_ + take, byte_length_ - take, - script_buffer_ + put, &tlen, &plen); - put += plen; // Advance dest - } - take += tlen; // Advance source - } else { - // Copy one byte, compressing spaces - if (!last_byte_was_space || !WS(c)) { - script_buffer_[put++] = c; // Advance dest - last_byte_was_space = WS(c); - } - ++take; // Advance source - } - - if (WS(c) && - (put >= put_soft_limit)) { - // Buffer is almost full - span->truncated = true; - break; - } - if (put >= kMaxScriptBytes) { - // Buffer is completely full - span->truncated = true; - break; - } - } - - // Almost done. Back up to a character boundary if needed - while ((0 < take) && ((next_byte_[take] & 0xc0) == 0x80)) { - // Back up over continuation byte - --take; - --put; - } - - // Update input position - next_byte_ += take; - byte_length_ -= take; - - // Put four more spaces/NUL. Worst case is abcd _ _ _ \0 - // kMaxScriptBytes | | put - script_buffer_[put + 0] = ' '; - script_buffer_[put + 1] = ' '; - script_buffer_[put + 2] = ' '; - script_buffer_[put + 3] = '\0'; - - span->text_bytes = put; // Does not include the last four chars above - return true; -} - - -// Copy next run of same-script non-tag letters to buffer [NUL terminated] -// Buffer ALWAYS has leading space and trailing space space space NUL -bool ScriptScanner::GetOneScriptSpan(LangSpan* span) { - if (!letters_marks_only_) { - // Return non-tag text, including punctuation and digits - return GetOneTextSpan(span); - } - - span->text = script_buffer_; - span->text_bytes = 0; - span->offset = next_byte_ - start_byte_; - span->ulscript = UNKNOWN_ULSCRIPT; - span->lang = UNKNOWN_LANGUAGE; - span->truncated = false; - - // struct timeval script_start, script_mid, script_end; - - int put_soft_limit = kMaxScriptBytes - kWithinScriptTail; - if ((kMaxScriptBytes <= byte_length_) && - (byte_length_ < (2 * kMaxScriptBytes))) { - // Try to split the last two fragments in half - put_soft_limit = byte_length_ / 2; - } - - - int spanscript; // The script of this span - int sc = UNKNOWN_ULSCRIPT; // The script of next character - int tlen = 0; - int plen = 0; - - script_buffer_[0] = ' '; // Always a space at front of output - script_buffer_[1] = '\0'; - int take = 0; - int put = 1; // Start after the initial space - - // Build offsets from span->text back to start_byte_ + span->offset - // This mapping reflects deletion of non-letters, expansion of - // entities, etc. - map2original_.Clear(); - map2original_.Delete(span->offset); // So that MapBack(0) gives offset - - // Get to the first real non-tag letter or entity that is a letter - int skip = SkipToFrontOfSpan(next_byte_, byte_length_, &spanscript); - next_byte_ += skip; - byte_length_ -= skip; - - if (skip != 1) { - map2original_.Delete(skip); - map2original_.Insert(1); - } else { - map2original_.Copy(1); - } - if (byte_length_ <= 0) { - map2original_.Reset(); - return false; // No more letters to be found - } - - // There is at least one letter, so we know the script for this span - span->ulscript = (ULScript)spanscript; - - - // Go over alternating spans of same-script letters and non-letters, - // copying letters to buffer with single spaces for each run of non-letters - while (take < byte_length_) { - // Copy run of letters in same script (&LS | LS)* - int letter_count = 0; // Keep track of word length - bool need_break = false; - - while (take < byte_length_) { - // We are at a letter, nonletter, tag, or entity - if (IsSpecial(next_byte_[take]) && !is_plain_text_) { - if (next_byte_[take] == '<') { - // Begining of tag - sc = 0; - break; - } else if (next_byte_[take] == '>') { - // Unexpected end of tag - sc = 0; - break; - } else if (next_byte_[take] == '&') { - // Copy entity, no advance - EntityToBuffer(next_byte_ + take, byte_length_ - take, - script_buffer_ + put, &tlen, &plen); - sc = GetUTF8LetterScriptNum(script_buffer_ + put); - } - } else { - // Real letter, safely copy up to 4 bytes, increment by 1..4 - // Will update by 1..4 bytes at Advance, below - tlen = plen = UTF8OneCharLen(next_byte_ + take); - if (take < (byte_length_ - 3)) { - // X86 fast case, does unaligned load/store - UNALIGNED_STORE32(script_buffer_ + put, - UNALIGNED_LOAD32(next_byte_ + take)); - - } else { - // Slow case, happens 1-3 times per input document - memcpy(script_buffer_ + put, next_byte_ + take, plen); - } - sc = GetUTF8LetterScriptNum(next_byte_ + take); - } - - // Allow continue across a single letter in a different script: - // A B D = three scripts, c = common script, i = inherited script, - // - = don't care, ( = take position before the += below - // AAA(A- continue - // - // AAA(BA continue - // AAA(BB break - // AAA(Bc continue (breaks after B) - // AAA(BD break - // AAA(Bi break - // - // AAA(c- break - // - // AAA(i- continue - // - - if ((sc != spanscript) && (sc != ULScript_Inherited)) { - // Might need to break this script span - if (sc == ULScript_Common) { - need_break = true; - } else { - // Look at next following character, ignoring entity as Common - int sc2 = GetUTF8LetterScriptNum(next_byte_ + take + tlen); - if ((sc2 != ULScript_Common) && (sc2 != spanscript)) { - // We found a non-trivial change of script - if (one_script_only_) { - need_break = true; - } - } - } - } - if (need_break) {break;} // Non-letter or letter in wrong script - - take += tlen; // Advance - put += plen; // Advance - - // Update the offset map to reflect take/put lengths - if (tlen == plen) { - map2original_.Copy(tlen); - } else if (tlen < plen) { - map2original_.Copy(tlen); - map2original_.Insert(plen - tlen); - } else { // plen < tlen - map2original_.Copy(plen); - map2original_.Delete(tlen - plen); - } - - ++letter_count; - if (put >= kMaxScriptBytes) { - // Buffer is full - span->truncated = true; - break; - } - } // End while letters - - // Do run of non-letters (tag | &NL | NL)* - while (take < byte_length_) { - // Do fast scan to next interesting byte - tlen = ScanToLetterOrSpecial(next_byte_ + take, byte_length_ - take); - take += tlen; - map2original_.Delete(tlen); - if (take >= byte_length_) {break;} // Might have scanned to end - - // We are at a letter, nonletter, tag, or entity - if (IsSpecial(next_byte_[take]) && !is_plain_text_) { - if (next_byte_[take] == '<') { - // Begining of tag; skip to end and go around again - tlen = ScanToPossibleLetter(next_byte_ + take, byte_length_ - take, - exit_state_); - sc = 0; - } else if (next_byte_[take] == '>') { - // Unexpected end of tag; skip it and go around again - tlen = 1; // Over the > - sc = 0; - } else if (next_byte_[take] == '&') { - // Expand entity, no advance - EntityToBuffer(next_byte_ + take, byte_length_ - take, - script_buffer_ + put, &tlen, &plen); - sc = GetUTF8LetterScriptNum(script_buffer_ + put); - } - } else { - // Update 1..4 - tlen = UTF8OneCharLen(next_byte_ + take); - sc = GetUTF8LetterScriptNum(next_byte_ + take); - } - if (sc != 0) {break;} // Letter found - take += tlen; // Else advance - map2original_.Delete(tlen); - } // End while not-letters - - script_buffer_[put++] = ' '; - map2original_.Insert(1); - - // Letter in wrong script ? - if ((sc != spanscript) && (sc != ULScript_Inherited)) {break;} - if (put >= put_soft_limit) { - // Buffer is almost full - span->truncated = true; - break; - } - } - - // Almost done. Back up to a character boundary if needed - while ((0 < take) && (take < byte_length_) && - ((next_byte_[take] & 0xc0) == 0x80)) { - // Back up over continuation byte - --take; - --put; - } - - // Update input position - next_byte_ += take; - byte_length_ -= take; - - // Put four more spaces/NUL. Worst case is abcd _ _ _ \0 - // kMaxScriptBytes | | put - script_buffer_[put + 0] = ' '; - script_buffer_[put + 1] = ' '; - script_buffer_[put + 2] = ' '; - script_buffer_[put + 3] = '\0'; - map2original_.Insert(4); - map2original_.Reset(); - - span->text_bytes = put; // Does not include the last four chars above - return true; -} - -// Force Latin, Cyrillic, Armenian, Greek scripts to be lowercase -// List changes with each version of Unicode, so just always lowercase -// Unicode 6.2.0: -// ARMENIAN COPTIC CYRILLIC DESERET GEORGIAN GLAGOLITIC GREEK LATIN -void ScriptScanner::LowerScriptSpan(LangSpan* span) { - // If needed, lowercase all the text. If we do it sooner, might miss - // lowercasing an entity such as Á - // We only need to do this for Latn and Cyrl scripts - map2uplow_.Clear(); - // Full Unicode lowercase of the entire buffer, including - // four pad bytes off the end. - // Ahhh. But the last byte 0x00 is not interchange-valid, so we do 3 pad - // bytes and put the 0x00 in explicitly. - // Build an offset map from script_buffer_lower_ back to script_buffer_ - int consumed, filled, changed; - StringPiece istr(span->text, span->text_bytes + 3); - StringPiece ostr(script_buffer_lower_, kMaxScriptLowerBuffer); - - UTF8GenericReplace(&utf8repl_lettermarklower_obj, - istr, ostr, is_plain_text_, - &consumed, &filled, &changed, &map2uplow_); - script_buffer_lower_[filled] = '\0'; - span->text = script_buffer_lower_; - span->text_bytes = filled - 3; - map2uplow_.Reset(); -} - -// Copy next run of same-script non-tag letters to buffer [NUL terminated] -// Force Latin, Cyrillic, Greek scripts to be lowercase -// Buffer ALWAYS has leading space and trailing space space space NUL -bool ScriptScanner::GetOneScriptSpanLower(LangSpan* span) { - bool ok = GetOneScriptSpan(span); - LowerScriptSpan(span); - return ok; -} - - -// Maps byte offset in most recent GetOneScriptSpan/Lower -// span->text [0..text_bytes] into an additional byte offset from -// span->offset, to get back to corresponding text in the original -// input buffer. -// text_offset must be the first byte -// of a UTF-8 character, or just beyond the last character. Normally this -// routine is called with the first byte of an interesting range and -// again with the first byte of the following range. -int ScriptScanner::MapBack(int text_offset) { - return map2original_.MapBack(map2uplow_.MapBack(text_offset)); -} - - -// Gets lscript number for letters; always returns -// 0 (common script) for non-letters -int GetUTF8LetterScriptNum(const char* src) { - int srclen = UTF8OneCharLen(src); - const uint8* usrc = reinterpret_cast<const uint8*>(src); - return UTF8GenericPropertyTwoByte(&utf8prop_lettermarkscriptnum_obj, - &usrc, &srclen); -} - -} // namespace CLD2 - - |