From a1be17c1cea81ebb1e8b131a662c698d78f3f7f2 Mon Sep 17 00:00:00 2001 From: wolfbeast Date: Mon, 4 Jun 2018 13:17:38 +0200 Subject: Issue #303 Part 1: Move basilisk files from /browser to /application/basilisk --- .../translation/cld2/internal/getonescriptspan.cc | 1086 ++++++++++++++++++++ 1 file changed, 1086 insertions(+) create mode 100644 application/basilisk/components/translation/cld2/internal/getonescriptspan.cc (limited to 'application/basilisk/components/translation/cld2/internal/getonescriptspan.cc') diff --git a/application/basilisk/components/translation/cld2/internal/getonescriptspan.cc b/application/basilisk/components/translation/cld2/internal/getonescriptspan.cc new file mode 100644 index 000000000..6bdd4871b --- /dev/null +++ b/application/basilisk/components/translation/cld2/internal/getonescriptspan.cc @@ -0,0 +1,1086 @@ +// Copyright 2013 Google Inc. All Rights Reserved. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +// +// Author: dsites@google.com (Dick Sites) +// + + +#include "getonescriptspan.h" +#include + +#include "fixunicodevalue.h" +#include "lang_script.h" +#include "port.h" +#include "utf8statetable.h" + +#include "utf8prop_lettermarkscriptnum.h" +#include "utf8repl_lettermarklower.h" +#include "utf8scannot_lettermarkspecial.h" + + +namespace CLD2 { + +// Alphabetical order for binary search, from +// generated_entities.cc +extern const int kNameToEntitySize; +extern const CharIntPair kNameToEntity[]; + +static const int kMaxUpToWordBoundary = 50; // span < this make longer, + // else make shorter +static const int kMaxAdvanceToWordBoundary = 10; // +/- this many bytes + // to round to word boundary, + // direction above + +static const char kSpecialSymbol[256] = { // true for < > & + 0,0,0,0,0,0,0,0, 0,0,0,0,0,0,0,0, 0,0,0,0,0,0,0,0, 0,0,0,0,0,0,0,0, + 0,0,0,0,0,0,1,0, 0,0,0,0,0,0,0,0, 0,0,0,0,0,0,0,0, 0,0,0,0,1,0,1,0, + 0,0,0,0,0,0,0,0, 0,0,0,0,0,0,0,0, 0,0,0,0,0,0,0,0, 0,0,0,0,0,0,0,0, + 0,0,0,0,0,0,0,0, 0,0,0,0,0,0,0,0, 0,0,0,0,0,0,0,0, 0,0,0,0,0,0,0,0, + + 0,0,0,0,0,0,0,0, 0,0,0,0,0,0,0,0, 0,0,0,0,0,0,0,0, 0,0,0,0,0,0,0,0, + 0,0,0,0,0,0,0,0, 0,0,0,0,0,0,0,0, 0,0,0,0,0,0,0,0, 0,0,0,0,0,0,0,0, + 0,0,0,0,0,0,0,0, 0,0,0,0,0,0,0,0, 0,0,0,0,0,0,0,0, 0,0,0,0,0,0,0,0, + 0,0,0,0,0,0,0,0, 0,0,0,0,0,0,0,0, 0,0,0,0,0,0,0,0, 0,0,0,0,0,0,0,0, +}; + + + +#define LT 0 // < +#define GT 1 // > +#define EX 2 // ! +#define HY 3 // - +#define QU 4 // " +#define AP 5 // ' +#define SL 6 // / +#define S_ 7 +#define C_ 8 +#define R_ 9 +#define I_ 10 +#define P_ 11 +#define T_ 12 +#define Y_ 13 +#define L_ 14 +#define E_ 15 +#define CR 16 // or +#define NL 17 // non-letter: ASCII whitespace, digit, punctuation +#define PL 18 // possible letter, incl. & +#define xx 19 // + +// Map byte to one of ~20 interesting categories for cheap tag parsing +static const uint8 kCharToSub[256] = { + NL,NL,NL,NL, NL,NL,NL,NL, NL,NL,CR,NL, NL,CR,NL,NL, + NL,NL,NL,NL, NL,NL,NL,NL, NL,NL,NL,NL, NL,NL,NL,NL, + NL,EX,QU,NL, NL,NL,PL,AP, NL,NL,NL,NL, NL,HY,NL,SL, + NL,NL,NL,NL, NL,NL,NL,NL, NL,NL,NL,NL, LT,NL,GT,NL, + + PL,PL,PL,C_, PL,E_,PL,PL, PL,I_,PL,PL, L_,PL,PL,PL, + P_,PL,R_,S_, T_,PL,PL,PL, PL,Y_,PL,NL, NL,NL,NL,NL, + PL,PL,PL,C_, PL,E_,PL,PL, PL,I_,PL,PL, L_,PL,PL,PL, + P_,PL,R_,S_, T_,PL,PL,PL, PL,Y_,PL,NL, NL,NL,NL,NL, + + NL,NL,NL,NL, NL,NL,NL,NL, NL,NL,NL,NL, NL,NL,NL,NL, + NL,NL,NL,NL, NL,NL,NL,NL, NL,NL,NL,NL, NL,NL,NL,NL, + NL,NL,NL,NL, NL,NL,NL,NL, NL,NL,NL,NL, NL,NL,NL,NL, + NL,NL,NL,NL, NL,NL,NL,NL, NL,NL,NL,NL, NL,NL,NL,NL, + + PL,PL,PL,PL, PL,PL,PL,PL, PL,PL,PL,PL, PL,PL,PL,PL, + PL,PL,PL,PL, PL,PL,PL,PL, PL,PL,PL,PL, PL,PL,PL,PL, + PL,PL,PL,PL, PL,PL,PL,PL, PL,PL,PL,PL, PL,PL,PL,PL, + PL,PL,PL,PL, PL,PL,PL,PL, PL,PL,PL,PL, PL,PL,PL,PL, +}; + +#undef LT +#undef GT +#undef EX +#undef HY +#undef QU +#undef AP +#undef SL +#undef S_ +#undef C_ +#undef R_ +#undef I_ +#undef P_ +#undef T_ +#undef Y_ +#undef L_ +#undef E_ +#undef CR +#undef NL +#undef PL +#undef xx + + +#define OK 0 +#define X_ 1 + + +static const int kMaxExitStateLettersMarksOnly = 1; +static const int kMaxExitStateAllText = 2; + + +// State machine to do cheap parse of non-letter strings incl. tags +// advances +// | | +// advances ... for