diff options
Diffstat (limited to 'application/basilisk/components/translation/cld2/internal/getonescriptspan.h')
-rw-r--r-- | application/basilisk/components/translation/cld2/internal/getonescriptspan.h | 110 |
1 files changed, 110 insertions, 0 deletions
diff --git a/application/basilisk/components/translation/cld2/internal/getonescriptspan.h b/application/basilisk/components/translation/cld2/internal/getonescriptspan.h new file mode 100644 index 000000000..a8999d069 --- /dev/null +++ b/application/basilisk/components/translation/cld2/internal/getonescriptspan.h @@ -0,0 +1,110 @@ +// Copyright 2013 Google Inc. All Rights Reserved. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +// +// Author: dsites@google.com (Dick Sites) +// + + +#ifndef I18N_ENCODINGS_CLD2_INTERNAL_GETONESCRIPTSPAN_H_ +#define I18N_ENCODINGS_CLD2_INTERNAL_GETONESCRIPTSPAN_H_ + +#include "integral_types.h" +#include "langspan.h" +#include "offsetmap.h" + +namespace CLD2 { + +static const int kMaxScriptBuffer = 40960; +static const int kMaxScriptLowerBuffer = (kMaxScriptBuffer * 3) / 2; +static const int kMaxScriptBytes = kMaxScriptBuffer - 32; // Leave some room +static const int kWithinScriptTail = 32; // Stop at word space in last + // N bytes of script buffer + + +static inline bool IsContinuationByte(char c) { + return static_cast<signed char>(c) < -64; +} + +// Gets lscript number for letters; always returns +// 0 (common script) for non-letters +int GetUTF8LetterScriptNum(const char* src); + +// Update src pointer to point to next quadgram, +2..+5 +// Looks at src[0..4] +const char* AdvanceQuad(const char* src); + + +class ScriptScanner { + public: + ScriptScanner(const char* buffer, int buffer_length, bool is_plain_text); + ScriptScanner(const char* buffer, int buffer_length, bool is_plain_text, + bool any_text, bool any_script); + ~ScriptScanner(); + + // Copy next run of same-script non-tag letters to buffer [NUL terminated] + bool GetOneScriptSpan(LangSpan* span); + + // Force Latin and Cyrillic scripts to be lowercase + void LowerScriptSpan(LangSpan* span); + + // Copy next run of same-script non-tag letters to buffer [NUL terminated] + // Force Latin and Cyrillic scripts to be lowercase + bool GetOneScriptSpanLower(LangSpan* span); + + // Copy next run of non-tag characters to buffer [NUL terminated] + // This just removes tags and removes entities + // Buffer has leading space + bool GetOneTextSpan(LangSpan* span); + + // Maps byte offset in most recent GetOneScriptSpan/Lower + // span->text [0..text_bytes] into an additional byte offset from + // span->offset, to get back to corresponding text in the original + // input buffer. + // text_offset must be the first byte + // of a UTF-8 character, or just beyond the last character. Normally this + // routine is called with the first byte of an interesting range and + // again with the first byte of the following range. + int MapBack(int text_offset); + + const char* GetBufferStart() {return start_byte_;}; + + private: + // Skip over tags and non-letters + int SkipToFrontOfSpan(const char* src, int len, int* script); + + const char* start_byte_; // Starting byte of buffer to scan + const char* next_byte_; // First unscanned byte + const char* next_byte_limit_; // Last byte + 1 + int byte_length_; // Bytes left: next_byte_limit_ - next_byte_ + + bool is_plain_text_; // true fo text, false for HTML + char* script_buffer_; // Holds text with expanded entities + char* script_buffer_lower_; // Holds lowercased text + bool letters_marks_only_; // To distinguish scriptspan of one + // letters/marks vs. any mixture of text + bool one_script_only_; // To distinguish scriptspan of one + // script vs. any mixture of scripts + int exit_state_; // For tag parser kTagParseTbl_0, based + // on letters_marks_only_ + public : + // Expose for debugging + OffsetMap map2original_; // map from script_buffer_ to buffer + OffsetMap map2uplow_; // map from script_buffer_lower_ to script_buffer_ +}; + +} // namespace CLD2 + +#endif // I18N_ENCODINGS_CLD2_INTERNAL_GETONESCRIPTSPAN_H_ + |