diff options
Diffstat (limited to 'application/basilisk/components/translation/cld2/internal/getonescriptspan.h')
-rw-r--r-- | application/basilisk/components/translation/cld2/internal/getonescriptspan.h | 110 |
1 files changed, 0 insertions, 110 deletions
diff --git a/application/basilisk/components/translation/cld2/internal/getonescriptspan.h b/application/basilisk/components/translation/cld2/internal/getonescriptspan.h deleted file mode 100644 index a8999d069..000000000 --- a/application/basilisk/components/translation/cld2/internal/getonescriptspan.h +++ /dev/null @@ -1,110 +0,0 @@ -// Copyright 2013 Google Inc. All Rights Reserved. -// -// Licensed under the Apache License, Version 2.0 (the "License"); -// you may not use this file except in compliance with the License. -// You may obtain a copy of the License at -// -// http://www.apache.org/licenses/LICENSE-2.0 -// -// Unless required by applicable law or agreed to in writing, software -// distributed under the License is distributed on an "AS IS" BASIS, -// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -// See the License for the specific language governing permissions and -// limitations under the License. - -// -// Author: dsites@google.com (Dick Sites) -// - - -#ifndef I18N_ENCODINGS_CLD2_INTERNAL_GETONESCRIPTSPAN_H_ -#define I18N_ENCODINGS_CLD2_INTERNAL_GETONESCRIPTSPAN_H_ - -#include "integral_types.h" -#include "langspan.h" -#include "offsetmap.h" - -namespace CLD2 { - -static const int kMaxScriptBuffer = 40960; -static const int kMaxScriptLowerBuffer = (kMaxScriptBuffer * 3) / 2; -static const int kMaxScriptBytes = kMaxScriptBuffer - 32; // Leave some room -static const int kWithinScriptTail = 32; // Stop at word space in last - // N bytes of script buffer - - -static inline bool IsContinuationByte(char c) { - return static_cast<signed char>(c) < -64; -} - -// Gets lscript number for letters; always returns -// 0 (common script) for non-letters -int GetUTF8LetterScriptNum(const char* src); - -// Update src pointer to point to next quadgram, +2..+5 -// Looks at src[0..4] -const char* AdvanceQuad(const char* src); - - -class ScriptScanner { - public: - ScriptScanner(const char* buffer, int buffer_length, bool is_plain_text); - ScriptScanner(const char* buffer, int buffer_length, bool is_plain_text, - bool any_text, bool any_script); - ~ScriptScanner(); - - // Copy next run of same-script non-tag letters to buffer [NUL terminated] - bool GetOneScriptSpan(LangSpan* span); - - // Force Latin and Cyrillic scripts to be lowercase - void LowerScriptSpan(LangSpan* span); - - // Copy next run of same-script non-tag letters to buffer [NUL terminated] - // Force Latin and Cyrillic scripts to be lowercase - bool GetOneScriptSpanLower(LangSpan* span); - - // Copy next run of non-tag characters to buffer [NUL terminated] - // This just removes tags and removes entities - // Buffer has leading space - bool GetOneTextSpan(LangSpan* span); - - // Maps byte offset in most recent GetOneScriptSpan/Lower - // span->text [0..text_bytes] into an additional byte offset from - // span->offset, to get back to corresponding text in the original - // input buffer. - // text_offset must be the first byte - // of a UTF-8 character, or just beyond the last character. Normally this - // routine is called with the first byte of an interesting range and - // again with the first byte of the following range. - int MapBack(int text_offset); - - const char* GetBufferStart() {return start_byte_;}; - - private: - // Skip over tags and non-letters - int SkipToFrontOfSpan(const char* src, int len, int* script); - - const char* start_byte_; // Starting byte of buffer to scan - const char* next_byte_; // First unscanned byte - const char* next_byte_limit_; // Last byte + 1 - int byte_length_; // Bytes left: next_byte_limit_ - next_byte_ - - bool is_plain_text_; // true fo text, false for HTML - char* script_buffer_; // Holds text with expanded entities - char* script_buffer_lower_; // Holds lowercased text - bool letters_marks_only_; // To distinguish scriptspan of one - // letters/marks vs. any mixture of text - bool one_script_only_; // To distinguish scriptspan of one - // script vs. any mixture of scripts - int exit_state_; // For tag parser kTagParseTbl_0, based - // on letters_marks_only_ - public : - // Expose for debugging - OffsetMap map2original_; // map from script_buffer_ to buffer - OffsetMap map2uplow_; // map from script_buffer_lower_ to script_buffer_ -}; - -} // namespace CLD2 - -#endif // I18N_ENCODINGS_CLD2_INTERNAL_GETONESCRIPTSPAN_H_ - |