summaryrefslogtreecommitdiffstats
path: root/application/basilisk/components/translation/cld2/internal/getonescriptspan.h
diff options
context:
space:
mode:
Diffstat (limited to 'application/basilisk/components/translation/cld2/internal/getonescriptspan.h')
-rw-r--r--application/basilisk/components/translation/cld2/internal/getonescriptspan.h110
1 files changed, 110 insertions, 0 deletions
diff --git a/application/basilisk/components/translation/cld2/internal/getonescriptspan.h b/application/basilisk/components/translation/cld2/internal/getonescriptspan.h
new file mode 100644
index 000000000..a8999d069
--- /dev/null
+++ b/application/basilisk/components/translation/cld2/internal/getonescriptspan.h
@@ -0,0 +1,110 @@
+// Copyright 2013 Google Inc. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+// http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+//
+// Author: dsites@google.com (Dick Sites)
+//
+
+
+#ifndef I18N_ENCODINGS_CLD2_INTERNAL_GETONESCRIPTSPAN_H_
+#define I18N_ENCODINGS_CLD2_INTERNAL_GETONESCRIPTSPAN_H_
+
+#include "integral_types.h"
+#include "langspan.h"
+#include "offsetmap.h"
+
+namespace CLD2 {
+
+static const int kMaxScriptBuffer = 40960;
+static const int kMaxScriptLowerBuffer = (kMaxScriptBuffer * 3) / 2;
+static const int kMaxScriptBytes = kMaxScriptBuffer - 32; // Leave some room
+static const int kWithinScriptTail = 32; // Stop at word space in last
+ // N bytes of script buffer
+
+
+static inline bool IsContinuationByte(char c) {
+ return static_cast<signed char>(c) < -64;
+}
+
+// Gets lscript number for letters; always returns
+// 0 (common script) for non-letters
+int GetUTF8LetterScriptNum(const char* src);
+
+// Update src pointer to point to next quadgram, +2..+5
+// Looks at src[0..4]
+const char* AdvanceQuad(const char* src);
+
+
+class ScriptScanner {
+ public:
+ ScriptScanner(const char* buffer, int buffer_length, bool is_plain_text);
+ ScriptScanner(const char* buffer, int buffer_length, bool is_plain_text,
+ bool any_text, bool any_script);
+ ~ScriptScanner();
+
+ // Copy next run of same-script non-tag letters to buffer [NUL terminated]
+ bool GetOneScriptSpan(LangSpan* span);
+
+ // Force Latin and Cyrillic scripts to be lowercase
+ void LowerScriptSpan(LangSpan* span);
+
+ // Copy next run of same-script non-tag letters to buffer [NUL terminated]
+ // Force Latin and Cyrillic scripts to be lowercase
+ bool GetOneScriptSpanLower(LangSpan* span);
+
+ // Copy next run of non-tag characters to buffer [NUL terminated]
+ // This just removes tags and removes entities
+ // Buffer has leading space
+ bool GetOneTextSpan(LangSpan* span);
+
+ // Maps byte offset in most recent GetOneScriptSpan/Lower
+ // span->text [0..text_bytes] into an additional byte offset from
+ // span->offset, to get back to corresponding text in the original
+ // input buffer.
+ // text_offset must be the first byte
+ // of a UTF-8 character, or just beyond the last character. Normally this
+ // routine is called with the first byte of an interesting range and
+ // again with the first byte of the following range.
+ int MapBack(int text_offset);
+
+ const char* GetBufferStart() {return start_byte_;};
+
+ private:
+ // Skip over tags and non-letters
+ int SkipToFrontOfSpan(const char* src, int len, int* script);
+
+ const char* start_byte_; // Starting byte of buffer to scan
+ const char* next_byte_; // First unscanned byte
+ const char* next_byte_limit_; // Last byte + 1
+ int byte_length_; // Bytes left: next_byte_limit_ - next_byte_
+
+ bool is_plain_text_; // true fo text, false for HTML
+ char* script_buffer_; // Holds text with expanded entities
+ char* script_buffer_lower_; // Holds lowercased text
+ bool letters_marks_only_; // To distinguish scriptspan of one
+ // letters/marks vs. any mixture of text
+ bool one_script_only_; // To distinguish scriptspan of one
+ // script vs. any mixture of scripts
+ int exit_state_; // For tag parser kTagParseTbl_0, based
+ // on letters_marks_only_
+ public :
+ // Expose for debugging
+ OffsetMap map2original_; // map from script_buffer_ to buffer
+ OffsetMap map2uplow_; // map from script_buffer_lower_ to script_buffer_
+};
+
+} // namespace CLD2
+
+#endif // I18N_ENCODINGS_CLD2_INTERNAL_GETONESCRIPTSPAN_H_
+