Merge branch 'move-basilisk'

author: wolfbeast <mcwerewolf@gmail.com> 2018-06-04 15:50:03 +0200
committer: wolfbeast <mcwerewolf@gmail.com> 2018-06-04 15:50:03 +0200
commit: e3b7744bee37c3d4a026d2193bed5e9439c40ff3 (patch)
tree: f3f7b07ca9bd78bf7ac2d76dd55b61b2a8bb549e /application/basilisk/components/translation/cld2/internal/getonescriptspan.h
parent: cbce4f0b6a337f8250b62cae028f1c6d4cce51df (diff)
parent: 031afcafe288bf0f46c0c5caae20dd3db8bd0297 (diff)
download: UXP-e3b7744bee37c3d4a026d2193bed5e9439c40ff3.tar
UXP-e3b7744bee37c3d4a026d2193bed5e9439c40ff3.tar.gz
UXP-e3b7744bee37c3d4a026d2193bed5e9439c40ff3.tar.lz
UXP-e3b7744bee37c3d4a026d2193bed5e9439c40ff3.tar.xz
UXP-e3b7744bee37c3d4a026d2193bed5e9439c40ff3.zip
1 files changed, 110 insertions, 0 deletions
diff --git a/application/basilisk/components/translation/cld2/internal/getonescriptspan.h b/application/basilisk/components/translation/cld2/internal/getonescriptspan.h
new file mode 100644
index 000000000..a8999d069
--- /dev/null
+++ b/application/basilisk/components/translation/cld2/internal/getonescriptspan.h
@@ -0,0 +1,110 @@
+// Copyright 2013 Google Inc. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+//
+// Author: dsites@google.com (Dick Sites)
+//
+
+
+#ifndef I18N_ENCODINGS_CLD2_INTERNAL_GETONESCRIPTSPAN_H_
+#define I18N_ENCODINGS_CLD2_INTERNAL_GETONESCRIPTSPAN_H_
+
+#include "integral_types.h"
+#include "langspan.h"
+#include "offsetmap.h"
+
+namespace CLD2 {
+
+static const int kMaxScriptBuffer = 40960;
+static const int kMaxScriptLowerBuffer = (kMaxScriptBuffer * 3) / 2;
+static const int kMaxScriptBytes = kMaxScriptBuffer - 32;   // Leave some room
+static const int kWithinScriptTail = 32;    // Stop at word space in last
+                                            // N bytes of script buffer
+
+
+static inline bool IsContinuationByte(char c) {
+  return static_cast<signed char>(c) < -64;
+}
+
+// Gets lscript number for letters; always returns
+//   0 (common script) for non-letters
+int GetUTF8LetterScriptNum(const char* src);
+
+// Update src pointer to point to next quadgram, +2..+5
+// Looks at src[0..4]
+const char* AdvanceQuad(const char* src);
+
+
+class ScriptScanner {
+ public:
+  ScriptScanner(const char* buffer, int buffer_length, bool is_plain_text);
+  ScriptScanner(const char* buffer, int buffer_length, bool is_plain_text,
+                bool any_text, bool any_script);
+  ~ScriptScanner();
+
+  // Copy next run of same-script non-tag letters to buffer [NUL terminated]
+  bool GetOneScriptSpan(LangSpan* span);
+
+  // Force Latin and Cyrillic scripts to be lowercase
+  void LowerScriptSpan(LangSpan* span);
+
+  // Copy next run of same-script non-tag letters to buffer [NUL terminated]
+  // Force Latin and Cyrillic scripts to be lowercase
+  bool GetOneScriptSpanLower(LangSpan* span);
+
+  // Copy next run of non-tag characters to buffer [NUL terminated]
+  // This just removes tags and removes entities
+  // Buffer has leading space
+  bool GetOneTextSpan(LangSpan* span);
+
+  // Maps byte offset in most recent GetOneScriptSpan/Lower
+  // span->text [0..text_bytes] into an additional byte offset from
+  // span->offset, to get back to corresponding text in the original
+  // input buffer.
+  // text_offset must be the first byte
+  // of a UTF-8 character, or just beyond the last character. Normally this
+  // routine is called with the first byte of an interesting range and
+  // again with the first byte of the following range.
+  int MapBack(int text_offset);
+
+  const char* GetBufferStart() {return start_byte_;};
+
+ private:
+  // Skip over tags and non-letters
+  int SkipToFrontOfSpan(const char* src, int len, int* script);
+
+  const char* start_byte_;        // Starting byte of buffer to scan
+  const char* next_byte_;         // First unscanned byte
+  const char* next_byte_limit_;   // Last byte + 1
+  int byte_length_;               // Bytes left: next_byte_limit_ - next_byte_
+
+  bool is_plain_text_;            // true fo text, false for HTML
+  char* script_buffer_;           // Holds text with expanded entities
+  char* script_buffer_lower_;     // Holds lowercased text
+  bool letters_marks_only_;       // To distinguish scriptspan of one
+                                  // letters/marks vs. any mixture of text
+  bool one_script_only_;          // To distinguish scriptspan of one
+                                  // script vs. any mixture of scripts
+  int exit_state_;                // For tag parser kTagParseTbl_0, based
+                                  // on letters_marks_only_
+ public :
+  // Expose for debugging
+  OffsetMap map2original_;    // map from script_buffer_ to buffer
+  OffsetMap map2uplow_;       // map from script_buffer_lower_ to script_buffer_
+};
+
+}  // namespace CLD2
+
+#endif  // I18N_ENCODINGS_CLD2_INTERNAL_GETONESCRIPTSPAN_H_
+
author	wolfbeast <mcwerewolf@gmail.com>	2018-06-04 15:50:03 +0200
committer	wolfbeast <mcwerewolf@gmail.com>	2018-06-04 15:50:03 +0200
commit	e3b7744bee37c3d4a026d2193bed5e9439c40ff3 (patch)
tree	f3f7b07ca9bd78bf7ac2d76dd55b61b2a8bb549e /application/basilisk/components/translation/cld2/internal/getonescriptspan.h
parent	cbce4f0b6a337f8250b62cae028f1c6d4cce51df (diff)
parent	031afcafe288bf0f46c0c5caae20dd3db8bd0297 (diff)
download	UXP-e3b7744bee37c3d4a026d2193bed5e9439c40ff3.tar UXP-e3b7744bee37c3d4a026d2193bed5e9439c40ff3.tar.gz UXP-e3b7744bee37c3d4a026d2193bed5e9439c40ff3.tar.lz UXP-e3b7744bee37c3d4a026d2193bed5e9439c40ff3.tar.xz UXP-e3b7744bee37c3d4a026d2193bed5e9439c40ff3.zip