summaryrefslogtreecommitdiffstats
path: root/js/src/regexp/gen-regexp-special-case.cc
diff options
context:
space:
mode:
Diffstat (limited to 'js/src/regexp/gen-regexp-special-case.cc')
-rw-r--r--js/src/regexp/gen-regexp-special-case.cc124
1 files changed, 124 insertions, 0 deletions
diff --git a/js/src/regexp/gen-regexp-special-case.cc b/js/src/regexp/gen-regexp-special-case.cc
new file mode 100644
index 000000000..337743f53
--- /dev/null
+++ b/js/src/regexp/gen-regexp-special-case.cc
@@ -0,0 +1,124 @@
+// Copyright 2019 the V8 project authors. All rights reserved.
+// Use of this source code is governed by a BSD-style license that can be
+// found in the LICENSE file.
+
+#include <fstream>
+#include <iomanip>
+#include <iostream>
+#include <sstream>
+
+#include "unicode/uchar.h"
+#include "unicode/uniset.h"
+
+namespace v8 {
+namespace internal {
+
+// The following code generates BuildSpecialAddSet() and BuildIgnoreSet()
+// functions into "src/regexp/special-case.cc".
+// See more details in http://shorturl.at/adfO5
+void PrintSet(std::ofstream& out, const char* func_name,
+ const icu::UnicodeSet& set) {
+ out << "icu::UnicodeSet " << func_name << "() {\n"
+ << " icu::UnicodeSet set;\n";
+ for (int32_t i = 0; i < set.getRangeCount(); i++) {
+ if (set.getRangeStart(i) == set.getRangeEnd(i)) {
+ out << " set.add(0x" << set.getRangeStart(i) << ");\n";
+ } else {
+ out << " set.add(0x" << set.getRangeStart(i) << ", 0x"
+ << set.getRangeEnd(i) << ");\n";
+ }
+ }
+ out << " set.freeze();\n"
+ << " return set;\n"
+ << "}\n";
+}
+
+void PrintSpecial(std::ofstream& out) {
+ icu::UnicodeSet current;
+ icu::UnicodeSet processed(0xd800, 0xdbff); // Ignore surrogate range.
+ icu::UnicodeSet special_add;
+ icu::UnicodeSet ignore;
+ UErrorCode status = U_ZERO_ERROR;
+ icu::UnicodeSet upper("[\\p{Lu}]", status);
+ CHECK(U_SUCCESS(status));
+ // Iterate through all chars in BMP except ASCII and Surrogate.
+ for (UChar32 i = 0x80; i < 0x010000; i++) {
+ // Ignore those characters which is already processed.
+ if (!processed.contains(i)) {
+ current.set(i, i);
+ current.closeOver(USET_CASE_INSENSITIVE);
+
+ // Remember we already processed current.
+ processed.addAll(current);
+
+ // All uppercase characters in current.
+ icu::UnicodeSet keep_upper(current);
+ keep_upper.retainAll(upper);
+
+ // Check if we have more than one uppercase character in current.
+ // If there are more than one uppercase character, then it is a special
+ // set which need to be added into either "Special Add" set or "Ignore"
+ // set.
+ int32_t number_of_upper = 0;
+ for (int32_t i = 0; i < keep_upper.getRangeCount() && i <= 1; i++) {
+ number_of_upper +=
+ keep_upper.getRangeEnd(i) - keep_upper.getRangeStart(i) + 1;
+ }
+ if (number_of_upper > 1) {
+ // Add all non uppercase characters (could be Ll or Mn) to special add
+ // set.
+ current.removeAll(upper);
+ special_add.addAll(current);
+
+ // Add the uppercase characters of non uppercase character to
+ // special add set.
+ CHECK_GT(current.getRangeCount(), 0);
+ UChar32 main_upper = u_toupper(current.getRangeStart(0));
+ special_add.add(main_upper);
+
+ // Add all uppercase except the main upper to ignore set.
+ keep_upper.remove(main_upper);
+ ignore.addAll(keep_upper);
+ }
+ }
+ }
+
+ // Remove any ASCII
+ special_add.remove(0x0000, 0x007f);
+ PrintSet(out, "BuildIgnoreSet", ignore);
+ PrintSet(out, "BuildSpecialAddSet", special_add);
+}
+
+void WriteHeader(const char* header_filename) {
+ std::ofstream out(header_filename);
+ out << std::hex << std::setfill('0') << std::setw(4);
+
+ out << "// Automatically generated by regexp/gen-regexp-special-case.cc\n"
+ << "// The following functions are used to build icu::UnicodeSet\n"
+ << "// for specical cases different between Unicode and ECMA262.\n"
+ << "#ifdef V8_INTL_SUPPORT\n"
+ << "#include \"src/regexp/special-case.h\"\n\n"
+ << "#include \"unicode/uniset.h\"\n"
+ << "namespace v8 {\n"
+ << "namespace internal {\n\n";
+
+ PrintSpecial(out);
+
+ out << "\n"
+ << "} // namespace internal\n"
+ << "} // namespace v8\n"
+ << "#endif // V8_INTL_SUPPORT\n";
+}
+
+} // namespace internal
+} // namespace v8
+
+int main(int argc, const char** argv) {
+ if (argc != 2) {
+ std::cerr << "Usage: " << argv[0] << " <output filename>\n";
+ std::exit(1);
+ }
+ v8::internal::WriteHeader(argv[1]);
+
+ return 0;
+}