diff options
Diffstat (limited to 'js/src/regexp/gen-regexp-special-case.cc')
-rw-r--r-- | js/src/regexp/gen-regexp-special-case.cc | 124 |
1 files changed, 124 insertions, 0 deletions
diff --git a/js/src/regexp/gen-regexp-special-case.cc b/js/src/regexp/gen-regexp-special-case.cc new file mode 100644 index 000000000..337743f53 --- /dev/null +++ b/js/src/regexp/gen-regexp-special-case.cc @@ -0,0 +1,124 @@ +// Copyright 2019 the V8 project authors. All rights reserved. +// Use of this source code is governed by a BSD-style license that can be +// found in the LICENSE file. + +#include <fstream> +#include <iomanip> +#include <iostream> +#include <sstream> + +#include "unicode/uchar.h" +#include "unicode/uniset.h" + +namespace v8 { +namespace internal { + +// The following code generates BuildSpecialAddSet() and BuildIgnoreSet() +// functions into "src/regexp/special-case.cc". +// See more details in http://shorturl.at/adfO5 +void PrintSet(std::ofstream& out, const char* func_name, + const icu::UnicodeSet& set) { + out << "icu::UnicodeSet " << func_name << "() {\n" + << " icu::UnicodeSet set;\n"; + for (int32_t i = 0; i < set.getRangeCount(); i++) { + if (set.getRangeStart(i) == set.getRangeEnd(i)) { + out << " set.add(0x" << set.getRangeStart(i) << ");\n"; + } else { + out << " set.add(0x" << set.getRangeStart(i) << ", 0x" + << set.getRangeEnd(i) << ");\n"; + } + } + out << " set.freeze();\n" + << " return set;\n" + << "}\n"; +} + +void PrintSpecial(std::ofstream& out) { + icu::UnicodeSet current; + icu::UnicodeSet processed(0xd800, 0xdbff); // Ignore surrogate range. + icu::UnicodeSet special_add; + icu::UnicodeSet ignore; + UErrorCode status = U_ZERO_ERROR; + icu::UnicodeSet upper("[\\p{Lu}]", status); + CHECK(U_SUCCESS(status)); + // Iterate through all chars in BMP except ASCII and Surrogate. + for (UChar32 i = 0x80; i < 0x010000; i++) { + // Ignore those characters which is already processed. + if (!processed.contains(i)) { + current.set(i, i); + current.closeOver(USET_CASE_INSENSITIVE); + + // Remember we already processed current. + processed.addAll(current); + + // All uppercase characters in current. + icu::UnicodeSet keep_upper(current); + keep_upper.retainAll(upper); + + // Check if we have more than one uppercase character in current. + // If there are more than one uppercase character, then it is a special + // set which need to be added into either "Special Add" set or "Ignore" + // set. + int32_t number_of_upper = 0; + for (int32_t i = 0; i < keep_upper.getRangeCount() && i <= 1; i++) { + number_of_upper += + keep_upper.getRangeEnd(i) - keep_upper.getRangeStart(i) + 1; + } + if (number_of_upper > 1) { + // Add all non uppercase characters (could be Ll or Mn) to special add + // set. + current.removeAll(upper); + special_add.addAll(current); + + // Add the uppercase characters of non uppercase character to + // special add set. + CHECK_GT(current.getRangeCount(), 0); + UChar32 main_upper = u_toupper(current.getRangeStart(0)); + special_add.add(main_upper); + + // Add all uppercase except the main upper to ignore set. + keep_upper.remove(main_upper); + ignore.addAll(keep_upper); + } + } + } + + // Remove any ASCII + special_add.remove(0x0000, 0x007f); + PrintSet(out, "BuildIgnoreSet", ignore); + PrintSet(out, "BuildSpecialAddSet", special_add); +} + +void WriteHeader(const char* header_filename) { + std::ofstream out(header_filename); + out << std::hex << std::setfill('0') << std::setw(4); + + out << "// Automatically generated by regexp/gen-regexp-special-case.cc\n" + << "// The following functions are used to build icu::UnicodeSet\n" + << "// for specical cases different between Unicode and ECMA262.\n" + << "#ifdef V8_INTL_SUPPORT\n" + << "#include \"src/regexp/special-case.h\"\n\n" + << "#include \"unicode/uniset.h\"\n" + << "namespace v8 {\n" + << "namespace internal {\n\n"; + + PrintSpecial(out); + + out << "\n" + << "} // namespace internal\n" + << "} // namespace v8\n" + << "#endif // V8_INTL_SUPPORT\n"; +} + +} // namespace internal +} // namespace v8 + +int main(int argc, const char** argv) { + if (argc != 2) { + std::cerr << "Usage: " << argv[0] << " <output filename>\n"; + std::exit(1); + } + v8::internal::WriteHeader(argv[1]); + + return 0; +} |