diff options
Diffstat (limited to 'js/src/regexp/gen-regexp-special-case.cc')
-rw-r--r-- | js/src/regexp/gen-regexp-special-case.cc | 149 |
1 files changed, 95 insertions, 54 deletions
diff --git a/js/src/regexp/gen-regexp-special-case.cc b/js/src/regexp/gen-regexp-special-case.cc index 337743f53..b4a8c3da4 100644 --- a/js/src/regexp/gen-regexp-special-case.cc +++ b/js/src/regexp/gen-regexp-special-case.cc @@ -1,4 +1,4 @@ -// Copyright 2019 the V8 project authors. All rights reserved. +// Copyright 2020 the V8 project authors. All rights reserved. // Use of this source code is governed by a BSD-style license that can be // found in the LICENSE file. @@ -7,18 +7,19 @@ #include <iostream> #include <sstream> -#include "unicode/uchar.h" -#include "unicode/uniset.h" +#include "regexp/special-case.h" namespace v8 { namespace internal { -// The following code generates BuildSpecialAddSet() and BuildIgnoreSet() -// functions into "src/regexp/special-case.cc". -// See more details in http://shorturl.at/adfO5 -void PrintSet(std::ofstream& out, const char* func_name, +static const uc32 kSurrogateStart = 0xd800; +static const uc32 kSurrogateEnd = 0xdfff; +static const uc32 kNonBmpStart = 0x10000; + +// The following code generates "src/regexp/special-case.cc". +void PrintSet(std::ofstream& out, const char* name, const icu::UnicodeSet& set) { - out << "icu::UnicodeSet " << func_name << "() {\n" + out << "icu::UnicodeSet Build" << name << "() {\n" << " icu::UnicodeSet set;\n"; for (int32_t i = 0; i < set.getRangeCount(); i++) { if (set.getRangeStart(i) == set.getRangeEnd(i)) { @@ -30,73 +31,113 @@ void PrintSet(std::ofstream& out, const char* func_name, } out << " set.freeze();\n" << " return set;\n" - << "}\n"; + << "}\n\n"; + + out << "struct " << name << "Data {\n" + << " " << name << "Data() : set(Build" << name << "()) {}\n" + << " const icu::UnicodeSet set;\n" + << "};\n\n"; + + out << "//static\n" + << "const icu::UnicodeSet& RegExpCaseFolding::" << name << "() {\n" + << " static base::LazyInstance<" << name << "Data>::type set =\n" + << " LAZY_INSTANCE_INITIALIZER;\n" + << " return set.Pointer()->set;\n" + << "}\n\n"; } void PrintSpecial(std::ofstream& out) { icu::UnicodeSet current; - icu::UnicodeSet processed(0xd800, 0xdbff); // Ignore surrogate range. icu::UnicodeSet special_add; icu::UnicodeSet ignore; UErrorCode status = U_ZERO_ERROR; icu::UnicodeSet upper("[\\p{Lu}]", status); CHECK(U_SUCCESS(status)); - // Iterate through all chars in BMP except ASCII and Surrogate. - for (UChar32 i = 0x80; i < 0x010000; i++) { - // Ignore those characters which is already processed. - if (!processed.contains(i)) { - current.set(i, i); - current.closeOver(USET_CASE_INSENSITIVE); - // Remember we already processed current. - processed.addAll(current); - - // All uppercase characters in current. - icu::UnicodeSet keep_upper(current); - keep_upper.retainAll(upper); - - // Check if we have more than one uppercase character in current. - // If there are more than one uppercase character, then it is a special - // set which need to be added into either "Special Add" set or "Ignore" - // set. - int32_t number_of_upper = 0; - for (int32_t i = 0; i < keep_upper.getRangeCount() && i <= 1; i++) { - number_of_upper += - keep_upper.getRangeEnd(i) - keep_upper.getRangeStart(i) + 1; + // Iterate through all chars in BMP except surrogates. + for (UChar32 i = 0; i < kNonBmpStart; i++) { + if (i >= kSurrogateStart && i <= kSurrogateEnd) { + continue; // Ignore surrogate range + } + current.set(i, i); + current.closeOver(USET_CASE_INSENSITIVE); + + // Check to see if all characters in the case-folding equivalence + // class as defined by UnicodeSet::closeOver all map to the same + // canonical value. + UChar32 canonical = RegExpCaseFolding::Canonicalize(i); + bool class_has_matching_canonical_char = false; + bool class_has_non_matching_canonical_char = false; + for (int32_t j = 0; j < current.getRangeCount(); j++) { + for (UChar32 c = current.getRangeStart(j); c <= current.getRangeEnd(j); + c++) { + if (c == i) { + continue; + } + UChar32 other_canonical = RegExpCaseFolding::Canonicalize(c); + if (canonical == other_canonical) { + class_has_matching_canonical_char = true; + } else { + class_has_non_matching_canonical_char = true; + } + } + } + // If any other character in i's equivalence class has a + // different canonical value, then i needs special handling. If + // no other character shares a canonical value with i, we can + // ignore i when adding alternatives for case-independent + // comparison. If at least one other character shares a + // canonical value, then i needs special handling. + if (class_has_non_matching_canonical_char) { + if (class_has_matching_canonical_char) { + special_add.add(i); + } else { + ignore.add(i); } - if (number_of_upper > 1) { - // Add all non uppercase characters (could be Ll or Mn) to special add - // set. - current.removeAll(upper); - special_add.addAll(current); - - // Add the uppercase characters of non uppercase character to - // special add set. - CHECK_GT(current.getRangeCount(), 0); - UChar32 main_upper = u_toupper(current.getRangeStart(0)); - special_add.add(main_upper); - - // Add all uppercase except the main upper to ignore set. - keep_upper.remove(main_upper); - ignore.addAll(keep_upper); + } + } + + // Verify that no Unicode equivalence class contains two non-trivial + // JS equivalence classes. Every character in SpecialAddSet has the + // same canonical value as every other non-IgnoreSet character in + // its Unicode equivalence class. Therefore, if we call closeOver on + // a set containing no IgnoreSet characters, the only characters + // that must be removed from the result are in IgnoreSet. This fact + // is used in CharacterRange::AddCaseEquivalents. + for (int32_t i = 0; i < special_add.getRangeCount(); i++) { + for (UChar32 c = special_add.getRangeStart(i); + c <= special_add.getRangeEnd(i); c++) { + UChar32 canonical = RegExpCaseFolding::Canonicalize(c); + current.set(c, c); + current.closeOver(USET_CASE_INSENSITIVE); + current.removeAll(ignore); + for (int32_t j = 0; j < current.getRangeCount(); j++) { + for (UChar32 c2 = current.getRangeStart(j); + c2 <= current.getRangeEnd(j); c2++) { + CHECK_EQ(canonical, RegExpCaseFolding::Canonicalize(c2)); + } } } } - // Remove any ASCII - special_add.remove(0x0000, 0x007f); - PrintSet(out, "BuildIgnoreSet", ignore); - PrintSet(out, "BuildSpecialAddSet", special_add); + PrintSet(out, "IgnoreSet", ignore); + PrintSet(out, "SpecialAddSet", special_add); } void WriteHeader(const char* header_filename) { std::ofstream out(header_filename); out << std::hex << std::setfill('0') << std::setw(4); - - out << "// Automatically generated by regexp/gen-regexp-special-case.cc\n" - << "// The following functions are used to build icu::UnicodeSet\n" - << "// for specical cases different between Unicode and ECMA262.\n" + out << "// Copyright 2020 the V8 project authors. All rights reserved.\n" + << "// Use of this source code is governed by a BSD-style license that\n" + << "// can be found in the LICENSE file.\n\n" + << "// Automatically generated by regexp/gen-regexp-special-case.cc\n\n" + << "// The following functions are used to build UnicodeSets\n" + << "// for special cases where the case-folding algorithm used by\n" + << "// UnicodeSet::closeOver(USET_CASE_INSENSITIVE) does not match\n" + << "// the algorithm defined in ECMAScript 2020 21.2.2.8.2 (Runtime\n" + << "// Semantics: Canonicalize) step 3.\n\n" << "#ifdef V8_INTL_SUPPORT\n" + << "#include \"src/base/lazy-instance.h\"\n\n" << "#include \"src/regexp/special-case.h\"\n\n" << "#include \"unicode/uniset.h\"\n" << "namespace v8 {\n" |