diff options
Diffstat (limited to 'js/src/regexp/regexp-compiler-tonode.cc')
-rw-r--r-- | js/src/regexp/regexp-compiler-tonode.cc | 114 |
1 files changed, 14 insertions, 100 deletions
diff --git a/js/src/regexp/regexp-compiler-tonode.cc b/js/src/regexp/regexp-compiler-tonode.cc index fc734ac7c..257030589 100644 --- a/js/src/regexp/regexp-compiler-tonode.cc +++ b/js/src/regexp/regexp-compiler-tonode.cc @@ -1137,39 +1137,6 @@ Vector<const int> CharacterRange::GetWordBounds() { return Vector<const int>(kWordRanges, kWordRangeCount - 1); } -#ifdef V8_INTL_SUPPORT -struct IgnoreSet { - IgnoreSet() : set(BuildIgnoreSet()) {} - const icu::UnicodeSet set; -}; - -struct SpecialAddSet { - SpecialAddSet() : set(BuildSpecialAddSet()) {} - const icu::UnicodeSet set; -}; - -icu::UnicodeSet BuildAsciiAToZSet() { - icu::UnicodeSet set('a', 'z'); - set.add('A', 'Z'); - set.freeze(); - return set; -} - -struct AsciiAToZSet { - AsciiAToZSet() : set(BuildAsciiAToZSet()) {} - const icu::UnicodeSet set; -}; - -static base::LazyInstance<IgnoreSet>::type ignore_set = - LAZY_INSTANCE_INITIALIZER; - -static base::LazyInstance<SpecialAddSet>::type special_add_set = - LAZY_INSTANCE_INITIALIZER; - -static base::LazyInstance<AsciiAToZSet>::type ascii_a_to_z_set = - LAZY_INSTANCE_INITIALIZER; -#endif // V8_INTL_SUPPORT - // static void CharacterRange::AddCaseEquivalents(Isolate* isolate, Zone* zone, ZoneList<CharacterRange>* ranges, @@ -1192,75 +1159,22 @@ void CharacterRange::AddCaseEquivalents(Isolate* isolate, Zone* zone, others.add(from, to); } - // Set of characters already added to ranges that do not need to be added - // again. + // Compute the set of additional characters that should be added, + // using UnicodeSet::closeOver. ECMA 262 defines slightly different + // case-folding rules than Unicode, so some characters that are + // added by closeOver do not match anything other than themselves in + // JS. For example, 'ſ' (U+017F LATIN SMALL LETTER LONG S) is the + // same case-insensitive character as 's' or 'S' according to + // Unicode, but does not match any other character in JS. To handle + // this case, we add such characters to the IgnoreSet and filter + // them out. We filter twice: once before calling closeOver (to + // prevent 'ſ' from adding 's'), and once after calling closeOver + // (to prevent 's' from adding 'ſ'). See regexp/special-case.h for + // more information. icu::UnicodeSet already_added(others); - - // Set of characters in ranges that are in the 52 ASCII characters [a-zA-Z]. - icu::UnicodeSet in_ascii_a_to_z(others); - in_ascii_a_to_z.retainAll(ascii_a_to_z_set.Pointer()->set); - - // Remove all chars in [a-zA-Z] from others. - others.removeAll(in_ascii_a_to_z); - - // Set of characters in ranges that are overlapping with special add set. - icu::UnicodeSet in_special_add(others); - in_special_add.retainAll(special_add_set.Pointer()->set); - - others.removeAll(in_special_add); - - // Ignore all chars in ignore set. - others.removeAll(ignore_set.Pointer()->set); - - // For most of the chars in ranges that is still in others, find the case - // equivlant set by calling closeOver(USET_CASE_INSENSITIVE). + others.removeAll(RegExpCaseFolding::IgnoreSet()); others.closeOver(USET_CASE_INSENSITIVE); - - // Because closeOver(USET_CASE_INSENSITIVE) may add ASCII [a-zA-Z] to others, - // but ECMA262 "i" mode won't consider that, remove them from others. - // Ex: U+017F add 'S' and 's' to others. - others.removeAll(ascii_a_to_z_set.Pointer()->set); - - // Special handling for in_ascii_a_to_z. - for (int32_t i = 0; i < in_ascii_a_to_z.getRangeCount(); i++) { - UChar32 start = in_ascii_a_to_z.getRangeStart(i); - UChar32 end = in_ascii_a_to_z.getRangeEnd(i); - // Check if it is uppercase A-Z by checking bit 6. - if (start & 0x0020) { - // Add the lowercases - others.add(start & 0x005F, end & 0x005F); - } else { - // Add the uppercases - others.add(start | 0x0020, end | 0x0020); - } - } - - // Special handling for chars in "Special Add" set. - for (int32_t i = 0; i < in_special_add.getRangeCount(); i++) { - UChar32 end = in_special_add.getRangeEnd(i); - for (UChar32 ch = in_special_add.getRangeStart(i); ch <= end; ch++) { - // Add the uppercase of this character if itself is not an uppercase - // character. - // Note: The if condiction cannot be u_islower(ch) because ch could be - // neither uppercase nor lowercase but Mn. - if (!u_isupper(ch)) { - others.add(u_toupper(ch)); - } - icu::UnicodeSet candidates(ch, ch); - candidates.closeOver(USET_CASE_INSENSITIVE); - for (int32_t j = 0; j < candidates.getRangeCount(); j++) { - UChar32 end2 = candidates.getRangeEnd(j); - for (UChar32 ch2 = candidates.getRangeStart(j); ch2 <= end2; ch2++) { - // Add character that is not uppercase to others. - if (!u_isupper(ch2)) { - others.add(ch2); - } - } - } - } - } - - // Remove all characters which already in the ranges. + others.removeAll(RegExpCaseFolding::IgnoreSet()); others.removeAll(already_added); // Add others to the ranges |