summaryrefslogtreecommitdiffstats
path: root/js/src/regexp/regexp-compiler-tonode.cc
diff options
context:
space:
mode:
Diffstat (limited to 'js/src/regexp/regexp-compiler-tonode.cc')
-rw-r--r--js/src/regexp/regexp-compiler-tonode.cc114
1 files changed, 14 insertions, 100 deletions
diff --git a/js/src/regexp/regexp-compiler-tonode.cc b/js/src/regexp/regexp-compiler-tonode.cc
index fc734ac7c..257030589 100644
--- a/js/src/regexp/regexp-compiler-tonode.cc
+++ b/js/src/regexp/regexp-compiler-tonode.cc
@@ -1137,39 +1137,6 @@ Vector<const int> CharacterRange::GetWordBounds() {
return Vector<const int>(kWordRanges, kWordRangeCount - 1);
}
-#ifdef V8_INTL_SUPPORT
-struct IgnoreSet {
- IgnoreSet() : set(BuildIgnoreSet()) {}
- const icu::UnicodeSet set;
-};
-
-struct SpecialAddSet {
- SpecialAddSet() : set(BuildSpecialAddSet()) {}
- const icu::UnicodeSet set;
-};
-
-icu::UnicodeSet BuildAsciiAToZSet() {
- icu::UnicodeSet set('a', 'z');
- set.add('A', 'Z');
- set.freeze();
- return set;
-}
-
-struct AsciiAToZSet {
- AsciiAToZSet() : set(BuildAsciiAToZSet()) {}
- const icu::UnicodeSet set;
-};
-
-static base::LazyInstance<IgnoreSet>::type ignore_set =
- LAZY_INSTANCE_INITIALIZER;
-
-static base::LazyInstance<SpecialAddSet>::type special_add_set =
- LAZY_INSTANCE_INITIALIZER;
-
-static base::LazyInstance<AsciiAToZSet>::type ascii_a_to_z_set =
- LAZY_INSTANCE_INITIALIZER;
-#endif // V8_INTL_SUPPORT
-
// static
void CharacterRange::AddCaseEquivalents(Isolate* isolate, Zone* zone,
ZoneList<CharacterRange>* ranges,
@@ -1192,75 +1159,22 @@ void CharacterRange::AddCaseEquivalents(Isolate* isolate, Zone* zone,
others.add(from, to);
}
- // Set of characters already added to ranges that do not need to be added
- // again.
+ // Compute the set of additional characters that should be added,
+ // using UnicodeSet::closeOver. ECMA 262 defines slightly different
+ // case-folding rules than Unicode, so some characters that are
+ // added by closeOver do not match anything other than themselves in
+ // JS. For example, 'ſ' (U+017F LATIN SMALL LETTER LONG S) is the
+ // same case-insensitive character as 's' or 'S' according to
+ // Unicode, but does not match any other character in JS. To handle
+ // this case, we add such characters to the IgnoreSet and filter
+ // them out. We filter twice: once before calling closeOver (to
+ // prevent 'ſ' from adding 's'), and once after calling closeOver
+ // (to prevent 's' from adding 'ſ'). See regexp/special-case.h for
+ // more information.
icu::UnicodeSet already_added(others);
-
- // Set of characters in ranges that are in the 52 ASCII characters [a-zA-Z].
- icu::UnicodeSet in_ascii_a_to_z(others);
- in_ascii_a_to_z.retainAll(ascii_a_to_z_set.Pointer()->set);
-
- // Remove all chars in [a-zA-Z] from others.
- others.removeAll(in_ascii_a_to_z);
-
- // Set of characters in ranges that are overlapping with special add set.
- icu::UnicodeSet in_special_add(others);
- in_special_add.retainAll(special_add_set.Pointer()->set);
-
- others.removeAll(in_special_add);
-
- // Ignore all chars in ignore set.
- others.removeAll(ignore_set.Pointer()->set);
-
- // For most of the chars in ranges that is still in others, find the case
- // equivlant set by calling closeOver(USET_CASE_INSENSITIVE).
+ others.removeAll(RegExpCaseFolding::IgnoreSet());
others.closeOver(USET_CASE_INSENSITIVE);
-
- // Because closeOver(USET_CASE_INSENSITIVE) may add ASCII [a-zA-Z] to others,
- // but ECMA262 "i" mode won't consider that, remove them from others.
- // Ex: U+017F add 'S' and 's' to others.
- others.removeAll(ascii_a_to_z_set.Pointer()->set);
-
- // Special handling for in_ascii_a_to_z.
- for (int32_t i = 0; i < in_ascii_a_to_z.getRangeCount(); i++) {
- UChar32 start = in_ascii_a_to_z.getRangeStart(i);
- UChar32 end = in_ascii_a_to_z.getRangeEnd(i);
- // Check if it is uppercase A-Z by checking bit 6.
- if (start & 0x0020) {
- // Add the lowercases
- others.add(start & 0x005F, end & 0x005F);
- } else {
- // Add the uppercases
- others.add(start | 0x0020, end | 0x0020);
- }
- }
-
- // Special handling for chars in "Special Add" set.
- for (int32_t i = 0; i < in_special_add.getRangeCount(); i++) {
- UChar32 end = in_special_add.getRangeEnd(i);
- for (UChar32 ch = in_special_add.getRangeStart(i); ch <= end; ch++) {
- // Add the uppercase of this character if itself is not an uppercase
- // character.
- // Note: The if condiction cannot be u_islower(ch) because ch could be
- // neither uppercase nor lowercase but Mn.
- if (!u_isupper(ch)) {
- others.add(u_toupper(ch));
- }
- icu::UnicodeSet candidates(ch, ch);
- candidates.closeOver(USET_CASE_INSENSITIVE);
- for (int32_t j = 0; j < candidates.getRangeCount(); j++) {
- UChar32 end2 = candidates.getRangeEnd(j);
- for (UChar32 ch2 = candidates.getRangeStart(j); ch2 <= end2; ch2++) {
- // Add character that is not uppercase to others.
- if (!u_isupper(ch2)) {
- others.add(ch2);
- }
- }
- }
- }
- }
-
- // Remove all characters which already in the ranges.
+ others.removeAll(RegExpCaseFolding::IgnoreSet());
others.removeAll(already_added);
// Add others to the ranges