diff options
Diffstat (limited to 'js/src/regexp/regexp-compiler-tonode.cc')
-rw-r--r-- | js/src/regexp/regexp-compiler-tonode.cc | 1675 |
1 files changed, 1675 insertions, 0 deletions
diff --git a/js/src/regexp/regexp-compiler-tonode.cc b/js/src/regexp/regexp-compiler-tonode.cc new file mode 100644 index 000000000..fc734ac7c --- /dev/null +++ b/js/src/regexp/regexp-compiler-tonode.cc @@ -0,0 +1,1675 @@ +// Copyright 2019 the V8 project authors. All rights reserved. +// Use of this source code is governed by a BSD-style license that can be +// found in the LICENSE file. + +#include "regexp/regexp-compiler.h" + +#include "regexp/regexp.h" +#ifdef V8_INTL_SUPPORT +#include "regexp/special-case.h" +#endif // V8_INTL_SUPPORT + +#ifdef V8_INTL_SUPPORT +#include "unicode/locid.h" +#include "unicode/uniset.h" +#include "unicode/utypes.h" +#endif // V8_INTL_SUPPORT + +namespace v8 { +namespace internal { + +using namespace regexp_compiler_constants; // NOLINT(build/namespaces) + +// ------------------------------------------------------------------- +// Tree to graph conversion + +RegExpNode* RegExpAtom::ToNode(RegExpCompiler* compiler, + RegExpNode* on_success) { + ZoneList<TextElement>* elms = + new (compiler->zone()) ZoneList<TextElement>(1, compiler->zone()); + elms->Add(TextElement::Atom(this), compiler->zone()); + return new (compiler->zone()) + TextNode(elms, compiler->read_backward(), on_success); +} + +RegExpNode* RegExpText::ToNode(RegExpCompiler* compiler, + RegExpNode* on_success) { + return new (compiler->zone()) + TextNode(elements(), compiler->read_backward(), on_success); +} + +static bool CompareInverseRanges(ZoneList<CharacterRange>* ranges, + const int* special_class, int length) { + length--; // Remove final marker. + DCHECK_EQ(kRangeEndMarker, special_class[length]); + DCHECK_NE(0, ranges->length()); + DCHECK_NE(0, length); + DCHECK_NE(0, special_class[0]); + if (ranges->length() != (length >> 1) + 1) { + return false; + } + CharacterRange range = ranges->at(0); + if (range.from() != 0) { + return false; + } + for (int i = 0; i < length; i += 2) { + if (special_class[i] != (range.to() + 1)) { + return false; + } + range = ranges->at((i >> 1) + 1); + if (special_class[i + 1] != range.from()) { + return false; + } + } + if (range.to() != String::kMaxCodePoint) { + return false; + } + return true; +} + +static bool CompareRanges(ZoneList<CharacterRange>* ranges, + const int* special_class, int length) { + length--; // Remove final marker. + DCHECK_EQ(kRangeEndMarker, special_class[length]); + if (ranges->length() * 2 != length) { + return false; + } + for (int i = 0; i < length; i += 2) { + CharacterRange range = ranges->at(i >> 1); + if (range.from() != special_class[i] || + range.to() != special_class[i + 1] - 1) { + return false; + } + } + return true; +} + +bool RegExpCharacterClass::is_standard(Zone* zone) { + // TODO(lrn): Remove need for this function, by not throwing away information + // along the way. + if (is_negated()) { + return false; + } + if (set_.is_standard()) { + return true; + } + if (CompareRanges(set_.ranges(zone), kSpaceRanges, kSpaceRangeCount)) { + set_.set_standard_set_type('s'); + return true; + } + if (CompareInverseRanges(set_.ranges(zone), kSpaceRanges, kSpaceRangeCount)) { + set_.set_standard_set_type('S'); + return true; + } + if (CompareInverseRanges(set_.ranges(zone), kLineTerminatorRanges, + kLineTerminatorRangeCount)) { + set_.set_standard_set_type('.'); + return true; + } + if (CompareRanges(set_.ranges(zone), kLineTerminatorRanges, + kLineTerminatorRangeCount)) { + set_.set_standard_set_type('n'); + return true; + } + if (CompareRanges(set_.ranges(zone), kWordRanges, kWordRangeCount)) { + set_.set_standard_set_type('w'); + return true; + } + if (CompareInverseRanges(set_.ranges(zone), kWordRanges, kWordRangeCount)) { + set_.set_standard_set_type('W'); + return true; + } + return false; +} + +UnicodeRangeSplitter::UnicodeRangeSplitter(ZoneList<CharacterRange>* base) { + // The unicode range splitter categorizes given character ranges into: + // - Code points from the BMP representable by one code unit. + // - Code points outside the BMP that need to be split into surrogate pairs. + // - Lone lead surrogates. + // - Lone trail surrogates. + // Lone surrogates are valid code points, even though no actual characters. + // They require special matching to make sure we do not split surrogate pairs. + + for (int i = 0; i < base->length(); i++) AddRange(base->at(i)); +} + +void UnicodeRangeSplitter::AddRange(CharacterRange range) { + static constexpr uc32 kBmp1Start = 0; + static constexpr uc32 kBmp1End = kLeadSurrogateStart - 1; + static constexpr uc32 kBmp2Start = kTrailSurrogateEnd + 1; + static constexpr uc32 kBmp2End = kNonBmpStart - 1; + + // Ends are all inclusive. + STATIC_ASSERT(kBmp1Start == 0); + STATIC_ASSERT(kBmp1Start < kBmp1End); + STATIC_ASSERT(kBmp1End + 1 == kLeadSurrogateStart); + STATIC_ASSERT(kLeadSurrogateStart < kLeadSurrogateEnd); + STATIC_ASSERT(kLeadSurrogateEnd + 1 == kTrailSurrogateStart); + STATIC_ASSERT(kTrailSurrogateStart < kTrailSurrogateEnd); + STATIC_ASSERT(kTrailSurrogateEnd + 1 == kBmp2Start); + STATIC_ASSERT(kBmp2Start < kBmp2End); + STATIC_ASSERT(kBmp2End + 1 == kNonBmpStart); + STATIC_ASSERT(kNonBmpStart < kNonBmpEnd); + + static constexpr uc32 kStarts[] = { + kBmp1Start, kLeadSurrogateStart, kTrailSurrogateStart, + kBmp2Start, kNonBmpStart, + }; + + static constexpr uc32 kEnds[] = { + kBmp1End, kLeadSurrogateEnd, kTrailSurrogateEnd, kBmp2End, kNonBmpEnd, + }; + + CharacterRangeVector* const kTargets[] = { + &bmp_, &lead_surrogates_, &trail_surrogates_, &bmp_, &non_bmp_, + }; + + static constexpr int kCount = arraysize(kStarts); + STATIC_ASSERT(kCount == arraysize(kEnds)); + STATIC_ASSERT(kCount == arraysize(kTargets)); + + for (int i = 0; i < kCount; i++) { + if (kStarts[i] > range.to()) break; + const uc32 from = std::max(kStarts[i], range.from()); + const uc32 to = std::min(kEnds[i], range.to()); + if (from > to) continue; + kTargets[i]->emplace_back(CharacterRange::Range(from, to)); + } +} + +namespace { + +// Translates between new and old V8-isms (SmallVector, ZoneList). +ZoneList<CharacterRange>* ToCanonicalZoneList( + const UnicodeRangeSplitter::CharacterRangeVector* v, Zone* zone) { + if (v->empty()) return nullptr; + + ZoneList<CharacterRange>* result = + new (zone) ZoneList<CharacterRange>(static_cast<int>(v->size()), zone); + for (size_t i = 0; i < v->size(); i++) { + result->Add(v->at(i), zone); + } + + CharacterRange::Canonicalize(result); + return result; +} + +void AddBmpCharacters(RegExpCompiler* compiler, ChoiceNode* result, + RegExpNode* on_success, UnicodeRangeSplitter* splitter) { + ZoneList<CharacterRange>* bmp = + ToCanonicalZoneList(splitter->bmp(), compiler->zone()); + if (bmp == nullptr) return; + JSRegExp::Flags default_flags = JSRegExp::Flags(); + result->AddAlternative(GuardedAlternative(TextNode::CreateForCharacterRanges( + compiler->zone(), bmp, compiler->read_backward(), on_success, + default_flags))); +} + +void AddNonBmpSurrogatePairs(RegExpCompiler* compiler, ChoiceNode* result, + RegExpNode* on_success, + UnicodeRangeSplitter* splitter) { + ZoneList<CharacterRange>* non_bmp = + ToCanonicalZoneList(splitter->non_bmp(), compiler->zone()); + if (non_bmp == nullptr) return; + DCHECK(!compiler->one_byte()); + Zone* zone = compiler->zone(); + JSRegExp::Flags default_flags = JSRegExp::Flags(); + CharacterRange::Canonicalize(non_bmp); + for (int i = 0; i < non_bmp->length(); i++) { + // Match surrogate pair. + // E.g. [\u10005-\u11005] becomes + // \ud800[\udc05-\udfff]| + // [\ud801-\ud803][\udc00-\udfff]| + // \ud804[\udc00-\udc05] + uc32 from = non_bmp->at(i).from(); + uc32 to = non_bmp->at(i).to(); + uc16 from_l = unibrow::Utf16::LeadSurrogate(from); + uc16 from_t = unibrow::Utf16::TrailSurrogate(from); + uc16 to_l = unibrow::Utf16::LeadSurrogate(to); + uc16 to_t = unibrow::Utf16::TrailSurrogate(to); + if (from_l == to_l) { + // The lead surrogate is the same. + result->AddAlternative( + GuardedAlternative(TextNode::CreateForSurrogatePair( + zone, CharacterRange::Singleton(from_l), + CharacterRange::Range(from_t, to_t), compiler->read_backward(), + on_success, default_flags))); + } else { + if (from_t != kTrailSurrogateStart) { + // Add [from_l][from_t-\udfff] + result->AddAlternative( + GuardedAlternative(TextNode::CreateForSurrogatePair( + zone, CharacterRange::Singleton(from_l), + CharacterRange::Range(from_t, kTrailSurrogateEnd), + compiler->read_backward(), on_success, default_flags))); + from_l++; + } + if (to_t != kTrailSurrogateEnd) { + // Add [to_l][\udc00-to_t] + result->AddAlternative( + GuardedAlternative(TextNode::CreateForSurrogatePair( + zone, CharacterRange::Singleton(to_l), + CharacterRange::Range(kTrailSurrogateStart, to_t), + compiler->read_backward(), on_success, default_flags))); + to_l--; + } + if (from_l <= to_l) { + // Add [from_l-to_l][\udc00-\udfff] + result->AddAlternative( + GuardedAlternative(TextNode::CreateForSurrogatePair( + zone, CharacterRange::Range(from_l, to_l), + CharacterRange::Range(kTrailSurrogateStart, kTrailSurrogateEnd), + compiler->read_backward(), on_success, default_flags))); + } + } + } +} + +RegExpNode* NegativeLookaroundAgainstReadDirectionAndMatch( + RegExpCompiler* compiler, ZoneList<CharacterRange>* lookbehind, + ZoneList<CharacterRange>* match, RegExpNode* on_success, bool read_backward, + JSRegExp::Flags flags) { + Zone* zone = compiler->zone(); + RegExpNode* match_node = TextNode::CreateForCharacterRanges( + zone, match, read_backward, on_success, flags); + int stack_register = compiler->UnicodeLookaroundStackRegister(); + int position_register = compiler->UnicodeLookaroundPositionRegister(); + RegExpLookaround::Builder lookaround(false, match_node, stack_register, + position_register); + RegExpNode* negative_match = TextNode::CreateForCharacterRanges( + zone, lookbehind, !read_backward, lookaround.on_match_success(), flags); + return lookaround.ForMatch(negative_match); +} + +RegExpNode* MatchAndNegativeLookaroundInReadDirection( + RegExpCompiler* compiler, ZoneList<CharacterRange>* match, + ZoneList<CharacterRange>* lookahead, RegExpNode* on_success, + bool read_backward, JSRegExp::Flags flags) { + Zone* zone = compiler->zone(); + int stack_register = compiler->UnicodeLookaroundStackRegister(); + int position_register = compiler->UnicodeLookaroundPositionRegister(); + RegExpLookaround::Builder lookaround(false, on_success, stack_register, + position_register); + RegExpNode* negative_match = TextNode::CreateForCharacterRanges( + zone, lookahead, read_backward, lookaround.on_match_success(), flags); + return TextNode::CreateForCharacterRanges( + zone, match, read_backward, lookaround.ForMatch(negative_match), flags); +} + +void AddLoneLeadSurrogates(RegExpCompiler* compiler, ChoiceNode* result, + RegExpNode* on_success, + UnicodeRangeSplitter* splitter) { + JSRegExp::Flags default_flags = JSRegExp::Flags(); + ZoneList<CharacterRange>* lead_surrogates = + ToCanonicalZoneList(splitter->lead_surrogates(), compiler->zone()); + if (lead_surrogates == nullptr) return; + Zone* zone = compiler->zone(); + // E.g. \ud801 becomes \ud801(?![\udc00-\udfff]). + ZoneList<CharacterRange>* trail_surrogates = CharacterRange::List( + zone, CharacterRange::Range(kTrailSurrogateStart, kTrailSurrogateEnd)); + + RegExpNode* match; + if (compiler->read_backward()) { + // Reading backward. Assert that reading forward, there is no trail + // surrogate, and then backward match the lead surrogate. + match = NegativeLookaroundAgainstReadDirectionAndMatch( + compiler, trail_surrogates, lead_surrogates, on_success, true, + default_flags); + } else { + // Reading forward. Forward match the lead surrogate and assert that + // no trail surrogate follows. + match = MatchAndNegativeLookaroundInReadDirection( + compiler, lead_surrogates, trail_surrogates, on_success, false, + default_flags); + } + result->AddAlternative(GuardedAlternative(match)); +} + +void AddLoneTrailSurrogates(RegExpCompiler* compiler, ChoiceNode* result, + RegExpNode* on_success, + UnicodeRangeSplitter* splitter) { + JSRegExp::Flags default_flags = JSRegExp::Flags(); + ZoneList<CharacterRange>* trail_surrogates = + ToCanonicalZoneList(splitter->trail_surrogates(), compiler->zone()); + if (trail_surrogates == nullptr) return; + Zone* zone = compiler->zone(); + // E.g. \udc01 becomes (?<![\ud800-\udbff])\udc01 + ZoneList<CharacterRange>* lead_surrogates = CharacterRange::List( + zone, CharacterRange::Range(kLeadSurrogateStart, kLeadSurrogateEnd)); + + RegExpNode* match; + if (compiler->read_backward()) { + // Reading backward. Backward match the trail surrogate and assert that no + // lead surrogate precedes it. + match = MatchAndNegativeLookaroundInReadDirection( + compiler, trail_surrogates, lead_surrogates, on_success, true, + default_flags); + } else { + // Reading forward. Assert that reading backward, there is no lead + // surrogate, and then forward match the trail surrogate. + match = NegativeLookaroundAgainstReadDirectionAndMatch( + compiler, lead_surrogates, trail_surrogates, on_success, false, + default_flags); + } + result->AddAlternative(GuardedAlternative(match)); +} + +RegExpNode* UnanchoredAdvance(RegExpCompiler* compiler, + RegExpNode* on_success) { + // This implements ES2015 21.2.5.2.3, AdvanceStringIndex. + DCHECK(!compiler->read_backward()); + Zone* zone = compiler->zone(); + // Advance any character. If the character happens to be a lead surrogate and + // we advanced into the middle of a surrogate pair, it will work out, as + // nothing will match from there. We will have to advance again, consuming + // the associated trail surrogate. + ZoneList<CharacterRange>* range = CharacterRange::List( + zone, CharacterRange::Range(0, String::kMaxUtf16CodeUnit)); + JSRegExp::Flags default_flags = JSRegExp::Flags(); + return TextNode::CreateForCharacterRanges(zone, range, false, on_success, + default_flags); +} + +void AddUnicodeCaseEquivalents(ZoneList<CharacterRange>* ranges, Zone* zone) { +#ifdef V8_INTL_SUPPORT + DCHECK(CharacterRange::IsCanonical(ranges)); + + // Micro-optimization to avoid passing large ranges to UnicodeSet::closeOver. + // See also https://crbug.com/v8/6727. + // TODO(jgruber): This only covers the special case of the {0,0x10FFFF} range, + // which we use frequently internally. But large ranges can also easily be + // created by the user. We might want to have a more general caching mechanism + // for such ranges. + if (ranges->length() == 1 && ranges->at(0).IsEverything(kNonBmpEnd)) return; + + // Use ICU to compute the case fold closure over the ranges. + icu::UnicodeSet set; + for (int i = 0; i < ranges->length(); i++) { + set.add(ranges->at(i).from(), ranges->at(i).to()); + } + ranges->Clear(); + set.closeOver(USET_CASE_INSENSITIVE); + // Full case mapping map single characters to multiple characters. + // Those are represented as strings in the set. Remove them so that + // we end up with only simple and common case mappings. + set.removeAllStrings(); + for (int i = 0; i < set.getRangeCount(); i++) { + ranges->Add(CharacterRange::Range(set.getRangeStart(i), set.getRangeEnd(i)), + zone); + } + // No errors and everything we collected have been ranges. + CharacterRange::Canonicalize(ranges); +#endif // V8_INTL_SUPPORT +} + +} // namespace + +RegExpNode* RegExpCharacterClass::ToNode(RegExpCompiler* compiler, + RegExpNode* on_success) { + set_.Canonicalize(); + Zone* zone = compiler->zone(); + ZoneList<CharacterRange>* ranges = this->ranges(zone); + if (NeedsUnicodeCaseEquivalents(flags_)) { + AddUnicodeCaseEquivalents(ranges, zone); + } + if (IsUnicode(flags_) && !compiler->one_byte() && + !contains_split_surrogate()) { + if (is_negated()) { + ZoneList<CharacterRange>* negated = + new (zone) ZoneList<CharacterRange>(2, zone); + CharacterRange::Negate(ranges, negated, zone); + ranges = negated; + } + if (ranges->length() == 0) { + JSRegExp::Flags default_flags; + RegExpCharacterClass* fail = + new (zone) RegExpCharacterClass(zone, ranges, default_flags); + return new (zone) TextNode(fail, compiler->read_backward(), on_success); + } + if (standard_type() == '*') { + return UnanchoredAdvance(compiler, on_success); + } else { + ChoiceNode* result = new (zone) ChoiceNode(2, zone); + UnicodeRangeSplitter splitter(ranges); + AddBmpCharacters(compiler, result, on_success, &splitter); + AddNonBmpSurrogatePairs(compiler, result, on_success, &splitter); + AddLoneLeadSurrogates(compiler, result, on_success, &splitter); + AddLoneTrailSurrogates(compiler, result, on_success, &splitter); + return result; + } + } else { + return new (zone) TextNode(this, compiler->read_backward(), on_success); + } +} + +int CompareFirstChar(RegExpTree* const* a, RegExpTree* const* b) { + RegExpAtom* atom1 = (*a)->AsAtom(); + RegExpAtom* atom2 = (*b)->AsAtom(); + uc16 character1 = atom1->data().at(0); + uc16 character2 = atom2->data().at(0); + if (character1 < character2) return -1; + if (character1 > character2) return 1; + return 0; +} + +#ifdef V8_INTL_SUPPORT + +// Case Insensitve comparesion +int CompareFirstCharCaseInsensitve(RegExpTree* const* a, RegExpTree* const* b) { + RegExpAtom* atom1 = (*a)->AsAtom(); + RegExpAtom* atom2 = (*b)->AsAtom(); + icu::UnicodeString character1(atom1->data().at(0)); + return character1.caseCompare(atom2->data().at(0), U_FOLD_CASE_DEFAULT); +} + +#else + +static unibrow::uchar Canonical( + unibrow::Mapping<unibrow::Ecma262Canonicalize>* canonicalize, + unibrow::uchar c) { + unibrow::uchar chars[unibrow::Ecma262Canonicalize::kMaxWidth]; + int length = canonicalize->get(c, '\0', chars); + DCHECK_LE(length, 1); + unibrow::uchar canonical = c; + if (length == 1) canonical = chars[0]; + return canonical; +} + +int CompareFirstCharCaseIndependent( + unibrow::Mapping<unibrow::Ecma262Canonicalize>* canonicalize, + RegExpTree* const* a, RegExpTree* const* b) { + RegExpAtom* atom1 = (*a)->AsAtom(); + RegExpAtom* atom2 = (*b)->AsAtom(); + unibrow::uchar character1 = atom1->data().at(0); + unibrow::uchar character2 = atom2->data().at(0); + if (character1 == character2) return 0; + if (character1 >= 'a' || character2 >= 'a') { + character1 = Canonical(canonicalize, character1); + character2 = Canonical(canonicalize, character2); + } + return static_cast<int>(character1) - static_cast<int>(character2); +} +#endif // V8_INTL_SUPPORT + +// We can stable sort runs of atoms, since the order does not matter if they +// start with different characters. +// Returns true if any consecutive atoms were found. +bool RegExpDisjunction::SortConsecutiveAtoms(RegExpCompiler* compiler) { + ZoneList<RegExpTree*>* alternatives = this->alternatives(); + int length = alternatives->length(); + bool found_consecutive_atoms = false; + for (int i = 0; i < length; i++) { + while (i < length) { + RegExpTree* alternative = alternatives->at(i); + if (alternative->IsAtom()) break; + i++; + } + // i is length or it is the index of an atom. + if (i == length) break; + int first_atom = i; + JSRegExp::Flags flags = alternatives->at(i)->AsAtom()->flags(); + i++; + while (i < length) { + RegExpTree* alternative = alternatives->at(i); + if (!alternative->IsAtom()) break; + if (alternative->AsAtom()->flags() != flags) break; + i++; + } + // Sort atoms to get ones with common prefixes together. + // This step is more tricky if we are in a case-independent regexp, + // because it would change /is|I/ to /I|is/, and order matters when + // the regexp parts don't match only disjoint starting points. To fix + // this we have a version of CompareFirstChar that uses case- + // independent character classes for comparison. + DCHECK_LT(first_atom, alternatives->length()); + DCHECK_LE(i, alternatives->length()); + DCHECK_LE(first_atom, i); + if (IgnoreCase(flags)) { +#ifdef V8_INTL_SUPPORT + alternatives->StableSort(CompareFirstCharCaseInsensitve, first_atom, + i - first_atom); +#else + unibrow::Mapping<unibrow::Ecma262Canonicalize>* canonicalize = + compiler->isolate()->regexp_macro_assembler_canonicalize(); + auto compare_closure = [canonicalize](RegExpTree* const* a, + RegExpTree* const* b) { + return CompareFirstCharCaseIndependent(canonicalize, a, b); + }; + alternatives->StableSort(compare_closure, first_atom, i - first_atom); +#endif // V8_INTL_SUPPORT + } else { + alternatives->StableSort(CompareFirstChar, first_atom, i - first_atom); + } + if (i - first_atom > 1) found_consecutive_atoms = true; + } + return found_consecutive_atoms; +} + +// Optimizes ab|ac|az to a(?:b|c|d). +void RegExpDisjunction::RationalizeConsecutiveAtoms(RegExpCompiler* compiler) { + Zone* zone = compiler->zone(); + ZoneList<RegExpTree*>* alternatives = this->alternatives(); + int length = alternatives->length(); + + int write_posn = 0; + int i = 0; + while (i < length) { + RegExpTree* alternative = alternatives->at(i); + if (!alternative->IsAtom()) { + alternatives->at(write_posn++) = alternatives->at(i); + i++; + continue; + } + RegExpAtom* const atom = alternative->AsAtom(); + JSRegExp::Flags flags = atom->flags(); +#ifdef V8_INTL_SUPPORT + icu::UnicodeString common_prefix(atom->data().at(0)); +#else + unibrow::uchar common_prefix = atom->data().at(0); +#endif // V8_INTL_SUPPORT + int first_with_prefix = i; + int prefix_length = atom->length(); + i++; + while (i < length) { + alternative = alternatives->at(i); + if (!alternative->IsAtom()) break; + RegExpAtom* const atom = alternative->AsAtom(); + if (atom->flags() != flags) break; +#ifdef V8_INTL_SUPPORT + icu::UnicodeString new_prefix(atom->data().at(0)); + if (new_prefix != common_prefix) { + if (!IgnoreCase(flags)) break; + if (common_prefix.caseCompare(new_prefix, U_FOLD_CASE_DEFAULT) != 0) + break; + } +#else + unibrow::uchar new_prefix = atom->data().at(0); + if (new_prefix != common_prefix) { + if (!IgnoreCase(flags)) break; + unibrow::Mapping<unibrow::Ecma262Canonicalize>* canonicalize = + compiler->isolate()->regexp_macro_assembler_canonicalize(); + new_prefix = Canonical(canonicalize, new_prefix); + common_prefix = Canonical(canonicalize, common_prefix); + if (new_prefix != common_prefix) break; + } +#endif // V8_INTL_SUPPORT + prefix_length = Min(prefix_length, atom->length()); + i++; + } + if (i > first_with_prefix + 2) { + // Found worthwhile run of alternatives with common prefix of at least one + // character. The sorting function above did not sort on more than one + // character for reasons of correctness, but there may still be a longer + // common prefix if the terms were similar or presorted in the input. + // Find out how long the common prefix is. + int run_length = i - first_with_prefix; + RegExpAtom* const atom = alternatives->at(first_with_prefix)->AsAtom(); + for (int j = 1; j < run_length && prefix_length > 1; j++) { + RegExpAtom* old_atom = + alternatives->at(j + first_with_prefix)->AsAtom(); + for (int k = 1; k < prefix_length; k++) { + if (atom->data().at(k) != old_atom->data().at(k)) { + prefix_length = k; + break; + } + } + } + RegExpAtom* prefix = new (zone) + RegExpAtom(atom->data().SubVector(0, prefix_length), flags); + ZoneList<RegExpTree*>* pair = new (zone) ZoneList<RegExpTree*>(2, zone); + pair->Add(prefix, zone); + ZoneList<RegExpTree*>* suffixes = + new (zone) ZoneList<RegExpTree*>(run_length, zone); + for (int j = 0; j < run_length; j++) { + RegExpAtom* old_atom = + alternatives->at(j + first_with_prefix)->AsAtom(); + int len = old_atom->length(); + if (len == prefix_length) { + suffixes->Add(new (zone) RegExpEmpty(), zone); + } else { + RegExpTree* suffix = new (zone) RegExpAtom( + old_atom->data().SubVector(prefix_length, old_atom->length()), + flags); + suffixes->Add(suffix, zone); + } + } + pair->Add(new (zone) RegExpDisjunction(suffixes), zone); + alternatives->at(write_posn++) = new (zone) RegExpAlternative(pair); + } else { + // Just copy any non-worthwhile alternatives. + for (int j = first_with_prefix; j < i; j++) { + alternatives->at(write_posn++) = alternatives->at(j); + } + } + } + alternatives->Rewind(write_posn); // Trim end of array. +} + +// Optimizes b|c|z to [bcz]. +void RegExpDisjunction::FixSingleCharacterDisjunctions( + RegExpCompiler* compiler) { + Zone* zone = compiler->zone(); + ZoneList<RegExpTree*>* alternatives = this->alternatives(); + int length = alternatives->length(); + + int write_posn = 0; + int i = 0; + while (i < length) { + RegExpTree* alternative = alternatives->at(i); + if (!alternative->IsAtom()) { + alternatives->at(write_posn++) = alternatives->at(i); + i++; + continue; + } + RegExpAtom* const atom = alternative->AsAtom(); + if (atom->length() != 1) { + alternatives->at(write_posn++) = alternatives->at(i); + i++; + continue; + } + JSRegExp::Flags flags = atom->flags(); + DCHECK_IMPLIES(IsUnicode(flags), + !unibrow::Utf16::IsLeadSurrogate(atom->data().at(0))); + bool contains_trail_surrogate = + unibrow::Utf16::IsTrailSurrogate(atom->data().at(0)); + int first_in_run = i; + i++; + // Find a run of single-character atom alternatives that have identical + // flags (case independence and unicode-ness). + while (i < length) { + alternative = alternatives->at(i); + if (!alternative->IsAtom()) break; + RegExpAtom* const atom = alternative->AsAtom(); + if (atom->length() != 1) break; + if (atom->flags() != flags) break; + DCHECK_IMPLIES(IsUnicode(flags), + !unibrow::Utf16::IsLeadSurrogate(atom->data().at(0))); + contains_trail_surrogate |= + unibrow::Utf16::IsTrailSurrogate(atom->data().at(0)); + i++; + } + if (i > first_in_run + 1) { + // Found non-trivial run of single-character alternatives. + int run_length = i - first_in_run; + ZoneList<CharacterRange>* ranges = + new (zone) ZoneList<CharacterRange>(2, zone); + for (int j = 0; j < run_length; j++) { + RegExpAtom* old_atom = alternatives->at(j + first_in_run)->AsAtom(); + DCHECK_EQ(old_atom->length(), 1); + ranges->Add(CharacterRange::Singleton(old_atom->data().at(0)), zone); + } + RegExpCharacterClass::CharacterClassFlags character_class_flags; + if (IsUnicode(flags) && contains_trail_surrogate) { + character_class_flags = RegExpCharacterClass::CONTAINS_SPLIT_SURROGATE; + } + alternatives->at(write_posn++) = new (zone) + RegExpCharacterClass(zone, ranges, flags, character_class_flags); + } else { + // Just copy any trivial alternatives. + for (int j = first_in_run; j < i; j++) { + alternatives->at(write_posn++) = alternatives->at(j); + } + } + } + alternatives->Rewind(write_posn); // Trim end of array. +} + +RegExpNode* RegExpDisjunction::ToNode(RegExpCompiler* compiler, + RegExpNode* on_success) { + ZoneList<RegExpTree*>* alternatives = this->alternatives(); + + if (alternatives->length() > 2) { + bool found_consecutive_atoms = SortConsecutiveAtoms(compiler); + if (found_consecutive_atoms) RationalizeConsecutiveAtoms(compiler); + FixSingleCharacterDisjunctions(compiler); + if (alternatives->length() == 1) { + return alternatives->at(0)->ToNode(compiler, on_success); + } + } + + int length = alternatives->length(); + + ChoiceNode* result = + new (compiler->zone()) ChoiceNode(length, compiler->zone()); + for (int i = 0; i < length; i++) { + GuardedAlternative alternative( + alternatives->at(i)->ToNode(compiler, on_success)); + result->AddAlternative(alternative); + } + return result; +} + +RegExpNode* RegExpQuantifier::ToNode(RegExpCompiler* compiler, + RegExpNode* on_success) { + return ToNode(min(), max(), is_greedy(), body(), compiler, on_success); +} + +namespace { +// Desugar \b to (?<=\w)(?=\W)|(?<=\W)(?=\w) and +// \B to (?<=\w)(?=\w)|(?<=\W)(?=\W) +RegExpNode* BoundaryAssertionAsLookaround(RegExpCompiler* compiler, + RegExpNode* on_success, + RegExpAssertion::AssertionType type, + JSRegExp::Flags flags) { + DCHECK(NeedsUnicodeCaseEquivalents(flags)); + Zone* zone = compiler->zone(); + ZoneList<CharacterRange>* word_range = + new (zone) ZoneList<CharacterRange>(2, zone); + CharacterRange::AddClassEscape('w', word_range, true, zone); + int stack_register = compiler->UnicodeLookaroundStackRegister(); + int position_register = compiler->UnicodeLookaroundPositionRegister(); + ChoiceNode* result = new (zone) ChoiceNode(2, zone); + // Add two choices. The (non-)boundary could start with a word or + // a non-word-character. + for (int i = 0; i < 2; i++) { + bool lookbehind_for_word = i == 0; + bool lookahead_for_word = + (type == RegExpAssertion::BOUNDARY) ^ lookbehind_for_word; + // Look to the left. + RegExpLookaround::Builder lookbehind(lookbehind_for_word, on_success, + stack_register, position_register); + RegExpNode* backward = TextNode::CreateForCharacterRanges( + zone, word_range, true, lookbehind.on_match_success(), flags); + // Look to the right. + RegExpLookaround::Builder lookahead(lookahead_for_word, + lookbehind.ForMatch(backward), + stack_register, position_register); + RegExpNode* forward = TextNode::CreateForCharacterRanges( + zone, word_range, false, lookahead.on_match_success(), flags); + result->AddAlternative(GuardedAlternative(lookahead.ForMatch(forward))); + } + return result; +} +} // anonymous namespace + +RegExpNode* RegExpAssertion::ToNode(RegExpCompiler* compiler, + RegExpNode* on_success) { + NodeInfo info; + Zone* zone = compiler->zone(); + + switch (assertion_type()) { + case START_OF_LINE: + return AssertionNode::AfterNewline(on_success); + case START_OF_INPUT: + return AssertionNode::AtStart(on_success); + case BOUNDARY: + return NeedsUnicodeCaseEquivalents(flags_) + ? BoundaryAssertionAsLookaround(compiler, on_success, BOUNDARY, + flags_) + : AssertionNode::AtBoundary(on_success); + case NON_BOUNDARY: + return NeedsUnicodeCaseEquivalents(flags_) + ? BoundaryAssertionAsLookaround(compiler, on_success, + NON_BOUNDARY, flags_) + : AssertionNode::AtNonBoundary(on_success); + case END_OF_INPUT: + return AssertionNode::AtEnd(on_success); + case END_OF_LINE: { + // Compile $ in multiline regexps as an alternation with a positive + // lookahead in one side and an end-of-input on the other side. + // We need two registers for the lookahead. + int stack_pointer_register = compiler->AllocateRegister(); + int position_register = compiler->AllocateRegister(); + // The ChoiceNode to distinguish between a newline and end-of-input. + ChoiceNode* result = new (zone) ChoiceNode(2, zone); + // Create a newline atom. + ZoneList<CharacterRange>* newline_ranges = + new (zone) ZoneList<CharacterRange>(3, zone); + CharacterRange::AddClassEscape('n', newline_ranges, false, zone); + JSRegExp::Flags default_flags = JSRegExp::Flags(); + RegExpCharacterClass* newline_atom = + new (zone) RegExpCharacterClass('n', default_flags); + TextNode* newline_matcher = + new (zone) TextNode(newline_atom, false, + ActionNode::PositiveSubmatchSuccess( + stack_pointer_register, position_register, + 0, // No captures inside. + -1, // Ignored if no captures. + on_success)); + // Create an end-of-input matcher. + RegExpNode* end_of_line = ActionNode::BeginSubmatch( + stack_pointer_register, position_register, newline_matcher); + // Add the two alternatives to the ChoiceNode. + GuardedAlternative eol_alternative(end_of_line); + result->AddAlternative(eol_alternative); + GuardedAlternative end_alternative(AssertionNode::AtEnd(on_success)); + result->AddAlternative(end_alternative); + return result; + } + default: + UNREACHABLE(); + } + return on_success; +} + +RegExpNode* RegExpBackReference::ToNode(RegExpCompiler* compiler, + RegExpNode* on_success) { + return new (compiler->zone()) + BackReferenceNode(RegExpCapture::StartRegister(index()), + RegExpCapture::EndRegister(index()), flags_, + compiler->read_backward(), on_success); +} + +RegExpNode* RegExpEmpty::ToNode(RegExpCompiler* compiler, + RegExpNode* on_success) { + return on_success; +} + +RegExpLookaround::Builder::Builder(bool is_positive, RegExpNode* on_success, + int stack_pointer_register, + int position_register, + int capture_register_count, + int capture_register_start) + : is_positive_(is_positive), + on_success_(on_success), + stack_pointer_register_(stack_pointer_register), + position_register_(position_register) { + if (is_positive_) { + on_match_success_ = ActionNode::PositiveSubmatchSuccess( + stack_pointer_register, position_register, capture_register_count, + capture_register_start, on_success_); + } else { + Zone* zone = on_success_->zone(); + on_match_success_ = new (zone) NegativeSubmatchSuccess( + stack_pointer_register, position_register, capture_register_count, + capture_register_start, zone); + } +} + +RegExpNode* RegExpLookaround::Builder::ForMatch(RegExpNode* match) { + if (is_positive_) { + return ActionNode::BeginSubmatch(stack_pointer_register_, + position_register_, match); + } else { + Zone* zone = on_success_->zone(); + // We use a ChoiceNode to represent the negative lookaround. The first + // alternative is the negative match. On success, the end node backtracks. + // On failure, the second alternative is tried and leads to success. + // NegativeLookaheadChoiceNode is a special ChoiceNode that ignores the + // first exit when calculating quick checks. + ChoiceNode* choice_node = new (zone) NegativeLookaroundChoiceNode( + GuardedAlternative(match), GuardedAlternative(on_success_), zone); + return ActionNode::BeginSubmatch(stack_pointer_register_, + position_register_, choice_node); + } +} + +RegExpNode* RegExpLookaround::ToNode(RegExpCompiler* compiler, + RegExpNode* on_success) { + int stack_pointer_register = compiler->AllocateRegister(); + int position_register = compiler->AllocateRegister(); + + const int registers_per_capture = 2; + const int register_of_first_capture = 2; + int register_count = capture_count_ * registers_per_capture; + int register_start = + register_of_first_capture + capture_from_ * registers_per_capture; + + RegExpNode* result; + bool was_reading_backward = compiler->read_backward(); + compiler->set_read_backward(type() == LOOKBEHIND); + Builder builder(is_positive(), on_success, stack_pointer_register, + position_register, register_count, register_start); + RegExpNode* match = body_->ToNode(compiler, builder.on_match_success()); + result = builder.ForMatch(match); + compiler->set_read_backward(was_reading_backward); + return result; +} + +RegExpNode* RegExpCapture::ToNode(RegExpCompiler* compiler, + RegExpNode* on_success) { + return ToNode(body(), index(), compiler, on_success); +} + +RegExpNode* RegExpCapture::ToNode(RegExpTree* body, int index, + RegExpCompiler* compiler, + RegExpNode* on_success) { + DCHECK_NOT_NULL(body); + int start_reg = RegExpCapture::StartRegister(index); + int end_reg = RegExpCapture::EndRegister(index); + if (compiler->read_backward()) std::swap(start_reg, end_reg); + RegExpNode* store_end = ActionNode::StorePosition(end_reg, true, on_success); + RegExpNode* body_node = body->ToNode(compiler, store_end); + return ActionNode::StorePosition(start_reg, true, body_node); +} + +namespace { + +class AssertionSequenceRewriter final { + public: + // TODO(jgruber): Consider moving this to a separate AST tree rewriter pass + // instead of sprinkling rewrites into the AST->Node conversion process. + static void MaybeRewrite(ZoneList<RegExpTree*>* terms, Zone* zone) { + AssertionSequenceRewriter rewriter(terms, zone); + + static constexpr int kNoIndex = -1; + int from = kNoIndex; + + for (int i = 0; i < terms->length(); i++) { + RegExpTree* t = terms->at(i); + if (from == kNoIndex && t->IsAssertion()) { + from = i; // Start a sequence. + } else if (from != kNoIndex && !t->IsAssertion()) { + // Terminate and process the sequence. + if (i - from > 1) rewriter.Rewrite(from, i); + from = kNoIndex; + } + } + + if (from != kNoIndex && terms->length() - from > 1) { + rewriter.Rewrite(from, terms->length()); + } + } + + // All assertions are zero width. A consecutive sequence of assertions is + // order-independent. There's two ways we can optimize here: + // 1. fold all identical assertions. + // 2. if any assertion combinations are known to fail (e.g. \b\B), the entire + // sequence fails. + void Rewrite(int from, int to) { + DCHECK_GT(to, from + 1); + + // Bitfield of all seen assertions. + uint32_t seen_assertions = 0; + STATIC_ASSERT(RegExpAssertion::LAST_TYPE < kUInt32Size * kBitsPerByte); + + // Flags must match for folding. + JSRegExp::Flags flags = terms_->at(from)->AsAssertion()->flags(); + bool saw_mismatched_flags = false; + + for (int i = from; i < to; i++) { + RegExpAssertion* t = terms_->at(i)->AsAssertion(); + if (t->flags() != flags) saw_mismatched_flags = true; + const uint32_t bit = 1 << t->assertion_type(); + + if ((seen_assertions & bit) && !saw_mismatched_flags) { + // Fold duplicates. + terms_->Set(i, new (zone_) RegExpEmpty()); + } + + seen_assertions |= bit; + } + + // Collapse failures. + const uint32_t always_fails_mask = + 1 << RegExpAssertion::BOUNDARY | 1 << RegExpAssertion::NON_BOUNDARY; + if ((seen_assertions & always_fails_mask) == always_fails_mask) { + ReplaceSequenceWithFailure(from, to); + } + } + + void ReplaceSequenceWithFailure(int from, int to) { + // Replace the entire sequence with a single node that always fails. + // TODO(jgruber): Consider adding an explicit Fail kind. Until then, the + // negated '*' (everything) range serves the purpose. + ZoneList<CharacterRange>* ranges = + new (zone_) ZoneList<CharacterRange>(0, zone_); + RegExpCharacterClass* cc = + new (zone_) RegExpCharacterClass(zone_, ranges, JSRegExp::Flags()); + terms_->Set(from, cc); + + // Zero out the rest. + RegExpEmpty* empty = new (zone_) RegExpEmpty(); + for (int i = from + 1; i < to; i++) terms_->Set(i, empty); + } + + private: + AssertionSequenceRewriter(ZoneList<RegExpTree*>* terms, Zone* zone) + : zone_(zone), terms_(terms) {} + + Zone* zone_; + ZoneList<RegExpTree*>* terms_; +}; + +} // namespace + +RegExpNode* RegExpAlternative::ToNode(RegExpCompiler* compiler, + RegExpNode* on_success) { + ZoneList<RegExpTree*>* children = nodes(); + + AssertionSequenceRewriter::MaybeRewrite(children, compiler->zone()); + + RegExpNode* current = on_success; + if (compiler->read_backward()) { + for (int i = 0; i < children->length(); i++) { + current = children->at(i)->ToNode(compiler, current); + } + } else { + for (int i = children->length() - 1; i >= 0; i--) { + current = children->at(i)->ToNode(compiler, current); + } + } + return current; +} + +static void AddClass(const int* elmv, int elmc, + ZoneList<CharacterRange>* ranges, Zone* zone) { + elmc--; + DCHECK_EQ(kRangeEndMarker, elmv[elmc]); + for (int i = 0; i < elmc; i += 2) { + DCHECK(elmv[i] < elmv[i + 1]); + ranges->Add(CharacterRange::Range(elmv[i], elmv[i + 1] - 1), zone); + } +} + +static void AddClassNegated(const int* elmv, int elmc, + ZoneList<CharacterRange>* ranges, Zone* zone) { + elmc--; + DCHECK_EQ(kRangeEndMarker, elmv[elmc]); + DCHECK_NE(0x0000, elmv[0]); + DCHECK_NE(String::kMaxCodePoint, elmv[elmc - 1]); + uc16 last = 0x0000; + for (int i = 0; i < elmc; i += 2) { + DCHECK(last <= elmv[i] - 1); + DCHECK(elmv[i] < elmv[i + 1]); + ranges->Add(CharacterRange::Range(last, elmv[i] - 1), zone); + last = elmv[i + 1]; + } + ranges->Add(CharacterRange::Range(last, String::kMaxCodePoint), zone); +} + +void CharacterRange::AddClassEscape(char type, ZoneList<CharacterRange>* ranges, + bool add_unicode_case_equivalents, + Zone* zone) { + if (add_unicode_case_equivalents && (type == 'w' || type == 'W')) { + // See #sec-runtime-semantics-wordcharacters-abstract-operation + // In case of unicode and ignore_case, we need to create the closure over + // case equivalent characters before negating. + ZoneList<CharacterRange>* new_ranges = + new (zone) ZoneList<CharacterRange>(2, zone); + AddClass(kWordRanges, kWordRangeCount, new_ranges, zone); + AddUnicodeCaseEquivalents(new_ranges, zone); + if (type == 'W') { + ZoneList<CharacterRange>* negated = + new (zone) ZoneList<CharacterRange>(2, zone); + CharacterRange::Negate(new_ranges, negated, zone); + new_ranges = negated; + } + ranges->AddAll(*new_ranges, zone); + return; + } + AddClassEscape(type, ranges, zone); +} + +void CharacterRange::AddClassEscape(char type, ZoneList<CharacterRange>* ranges, + Zone* zone) { + switch (type) { + case 's': + AddClass(kSpaceRanges, kSpaceRangeCount, ranges, zone); + break; + case 'S': + AddClassNegated(kSpaceRanges, kSpaceRangeCount, ranges, zone); + break; + case 'w': + AddClass(kWordRanges, kWordRangeCount, ranges, zone); + break; + case 'W': + AddClassNegated(kWordRanges, kWordRangeCount, ranges, zone); + break; + case 'd': + AddClass(kDigitRanges, kDigitRangeCount, ranges, zone); + break; + case 'D': + AddClassNegated(kDigitRanges, kDigitRangeCount, ranges, zone); + break; + case '.': + AddClassNegated(kLineTerminatorRanges, kLineTerminatorRangeCount, ranges, + zone); + break; + // This is not a character range as defined by the spec but a + // convenient shorthand for a character class that matches any + // character. + case '*': + ranges->Add(CharacterRange::Everything(), zone); + break; + // This is the set of characters matched by the $ and ^ symbols + // in multiline mode. + case 'n': + AddClass(kLineTerminatorRanges, kLineTerminatorRangeCount, ranges, zone); + break; + default: + UNREACHABLE(); + } +} + +Vector<const int> CharacterRange::GetWordBounds() { + return Vector<const int>(kWordRanges, kWordRangeCount - 1); +} + +#ifdef V8_INTL_SUPPORT +struct IgnoreSet { + IgnoreSet() : set(BuildIgnoreSet()) {} + const icu::UnicodeSet set; +}; + +struct SpecialAddSet { + SpecialAddSet() : set(BuildSpecialAddSet()) {} + const icu::UnicodeSet set; +}; + +icu::UnicodeSet BuildAsciiAToZSet() { + icu::UnicodeSet set('a', 'z'); + set.add('A', 'Z'); + set.freeze(); + return set; +} + +struct AsciiAToZSet { + AsciiAToZSet() : set(BuildAsciiAToZSet()) {} + const icu::UnicodeSet set; +}; + +static base::LazyInstance<IgnoreSet>::type ignore_set = + LAZY_INSTANCE_INITIALIZER; + +static base::LazyInstance<SpecialAddSet>::type special_add_set = + LAZY_INSTANCE_INITIALIZER; + +static base::LazyInstance<AsciiAToZSet>::type ascii_a_to_z_set = + LAZY_INSTANCE_INITIALIZER; +#endif // V8_INTL_SUPPORT + +// static +void CharacterRange::AddCaseEquivalents(Isolate* isolate, Zone* zone, + ZoneList<CharacterRange>* ranges, + bool is_one_byte) { + CharacterRange::Canonicalize(ranges); + int range_count = ranges->length(); +#ifdef V8_INTL_SUPPORT + icu::UnicodeSet others; + for (int i = 0; i < range_count; i++) { + CharacterRange range = ranges->at(i); + uc32 from = range.from(); + if (from > String::kMaxUtf16CodeUnit) continue; + uc32 to = Min(range.to(), String::kMaxUtf16CodeUnit); + // Nothing to be done for surrogates. + if (from >= kLeadSurrogateStart && to <= kTrailSurrogateEnd) continue; + if (is_one_byte && !RangeContainsLatin1Equivalents(range)) { + if (from > String::kMaxOneByteCharCode) continue; + if (to > String::kMaxOneByteCharCode) to = String::kMaxOneByteCharCode; + } + others.add(from, to); + } + + // Set of characters already added to ranges that do not need to be added + // again. + icu::UnicodeSet already_added(others); + + // Set of characters in ranges that are in the 52 ASCII characters [a-zA-Z]. + icu::UnicodeSet in_ascii_a_to_z(others); + in_ascii_a_to_z.retainAll(ascii_a_to_z_set.Pointer()->set); + + // Remove all chars in [a-zA-Z] from others. + others.removeAll(in_ascii_a_to_z); + + // Set of characters in ranges that are overlapping with special add set. + icu::UnicodeSet in_special_add(others); + in_special_add.retainAll(special_add_set.Pointer()->set); + + others.removeAll(in_special_add); + + // Ignore all chars in ignore set. + others.removeAll(ignore_set.Pointer()->set); + + // For most of the chars in ranges that is still in others, find the case + // equivlant set by calling closeOver(USET_CASE_INSENSITIVE). + others.closeOver(USET_CASE_INSENSITIVE); + + // Because closeOver(USET_CASE_INSENSITIVE) may add ASCII [a-zA-Z] to others, + // but ECMA262 "i" mode won't consider that, remove them from others. + // Ex: U+017F add 'S' and 's' to others. + others.removeAll(ascii_a_to_z_set.Pointer()->set); + + // Special handling for in_ascii_a_to_z. + for (int32_t i = 0; i < in_ascii_a_to_z.getRangeCount(); i++) { + UChar32 start = in_ascii_a_to_z.getRangeStart(i); + UChar32 end = in_ascii_a_to_z.getRangeEnd(i); + // Check if it is uppercase A-Z by checking bit 6. + if (start & 0x0020) { + // Add the lowercases + others.add(start & 0x005F, end & 0x005F); + } else { + // Add the uppercases + others.add(start | 0x0020, end | 0x0020); + } + } + + // Special handling for chars in "Special Add" set. + for (int32_t i = 0; i < in_special_add.getRangeCount(); i++) { + UChar32 end = in_special_add.getRangeEnd(i); + for (UChar32 ch = in_special_add.getRangeStart(i); ch <= end; ch++) { + // Add the uppercase of this character if itself is not an uppercase + // character. + // Note: The if condiction cannot be u_islower(ch) because ch could be + // neither uppercase nor lowercase but Mn. + if (!u_isupper(ch)) { + others.add(u_toupper(ch)); + } + icu::UnicodeSet candidates(ch, ch); + candidates.closeOver(USET_CASE_INSENSITIVE); + for (int32_t j = 0; j < candidates.getRangeCount(); j++) { + UChar32 end2 = candidates.getRangeEnd(j); + for (UChar32 ch2 = candidates.getRangeStart(j); ch2 <= end2; ch2++) { + // Add character that is not uppercase to others. + if (!u_isupper(ch2)) { + others.add(ch2); + } + } + } + } + } + + // Remove all characters which already in the ranges. + others.removeAll(already_added); + + // Add others to the ranges + for (int32_t i = 0; i < others.getRangeCount(); i++) { + UChar32 from = others.getRangeStart(i); + UChar32 to = others.getRangeEnd(i); + if (from == to) { + ranges->Add(CharacterRange::Singleton(from), zone); + } else { + ranges->Add(CharacterRange::Range(from, to), zone); + } + } +#else + for (int i = 0; i < range_count; i++) { + CharacterRange range = ranges->at(i); + uc32 bottom = range.from(); + if (bottom > String::kMaxUtf16CodeUnit) continue; + uc32 top = Min(range.to(), String::kMaxUtf16CodeUnit); + // Nothing to be done for surrogates. + if (bottom >= kLeadSurrogateStart && top <= kTrailSurrogateEnd) continue; + if (is_one_byte && !RangeContainsLatin1Equivalents(range)) { + if (bottom > String::kMaxOneByteCharCode) continue; + if (top > String::kMaxOneByteCharCode) top = String::kMaxOneByteCharCode; + } + unibrow::uchar chars[unibrow::Ecma262UnCanonicalize::kMaxWidth]; + if (top == bottom) { + // If this is a singleton we just expand the one character. + int length = isolate->jsregexp_uncanonicalize()->get(bottom, '\0', chars); + for (int i = 0; i < length; i++) { + uc32 chr = chars[i]; + if (chr != bottom) { + ranges->Add(CharacterRange::Singleton(chars[i]), zone); + } + } + } else { + // If this is a range we expand the characters block by block, expanding + // contiguous subranges (blocks) one at a time. The approach is as + // follows. For a given start character we look up the remainder of the + // block that contains it (represented by the end point), for instance we + // find 'z' if the character is 'c'. A block is characterized by the + // property that all characters uncanonicalize in the same way, except + // that each entry in the result is incremented by the distance from the + // first element. So a-z is a block because 'a' uncanonicalizes to ['a', + // 'A'] and the k'th letter uncanonicalizes to ['a' + k, 'A' + k]. Once + // we've found the end point we look up its uncanonicalization and + // produce a range for each element. For instance for [c-f] we look up + // ['z', 'Z'] and produce [c-f] and [C-F]. We then only add a range if + // it is not already contained in the input, so [c-f] will be skipped but + // [C-F] will be added. If this range is not completely contained in a + // block we do this for all the blocks covered by the range (handling + // characters that is not in a block as a "singleton block"). + unibrow::uchar equivalents[unibrow::Ecma262UnCanonicalize::kMaxWidth]; + int pos = bottom; + while (pos <= top) { + int length = + isolate->jsregexp_canonrange()->get(pos, '\0', equivalents); + uc32 block_end; + if (length == 0) { + block_end = pos; + } else { + DCHECK_EQ(1, length); + block_end = equivalents[0]; + } + int end = (block_end > top) ? top : block_end; + length = isolate->jsregexp_uncanonicalize()->get(block_end, '\0', + equivalents); + for (int i = 0; i < length; i++) { + uc32 c = equivalents[i]; + uc32 range_from = c - (block_end - pos); + uc32 range_to = c - (block_end - end); + if (!(bottom <= range_from && range_to <= top)) { + ranges->Add(CharacterRange::Range(range_from, range_to), zone); + } + } + pos = end + 1; + } + } + } +#endif // V8_INTL_SUPPORT +} + +bool CharacterRange::IsCanonical(ZoneList<CharacterRange>* ranges) { + DCHECK_NOT_NULL(ranges); + int n = ranges->length(); + if (n <= 1) return true; + int max = ranges->at(0).to(); + for (int i = 1; i < n; i++) { + CharacterRange next_range = ranges->at(i); + if (next_range.from() <= max + 1) return false; + max = next_range.to(); + } + return true; +} + +ZoneList<CharacterRange>* CharacterSet::ranges(Zone* zone) { + if (ranges_ == nullptr) { + ranges_ = new (zone) ZoneList<CharacterRange>(2, zone); + CharacterRange::AddClassEscape(standard_set_type_, ranges_, false, zone); + } + return ranges_; +} + +// Move a number of elements in a zonelist to another position +// in the same list. Handles overlapping source and target areas. +static void MoveRanges(ZoneList<CharacterRange>* list, int from, int to, + int count) { + // Ranges are potentially overlapping. + if (from < to) { + for (int i = count - 1; i >= 0; i--) { + list->at(to + i) = list->at(from + i); + } + } else { + for (int i = 0; i < count; i++) { + list->at(to + i) = list->at(from + i); + } + } +} + +static int InsertRangeInCanonicalList(ZoneList<CharacterRange>* list, int count, + CharacterRange insert) { + // Inserts a range into list[0..count[, which must be sorted + // by from value and non-overlapping and non-adjacent, using at most + // list[0..count] for the result. Returns the number of resulting + // canonicalized ranges. Inserting a range may collapse existing ranges into + // fewer ranges, so the return value can be anything in the range 1..count+1. + uc32 from = insert.from(); + uc32 to = insert.to(); + int start_pos = 0; + int end_pos = count; + for (int i = count - 1; i >= 0; i--) { + CharacterRange current = list->at(i); + if (current.from() > to + 1) { + end_pos = i; + } else if (current.to() + 1 < from) { + start_pos = i + 1; + break; + } + } + + // Inserted range overlaps, or is adjacent to, ranges at positions + // [start_pos..end_pos[. Ranges before start_pos or at or after end_pos are + // not affected by the insertion. + // If start_pos == end_pos, the range must be inserted before start_pos. + // if start_pos < end_pos, the entire range from start_pos to end_pos + // must be merged with the insert range. + + if (start_pos == end_pos) { + // Insert between existing ranges at position start_pos. + if (start_pos < count) { + MoveRanges(list, start_pos, start_pos + 1, count - start_pos); + } + list->at(start_pos) = insert; + return count + 1; + } + if (start_pos + 1 == end_pos) { + // Replace single existing range at position start_pos. + CharacterRange to_replace = list->at(start_pos); + int new_from = Min(to_replace.from(), from); + int new_to = Max(to_replace.to(), to); + list->at(start_pos) = CharacterRange::Range(new_from, new_to); + return count; + } + // Replace a number of existing ranges from start_pos to end_pos - 1. + // Move the remaining ranges down. + + int new_from = Min(list->at(start_pos).from(), from); + int new_to = Max(list->at(end_pos - 1).to(), to); + if (end_pos < count) { + MoveRanges(list, end_pos, start_pos + 1, count - end_pos); + } + list->at(start_pos) = CharacterRange::Range(new_from, new_to); + return count - (end_pos - start_pos) + 1; +} + +void CharacterSet::Canonicalize() { + // Special/default classes are always considered canonical. The result + // of calling ranges() will be sorted. + if (ranges_ == nullptr) return; + CharacterRange::Canonicalize(ranges_); +} + +void CharacterRange::Canonicalize(ZoneList<CharacterRange>* character_ranges) { + if (character_ranges->length() <= 1) return; + // Check whether ranges are already canonical (increasing, non-overlapping, + // non-adjacent). + int n = character_ranges->length(); + int max = character_ranges->at(0).to(); + int i = 1; + while (i < n) { + CharacterRange current = character_ranges->at(i); + if (current.from() <= max + 1) { + break; + } + max = current.to(); + i++; + } + // Canonical until the i'th range. If that's all of them, we are done. + if (i == n) return; + + // The ranges at index i and forward are not canonicalized. Make them so by + // doing the equivalent of insertion sort (inserting each into the previous + // list, in order). + // Notice that inserting a range can reduce the number of ranges in the + // result due to combining of adjacent and overlapping ranges. + int read = i; // Range to insert. + int num_canonical = i; // Length of canonicalized part of list. + do { + num_canonical = InsertRangeInCanonicalList(character_ranges, num_canonical, + character_ranges->at(read)); + read++; + } while (read < n); + character_ranges->Rewind(num_canonical); + + DCHECK(CharacterRange::IsCanonical(character_ranges)); +} + +void CharacterRange::Negate(ZoneList<CharacterRange>* ranges, + ZoneList<CharacterRange>* negated_ranges, + Zone* zone) { + DCHECK(CharacterRange::IsCanonical(ranges)); + DCHECK_EQ(0, negated_ranges->length()); + int range_count = ranges->length(); + uc32 from = 0; + int i = 0; + if (range_count > 0 && ranges->at(0).from() == 0) { + from = ranges->at(0).to() + 1; + i = 1; + } + while (i < range_count) { + CharacterRange range = ranges->at(i); + negated_ranges->Add(CharacterRange::Range(from, range.from() - 1), zone); + from = range.to() + 1; + i++; + } + if (from < String::kMaxCodePoint) { + negated_ranges->Add(CharacterRange::Range(from, String::kMaxCodePoint), + zone); + } +} + +// Scoped object to keep track of how much we unroll quantifier loops in the +// regexp graph generator. +class RegExpExpansionLimiter { + public: + static const int kMaxExpansionFactor = 6; + RegExpExpansionLimiter(RegExpCompiler* compiler, int factor) + : compiler_(compiler), + saved_expansion_factor_(compiler->current_expansion_factor()), + ok_to_expand_(saved_expansion_factor_ <= kMaxExpansionFactor) { + DCHECK_LT(0, factor); + if (ok_to_expand_) { + if (factor > kMaxExpansionFactor) { + // Avoid integer overflow of the current expansion factor. + ok_to_expand_ = false; + compiler->set_current_expansion_factor(kMaxExpansionFactor + 1); + } else { + int new_factor = saved_expansion_factor_ * factor; + ok_to_expand_ = (new_factor <= kMaxExpansionFactor); + compiler->set_current_expansion_factor(new_factor); + } + } + } + + ~RegExpExpansionLimiter() { + compiler_->set_current_expansion_factor(saved_expansion_factor_); + } + + bool ok_to_expand() { return ok_to_expand_; } + + private: + RegExpCompiler* compiler_; + int saved_expansion_factor_; + bool ok_to_expand_; + + DISALLOW_IMPLICIT_CONSTRUCTORS(RegExpExpansionLimiter); +}; + +RegExpNode* RegExpQuantifier::ToNode(int min, int max, bool is_greedy, + RegExpTree* body, RegExpCompiler* compiler, + RegExpNode* on_success, + bool not_at_start) { + // x{f, t} becomes this: + // + // (r++)<-. + // | ` + // | (x) + // v ^ + // (r=0)-->(?)---/ [if r < t] + // | + // [if r >= f] \----> ... + // + + // 15.10.2.5 RepeatMatcher algorithm. + // The parser has already eliminated the case where max is 0. In the case + // where max_match is zero the parser has removed the quantifier if min was + // > 0 and removed the atom if min was 0. See AddQuantifierToAtom. + + // If we know that we cannot match zero length then things are a little + // simpler since we don't need to make the special zero length match check + // from step 2.1. If the min and max are small we can unroll a little in + // this case. + static const int kMaxUnrolledMinMatches = 3; // Unroll (foo)+ and (foo){3,} + static const int kMaxUnrolledMaxMatches = 3; // Unroll (foo)? and (foo){x,3} + if (max == 0) return on_success; // This can happen due to recursion. + bool body_can_be_empty = (body->min_match() == 0); + int body_start_reg = RegExpCompiler::kNoRegister; + Interval capture_registers = body->CaptureRegisters(); + bool needs_capture_clearing = !capture_registers.is_empty(); + Zone* zone = compiler->zone(); + + if (body_can_be_empty) { + body_start_reg = compiler->AllocateRegister(); + } else if (compiler->optimize() && !needs_capture_clearing) { + // Only unroll if there are no captures and the body can't be + // empty. + { + RegExpExpansionLimiter limiter(compiler, min + ((max != min) ? 1 : 0)); + if (min > 0 && min <= kMaxUnrolledMinMatches && limiter.ok_to_expand()) { + int new_max = (max == kInfinity) ? max : max - min; + // Recurse once to get the loop or optional matches after the fixed + // ones. + RegExpNode* answer = + ToNode(0, new_max, is_greedy, body, compiler, on_success, true); + // Unroll the forced matches from 0 to min. This can cause chains of + // TextNodes (which the parser does not generate). These should be + // combined if it turns out they hinder good code generation. + for (int i = 0; i < min; i++) { + answer = body->ToNode(compiler, answer); + } + return answer; + } + } + if (max <= kMaxUnrolledMaxMatches && min == 0) { + DCHECK_LT(0, max); // Due to the 'if' above. + RegExpExpansionLimiter limiter(compiler, max); + if (limiter.ok_to_expand()) { + // Unroll the optional matches up to max. + RegExpNode* answer = on_success; + for (int i = 0; i < max; i++) { + ChoiceNode* alternation = new (zone) ChoiceNode(2, zone); + if (is_greedy) { + alternation->AddAlternative( + GuardedAlternative(body->ToNode(compiler, answer))); + alternation->AddAlternative(GuardedAlternative(on_success)); + } else { + alternation->AddAlternative(GuardedAlternative(on_success)); + alternation->AddAlternative( + GuardedAlternative(body->ToNode(compiler, answer))); + } + answer = alternation; + if (not_at_start && !compiler->read_backward()) { + alternation->set_not_at_start(); + } + } + return answer; + } + } + } + bool has_min = min > 0; + bool has_max = max < RegExpTree::kInfinity; + bool needs_counter = has_min || has_max; + int reg_ctr = needs_counter ? compiler->AllocateRegister() + : RegExpCompiler::kNoRegister; + LoopChoiceNode* center = new (zone) LoopChoiceNode( + body->min_match() == 0, compiler->read_backward(), min, zone); + if (not_at_start && !compiler->read_backward()) center->set_not_at_start(); + RegExpNode* loop_return = + needs_counter ? static_cast<RegExpNode*>( + ActionNode::IncrementRegister(reg_ctr, center)) + : static_cast<RegExpNode*>(center); + if (body_can_be_empty) { + // If the body can be empty we need to check if it was and then + // backtrack. + loop_return = + ActionNode::EmptyMatchCheck(body_start_reg, reg_ctr, min, loop_return); + } + RegExpNode* body_node = body->ToNode(compiler, loop_return); + if (body_can_be_empty) { + // If the body can be empty we need to store the start position + // so we can bail out if it was empty. + body_node = ActionNode::StorePosition(body_start_reg, false, body_node); + } + if (needs_capture_clearing) { + // Before entering the body of this loop we need to clear captures. + body_node = ActionNode::ClearCaptures(capture_registers, body_node); + } + GuardedAlternative body_alt(body_node); + if (has_max) { + Guard* body_guard = new (zone) Guard(reg_ctr, Guard::LT, max); + body_alt.AddGuard(body_guard, zone); + } + GuardedAlternative rest_alt(on_success); + if (has_min) { + Guard* rest_guard = new (compiler->zone()) Guard(reg_ctr, Guard::GEQ, min); + rest_alt.AddGuard(rest_guard, zone); + } + if (is_greedy) { + center->AddLoopAlternative(body_alt); + center->AddContinueAlternative(rest_alt); + } else { + center->AddContinueAlternative(rest_alt); + center->AddLoopAlternative(body_alt); + } + if (needs_counter) { + return ActionNode::SetRegisterForLoop(reg_ctr, 0, center); + } else { + return center; + } +} + +} // namespace internal +} // namespace v8 |