From f31b04a303607cd82757e7c4f60bb536658c8a30 Mon Sep 17 00:00:00 2001 From: wolfbeast Date: Mon, 18 Nov 2019 12:20:44 +0100 Subject: Issue #1284 - Implement /s (dotAll) for regular expressions. Resolves #1284. --- js/src/irregexp/RegExpParser.cpp | 52 ++++++++++++++++++++++++++++++++++++---- js/src/irregexp/RegExpParser.h | 1 + 2 files changed, 49 insertions(+), 4 deletions(-) (limited to 'js/src/irregexp') diff --git a/js/src/irregexp/RegExpParser.cpp b/js/src/irregexp/RegExpParser.cpp index 9ef9fe3e2..28abdb0b4 100644 --- a/js/src/irregexp/RegExpParser.cpp +++ b/js/src/irregexp/RegExpParser.cpp @@ -1384,7 +1384,7 @@ UnicodeEverythingAtom(LifoAlloc* alloc) { RegExpBuilder* builder = alloc->newInfallible(alloc); - // everything except \x0a, \x0d, \u2028 and \u2029 + // Everything except \x0a, \x0d, \u2028 and \u2029 CharacterRangeVector* ranges = alloc->newInfallible(*alloc); ranges->append(CharacterRange::Range(0x0, 0x09)); @@ -1414,6 +1414,38 @@ UnicodeEverythingAtom(LifoAlloc* alloc) return builder->ToRegExp(); } +static inline RegExpTree* +UnicodeDotAllAtom(LifoAlloc* alloc) +{ + RegExpBuilder* builder = alloc->newInfallible(alloc); + + // Full range excluding surrogates because /s was specified + + CharacterRangeVector* ranges = alloc->newInfallible(*alloc); + ranges->append(CharacterRange::Range(0x0, unicode::LeadSurrogateMin - 1)); + ranges->append(CharacterRange::Range(unicode::TrailSurrogateMax + 1, unicode::UTF16Max)); + builder->AddAtom(alloc->newInfallible(ranges, false)); + + builder->NewAlternative(); + + builder->AddAtom(RangeAtom(alloc, unicode::LeadSurrogateMin, unicode::LeadSurrogateMax)); + builder->AddAtom(NegativeLookahead(alloc, unicode::TrailSurrogateMin, + unicode::TrailSurrogateMax)); + + builder->NewAlternative(); + + builder->AddAssertion(alloc->newInfallible( + RegExpAssertion::NOT_AFTER_LEAD_SURROGATE)); + builder->AddAtom(RangeAtom(alloc, unicode::TrailSurrogateMin, unicode::TrailSurrogateMax)); + + builder->NewAlternative(); + + builder->AddAtom(RangeAtom(alloc, unicode::LeadSurrogateMin, unicode::LeadSurrogateMax)); + builder->AddAtom(RangeAtom(alloc, unicode::TrailSurrogateMin, unicode::TrailSurrogateMax)); + + return builder->ToRegExp(); +} + RegExpTree* UnicodeCharacterClassEscapeAtom(LifoAlloc* alloc, char16_t char_class, bool ignore_case) { @@ -1541,13 +1573,25 @@ RegExpParser::ParseDisjunction() } case '.': { Advance(); - // everything except \x0a, \x0d, \u2028 and \u2029 + if (unicode_) { - builder->AddAtom(UnicodeEverythingAtom(alloc)); + if (dotall_) { + // Everything + builder->AddAtom(UnicodeDotAllAtom(alloc)); + } else { + // Everything except \x0a, \x0d, \u2028 and \u2029 + builder->AddAtom(UnicodeEverythingAtom(alloc)); + } break; } CharacterRangeVector* ranges = alloc->newInfallible(*alloc); - CharacterRange::AddClassEscape(alloc, '.', ranges); + if (dotall_) { + // Everything + CharacterRange::AddClassEscape(alloc, '*', ranges); + } else { + // Everything except \x0a, \x0d, \u2028 and \u2029 + CharacterRange::AddClassEscape(alloc, '.', ranges); + } RegExpTree* atom = alloc->newInfallible(ranges, false); builder->AddAtom(atom); break; diff --git a/js/src/irregexp/RegExpParser.h b/js/src/irregexp/RegExpParser.h index 2f02625b5..36c24cd7c 100644 --- a/js/src/irregexp/RegExpParser.h +++ b/js/src/irregexp/RegExpParser.h @@ -313,6 +313,7 @@ class RegExpParser bool multiline_; bool unicode_; bool ignore_case_; + bool dotall_; bool simple_; bool contains_anchor_; bool is_scanned_for_captures_; -- cgit v1.2.3