diff options
author | wolfbeast <mcwerewolf@wolfbeast.com> | 2019-11-18 16:50:40 +0100 |
---|---|---|
committer | wolfbeast <mcwerewolf@wolfbeast.com> | 2019-11-18 16:50:40 +0100 |
commit | 122e1ee6cd24b5de80a1702313db732c8961202a (patch) | |
tree | 4c79ac5ab8d11c15375c42867f9c47f575c5e54f /js/src/irregexp | |
parent | 62a72e3d281ea48e7b311a1c153a0e5ae7586da8 (diff) | |
download | UXP-122e1ee6cd24b5de80a1702313db732c8961202a.tar UXP-122e1ee6cd24b5de80a1702313db732c8961202a.tar.gz UXP-122e1ee6cd24b5de80a1702313db732c8961202a.tar.lz UXP-122e1ee6cd24b5de80a1702313db732c8961202a.tar.xz UXP-122e1ee6cd24b5de80a1702313db732c8961202a.zip |
Issue #1284 - Implement /s (dotAll) for regular expressions, v2.
Resolves #1284.
Diffstat (limited to 'js/src/irregexp')
-rw-r--r-- | js/src/irregexp/RegExpParser.cpp | 75 | ||||
-rw-r--r-- | js/src/irregexp/RegExpParser.h | 7 |
2 files changed, 64 insertions, 18 deletions
diff --git a/js/src/irregexp/RegExpParser.cpp b/js/src/irregexp/RegExpParser.cpp index 9ef9fe3e2..1ad044e8e 100644 --- a/js/src/irregexp/RegExpParser.cpp +++ b/js/src/irregexp/RegExpParser.cpp @@ -222,7 +222,7 @@ RegExpBuilder::AddQuantifierToAtom(int min, int max, template <typename CharT> RegExpParser<CharT>::RegExpParser(frontend::TokenStream& ts, LifoAlloc* alloc, const CharT* chars, const CharT* end, bool multiline_mode, - bool unicode, bool ignore_case) + bool unicode, bool ignore_case, bool dotall) : ts(ts), alloc(alloc), captures_(nullptr), @@ -235,6 +235,7 @@ RegExpParser<CharT>::RegExpParser(frontend::TokenStream& ts, LifoAlloc* alloc, multiline_(multiline_mode), unicode_(unicode), ignore_case_(ignore_case), + dotall_(dotall), simple_(false), contains_anchor_(false), is_scanned_for_captures_(false) @@ -1384,7 +1385,7 @@ UnicodeEverythingAtom(LifoAlloc* alloc) { RegExpBuilder* builder = alloc->newInfallible<RegExpBuilder>(alloc); - // everything except \x0a, \x0d, \u2028 and \u2029 + // Everything except \x0a, \x0d, \u2028 and \u2029 CharacterRangeVector* ranges = alloc->newInfallible<CharacterRangeVector>(*alloc); ranges->append(CharacterRange::Range(0x0, 0x09)); @@ -1414,6 +1415,38 @@ UnicodeEverythingAtom(LifoAlloc* alloc) return builder->ToRegExp(); } +static inline RegExpTree* +UnicodeDotAllAtom(LifoAlloc* alloc) +{ + RegExpBuilder* builder = alloc->newInfallible<RegExpBuilder>(alloc); + + // Full range excluding surrogates because /s was specified + + CharacterRangeVector* ranges = alloc->newInfallible<CharacterRangeVector>(*alloc); + ranges->append(CharacterRange::Range(0x0, unicode::LeadSurrogateMin - 1)); + ranges->append(CharacterRange::Range(unicode::TrailSurrogateMax + 1, unicode::UTF16Max)); + builder->AddAtom(alloc->newInfallible<RegExpCharacterClass>(ranges, false)); + + builder->NewAlternative(); + + builder->AddAtom(RangeAtom(alloc, unicode::LeadSurrogateMin, unicode::LeadSurrogateMax)); + builder->AddAtom(NegativeLookahead(alloc, unicode::TrailSurrogateMin, + unicode::TrailSurrogateMax)); + + builder->NewAlternative(); + + builder->AddAssertion(alloc->newInfallible<RegExpAssertion>( + RegExpAssertion::NOT_AFTER_LEAD_SURROGATE)); + builder->AddAtom(RangeAtom(alloc, unicode::TrailSurrogateMin, unicode::TrailSurrogateMax)); + + builder->NewAlternative(); + + builder->AddAtom(RangeAtom(alloc, unicode::LeadSurrogateMin, unicode::LeadSurrogateMax)); + builder->AddAtom(RangeAtom(alloc, unicode::TrailSurrogateMin, unicode::TrailSurrogateMax)); + + return builder->ToRegExp(); +} + RegExpTree* UnicodeCharacterClassEscapeAtom(LifoAlloc* alloc, char16_t char_class, bool ignore_case) { @@ -1541,13 +1574,25 @@ RegExpParser<CharT>::ParseDisjunction() } case '.': { Advance(); - // everything except \x0a, \x0d, \u2028 and \u2029 + if (unicode_) { - builder->AddAtom(UnicodeEverythingAtom(alloc)); + if (dotall_) { + // Everything + builder->AddAtom(UnicodeDotAllAtom(alloc)); + } else { + // Everything except \x0a, \x0d, \u2028 and \u2029 + builder->AddAtom(UnicodeEverythingAtom(alloc)); + } break; } CharacterRangeVector* ranges = alloc->newInfallible<CharacterRangeVector>(*alloc); - CharacterRange::AddClassEscape(alloc, '.', ranges); + if (dotall_) { + // Everything + CharacterRange::AddClassEscape(alloc, '*', ranges); + } else { + // Everything except \x0a, \x0d, \u2028 and \u2029 + CharacterRange::AddClassEscape(alloc, '.', ranges); + } RegExpTree* atom = alloc->newInfallible<RegExpCharacterClass>(ranges, false); builder->AddAtom(atom); break; @@ -1880,7 +1925,7 @@ template <typename CharT> static bool ParsePattern(frontend::TokenStream& ts, LifoAlloc& alloc, const CharT* chars, size_t length, bool multiline, bool match_only, bool unicode, bool ignore_case, - bool global, bool sticky, RegExpCompileData* data) + bool global, bool sticky, bool dotall, RegExpCompileData* data) { if (match_only) { // Try to strip a leading '.*' from the RegExp, but only if it is not @@ -1907,7 +1952,7 @@ ParsePattern(frontend::TokenStream& ts, LifoAlloc& alloc, const CharT* chars, si } } - RegExpParser<CharT> parser(ts, &alloc, chars, chars + length, multiline, unicode, ignore_case); + RegExpParser<CharT> parser(ts, &alloc, chars, chars + length, multiline, unicode, ignore_case, dotall); data->tree = parser.ParsePattern(); if (!data->tree) return false; @@ -1921,33 +1966,33 @@ ParsePattern(frontend::TokenStream& ts, LifoAlloc& alloc, const CharT* chars, si bool irregexp::ParsePattern(frontend::TokenStream& ts, LifoAlloc& alloc, JSAtom* str, bool multiline, bool match_only, bool unicode, bool ignore_case, - bool global, bool sticky, RegExpCompileData* data) + bool global, bool sticky, bool dotall, RegExpCompileData* data) { JS::AutoCheckCannotGC nogc; return str->hasLatin1Chars() ? ::ParsePattern(ts, alloc, str->latin1Chars(nogc), str->length(), - multiline, match_only, unicode, ignore_case, global, sticky, data) + multiline, match_only, unicode, ignore_case, global, sticky, dotall, data) : ::ParsePattern(ts, alloc, str->twoByteChars(nogc), str->length(), - multiline, match_only, unicode, ignore_case, global, sticky, data); + multiline, match_only, unicode, ignore_case, global, sticky, dotall, data); } template <typename CharT> static bool ParsePatternSyntax(frontend::TokenStream& ts, LifoAlloc& alloc, const CharT* chars, size_t length, - bool unicode) + bool unicode, bool dotall) { LifoAllocScope scope(&alloc); - RegExpParser<CharT> parser(ts, &alloc, chars, chars + length, false, unicode, false); + RegExpParser<CharT> parser(ts, &alloc, chars, chars + length, false, unicode, dotall, false); return parser.ParsePattern() != nullptr; } bool irregexp::ParsePatternSyntax(frontend::TokenStream& ts, LifoAlloc& alloc, JSAtom* str, - bool unicode) + bool unicode, bool dotall) { JS::AutoCheckCannotGC nogc; return str->hasLatin1Chars() - ? ::ParsePatternSyntax(ts, alloc, str->latin1Chars(nogc), str->length(), unicode) - : ::ParsePatternSyntax(ts, alloc, str->twoByteChars(nogc), str->length(), unicode); + ? ::ParsePatternSyntax(ts, alloc, str->latin1Chars(nogc), str->length(), unicode, dotall) + : ::ParsePatternSyntax(ts, alloc, str->twoByteChars(nogc), str->length(), unicode, dotall); } diff --git a/js/src/irregexp/RegExpParser.h b/js/src/irregexp/RegExpParser.h index 2f02625b5..ee57f0436 100644 --- a/js/src/irregexp/RegExpParser.h +++ b/js/src/irregexp/RegExpParser.h @@ -44,11 +44,11 @@ namespace irregexp { bool ParsePattern(frontend::TokenStream& ts, LifoAlloc& alloc, JSAtom* str, bool multiline, bool match_only, bool unicode, bool ignore_case, - bool global, bool sticky, RegExpCompileData* data); + bool global, bool sticky, bool dotall, RegExpCompileData* data); bool ParsePatternSyntax(frontend::TokenStream& ts, LifoAlloc& alloc, JSAtom* str, - bool unicode); + bool unicode, bool dotall); // A BufferedVector is an automatically growing list, just like (and backed // by) a Vector, that is optimized for the case of adding and removing @@ -178,7 +178,7 @@ class RegExpParser public: RegExpParser(frontend::TokenStream& ts, LifoAlloc* alloc, const CharT* chars, const CharT* end, bool multiline_mode, bool unicode, - bool ignore_case); + bool ignore_case, bool dotall); RegExpTree* ParsePattern(); RegExpTree* ParseDisjunction(); @@ -313,6 +313,7 @@ class RegExpParser bool multiline_; bool unicode_; bool ignore_case_; + bool dotall_; bool simple_; bool contains_anchor_; bool is_scanned_for_captures_; |