From f31b04a303607cd82757e7c4f60bb536658c8a30 Mon Sep 17 00:00:00 2001 From: wolfbeast Date: Mon, 18 Nov 2019 12:20:44 +0100 Subject: Issue #1284 - Implement /s (dotAll) for regular expressions. Resolves #1284. --- js/src/builtin/RegExp.cpp | 24 +++++++++++++++++ js/src/builtin/RegExp.h | 2 ++ js/src/builtin/RegExp.js | 5 ++++ js/src/builtin/SelfHostingDefines.h | 1 + js/src/frontend/TokenStream.cpp | 2 ++ js/src/irregexp/RegExpParser.cpp | 52 ++++++++++++++++++++++++++++++++++--- js/src/irregexp/RegExpParser.h | 1 + js/src/vm/RegExpObject.h | 7 +++-- 8 files changed, 88 insertions(+), 6 deletions(-) diff --git a/js/src/builtin/RegExp.cpp b/js/src/builtin/RegExp.cpp index 7cf20d23c..93a7f2b79 100644 --- a/js/src/builtin/RegExp.cpp +++ b/js/src/builtin/RegExp.cpp @@ -664,6 +664,29 @@ js::regexp_multiline(JSContext* cx, unsigned argc, JS::Value* vp) return CallNonGenericMethod(cx, args); } +// ES 2018 dotAll +MOZ_ALWAYS_INLINE bool +regexp_dotall_impl(JSContext* cx, const CallArgs& args) +{ + MOZ_ASSERT(IsRegExpInstanceOrPrototype(args.thisv())); + + if (!IsRegExpObject(args.thisv())) { + args.rval().setUndefined(); + return true; + } + + Rooted reObj(cx, &args.thisv().toObject().as()); + args.rval().setBoolean(reObj->dotall()); + return true; +} + +bool +js::regexp_dotall(JSContext* cx, unsigned argc, JS::Value* vp) +{ + CallArgs args = CallArgsFromVp(argc, vp); + return CallNonGenericMethod(cx, args); +} + // ES 2017 draft rev32 21.2.5.10. MOZ_ALWAYS_INLINE bool regexp_source_impl(JSContext* cx, const CallArgs& args) @@ -759,6 +782,7 @@ const JSPropertySpec js::regexp_properties[] = { JS_PSG("source", regexp_source, 0), JS_PSG("sticky", regexp_sticky, 0), JS_PSG("unicode", regexp_unicode, 0), + JS_PSG("dotall", regexp_dotall, 0), JS_PS_END }; diff --git a/js/src/builtin/RegExp.h b/js/src/builtin/RegExp.h index 4e0ff6948..f808f5146 100644 --- a/js/src/builtin/RegExp.h +++ b/js/src/builtin/RegExp.h @@ -153,6 +153,8 @@ extern MOZ_MUST_USE bool regexp_sticky(JSContext* cx, unsigned argc, JS::Value* vp); extern MOZ_MUST_USE bool regexp_unicode(JSContext* cx, unsigned argc, JS::Value* vp); +extern MOZ_MUST_USE bool +regexp_dotall(JSContext* cx, unsigned argc, JS::Value* vp); } /* namespace js */ diff --git a/js/src/builtin/RegExp.js b/js/src/builtin/RegExp.js index 0b849292c..1a2276594 100644 --- a/js/src/builtin/RegExp.js +++ b/js/src/builtin/RegExp.js @@ -3,6 +3,7 @@ * file, You can obtain one at http://mozilla.org/MPL/2.0/. */ // ES6 draft rev34 (2015/02/20) 21.2.5.3 get RegExp.prototype.flags +// Updated for ES2018 /s (dotAll) function RegExpFlagsGetter() { // Steps 1-2. var R = this; @@ -31,6 +32,10 @@ function RegExpFlagsGetter() { // Steps 16-18. if (R.sticky) result += "y"; + + // ES2018 + if (R.dotall) + result += "s"; // Step 19. return result; diff --git a/js/src/builtin/SelfHostingDefines.h b/js/src/builtin/SelfHostingDefines.h index d676270a1..6512810ca 100644 --- a/js/src/builtin/SelfHostingDefines.h +++ b/js/src/builtin/SelfHostingDefines.h @@ -90,6 +90,7 @@ #define REGEXP_MULTILINE_FLAG 0x04 #define REGEXP_STICKY_FLAG 0x08 #define REGEXP_UNICODE_FLAG 0x10 +#define REGEXP_DOTALL_FLAG 0x20 #define MODULE_OBJECT_ENVIRONMENT_SLOT 2 diff --git a/js/src/frontend/TokenStream.cpp b/js/src/frontend/TokenStream.cpp index b8623d545..e07f8df8a 100644 --- a/js/src/frontend/TokenStream.cpp +++ b/js/src/frontend/TokenStream.cpp @@ -1843,6 +1843,8 @@ TokenStream::getTokenInternal(TokenKind* ttp, Modifier modifier) reflags = RegExpFlag(reflags | StickyFlag); else if (c == 'u' && !(reflags & UnicodeFlag)) reflags = RegExpFlag(reflags | UnicodeFlag); + else if (c == 's' && !(reflags & DotAllFlag)) + reflags = RegExpFlag(reflags | DotAllFlag); else break; getChar(); diff --git a/js/src/irregexp/RegExpParser.cpp b/js/src/irregexp/RegExpParser.cpp index 9ef9fe3e2..28abdb0b4 100644 --- a/js/src/irregexp/RegExpParser.cpp +++ b/js/src/irregexp/RegExpParser.cpp @@ -1384,7 +1384,7 @@ UnicodeEverythingAtom(LifoAlloc* alloc) { RegExpBuilder* builder = alloc->newInfallible(alloc); - // everything except \x0a, \x0d, \u2028 and \u2029 + // Everything except \x0a, \x0d, \u2028 and \u2029 CharacterRangeVector* ranges = alloc->newInfallible(*alloc); ranges->append(CharacterRange::Range(0x0, 0x09)); @@ -1414,6 +1414,38 @@ UnicodeEverythingAtom(LifoAlloc* alloc) return builder->ToRegExp(); } +static inline RegExpTree* +UnicodeDotAllAtom(LifoAlloc* alloc) +{ + RegExpBuilder* builder = alloc->newInfallible(alloc); + + // Full range excluding surrogates because /s was specified + + CharacterRangeVector* ranges = alloc->newInfallible(*alloc); + ranges->append(CharacterRange::Range(0x0, unicode::LeadSurrogateMin - 1)); + ranges->append(CharacterRange::Range(unicode::TrailSurrogateMax + 1, unicode::UTF16Max)); + builder->AddAtom(alloc->newInfallible(ranges, false)); + + builder->NewAlternative(); + + builder->AddAtom(RangeAtom(alloc, unicode::LeadSurrogateMin, unicode::LeadSurrogateMax)); + builder->AddAtom(NegativeLookahead(alloc, unicode::TrailSurrogateMin, + unicode::TrailSurrogateMax)); + + builder->NewAlternative(); + + builder->AddAssertion(alloc->newInfallible( + RegExpAssertion::NOT_AFTER_LEAD_SURROGATE)); + builder->AddAtom(RangeAtom(alloc, unicode::TrailSurrogateMin, unicode::TrailSurrogateMax)); + + builder->NewAlternative(); + + builder->AddAtom(RangeAtom(alloc, unicode::LeadSurrogateMin, unicode::LeadSurrogateMax)); + builder->AddAtom(RangeAtom(alloc, unicode::TrailSurrogateMin, unicode::TrailSurrogateMax)); + + return builder->ToRegExp(); +} + RegExpTree* UnicodeCharacterClassEscapeAtom(LifoAlloc* alloc, char16_t char_class, bool ignore_case) { @@ -1541,13 +1573,25 @@ RegExpParser::ParseDisjunction() } case '.': { Advance(); - // everything except \x0a, \x0d, \u2028 and \u2029 + if (unicode_) { - builder->AddAtom(UnicodeEverythingAtom(alloc)); + if (dotall_) { + // Everything + builder->AddAtom(UnicodeDotAllAtom(alloc)); + } else { + // Everything except \x0a, \x0d, \u2028 and \u2029 + builder->AddAtom(UnicodeEverythingAtom(alloc)); + } break; } CharacterRangeVector* ranges = alloc->newInfallible(*alloc); - CharacterRange::AddClassEscape(alloc, '.', ranges); + if (dotall_) { + // Everything + CharacterRange::AddClassEscape(alloc, '*', ranges); + } else { + // Everything except \x0a, \x0d, \u2028 and \u2029 + CharacterRange::AddClassEscape(alloc, '.', ranges); + } RegExpTree* atom = alloc->newInfallible(ranges, false); builder->AddAtom(atom); break; diff --git a/js/src/irregexp/RegExpParser.h b/js/src/irregexp/RegExpParser.h index 2f02625b5..36c24cd7c 100644 --- a/js/src/irregexp/RegExpParser.h +++ b/js/src/irregexp/RegExpParser.h @@ -313,6 +313,7 @@ class RegExpParser bool multiline_; bool unicode_; bool ignore_case_; + bool dotall_; bool simple_; bool contains_anchor_; bool is_scanned_for_captures_; diff --git a/js/src/vm/RegExpObject.h b/js/src/vm/RegExpObject.h index f1ea101ed..4548521df 100644 --- a/js/src/vm/RegExpObject.h +++ b/js/src/vm/RegExpObject.h @@ -53,16 +53,18 @@ enum RegExpFlag MultilineFlag = 0x04, StickyFlag = 0x08, UnicodeFlag = 0x10, + DotAllFlag = 0x20, NoFlags = 0x00, - AllFlags = 0x1f + AllFlags = 0x3f }; static_assert(IgnoreCaseFlag == REGEXP_IGNORECASE_FLAG && GlobalFlag == REGEXP_GLOBAL_FLAG && MultilineFlag == REGEXP_MULTILINE_FLAG && StickyFlag == REGEXP_STICKY_FLAG && - UnicodeFlag == REGEXP_UNICODE_FLAG, + UnicodeFlag == REGEXP_UNICODE_FLAG && + DotAllFlag == REGEXP_DOTALL_FLAG, "Flag values should be in sync with self-hosted JS"); enum RegExpRunStatus @@ -480,6 +482,7 @@ class RegExpObject : public NativeObject bool multiline() const { return getFlags() & MultilineFlag; } bool sticky() const { return getFlags() & StickyFlag; } bool unicode() const { return getFlags() & UnicodeFlag; } + bool dotall() const { return getFlags() & DotAllFlag; } static bool isOriginalFlagGetter(JSNative native, RegExpFlag* mask); -- cgit v1.2.3