From 122e1ee6cd24b5de80a1702313db732c8961202a Mon Sep 17 00:00:00 2001 From: wolfbeast Date: Mon, 18 Nov 2019 16:50:40 +0100 Subject: Issue #1284 - Implement /s (dotAll) for regular expressions, v2. Resolves #1284. --- js/src/builtin/RegExp.cpp | 33 +++++++++++++++- js/src/builtin/RegExp.h | 2 + js/src/builtin/RegExp.js | 5 +++ js/src/builtin/SelfHostingDefines.h | 1 + js/src/frontend/TokenStream.cpp | 2 + js/src/irregexp/RegExpParser.cpp | 75 +++++++++++++++++++++++++++++-------- js/src/irregexp/RegExpParser.h | 7 ++-- js/src/jsapi.h | 1 + js/src/vm/CommonPropertyNames.h | 1 + js/src/vm/RegExpObject.cpp | 5 ++- js/src/vm/RegExpObject.h | 8 +++- 11 files changed, 117 insertions(+), 23 deletions(-) diff --git a/js/src/builtin/RegExp.cpp b/js/src/builtin/RegExp.cpp index 7cf20d23c..b7853d533 100644 --- a/js/src/builtin/RegExp.cpp +++ b/js/src/builtin/RegExp.cpp @@ -178,7 +178,7 @@ CheckPatternSyntax(JSContext* cx, HandleAtom pattern, RegExpFlag flags) CompileOptions options(cx); frontend::TokenStream dummyTokenStream(cx, options, nullptr, 0, nullptr); return irregexp::ParsePatternSyntax(dummyTokenStream, cx->tempLifoAlloc(), pattern, - flags & UnicodeFlag); + flags & UnicodeFlag, flags & DotAllFlag); } enum RegExpSharedUse { @@ -664,6 +664,29 @@ js::regexp_multiline(JSContext* cx, unsigned argc, JS::Value* vp) return CallNonGenericMethod(cx, args); } +// ES 2018 dotAll +MOZ_ALWAYS_INLINE bool +regexp_dotall_impl(JSContext* cx, const CallArgs& args) +{ + MOZ_ASSERT(IsRegExpInstanceOrPrototype(args.thisv())); + + if (!IsRegExpObject(args.thisv())) { + args.rval().setUndefined(); + return true; + } + + Rooted reObj(cx, &args.thisv().toObject().as()); + args.rval().setBoolean(reObj->dotall()); + return true; +} + +bool +js::regexp_dotall(JSContext* cx, unsigned argc, JS::Value* vp) +{ + CallArgs args = CallArgsFromVp(argc, vp); + return CallNonGenericMethod(cx, args); +} + // ES 2017 draft rev32 21.2.5.10. MOZ_ALWAYS_INLINE bool regexp_source_impl(JSContext* cx, const CallArgs& args) @@ -759,6 +782,7 @@ const JSPropertySpec js::regexp_properties[] = { JS_PSG("source", regexp_source, 0), JS_PSG("sticky", regexp_sticky, 0), JS_PSG("unicode", regexp_unicode, 0), + JS_PSG("dotall", regexp_dotall, 0), JS_PS_END }; @@ -1642,6 +1666,13 @@ js::RegExpPrototypeOptimizableRaw(JSContext* cx, JSObject* proto) if (unicodeGetter != regexp_unicode) return false; + JSNative dotAllGetter; + if (!GetOwnNativeGetterPure(cx, proto, NameToId(cx->names().dotall), &dotAllGetter)) + return false; + + if (dotAllGetter != regexp_dotall) + return false; + // Check if @@match, @@search, and exec are own data properties, // those values should be tested in selfhosted JS. bool has = false; diff --git a/js/src/builtin/RegExp.h b/js/src/builtin/RegExp.h index 4e0ff6948..f808f5146 100644 --- a/js/src/builtin/RegExp.h +++ b/js/src/builtin/RegExp.h @@ -153,6 +153,8 @@ extern MOZ_MUST_USE bool regexp_sticky(JSContext* cx, unsigned argc, JS::Value* vp); extern MOZ_MUST_USE bool regexp_unicode(JSContext* cx, unsigned argc, JS::Value* vp); +extern MOZ_MUST_USE bool +regexp_dotall(JSContext* cx, unsigned argc, JS::Value* vp); } /* namespace js */ diff --git a/js/src/builtin/RegExp.js b/js/src/builtin/RegExp.js index 0b849292c..1a2276594 100644 --- a/js/src/builtin/RegExp.js +++ b/js/src/builtin/RegExp.js @@ -3,6 +3,7 @@ * file, You can obtain one at http://mozilla.org/MPL/2.0/. */ // ES6 draft rev34 (2015/02/20) 21.2.5.3 get RegExp.prototype.flags +// Updated for ES2018 /s (dotAll) function RegExpFlagsGetter() { // Steps 1-2. var R = this; @@ -31,6 +32,10 @@ function RegExpFlagsGetter() { // Steps 16-18. if (R.sticky) result += "y"; + + // ES2018 + if (R.dotall) + result += "s"; // Step 19. return result; diff --git a/js/src/builtin/SelfHostingDefines.h b/js/src/builtin/SelfHostingDefines.h index d676270a1..6512810ca 100644 --- a/js/src/builtin/SelfHostingDefines.h +++ b/js/src/builtin/SelfHostingDefines.h @@ -90,6 +90,7 @@ #define REGEXP_MULTILINE_FLAG 0x04 #define REGEXP_STICKY_FLAG 0x08 #define REGEXP_UNICODE_FLAG 0x10 +#define REGEXP_DOTALL_FLAG 0x20 #define MODULE_OBJECT_ENVIRONMENT_SLOT 2 diff --git a/js/src/frontend/TokenStream.cpp b/js/src/frontend/TokenStream.cpp index b8623d545..e07f8df8a 100644 --- a/js/src/frontend/TokenStream.cpp +++ b/js/src/frontend/TokenStream.cpp @@ -1843,6 +1843,8 @@ TokenStream::getTokenInternal(TokenKind* ttp, Modifier modifier) reflags = RegExpFlag(reflags | StickyFlag); else if (c == 'u' && !(reflags & UnicodeFlag)) reflags = RegExpFlag(reflags | UnicodeFlag); + else if (c == 's' && !(reflags & DotAllFlag)) + reflags = RegExpFlag(reflags | DotAllFlag); else break; getChar(); diff --git a/js/src/irregexp/RegExpParser.cpp b/js/src/irregexp/RegExpParser.cpp index 9ef9fe3e2..1ad044e8e 100644 --- a/js/src/irregexp/RegExpParser.cpp +++ b/js/src/irregexp/RegExpParser.cpp @@ -222,7 +222,7 @@ RegExpBuilder::AddQuantifierToAtom(int min, int max, template RegExpParser::RegExpParser(frontend::TokenStream& ts, LifoAlloc* alloc, const CharT* chars, const CharT* end, bool multiline_mode, - bool unicode, bool ignore_case) + bool unicode, bool ignore_case, bool dotall) : ts(ts), alloc(alloc), captures_(nullptr), @@ -235,6 +235,7 @@ RegExpParser::RegExpParser(frontend::TokenStream& ts, LifoAlloc* alloc, multiline_(multiline_mode), unicode_(unicode), ignore_case_(ignore_case), + dotall_(dotall), simple_(false), contains_anchor_(false), is_scanned_for_captures_(false) @@ -1384,7 +1385,7 @@ UnicodeEverythingAtom(LifoAlloc* alloc) { RegExpBuilder* builder = alloc->newInfallible(alloc); - // everything except \x0a, \x0d, \u2028 and \u2029 + // Everything except \x0a, \x0d, \u2028 and \u2029 CharacterRangeVector* ranges = alloc->newInfallible(*alloc); ranges->append(CharacterRange::Range(0x0, 0x09)); @@ -1414,6 +1415,38 @@ UnicodeEverythingAtom(LifoAlloc* alloc) return builder->ToRegExp(); } +static inline RegExpTree* +UnicodeDotAllAtom(LifoAlloc* alloc) +{ + RegExpBuilder* builder = alloc->newInfallible(alloc); + + // Full range excluding surrogates because /s was specified + + CharacterRangeVector* ranges = alloc->newInfallible(*alloc); + ranges->append(CharacterRange::Range(0x0, unicode::LeadSurrogateMin - 1)); + ranges->append(CharacterRange::Range(unicode::TrailSurrogateMax + 1, unicode::UTF16Max)); + builder->AddAtom(alloc->newInfallible(ranges, false)); + + builder->NewAlternative(); + + builder->AddAtom(RangeAtom(alloc, unicode::LeadSurrogateMin, unicode::LeadSurrogateMax)); + builder->AddAtom(NegativeLookahead(alloc, unicode::TrailSurrogateMin, + unicode::TrailSurrogateMax)); + + builder->NewAlternative(); + + builder->AddAssertion(alloc->newInfallible( + RegExpAssertion::NOT_AFTER_LEAD_SURROGATE)); + builder->AddAtom(RangeAtom(alloc, unicode::TrailSurrogateMin, unicode::TrailSurrogateMax)); + + builder->NewAlternative(); + + builder->AddAtom(RangeAtom(alloc, unicode::LeadSurrogateMin, unicode::LeadSurrogateMax)); + builder->AddAtom(RangeAtom(alloc, unicode::TrailSurrogateMin, unicode::TrailSurrogateMax)); + + return builder->ToRegExp(); +} + RegExpTree* UnicodeCharacterClassEscapeAtom(LifoAlloc* alloc, char16_t char_class, bool ignore_case) { @@ -1541,13 +1574,25 @@ RegExpParser::ParseDisjunction() } case '.': { Advance(); - // everything except \x0a, \x0d, \u2028 and \u2029 + if (unicode_) { - builder->AddAtom(UnicodeEverythingAtom(alloc)); + if (dotall_) { + // Everything + builder->AddAtom(UnicodeDotAllAtom(alloc)); + } else { + // Everything except \x0a, \x0d, \u2028 and \u2029 + builder->AddAtom(UnicodeEverythingAtom(alloc)); + } break; } CharacterRangeVector* ranges = alloc->newInfallible(*alloc); - CharacterRange::AddClassEscape(alloc, '.', ranges); + if (dotall_) { + // Everything + CharacterRange::AddClassEscape(alloc, '*', ranges); + } else { + // Everything except \x0a, \x0d, \u2028 and \u2029 + CharacterRange::AddClassEscape(alloc, '.', ranges); + } RegExpTree* atom = alloc->newInfallible(ranges, false); builder->AddAtom(atom); break; @@ -1880,7 +1925,7 @@ template static bool ParsePattern(frontend::TokenStream& ts, LifoAlloc& alloc, const CharT* chars, size_t length, bool multiline, bool match_only, bool unicode, bool ignore_case, - bool global, bool sticky, RegExpCompileData* data) + bool global, bool sticky, bool dotall, RegExpCompileData* data) { if (match_only) { // Try to strip a leading '.*' from the RegExp, but only if it is not @@ -1907,7 +1952,7 @@ ParsePattern(frontend::TokenStream& ts, LifoAlloc& alloc, const CharT* chars, si } } - RegExpParser parser(ts, &alloc, chars, chars + length, multiline, unicode, ignore_case); + RegExpParser parser(ts, &alloc, chars, chars + length, multiline, unicode, ignore_case, dotall); data->tree = parser.ParsePattern(); if (!data->tree) return false; @@ -1921,33 +1966,33 @@ ParsePattern(frontend::TokenStream& ts, LifoAlloc& alloc, const CharT* chars, si bool irregexp::ParsePattern(frontend::TokenStream& ts, LifoAlloc& alloc, JSAtom* str, bool multiline, bool match_only, bool unicode, bool ignore_case, - bool global, bool sticky, RegExpCompileData* data) + bool global, bool sticky, bool dotall, RegExpCompileData* data) { JS::AutoCheckCannotGC nogc; return str->hasLatin1Chars() ? ::ParsePattern(ts, alloc, str->latin1Chars(nogc), str->length(), - multiline, match_only, unicode, ignore_case, global, sticky, data) + multiline, match_only, unicode, ignore_case, global, sticky, dotall, data) : ::ParsePattern(ts, alloc, str->twoByteChars(nogc), str->length(), - multiline, match_only, unicode, ignore_case, global, sticky, data); + multiline, match_only, unicode, ignore_case, global, sticky, dotall, data); } template static bool ParsePatternSyntax(frontend::TokenStream& ts, LifoAlloc& alloc, const CharT* chars, size_t length, - bool unicode) + bool unicode, bool dotall) { LifoAllocScope scope(&alloc); - RegExpParser parser(ts, &alloc, chars, chars + length, false, unicode, false); + RegExpParser parser(ts, &alloc, chars, chars + length, false, unicode, dotall, false); return parser.ParsePattern() != nullptr; } bool irregexp::ParsePatternSyntax(frontend::TokenStream& ts, LifoAlloc& alloc, JSAtom* str, - bool unicode) + bool unicode, bool dotall) { JS::AutoCheckCannotGC nogc; return str->hasLatin1Chars() - ? ::ParsePatternSyntax(ts, alloc, str->latin1Chars(nogc), str->length(), unicode) - : ::ParsePatternSyntax(ts, alloc, str->twoByteChars(nogc), str->length(), unicode); + ? ::ParsePatternSyntax(ts, alloc, str->latin1Chars(nogc), str->length(), unicode, dotall) + : ::ParsePatternSyntax(ts, alloc, str->twoByteChars(nogc), str->length(), unicode, dotall); } diff --git a/js/src/irregexp/RegExpParser.h b/js/src/irregexp/RegExpParser.h index 2f02625b5..ee57f0436 100644 --- a/js/src/irregexp/RegExpParser.h +++ b/js/src/irregexp/RegExpParser.h @@ -44,11 +44,11 @@ namespace irregexp { bool ParsePattern(frontend::TokenStream& ts, LifoAlloc& alloc, JSAtom* str, bool multiline, bool match_only, bool unicode, bool ignore_case, - bool global, bool sticky, RegExpCompileData* data); + bool global, bool sticky, bool dotall, RegExpCompileData* data); bool ParsePatternSyntax(frontend::TokenStream& ts, LifoAlloc& alloc, JSAtom* str, - bool unicode); + bool unicode, bool dotall); // A BufferedVector is an automatically growing list, just like (and backed // by) a Vector, that is optimized for the case of adding and removing @@ -178,7 +178,7 @@ class RegExpParser public: RegExpParser(frontend::TokenStream& ts, LifoAlloc* alloc, const CharT* chars, const CharT* end, bool multiline_mode, bool unicode, - bool ignore_case); + bool ignore_case, bool dotall); RegExpTree* ParsePattern(); RegExpTree* ParseDisjunction(); @@ -313,6 +313,7 @@ class RegExpParser bool multiline_; bool unicode_; bool ignore_case_; + bool dotall_; bool simple_; bool contains_anchor_; bool is_scanned_for_captures_; diff --git a/js/src/jsapi.h b/js/src/jsapi.h index dc00c650d..1a69b1513 100644 --- a/js/src/jsapi.h +++ b/js/src/jsapi.h @@ -5704,6 +5704,7 @@ JS_ObjectIsDate(JSContext* cx, JS::HandleObject obj, bool* isDate); #define JSREG_MULTILINE 0x04u /* treat ^ and $ as begin and end of line */ #define JSREG_STICKY 0x08u /* only match starting at lastIndex */ #define JSREG_UNICODE 0x10u /* unicode */ +#define JSREG_DOTALL 0x20u /* match . to everything including newlines */ extern JS_PUBLIC_API(JSObject*) JS_NewRegExpObject(JSContext* cx, const char* bytes, size_t length, unsigned flags); diff --git a/js/src/vm/CommonPropertyNames.h b/js/src/vm/CommonPropertyNames.h index fd1c9f5e6..4ae49d577 100644 --- a/js/src/vm/CommonPropertyNames.h +++ b/js/src/vm/CommonPropertyNames.h @@ -97,6 +97,7 @@ macro(displayURL, displayURL, "displayURL") \ macro(do, do_, "do") \ macro(done, done, "done") \ + macro(dotall, dotall, "dotall") \ macro(dotGenerator, dotGenerator, ".generator") \ macro(dotThis, dotThis, ".this") \ macro(each, each, "each") \ diff --git a/js/src/vm/RegExpObject.cpp b/js/src/vm/RegExpObject.cpp index ef97ed816..cd0b54c9d 100644 --- a/js/src/vm/RegExpObject.cpp +++ b/js/src/vm/RegExpObject.cpp @@ -49,6 +49,7 @@ JS_STATIC_ASSERT(GlobalFlag == JSREG_GLOB); JS_STATIC_ASSERT(MultilineFlag == JSREG_MULTILINE); JS_STATIC_ASSERT(StickyFlag == JSREG_STICKY); JS_STATIC_ASSERT(UnicodeFlag == JSREG_UNICODE); +JS_STATIC_ASSERT(DotAllFlag == JSREG_DOTALL); RegExpObject* js::RegExpAlloc(ExclusiveContext* cx, HandleObject proto /* = nullptr */) @@ -267,7 +268,7 @@ RegExpObject::create(ExclusiveContext* cx, HandleAtom source, RegExpFlag flags, tokenStream = dummyTokenStream.ptr(); } - if (!irregexp::ParsePatternSyntax(*tokenStream, alloc, source, flags & UnicodeFlag)) + if (!irregexp::ParsePatternSyntax(*tokenStream, alloc, source, flags & UnicodeFlag, flags & DotAllFlag)) return nullptr; Rooted regexp(cx, RegExpAlloc(cx)); @@ -1017,7 +1018,7 @@ RegExpShared::compile(JSContext* cx, HandleAtom pattern, HandleLinearString inpu irregexp::RegExpCompileData data; if (!irregexp::ParsePattern(dummyTokenStream, cx->tempLifoAlloc(), pattern, multiline(), mode == MatchOnly, unicode(), ignoreCase(), - global(), sticky(), &data)) + global(), sticky(), dotall(), &data)) { return false; } diff --git a/js/src/vm/RegExpObject.h b/js/src/vm/RegExpObject.h index f1ea101ed..95c64fa67 100644 --- a/js/src/vm/RegExpObject.h +++ b/js/src/vm/RegExpObject.h @@ -53,16 +53,18 @@ enum RegExpFlag MultilineFlag = 0x04, StickyFlag = 0x08, UnicodeFlag = 0x10, + DotAllFlag = 0x20, NoFlags = 0x00, - AllFlags = 0x1f + AllFlags = 0x3f }; static_assert(IgnoreCaseFlag == REGEXP_IGNORECASE_FLAG && GlobalFlag == REGEXP_GLOBAL_FLAG && MultilineFlag == REGEXP_MULTILINE_FLAG && StickyFlag == REGEXP_STICKY_FLAG && - UnicodeFlag == REGEXP_UNICODE_FLAG, + UnicodeFlag == REGEXP_UNICODE_FLAG && + DotAllFlag == REGEXP_DOTALL_FLAG, "Flag values should be in sync with self-hosted JS"); enum RegExpRunStatus @@ -193,6 +195,7 @@ class RegExpShared bool multiline() const { return flags & MultilineFlag; } bool sticky() const { return flags & StickyFlag; } bool unicode() const { return flags & UnicodeFlag; } + bool dotall() const { return flags & DotAllFlag; } bool isCompiled(CompilationMode mode, bool latin1, ForceByteCodeEnum force = DontForceByteCode) const { @@ -480,6 +483,7 @@ class RegExpObject : public NativeObject bool multiline() const { return getFlags() & MultilineFlag; } bool sticky() const { return getFlags() & StickyFlag; } bool unicode() const { return getFlags() & UnicodeFlag; } + bool dotall() const { return getFlags() & DotAllFlag; } static bool isOriginalFlagGetter(JSNative native, RegExpFlag* mask); -- cgit v1.2.3