summaryrefslogtreecommitdiffstats
path: root/js/src/irregexp/RegExpParser.cpp
diff options
context:
space:
mode:
authorwolfbeast <mcwerewolf@wolfbeast.com>2019-11-18 16:50:40 +0100
committerwolfbeast <mcwerewolf@wolfbeast.com>2019-11-18 16:50:40 +0100
commit122e1ee6cd24b5de80a1702313db732c8961202a (patch)
tree4c79ac5ab8d11c15375c42867f9c47f575c5e54f /js/src/irregexp/RegExpParser.cpp
parent62a72e3d281ea48e7b311a1c153a0e5ae7586da8 (diff)
downloadUXP-122e1ee6cd24b5de80a1702313db732c8961202a.tar
UXP-122e1ee6cd24b5de80a1702313db732c8961202a.tar.gz
UXP-122e1ee6cd24b5de80a1702313db732c8961202a.tar.lz
UXP-122e1ee6cd24b5de80a1702313db732c8961202a.tar.xz
UXP-122e1ee6cd24b5de80a1702313db732c8961202a.zip
Issue #1284 - Implement /s (dotAll) for regular expressions, v2.
Resolves #1284.
Diffstat (limited to 'js/src/irregexp/RegExpParser.cpp')
-rw-r--r--js/src/irregexp/RegExpParser.cpp75
1 files changed, 60 insertions, 15 deletions
diff --git a/js/src/irregexp/RegExpParser.cpp b/js/src/irregexp/RegExpParser.cpp
index 9ef9fe3e2..1ad044e8e 100644
--- a/js/src/irregexp/RegExpParser.cpp
+++ b/js/src/irregexp/RegExpParser.cpp
@@ -222,7 +222,7 @@ RegExpBuilder::AddQuantifierToAtom(int min, int max,
template <typename CharT>
RegExpParser<CharT>::RegExpParser(frontend::TokenStream& ts, LifoAlloc* alloc,
const CharT* chars, const CharT* end, bool multiline_mode,
- bool unicode, bool ignore_case)
+ bool unicode, bool ignore_case, bool dotall)
: ts(ts),
alloc(alloc),
captures_(nullptr),
@@ -235,6 +235,7 @@ RegExpParser<CharT>::RegExpParser(frontend::TokenStream& ts, LifoAlloc* alloc,
multiline_(multiline_mode),
unicode_(unicode),
ignore_case_(ignore_case),
+ dotall_(dotall),
simple_(false),
contains_anchor_(false),
is_scanned_for_captures_(false)
@@ -1384,7 +1385,7 @@ UnicodeEverythingAtom(LifoAlloc* alloc)
{
RegExpBuilder* builder = alloc->newInfallible<RegExpBuilder>(alloc);
- // everything except \x0a, \x0d, \u2028 and \u2029
+ // Everything except \x0a, \x0d, \u2028 and \u2029
CharacterRangeVector* ranges = alloc->newInfallible<CharacterRangeVector>(*alloc);
ranges->append(CharacterRange::Range(0x0, 0x09));
@@ -1414,6 +1415,38 @@ UnicodeEverythingAtom(LifoAlloc* alloc)
return builder->ToRegExp();
}
+static inline RegExpTree*
+UnicodeDotAllAtom(LifoAlloc* alloc)
+{
+ RegExpBuilder* builder = alloc->newInfallible<RegExpBuilder>(alloc);
+
+ // Full range excluding surrogates because /s was specified
+
+ CharacterRangeVector* ranges = alloc->newInfallible<CharacterRangeVector>(*alloc);
+ ranges->append(CharacterRange::Range(0x0, unicode::LeadSurrogateMin - 1));
+ ranges->append(CharacterRange::Range(unicode::TrailSurrogateMax + 1, unicode::UTF16Max));
+ builder->AddAtom(alloc->newInfallible<RegExpCharacterClass>(ranges, false));
+
+ builder->NewAlternative();
+
+ builder->AddAtom(RangeAtom(alloc, unicode::LeadSurrogateMin, unicode::LeadSurrogateMax));
+ builder->AddAtom(NegativeLookahead(alloc, unicode::TrailSurrogateMin,
+ unicode::TrailSurrogateMax));
+
+ builder->NewAlternative();
+
+ builder->AddAssertion(alloc->newInfallible<RegExpAssertion>(
+ RegExpAssertion::NOT_AFTER_LEAD_SURROGATE));
+ builder->AddAtom(RangeAtom(alloc, unicode::TrailSurrogateMin, unicode::TrailSurrogateMax));
+
+ builder->NewAlternative();
+
+ builder->AddAtom(RangeAtom(alloc, unicode::LeadSurrogateMin, unicode::LeadSurrogateMax));
+ builder->AddAtom(RangeAtom(alloc, unicode::TrailSurrogateMin, unicode::TrailSurrogateMax));
+
+ return builder->ToRegExp();
+}
+
RegExpTree*
UnicodeCharacterClassEscapeAtom(LifoAlloc* alloc, char16_t char_class, bool ignore_case)
{
@@ -1541,13 +1574,25 @@ RegExpParser<CharT>::ParseDisjunction()
}
case '.': {
Advance();
- // everything except \x0a, \x0d, \u2028 and \u2029
+
if (unicode_) {
- builder->AddAtom(UnicodeEverythingAtom(alloc));
+ if (dotall_) {
+ // Everything
+ builder->AddAtom(UnicodeDotAllAtom(alloc));
+ } else {
+ // Everything except \x0a, \x0d, \u2028 and \u2029
+ builder->AddAtom(UnicodeEverythingAtom(alloc));
+ }
break;
}
CharacterRangeVector* ranges = alloc->newInfallible<CharacterRangeVector>(*alloc);
- CharacterRange::AddClassEscape(alloc, '.', ranges);
+ if (dotall_) {
+ // Everything
+ CharacterRange::AddClassEscape(alloc, '*', ranges);
+ } else {
+ // Everything except \x0a, \x0d, \u2028 and \u2029
+ CharacterRange::AddClassEscape(alloc, '.', ranges);
+ }
RegExpTree* atom = alloc->newInfallible<RegExpCharacterClass>(ranges, false);
builder->AddAtom(atom);
break;
@@ -1880,7 +1925,7 @@ template <typename CharT>
static bool
ParsePattern(frontend::TokenStream& ts, LifoAlloc& alloc, const CharT* chars, size_t length,
bool multiline, bool match_only, bool unicode, bool ignore_case,
- bool global, bool sticky, RegExpCompileData* data)
+ bool global, bool sticky, bool dotall, RegExpCompileData* data)
{
if (match_only) {
// Try to strip a leading '.*' from the RegExp, but only if it is not
@@ -1907,7 +1952,7 @@ ParsePattern(frontend::TokenStream& ts, LifoAlloc& alloc, const CharT* chars, si
}
}
- RegExpParser<CharT> parser(ts, &alloc, chars, chars + length, multiline, unicode, ignore_case);
+ RegExpParser<CharT> parser(ts, &alloc, chars, chars + length, multiline, unicode, ignore_case, dotall);
data->tree = parser.ParsePattern();
if (!data->tree)
return false;
@@ -1921,33 +1966,33 @@ ParsePattern(frontend::TokenStream& ts, LifoAlloc& alloc, const CharT* chars, si
bool
irregexp::ParsePattern(frontend::TokenStream& ts, LifoAlloc& alloc, JSAtom* str,
bool multiline, bool match_only, bool unicode, bool ignore_case,
- bool global, bool sticky, RegExpCompileData* data)
+ bool global, bool sticky, bool dotall, RegExpCompileData* data)
{
JS::AutoCheckCannotGC nogc;
return str->hasLatin1Chars()
? ::ParsePattern(ts, alloc, str->latin1Chars(nogc), str->length(),
- multiline, match_only, unicode, ignore_case, global, sticky, data)
+ multiline, match_only, unicode, ignore_case, global, sticky, dotall, data)
: ::ParsePattern(ts, alloc, str->twoByteChars(nogc), str->length(),
- multiline, match_only, unicode, ignore_case, global, sticky, data);
+ multiline, match_only, unicode, ignore_case, global, sticky, dotall, data);
}
template <typename CharT>
static bool
ParsePatternSyntax(frontend::TokenStream& ts, LifoAlloc& alloc, const CharT* chars, size_t length,
- bool unicode)
+ bool unicode, bool dotall)
{
LifoAllocScope scope(&alloc);
- RegExpParser<CharT> parser(ts, &alloc, chars, chars + length, false, unicode, false);
+ RegExpParser<CharT> parser(ts, &alloc, chars, chars + length, false, unicode, dotall, false);
return parser.ParsePattern() != nullptr;
}
bool
irregexp::ParsePatternSyntax(frontend::TokenStream& ts, LifoAlloc& alloc, JSAtom* str,
- bool unicode)
+ bool unicode, bool dotall)
{
JS::AutoCheckCannotGC nogc;
return str->hasLatin1Chars()
- ? ::ParsePatternSyntax(ts, alloc, str->latin1Chars(nogc), str->length(), unicode)
- : ::ParsePatternSyntax(ts, alloc, str->twoByteChars(nogc), str->length(), unicode);
+ ? ::ParsePatternSyntax(ts, alloc, str->latin1Chars(nogc), str->length(), unicode, dotall)
+ : ::ParsePatternSyntax(ts, alloc, str->twoByteChars(nogc), str->length(), unicode, dotall);
}