diff options
Diffstat (limited to 'js/src/irregexp/RegExpParser.cpp')
-rw-r--r-- | js/src/irregexp/RegExpParser.cpp | 205 |
1 files changed, 149 insertions, 56 deletions
diff --git a/js/src/irregexp/RegExpParser.cpp b/js/src/irregexp/RegExpParser.cpp index 8bd88047a..1ad044e8e 100644 --- a/js/src/irregexp/RegExpParser.cpp +++ b/js/src/irregexp/RegExpParser.cpp @@ -222,11 +222,12 @@ RegExpBuilder::AddQuantifierToAtom(int min, int max, template <typename CharT> RegExpParser<CharT>::RegExpParser(frontend::TokenStream& ts, LifoAlloc* alloc, const CharT* chars, const CharT* end, bool multiline_mode, - bool unicode, bool ignore_case) + bool unicode, bool ignore_case, bool dotall) : ts(ts), alloc(alloc), captures_(nullptr), next_pos_(chars), + captures_started_(0), end_(end), current_(kEndMarker), capture_count_(0), @@ -234,6 +235,7 @@ RegExpParser<CharT>::RegExpParser(frontend::TokenStream& ts, LifoAlloc* alloc, multiline_(multiline_mode), unicode_(unicode), ignore_case_(ignore_case), + dotall_(dotall), simple_(false), contains_anchor_(false), is_scanned_for_captures_(false) @@ -418,7 +420,8 @@ RangeAtom(LifoAlloc* alloc, char16_t from, char16_t to) static inline RegExpTree* NegativeLookahead(LifoAlloc* alloc, char16_t from, char16_t to) { - return alloc->newInfallible<RegExpLookahead>(RangeAtom(alloc, from, to), false, 0, 0); + return alloc->newInfallible<RegExpLookaround>(RangeAtom(alloc, from, to), false, + 0, 0, RegExpLookaround::LOOKAHEAD); } static bool @@ -1213,6 +1216,38 @@ RegExpParser<CharT>::ParseBackReferenceIndex(int* index_out) return true; } +template <typename CharT> +RegExpCapture* +RegExpParser<CharT>::GetCapture(int index) { + // The index for the capture groups are one-based. Its index in the list is + // zero-based. + int known_captures = + is_scanned_for_captures_ ? capture_count_ : captures_started_; + MOZ_ASSERT(index <= known_captures); + if (captures_ == NULL) { + captures_ = alloc->newInfallible<RegExpCaptureVector>(*alloc); + } + while ((int)captures_->length() < known_captures) { + RegExpCapture* capture = alloc->newInfallible<RegExpCapture>(nullptr, captures_->length() + 1); + captures_->append(capture); + } + return (*captures_)[index - 1]; +} + + +template <typename CharT> +bool +RegExpParser<CharT>::RegExpParserState::IsInsideCaptureGroup(int index) { + for (RegExpParserState* s = this; s != NULL; s = s->previous_state()) { + if (s->group_type() != CAPTURE) continue; + // Return true if we found the matching capture index. + if (index == s->capture_index()) return true; + // Abort if index is larger than what has been parsed up till this state. + if (index > s->capture_index()) return false; + } + return false; +} + // QuantifierPrefix :: // { DecimalDigits } // { DecimalDigits , } @@ -1350,7 +1385,7 @@ UnicodeEverythingAtom(LifoAlloc* alloc) { RegExpBuilder* builder = alloc->newInfallible<RegExpBuilder>(alloc); - // everything except \x0a, \x0d, \u2028 and \u2029 + // Everything except \x0a, \x0d, \u2028 and \u2029 CharacterRangeVector* ranges = alloc->newInfallible<CharacterRangeVector>(*alloc); ranges->append(CharacterRange::Range(0x0, 0x09)); @@ -1380,6 +1415,38 @@ UnicodeEverythingAtom(LifoAlloc* alloc) return builder->ToRegExp(); } +static inline RegExpTree* +UnicodeDotAllAtom(LifoAlloc* alloc) +{ + RegExpBuilder* builder = alloc->newInfallible<RegExpBuilder>(alloc); + + // Full range excluding surrogates because /s was specified + + CharacterRangeVector* ranges = alloc->newInfallible<CharacterRangeVector>(*alloc); + ranges->append(CharacterRange::Range(0x0, unicode::LeadSurrogateMin - 1)); + ranges->append(CharacterRange::Range(unicode::TrailSurrogateMax + 1, unicode::UTF16Max)); + builder->AddAtom(alloc->newInfallible<RegExpCharacterClass>(ranges, false)); + + builder->NewAlternative(); + + builder->AddAtom(RangeAtom(alloc, unicode::LeadSurrogateMin, unicode::LeadSurrogateMax)); + builder->AddAtom(NegativeLookahead(alloc, unicode::TrailSurrogateMin, + unicode::TrailSurrogateMax)); + + builder->NewAlternative(); + + builder->AddAssertion(alloc->newInfallible<RegExpAssertion>( + RegExpAssertion::NOT_AFTER_LEAD_SURROGATE)); + builder->AddAtom(RangeAtom(alloc, unicode::TrailSurrogateMin, unicode::TrailSurrogateMax)); + + builder->NewAlternative(); + + builder->AddAtom(RangeAtom(alloc, unicode::LeadSurrogateMin, unicode::LeadSurrogateMax)); + builder->AddAtom(RangeAtom(alloc, unicode::TrailSurrogateMin, unicode::TrailSurrogateMax)); + + return builder->ToRegExp(); +} + RegExpTree* UnicodeCharacterClassEscapeAtom(LifoAlloc* alloc, char16_t char_class, bool ignore_case) { @@ -1423,24 +1490,24 @@ RegExpTree* RegExpParser<CharT>::ParseDisjunction() { // Used to store current state while parsing subexpressions. - RegExpParserState initial_state(alloc, nullptr, INITIAL, 0); - RegExpParserState* stored_state = &initial_state; + RegExpParserState initial_state(alloc, nullptr, INITIAL, RegExpLookaround::LOOKAHEAD, 0); + RegExpParserState* state = &initial_state; // Cache the builder in a local variable for quick access. RegExpBuilder* builder = initial_state.builder(); while (true) { switch (current()) { case kEndMarker: - if (stored_state->IsSubexpression()) { + if (state->IsSubexpression()) { // Inside a parenthesized group when hitting end of input. return ReportError(JSMSG_MISSING_PAREN); } - MOZ_ASSERT(INITIAL == stored_state->group_type()); + MOZ_ASSERT(INITIAL == state->group_type()); // Parsing completed successfully. return builder->ToRegExp(); case ')': { - if (!stored_state->IsSubexpression()) + if (!state->IsSubexpression()) return ReportError(JSMSG_UNMATCHED_RIGHT_PAREN); - MOZ_ASSERT(INITIAL != stored_state->group_type()); + MOZ_ASSERT(INITIAL != state->group_type()); Advance(); // End disjunction parsing and convert builder content to new single @@ -1449,29 +1516,30 @@ RegExpParser<CharT>::ParseDisjunction() int end_capture_index = captures_started(); - int capture_index = stored_state->capture_index(); - SubexpressionType group_type = stored_state->group_type(); - - // Restore previous state. - stored_state = stored_state->previous_state(); - builder = stored_state->builder(); + int capture_index = state->capture_index(); + SubexpressionType group_type = state->group_type(); // Build result of subexpression. if (group_type == CAPTURE) { - RegExpCapture* capture = alloc->newInfallible<RegExpCapture>(body, capture_index); - (*captures_)[capture_index - 1] = capture; + RegExpCapture* capture = GetCapture(capture_index); + capture->set_body(body); body = capture; } else if (group_type != GROUPING) { - MOZ_ASSERT(group_type == POSITIVE_LOOKAHEAD || - group_type == NEGATIVE_LOOKAHEAD); - bool is_positive = (group_type == POSITIVE_LOOKAHEAD); - body = alloc->newInfallible<RegExpLookahead>(body, + MOZ_ASSERT(group_type == POSITIVE_LOOKAROUND || + group_type == NEGATIVE_LOOKAROUND); + bool is_positive = (group_type == POSITIVE_LOOKAROUND); + body = alloc->newInfallible<RegExpLookaround>(body, is_positive, end_capture_index - capture_index, - capture_index); + capture_index, + state->lookaround_type()); } + + // Restore previous state. + state = state->previous_state(); + builder = state->builder(); builder->AddAtom(body); - if (unicode_ && (group_type == POSITIVE_LOOKAHEAD || group_type == NEGATIVE_LOOKAHEAD)) + if (unicode_ && (group_type == POSITIVE_LOOKAROUND || group_type == NEGATIVE_LOOKAROUND)) continue; // For compatability with JSC and ES3, we allow quantifiers after // lookaheads, and break in all cases. @@ -1506,19 +1574,32 @@ RegExpParser<CharT>::ParseDisjunction() } case '.': { Advance(); - // everything except \x0a, \x0d, \u2028 and \u2029 + if (unicode_) { - builder->AddAtom(UnicodeEverythingAtom(alloc)); + if (dotall_) { + // Everything + builder->AddAtom(UnicodeDotAllAtom(alloc)); + } else { + // Everything except \x0a, \x0d, \u2028 and \u2029 + builder->AddAtom(UnicodeEverythingAtom(alloc)); + } break; } CharacterRangeVector* ranges = alloc->newInfallible<CharacterRangeVector>(*alloc); - CharacterRange::AddClassEscape(alloc, '.', ranges); + if (dotall_) { + // Everything + CharacterRange::AddClassEscape(alloc, '*', ranges); + } else { + // Everything except \x0a, \x0d, \u2028 and \u2029 + CharacterRange::AddClassEscape(alloc, '.', ranges); + } RegExpTree* atom = alloc->newInfallible<RegExpCharacterClass>(ranges, false); builder->AddAtom(atom); break; } case '(': { SubexpressionType subexpr_type = CAPTURE; + RegExpLookaround::Type lookaround_type = state->lookaround_type(); Advance(); if (current() == '?') { switch (Next()) { @@ -1526,26 +1607,39 @@ RegExpParser<CharT>::ParseDisjunction() subexpr_type = GROUPING; break; case '=': - subexpr_type = POSITIVE_LOOKAHEAD; + lookaround_type = RegExpLookaround::LOOKAHEAD; + subexpr_type = POSITIVE_LOOKAROUND; break; case '!': - subexpr_type = NEGATIVE_LOOKAHEAD; + lookaround_type = RegExpLookaround::LOOKAHEAD; + subexpr_type = NEGATIVE_LOOKAROUND; break; + case '<': + Advance(); + lookaround_type = RegExpLookaround::LOOKBEHIND; + if (Next() == '=') { + subexpr_type = POSITIVE_LOOKAROUND; + break; + } else if (Next() == '!') { + subexpr_type = NEGATIVE_LOOKAROUND; + break; + } + // We didn't get a positive or negative after '<'. + // That's an error. + return ReportError(JSMSG_INVALID_GROUP); default: return ReportError(JSMSG_INVALID_GROUP); } Advance(2); } else { - if (captures_ == nullptr) - captures_ = alloc->newInfallible<RegExpCaptureVector>(*alloc); if (captures_started() >= kMaxCaptures) return ReportError(JSMSG_TOO_MANY_PARENS); - captures_->append((RegExpCapture*) nullptr); + captures_started_++; } // Store current state and begin new disjunction parsing. - stored_state = alloc->newInfallible<RegExpParserState>(alloc, stored_state, subexpr_type, - captures_started()); - builder = stored_state->builder(); + state = alloc->newInfallible<RegExpParserState>(alloc, state, subexpr_type, + lookaround_type, captures_started_); + builder = state->builder(); continue; } case '[': { @@ -1600,19 +1694,18 @@ RegExpParser<CharT>::ParseDisjunction() case '7': case '8': case '9': { int index = 0; if (ParseBackReferenceIndex(&index)) { - RegExpCapture* capture = nullptr; - if (captures_ != nullptr && index <= (int) captures_->length()) { - capture = (*captures_)[index - 1]; - } - if (capture == nullptr) { - builder->AddEmpty(); - break; + if (state->IsInsideCaptureGroup(index)) { + // The backreference is inside the capture group it refers to. + // Nothing can possibly have been captured yet. + builder->AddEmpty(); + } else { + RegExpCapture* capture = GetCapture(index); + RegExpTree* atom = alloc->newInfallible<RegExpBackReference>(capture); + if (unicode_) + builder->AddAtom(UnicodeBackReferenceAtom(alloc, atom)); + else + builder->AddAtom(atom); } - RegExpTree* atom = alloc->newInfallible<RegExpBackReference>(capture); - if (unicode_) - builder->AddAtom(UnicodeBackReferenceAtom(alloc, atom)); - else - builder->AddAtom(atom); break; } if (unicode_) @@ -1832,7 +1925,7 @@ template <typename CharT> static bool ParsePattern(frontend::TokenStream& ts, LifoAlloc& alloc, const CharT* chars, size_t length, bool multiline, bool match_only, bool unicode, bool ignore_case, - bool global, bool sticky, RegExpCompileData* data) + bool global, bool sticky, bool dotall, RegExpCompileData* data) { if (match_only) { // Try to strip a leading '.*' from the RegExp, but only if it is not @@ -1859,7 +1952,7 @@ ParsePattern(frontend::TokenStream& ts, LifoAlloc& alloc, const CharT* chars, si } } - RegExpParser<CharT> parser(ts, &alloc, chars, chars + length, multiline, unicode, ignore_case); + RegExpParser<CharT> parser(ts, &alloc, chars, chars + length, multiline, unicode, ignore_case, dotall); data->tree = parser.ParsePattern(); if (!data->tree) return false; @@ -1873,33 +1966,33 @@ ParsePattern(frontend::TokenStream& ts, LifoAlloc& alloc, const CharT* chars, si bool irregexp::ParsePattern(frontend::TokenStream& ts, LifoAlloc& alloc, JSAtom* str, bool multiline, bool match_only, bool unicode, bool ignore_case, - bool global, bool sticky, RegExpCompileData* data) + bool global, bool sticky, bool dotall, RegExpCompileData* data) { JS::AutoCheckCannotGC nogc; return str->hasLatin1Chars() ? ::ParsePattern(ts, alloc, str->latin1Chars(nogc), str->length(), - multiline, match_only, unicode, ignore_case, global, sticky, data) + multiline, match_only, unicode, ignore_case, global, sticky, dotall, data) : ::ParsePattern(ts, alloc, str->twoByteChars(nogc), str->length(), - multiline, match_only, unicode, ignore_case, global, sticky, data); + multiline, match_only, unicode, ignore_case, global, sticky, dotall, data); } template <typename CharT> static bool ParsePatternSyntax(frontend::TokenStream& ts, LifoAlloc& alloc, const CharT* chars, size_t length, - bool unicode) + bool unicode, bool dotall) { LifoAllocScope scope(&alloc); - RegExpParser<CharT> parser(ts, &alloc, chars, chars + length, false, unicode, false); + RegExpParser<CharT> parser(ts, &alloc, chars, chars + length, false, unicode, dotall, false); return parser.ParsePattern() != nullptr; } bool irregexp::ParsePatternSyntax(frontend::TokenStream& ts, LifoAlloc& alloc, JSAtom* str, - bool unicode) + bool unicode, bool dotall) { JS::AutoCheckCannotGC nogc; return str->hasLatin1Chars() - ? ::ParsePatternSyntax(ts, alloc, str->latin1Chars(nogc), str->length(), unicode) - : ::ParsePatternSyntax(ts, alloc, str->twoByteChars(nogc), str->length(), unicode); + ? ::ParsePatternSyntax(ts, alloc, str->latin1Chars(nogc), str->length(), unicode, dotall) + : ::ParsePatternSyntax(ts, alloc, str->twoByteChars(nogc), str->length(), unicode, dotall); } |