diff options
author | wolfbeast <mcwerewolf@wolfbeast.com> | 2019-11-11 23:37:35 +0100 |
---|---|---|
committer | wolfbeast <mcwerewolf@wolfbeast.com> | 2019-11-11 23:37:35 +0100 |
commit | fa473930f424bf17a9e545b601c84dd2e61364e3 (patch) | |
tree | 41a29cb3ad0ebaf93f6b6248e92073fe4d8788ff /js/src/irregexp/RegExpEngine.cpp | |
parent | b00601953bade944cd6df9cde6fcdd1f10d76feb (diff) | |
download | UXP-fa473930f424bf17a9e545b601c84dd2e61364e3.tar UXP-fa473930f424bf17a9e545b601c84dd2e61364e3.tar.gz UXP-fa473930f424bf17a9e545b601c84dd2e61364e3.tar.lz UXP-fa473930f424bf17a9e545b601c84dd2e61364e3.tar.xz UXP-fa473930f424bf17a9e545b601c84dd2e61364e3.zip |
Issue #1279 - Implement regular expression lookbehind
Based on Tom Schuster's work, with extra minters for unicode.
Diffstat (limited to 'js/src/irregexp/RegExpEngine.cpp')
-rw-r--r-- | js/src/irregexp/RegExpEngine.cpp | 151 |
1 files changed, 95 insertions, 56 deletions
diff --git a/js/src/irregexp/RegExpEngine.cpp b/js/src/irregexp/RegExpEngine.cpp index 4d691a5dc..62f94c3e7 100644 --- a/js/src/irregexp/RegExpEngine.cpp +++ b/js/src/irregexp/RegExpEngine.cpp @@ -721,6 +721,8 @@ ActionNode::EmptyMatchCheck(int start_register, int TextNode::EatsAtLeast(int still_to_find, int budget, bool not_at_start) { + if (read_backward()) + return 0; int answer = Length(); if (answer >= still_to_find) return answer; @@ -736,8 +738,7 @@ TextNode::EatsAtLeast(int still_to_find, int budget, bool not_at_start) int TextNode::GreedyLoopTextLength() { - TextElement elm = elements()[elements().length() - 1]; - return elm.cp_offset() + elm.length(); + return Length(); } RegExpNode* @@ -887,6 +888,8 @@ AssertionNode::FillInBMInfo(int offset, int budget, BoyerMooreLookahead* bm, boo int BackReferenceNode::EatsAtLeast(int still_to_find, int budget, bool not_at_start) { + if (read_backward()) + return 0; if (budget <= 0) return 0; return on_success()->EatsAtLeast(still_to_find, budget - 1, not_at_start); @@ -1578,6 +1581,9 @@ class irregexp::RegExpCompiler current_expansion_factor_ = value; } + bool read_backward() { return read_backward_; } + void set_read_backward(bool value) { read_backward_ = value; } + JSContext* cx() const { return cx_; } LifoAlloc* alloc() const { return alloc_; } @@ -1595,6 +1601,7 @@ class irregexp::RegExpCompiler bool unicode_; bool reg_exp_too_big_; int current_expansion_factor_; + bool read_backward_; FrequencyCollator frequency_collator_; JSContext* cx_; LifoAlloc* alloc_; @@ -1624,6 +1631,7 @@ RegExpCompiler::RegExpCompiler(JSContext* cx, LifoAlloc* alloc, int capture_coun unicode_(unicode), reg_exp_too_big_(false), current_expansion_factor_(1), + read_backward_(false), frequency_collator_(), cx_(cx), alloc_(alloc) @@ -1747,7 +1755,7 @@ irregexp::CompilePattern(JSContext* cx, RegExpShared* shared, RegExpCompileData* // at the start of input. ChoiceNode* first_step_node = alloc.newInfallible<ChoiceNode>(&alloc, 2); RegExpNode* char_class = - alloc.newInfallible<TextNode>(alloc.newInfallible<RegExpCharacterClass>('*'), loop_node); + alloc.newInfallible<TextNode>(alloc.newInfallible<RegExpCharacterClass>('*'), false, loop_node); first_step_node->AddAlternative(GuardedAlternative(captured_body)); first_step_node->AddAlternative(GuardedAlternative(char_class)); node = first_step_node; @@ -1850,19 +1858,19 @@ RegExpAtom::ToNode(RegExpCompiler* compiler, RegExpNode* on_success) TextElementVector* elms = compiler->alloc()->newInfallible<TextElementVector>(*compiler->alloc()); elms->append(TextElement::Atom(this)); - return compiler->alloc()->newInfallible<TextNode>(elms, on_success); + return compiler->alloc()->newInfallible<TextNode>(elms, compiler->read_backward(), on_success); } RegExpNode* RegExpText::ToNode(RegExpCompiler* compiler, RegExpNode* on_success) { - return compiler->alloc()->newInfallible<TextNode>(&elements_, on_success); + return compiler->alloc()->newInfallible<TextNode>(&elements_, compiler->read_backward(), on_success); } RegExpNode* RegExpCharacterClass::ToNode(RegExpCompiler* compiler, RegExpNode* on_success) { - return compiler->alloc()->newInfallible<TextNode>(this, on_success); + return compiler->alloc()->newInfallible<TextNode>(this, compiler->read_backward(), on_success); } RegExpNode* @@ -2003,7 +2011,8 @@ RegExpQuantifier::ToNode(int min, alternation->AddAlternative(GuardedAlternative(body->ToNode(compiler, answer))); } answer = alternation; - if (not_at_start) alternation->set_not_at_start(); + if (not_at_start && !compiler->read_backward()) + alternation->set_not_at_start(); } return answer; } @@ -2015,8 +2024,9 @@ RegExpQuantifier::ToNode(int min, int reg_ctr = needs_counter ? compiler->AllocateRegister() : RegExpCompiler::kNoRegister; - LoopChoiceNode* center = alloc->newInfallible<LoopChoiceNode>(alloc, body->min_match() == 0); - if (not_at_start) + LoopChoiceNode* center = alloc->newInfallible<LoopChoiceNode>(alloc, body->min_match() == 0, + compiler->read_backward()); + if (not_at_start && !compiler->read_backward()) center->set_not_at_start(); RegExpNode* loop_return = needs_counter ? static_cast<RegExpNode*>(ActionNode::IncrementRegister(reg_ctr, center)) @@ -2092,7 +2102,7 @@ RegExpAssertion::ToNode(RegExpCompiler* compiler, CharacterRange::AddClassEscape(alloc, 'n', newline_ranges); RegExpCharacterClass* newline_atom = alloc->newInfallible<RegExpCharacterClass>('n'); TextNode* newline_matcher = - alloc->newInfallible<TextNode>(newline_atom, + alloc->newInfallible<TextNode>(newline_atom, false, ActionNode::PositiveSubmatchSuccess(stack_pointer_register, position_register, 0, // No captures inside. @@ -2124,6 +2134,7 @@ RegExpBackReference::ToNode(RegExpCompiler* compiler, RegExpNode* on_success) { return compiler->alloc()->newInfallible<BackReferenceNode>(RegExpCapture::StartRegister(index()), RegExpCapture::EndRegister(index()), + compiler->read_backward(), on_success); } @@ -2134,7 +2145,7 @@ RegExpEmpty::ToNode(RegExpCompiler* compiler, RegExpNode* on_success) } RegExpNode* -RegExpLookahead::ToNode(RegExpCompiler* compiler, RegExpNode* on_success) +RegExpLookaround::ToNode(RegExpCompiler* compiler, RegExpNode* on_success) { int stack_pointer_register = compiler->AllocateRegister(); int position_register = compiler->AllocateRegister(); @@ -2145,6 +2156,10 @@ RegExpLookahead::ToNode(RegExpCompiler* compiler, RegExpNode* on_success) int register_start = register_of_first_capture + capture_from_ * registers_per_capture; + RegExpNode* result; + bool was_reading_backward = compiler->read_backward(); + compiler->set_read_backward(type() == LOOKBEHIND); + if (is_positive()) { RegExpNode* bodyNode = body()->ToNode(compiler, @@ -2153,37 +2168,39 @@ RegExpLookahead::ToNode(RegExpCompiler* compiler, RegExpNode* on_success) register_count, register_start, on_success)); - return ActionNode::BeginSubmatch(stack_pointer_register, + result = ActionNode::BeginSubmatch(stack_pointer_register, + position_register, + bodyNode); + } else { + // We use a ChoiceNode for a negative lookahead because it has most of + // the characteristics we need. It has the body of the lookahead as its + // first alternative and the expression after the lookahead of the second + // alternative. If the first alternative succeeds then the + // NegativeSubmatchSuccess will unwind the stack including everything the + // choice node set up and backtrack. If the first alternative fails then + // the second alternative is tried, which is exactly the desired result + // for a negative lookahead. The NegativeLookaheadChoiceNode is a special + // ChoiceNode that knows to ignore the first exit when calculating quick + // checks. + LifoAlloc* alloc = compiler->alloc(); + + RegExpNode* success = + alloc->newInfallible<NegativeSubmatchSuccess>(alloc, + stack_pointer_register, + position_register, + register_count, + register_start); + GuardedAlternative body_alt(body()->ToNode(compiler, success)); + + ChoiceNode* choice_node = + alloc->newInfallible<NegativeLookaheadChoiceNode>(alloc, body_alt, GuardedAlternative(on_success)); + + result = ActionNode::BeginSubmatch(stack_pointer_register, position_register, - bodyNode); - } - - // We use a ChoiceNode for a negative lookahead because it has most of - // the characteristics we need. It has the body of the lookahead as its - // first alternative and the expression after the lookahead of the second - // alternative. If the first alternative succeeds then the - // NegativeSubmatchSuccess will unwind the stack including everything the - // choice node set up and backtrack. If the first alternative fails then - // the second alternative is tried, which is exactly the desired result - // for a negative lookahead. The NegativeLookaheadChoiceNode is a special - // ChoiceNode that knows to ignore the first exit when calculating quick - // checks. - LifoAlloc* alloc = compiler->alloc(); - - RegExpNode* success = - alloc->newInfallible<NegativeSubmatchSuccess>(alloc, - stack_pointer_register, - position_register, - register_count, - register_start); - GuardedAlternative body_alt(body()->ToNode(compiler, success)); - - ChoiceNode* choice_node = - alloc->newInfallible<NegativeLookaheadChoiceNode>(alloc, body_alt, GuardedAlternative(on_success)); - - return ActionNode::BeginSubmatch(stack_pointer_register, - position_register, - choice_node); + choice_node); + } + compiler->set_read_backward(was_reading_backward); + return result; } RegExpNode* @@ -2198,8 +2215,14 @@ RegExpCapture::ToNode(RegExpTree* body, RegExpCompiler* compiler, RegExpNode* on_success) { + MOZ_ASSERT(body); int start_reg = RegExpCapture::StartRegister(index); int end_reg = RegExpCapture::EndRegister(index); + if (compiler->read_backward()) { + // std::swap(start_reg, end_reg); + start_reg = RegExpCapture::EndRegister(index); + end_reg = RegExpCapture::StartRegister(index); + } RegExpNode* store_end = ActionNode::StorePosition(end_reg, true, on_success); RegExpNode* body_node = body->ToNode(compiler, store_end); return ActionNode::StorePosition(start_reg, true, body_node); @@ -2210,8 +2233,15 @@ RegExpAlternative::ToNode(RegExpCompiler* compiler, RegExpNode* on_success) { const RegExpTreeVector& children = nodes(); RegExpNode* current = on_success; - for (int i = children.length() - 1; i >= 0; i--) - current = children[i]->ToNode(compiler, current); + if (compiler->read_backward()) { + for (int i = 0; i < children.length(); i++) { + current = children[i]->ToNode(compiler, current); + } + } else { + for (int i = children.length() - 1; i >= 0; i--) { + current = children[i]->ToNode(compiler, current); + } + } return current; } @@ -2764,7 +2794,6 @@ Trace::InvalidateCurrentCharacter() void Trace::AdvanceCurrentPositionInTrace(int by, RegExpCompiler* compiler) { - MOZ_ASSERT(by > 0); // We don't have an instruction for shifting the current character register // down or for using a shifted value for anything so lets just forget that // we preloaded any characters into it. @@ -3109,9 +3138,9 @@ AssertionNode::Emit(RegExpCompiler* compiler, Trace* trace) return; } if (trace->at_start() == Trace::UNKNOWN) { - assembler->CheckNotAtStart(trace->backtrack()); + assembler->CheckNotAtStart(trace->cp_offset(), trace->backtrack()); Trace at_start_trace = *trace; - at_start_trace.set_at_start(true); + at_start_trace.set_at_start(Trace::TRUE_VALUE); on_success()->Emit(compiler, &at_start_trace); return; } @@ -3814,9 +3843,10 @@ TextNode::TextEmitPass(RegExpCompiler* compiler, jit::Label* backtrack = trace->backtrack(); QuickCheckDetails* quick_check = trace->quick_check_performed(); int element_count = elements().length(); + int backward_offset = read_backward() ? -Length() : 0; for (int i = preloaded ? 0 : element_count - 1; i >= 0; i--) { TextElement elm = elements()[i]; - int cp_offset = trace->cp_offset() + elm.cp_offset(); + int cp_offset = trace->cp_offset() + elm.cp_offset() + backward_offset; if (elm.text_type() == TextElement::ATOM) { const CharacterVector& quarks = elm.atom()->data(); for (int j = preloaded ? 0 : quarks.length() - 1; j >= 0; j--) { @@ -3844,11 +3874,12 @@ TextNode::TextEmitPass(RegExpCompiler* compiler, break; } if (emit_function != nullptr) { + bool bounds_check = *checked_up_to < cp_offset + j || read_backward(); bool bound_checked = emit_function(compiler, quarks[j], backtrack, cp_offset + j, - *checked_up_to < cp_offset + j, + bounds_check, preloaded); if (bound_checked) UpdateBoundsCheck(cp_offset + j, checked_up_to); } @@ -3859,13 +3890,14 @@ TextNode::TextEmitPass(RegExpCompiler* compiler, if (first_element_checked && i == 0) continue; if (DeterminedAlready(quick_check, elm.cp_offset())) continue; RegExpCharacterClass* cc = elm.char_class(); + bool bounds_check = *checked_up_to < cp_offset || read_backward(); EmitCharClass(alloc(), assembler, cc, ascii, backtrack, cp_offset, - *checked_up_to < cp_offset, + bounds_check, preloaded); UpdateBoundsCheck(cp_offset, checked_up_to); } @@ -3945,8 +3977,11 @@ TextNode::Emit(RegExpCompiler* compiler, Trace* trace) } Trace successor_trace(*trace); - successor_trace.set_at_start(false); - successor_trace.AdvanceCurrentPositionInTrace(Length(), compiler); + // If we advance backward, we may end up at the start. + successor_trace.AdvanceCurrentPositionInTrace( + read_backward() ? -Length() : Length(), compiler); + successor_trace.set_at_start(read_backward() ? Trace::UNKNOWN + : Trace::FALSE_VALUE); RecursionCheck rc(compiler); on_success()->Emit(compiler, &successor_trace); } @@ -4118,6 +4153,8 @@ ChoiceNode::CalculatePreloadCharacters(RegExpCompiler* compiler, int eats_at_lea RegExpNode* TextNode::GetSuccessorOfOmnivorousTextNode(RegExpCompiler* compiler) { + if (read_backward()) return NULL; + if (elements().length() != 1) return nullptr; @@ -4165,7 +4202,7 @@ ChoiceNode::GreedyLoopTextLengthForAlternative(GuardedAlternative* alternative) SeqRegExpNode* seq_node = static_cast<SeqRegExpNode*>(node); node = seq_node->on_success(); } - return length; + return read_backward() ? -length : length; } // Creates a list of AlternativeGenerations. If the list has a reasonable @@ -4240,7 +4277,7 @@ ChoiceNode::Emit(RegExpCompiler* compiler, Trace* trace) jit::Label greedy_loop_label; Trace counter_backtrack_trace; counter_backtrack_trace.set_backtrack(&greedy_loop_label); - if (not_at_start()) counter_backtrack_trace.set_at_start(false); + if (not_at_start()) counter_backtrack_trace.set_at_start(Trace::FALSE_VALUE); if (choice_count > 1 && text_length != kNodeIsTooComplexForGreedyLoops) { // Here we have special handling for greedy loops containing only text nodes @@ -4256,7 +4293,7 @@ ChoiceNode::Emit(RegExpCompiler* compiler, Trace* trace) current_trace = &counter_backtrack_trace; jit::Label greedy_match_failed; Trace greedy_match_trace; - if (not_at_start()) greedy_match_trace.set_at_start(false); + if (not_at_start()) greedy_match_trace.set_at_start(Trace::FALSE_VALUE); greedy_match_trace.set_backtrack(&greedy_match_failed); jit::Label loop_label; macro_assembler->Bind(&loop_label); @@ -4605,11 +4642,14 @@ BackReferenceNode::Emit(RegExpCompiler* compiler, Trace* trace) MOZ_ASSERT(start_reg_ + 1 == end_reg_); if (compiler->ignore_case()) { assembler->CheckNotBackReferenceIgnoreCase(start_reg_, + read_backward(), trace->backtrack(), compiler->unicode()); } else { - assembler->CheckNotBackReference(start_reg_, trace->backtrack()); + assembler->CheckNotBackReference(start_reg_, read_backward(), trace->backtrack()); } + // We are going to advance backward, so we may end up at the start. + if (read_backward()) trace->set_at_start(Trace::UNKNOWN); on_success()->Emit(compiler, trace); } @@ -4977,7 +5017,6 @@ QuickCheckDetails::Clear() void QuickCheckDetails::Advance(int by, bool ascii) { - MOZ_ASSERT(by >= 0); if (by >= characters_) { Clear(); return; |