diff options
Diffstat (limited to 'js/src/regexp/regexp-compiler.cc')
-rw-r--r-- | js/src/regexp/regexp-compiler.cc | 115 |
1 files changed, 56 insertions, 59 deletions
diff --git a/js/src/regexp/regexp-compiler.cc b/js/src/regexp/regexp-compiler.cc index 9a2aa30dc..c0070061f 100644 --- a/js/src/regexp/regexp-compiler.cc +++ b/js/src/regexp/regexp-compiler.cc @@ -5,7 +5,9 @@ #include "regexp/regexp-compiler.h" #include "regexp/regexp-macro-assembler-arch.h" -#include "regexp/regexp-macro-assembler-tracer.h" +#ifdef V8_INTL_SUPPORT +#include "regexp/special-case.h" +#endif // V8_INTL_SUPPORT #ifdef V8_INTL_SUPPORT #include "unicode/locid.h" @@ -237,20 +239,15 @@ RegExpCompiler::RegExpCompiler(Isolate* isolate, Zone* zone, int capture_count, RegExpCompiler::CompilationResult RegExpCompiler::Assemble( Isolate* isolate, RegExpMacroAssembler* macro_assembler, RegExpNode* start, int capture_count, Handle<String> pattern) { -#ifdef DEBUG - if (FLAG_trace_regexp_assembler) - macro_assembler_ = new RegExpMacroAssemblerTracer(isolate, macro_assembler); - else -#endif - macro_assembler_ = macro_assembler; + macro_assembler_ = macro_assembler; - std::vector<RegExpNode*> work_list; + ZoneVector<RegExpNode*> work_list(zone()); work_list_ = &work_list; Label fail; macro_assembler_->PushBacktrack(&fail); Trace new_trace; start->Emit(this, &new_trace); - macro_assembler_->Bind(&fail); + macro_assembler_->BindJumpTarget(&fail); macro_assembler_->Fail(); while (!work_list.empty()) { RegExpNode* node = work_list.back(); @@ -264,14 +261,9 @@ RegExpCompiler::CompilationResult RegExpCompiler::Assemble( } Handle<HeapObject> code = macro_assembler_->GetCode(pattern); - isolate->IncreaseTotalRegexpCodeGenerated(code->Size()); + isolate->IncreaseTotalRegexpCodeGenerated(code); work_list_ = nullptr; -#ifdef DEBUG - if (FLAG_trace_regexp_assembler) { - delete macro_assembler_; - } -#endif return {*code, next_register_}; } @@ -557,7 +549,7 @@ void Trace::Flush(RegExpCompiler* compiler, RegExpNode* successor) { } // On backtrack we need to restore state. - assembler->Bind(&undo); + assembler->BindJumpTarget(&undo); RestoreAffectedRegisters(assembler, max_register, registers_to_pop, registers_to_clear); if (backtrack() == nullptr) { @@ -720,32 +712,34 @@ static int GetCaseIndependentLetters(Isolate* isolate, uc16 character, unibrow::uchar* letters, int letter_length) { #ifdef V8_INTL_SUPPORT - // Special case for U+017F which has upper case in ASCII range. - if (character == 0x017f) { + if (RegExpCaseFolding::IgnoreSet().contains(character)) { letters[0] = character; return 1; } + bool in_special_add_set = + RegExpCaseFolding::SpecialAddSet().contains(character); + icu::UnicodeSet set; set.add(character); set = set.closeOver(USET_CASE_INSENSITIVE); + + UChar32 canon = 0; + if (in_special_add_set) { + canon = RegExpCaseFolding::Canonicalize(character); + } + int32_t range_count = set.getRangeCount(); int items = 0; for (int32_t i = 0; i < range_count; i++) { UChar32 start = set.getRangeStart(i); UChar32 end = set.getRangeEnd(i); CHECK(end - start + items <= letter_length); - // Only add to the output if character is not in ASCII range - // or the case equivalent character is in ASCII range. - // #sec-runtime-semantics-canonicalize-ch - // 3.g If the numeric value of ch ≥ 128 and the numeric value of cu < 128, - // return ch. - if (!((start >= 128) && (character < 128))) { - // No range have start and end span across code point 128. - DCHECK((start >= 128) == (end >= 128)); - for (UChar32 cu = start; cu <= end; cu++) { - if (one_byte_subject && cu > String::kMaxOneByteCharCode) break; - letters[items++] = (unibrow::uchar)(cu); + for (UChar32 cu = start; cu <= end; cu++) { + if (one_byte_subject && cu > String::kMaxOneByteCharCode) break; + if (in_special_add_set && RegExpCaseFolding::Canonicalize(cu) != canon) { + continue; } + letters[items++] = (unibrow::uchar)(cu); } } return items; @@ -852,10 +846,6 @@ static bool ShortCutEmitCharacterPair(RegExpMacroAssembler* macro_assembler, return false; } -using EmitCharacterFunction = bool(Isolate* isolate, RegExpCompiler* compiler, - uc16 c, Label* on_failure, int cp_offset, - bool check, bool preloaded); - // Only emits letters (things that have case). Only used for case independent // matches. static inline bool EmitAtomLetter(Isolate* isolate, RegExpCompiler* compiler, @@ -1843,13 +1833,13 @@ RegExpNode* TextNode::FilterOneByte(int depth) { if (elm.text_type() == TextElement::ATOM) { Vector<const uc16> quarks = elm.atom()->data(); for (int j = 0; j < quarks.length(); j++) { - uint16_t c = quarks[j]; + uc16 c = quarks[j]; if (elm.atom()->ignore_case()) { c = unibrow::Latin1::TryConvertToLatin1(c); } if (c > unibrow::Latin1::kMaxChar) return set_replacement(nullptr); // Replace quark in case we converted to Latin-1. - uint16_t* writable_quarks = const_cast<uint16_t*>(quarks.begin()); + uc16* writable_quarks = const_cast<uc16*>(quarks.begin()); writable_quarks[j] = c; } } else { @@ -2304,7 +2294,6 @@ void TextNode::TextEmitPass(RegExpCompiler* compiler, TextEmitPassType pass, for (int j = preloaded ? 0 : quarks.length() - 1; j >= 0; j--) { if (first_element_checked && i == 0 && j == 0) continue; if (DeterminedAlready(quick_check, elm.cp_offset() + j)) continue; - EmitCharacterFunction* emit_function = nullptr; uc16 quark = quarks[j]; if (elm.atom()->ignore_case()) { // Everywhere else we assume that a non-Latin-1 character cannot match @@ -2312,6 +2301,9 @@ void TextNode::TextEmitPass(RegExpCompiler* compiler, TextEmitPassType pass, // invalid by using the Latin1 equivalent instead. quark = unibrow::Latin1::TryConvertToLatin1(quark); } + bool needs_bounds_check = + *checked_up_to < cp_offset + j || read_backward(); + bool bounds_checked = false; switch (pass) { case NON_LATIN1_MATCH: DCHECK(one_byte); @@ -2321,24 +2313,24 @@ void TextNode::TextEmitPass(RegExpCompiler* compiler, TextEmitPassType pass, } break; case NON_LETTER_CHARACTER_MATCH: - emit_function = &EmitAtomNonLetter; + bounds_checked = + EmitAtomNonLetter(isolate, compiler, quark, backtrack, + cp_offset + j, needs_bounds_check, preloaded); break; case SIMPLE_CHARACTER_MATCH: - emit_function = &EmitSimpleCharacter; + bounds_checked = EmitSimpleCharacter(isolate, compiler, quark, + backtrack, cp_offset + j, + needs_bounds_check, preloaded); break; case CASE_CHARACTER_MATCH: - emit_function = &EmitAtomLetter; + bounds_checked = + EmitAtomLetter(isolate, compiler, quark, backtrack, + cp_offset + j, needs_bounds_check, preloaded); break; default: break; } - if (emit_function != nullptr) { - bool bounds_check = *checked_up_to < cp_offset + j || read_backward(); - bool bound_checked = - emit_function(isolate, compiler, quark, backtrack, cp_offset + j, - bounds_check, preloaded); - if (bound_checked) UpdateBoundsCheck(cp_offset + j, checked_up_to); - } + if (bounds_checked) UpdateBoundsCheck(cp_offset + j, checked_up_to); } } else { DCHECK_EQ(TextElement::CHAR_CLASS, elm.text_type()); @@ -3424,8 +3416,8 @@ void BackReferenceNode::Emit(RegExpCompiler* compiler, Trace* trace) { DCHECK_EQ(start_reg_ + 1, end_reg_); if (IgnoreCase(flags_)) { - assembler->CheckNotBackReferenceIgnoreCase( - start_reg_, read_backward(), IsUnicode(flags_), trace->backtrack()); + assembler->CheckNotBackReferenceIgnoreCase(start_reg_, read_backward(), + trace->backtrack()); } else { assembler->CheckNotBackReference(start_reg_, read_backward(), trace->backtrack()); @@ -3597,12 +3589,17 @@ template <typename... Propagators> class Analysis : public NodeVisitor { public: Analysis(Isolate* isolate, bool is_one_byte) - : isolate_(isolate), is_one_byte_(is_one_byte), error_message_(nullptr) {} + : isolate_(isolate), + is_one_byte_(is_one_byte), + error_(RegExpError::kNone) {} void EnsureAnalyzed(RegExpNode* that) { StackLimitCheck check(isolate()); if (check.HasOverflowed()) { - fail("Stack overflow"); + if (FLAG_correctness_fuzzer_suppressions) { + FATAL("Analysis: Aborting on stack overflow"); + } + fail(RegExpError::kAnalysisStackOverflow); return; } if (that->info()->been_analyzed || that->info()->being_analyzed) return; @@ -3612,12 +3609,12 @@ class Analysis : public NodeVisitor { that->info()->been_analyzed = true; } - bool has_failed() { return error_message_ != nullptr; } - const char* error_message() { - DCHECK(error_message_ != nullptr); - return error_message_; + bool has_failed() { return error_ != RegExpError::kNone; } + RegExpError error() { + DCHECK(error_ != RegExpError::kNone); + return error_; } - void fail(const char* error_message) { error_message_ = error_message; } + void fail(RegExpError error) { error_ = error; } Isolate* isolate() const { return isolate_; } @@ -3702,19 +3699,19 @@ class Analysis : public NodeVisitor { private: Isolate* isolate_; bool is_one_byte_; - const char* error_message_; + RegExpError error_; DISALLOW_IMPLICIT_CONSTRUCTORS(Analysis); }; -const char* AnalyzeRegExp(Isolate* isolate, bool is_one_byte, +RegExpError AnalyzeRegExp(Isolate* isolate, bool is_one_byte, RegExpNode* node) { Analysis<AssertionPropagator, EatsAtLeastPropagator> analysis(isolate, is_one_byte); DCHECK_EQ(node->info()->been_analyzed, false); analysis.EnsureAnalyzed(node); - DCHECK_IMPLIES(analysis.has_failed(), analysis.error_message() != nullptr); - return analysis.has_failed() ? analysis.error_message() : nullptr; + DCHECK_IMPLIES(analysis.has_failed(), analysis.error() != RegExpError::kNone); + return analysis.has_failed() ? analysis.error() : RegExpError::kNone; } void BackReferenceNode::FillInBMInfo(Isolate* isolate, int offset, int budget, |