/* -*- Mode: C++; tab-width: 8; indent-tabs-mode: nil; c-basic-offset: 4 -*- * vim: set ts=8 sts=4 et sw=4 tw=99: */ // Copyright 2011 the V8 project authors. All rights reserved. // Redistribution and use in source and binary forms, with or without // modification, are permitted provided that the following conditions are // met: // // * Redistributions of source code must retain the above copyright // notice, this list of conditions and the following disclaimer. // * Redistributions in binary form must reproduce the above // copyright notice, this list of conditions and the following // disclaimer in the documentation and/or other materials provided // with the distribution. // * Neither the name of Google Inc. nor the names of its // contributors may be used to endorse or promote products derived // from this software without specific prior written permission. // // THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS // "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT // LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR // A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT // OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, // SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT // LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, // DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY // THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT // (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE // OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. // A simple interpreter for the Irregexp byte code. #include "irregexp/RegExpBytecode.h" #include "irregexp/RegExpMacroAssembler.h" #include "vm/MatchPairs.h" using namespace js; using namespace js::irregexp; static const size_t kBitsPerByte = 8; static const size_t kBitsPerByteLog2 = 3; class MOZ_STACK_CLASS RegExpStackCursor { public: explicit RegExpStackCursor(JSContext* cx) : cx(cx), cursor(nullptr) {} bool init() { if (!stack.init()) { ReportOutOfMemory(cx); return false; } cursor = base(); return true; } bool push(int32_t value) { *cursor++ = value; if (cursor >= stack.limit()) { int32_t pos = position(); if (!stack.grow()) { ReportOverRecursed(cx); return false; } setPosition(pos); } return true; } int32_t pop() { MOZ_ASSERT(cursor > base()); return *--cursor; } int32_t peek() { MOZ_ASSERT(cursor > base()); return *(cursor - 1); } int32_t position() { MOZ_ASSERT(cursor >= base()); return cursor - base(); } void setPosition(int32_t position) { cursor = base() + position; MOZ_ASSERT(cursor < stack.limit()); } private: JSContext* cx; RegExpStack stack; int32_t* cursor; int32_t* base() { return (int32_t*) stack.base(); } }; static int32_t Load32Aligned(const uint8_t* pc) { MOZ_ASSERT((reinterpret_cast<uintptr_t>(pc) & 3) == 0); return *reinterpret_cast<const int32_t*>(pc); } static int32_t Load16Aligned(const uint8_t* pc) { MOZ_ASSERT((reinterpret_cast<uintptr_t>(pc) & 1) == 0); return *reinterpret_cast<const uint16_t*>(pc); } #define BYTECODE(name) case BC_##name: template <typename CharT> RegExpRunStatus irregexp::InterpretCode(JSContext* cx, const uint8_t* byteCode, const CharT* chars, size_t current, size_t length, MatchPairs* matches, size_t* endIndex) { const uint8_t* pc = byteCode; uint32_t current_char = current ? chars[current - 1] : '\n'; RegExpStackCursor stack(cx); if (!stack.init()) return RegExpRunStatus_Error; int32_t numRegisters = Load32Aligned(pc); pc += 4; Vector<int32_t, 0, SystemAllocPolicy> registers; if (!registers.growByUninitialized(numRegisters)) return RegExpRunStatus_Error; for (size_t i = 0; i < (size_t) numRegisters; i++) registers[i] = -1; while (true) { int32_t insn = Load32Aligned(pc); switch (insn & BYTECODE_MASK) { BYTECODE(BREAK) MOZ_CRASH("Bad bytecode: BREAK"); BYTECODE(PUSH_CP) if (!stack.push(current)) return RegExpRunStatus_Error; pc += BC_PUSH_CP_LENGTH; break; BYTECODE(PUSH_BT) if (!stack.push(Load32Aligned(pc + 4))) return RegExpRunStatus_Error; pc += BC_PUSH_BT_LENGTH; break; BYTECODE(PUSH_REGISTER) if (!stack.push(registers[insn >> BYTECODE_SHIFT])) return RegExpRunStatus_Error; pc += BC_PUSH_REGISTER_LENGTH; break; BYTECODE(SET_REGISTER) registers[insn >> BYTECODE_SHIFT] = Load32Aligned(pc + 4); pc += BC_SET_REGISTER_LENGTH; break; BYTECODE(ADVANCE_REGISTER) registers[insn >> BYTECODE_SHIFT] += Load32Aligned(pc + 4); pc += BC_ADVANCE_REGISTER_LENGTH; break; BYTECODE(SET_REGISTER_TO_CP) registers[insn >> BYTECODE_SHIFT] = current + Load32Aligned(pc + 4); pc += BC_SET_REGISTER_TO_CP_LENGTH; break; BYTECODE(SET_CP_TO_REGISTER) current = registers[insn >> BYTECODE_SHIFT]; pc += BC_SET_CP_TO_REGISTER_LENGTH; break; BYTECODE(SET_REGISTER_TO_SP) registers[insn >> BYTECODE_SHIFT] = stack.position(); pc += BC_SET_REGISTER_TO_SP_LENGTH; break; BYTECODE(SET_SP_TO_REGISTER) stack.setPosition(registers[insn >> BYTECODE_SHIFT]); pc += BC_SET_SP_TO_REGISTER_LENGTH; break; BYTECODE(POP_CP) current = stack.pop(); pc += BC_POP_CP_LENGTH; break; BYTECODE(POP_BT) if (!CheckForInterrupt(cx)) return RegExpRunStatus_Error; pc = byteCode + stack.pop(); break; BYTECODE(POP_REGISTER) registers[insn >> BYTECODE_SHIFT] = stack.pop(); pc += BC_POP_REGISTER_LENGTH; break; BYTECODE(FAIL) return RegExpRunStatus_Success_NotFound; BYTECODE(SUCCEED) if (matches) memcpy(matches->pairsRaw(), registers.begin(), matches->length() * 2 * sizeof(int32_t)); else if (endIndex) *endIndex = registers[1]; return RegExpRunStatus_Success; BYTECODE(ADVANCE_CP) current += insn >> BYTECODE_SHIFT; pc += BC_ADVANCE_CP_LENGTH; break; BYTECODE(GOTO) pc = byteCode + Load32Aligned(pc + 4); break; BYTECODE(ADVANCE_CP_AND_GOTO) current += insn >> BYTECODE_SHIFT; pc = byteCode + Load32Aligned(pc + 4); break; BYTECODE(CHECK_GREEDY) if ((int32_t)current == stack.peek()) { stack.pop(); pc = byteCode + Load32Aligned(pc + 4); } else { pc += BC_CHECK_GREEDY_LENGTH; } break; BYTECODE(LOAD_CURRENT_CHAR) { size_t pos = current + (insn >> BYTECODE_SHIFT); if (pos >= length) { pc = byteCode + Load32Aligned(pc + 4); } else { current_char = chars[pos]; pc += BC_LOAD_CURRENT_CHAR_LENGTH; } break; } BYTECODE(LOAD_CURRENT_CHAR_UNCHECKED) { int pos = current + (insn >> BYTECODE_SHIFT); current_char = chars[pos]; pc += BC_LOAD_CURRENT_CHAR_UNCHECKED_LENGTH; break; } BYTECODE(LOAD_2_CURRENT_CHARS) { size_t pos = current + (insn >> BYTECODE_SHIFT); if (pos + 2 > length) { pc = byteCode + Load32Aligned(pc + 4); } else { CharT next = chars[pos + 1]; current_char = (chars[pos] | (next << (kBitsPerByte * sizeof(CharT)))); pc += BC_LOAD_2_CURRENT_CHARS_LENGTH; } break; } BYTECODE(LOAD_2_CURRENT_CHARS_UNCHECKED) { int pos = current + (insn >> BYTECODE_SHIFT); char16_t next = chars[pos + 1]; current_char = (chars[pos] | (next << (kBitsPerByte * sizeof(char16_t)))); pc += BC_LOAD_2_CURRENT_CHARS_UNCHECKED_LENGTH; break; } BYTECODE(LOAD_4_CURRENT_CHARS) MOZ_CRASH("ASCII handling implemented"); BYTECODE(LOAD_4_CURRENT_CHARS_UNCHECKED) MOZ_CRASH("ASCII handling implemented"); BYTECODE(CHECK_4_CHARS) { uint32_t c = Load32Aligned(pc + 4); if (c == current_char) pc = byteCode + Load32Aligned(pc + 8); else pc += BC_CHECK_4_CHARS_LENGTH; break; } BYTECODE(CHECK_CHAR) { uint32_t c = (insn >> BYTECODE_SHIFT); if (c == current_char) pc = byteCode + Load32Aligned(pc + 4); else pc += BC_CHECK_CHAR_LENGTH; break; } BYTECODE(CHECK_NOT_4_CHARS) { uint32_t c = Load32Aligned(pc + 4); if (c != current_char) pc = byteCode + Load32Aligned(pc + 8); else pc += BC_CHECK_NOT_4_CHARS_LENGTH; break; } BYTECODE(CHECK_NOT_CHAR) { uint32_t c = (insn >> BYTECODE_SHIFT); if (c != current_char) pc = byteCode + Load32Aligned(pc + 4); else pc += BC_CHECK_NOT_CHAR_LENGTH; break; } BYTECODE(AND_CHECK_4_CHARS) { uint32_t c = Load32Aligned(pc + 4); if (c == (current_char & Load32Aligned(pc + 8))) pc = byteCode + Load32Aligned(pc + 12); else pc += BC_AND_CHECK_4_CHARS_LENGTH; break; } BYTECODE(AND_CHECK_CHAR) { uint32_t c = (insn >> BYTECODE_SHIFT); if (c == (current_char & Load32Aligned(pc + 4))) pc = byteCode + Load32Aligned(pc + 8); else pc += BC_AND_CHECK_CHAR_LENGTH; break; } BYTECODE(AND_CHECK_NOT_4_CHARS) { uint32_t c = Load32Aligned(pc + 4); if (c != (current_char & Load32Aligned(pc + 8))) pc = byteCode + Load32Aligned(pc + 12); else pc += BC_AND_CHECK_NOT_4_CHARS_LENGTH; break; } BYTECODE(AND_CHECK_NOT_CHAR) { uint32_t c = (insn >> BYTECODE_SHIFT); if (c != (current_char & Load32Aligned(pc + 4))) pc = byteCode + Load32Aligned(pc + 8); else pc += BC_AND_CHECK_NOT_CHAR_LENGTH; break; } BYTECODE(MINUS_AND_CHECK_NOT_CHAR) { uint32_t c = (insn >> BYTECODE_SHIFT); uint32_t minus = Load16Aligned(pc + 4); uint32_t mask = Load16Aligned(pc + 6); if (c != ((current_char - minus) & mask)) pc = byteCode + Load32Aligned(pc + 8); else pc += BC_MINUS_AND_CHECK_NOT_CHAR_LENGTH; break; } BYTECODE(CHECK_CHAR_IN_RANGE) { uint32_t from = Load16Aligned(pc + 4); uint32_t to = Load16Aligned(pc + 6); if (from <= current_char && current_char <= to) pc = byteCode + Load32Aligned(pc + 8); else pc += BC_CHECK_CHAR_IN_RANGE_LENGTH; break; } BYTECODE(CHECK_CHAR_NOT_IN_RANGE) { uint32_t from = Load16Aligned(pc + 4); uint32_t to = Load16Aligned(pc + 6); if (from > current_char || current_char > to) pc = byteCode + Load32Aligned(pc + 8); else pc += BC_CHECK_CHAR_NOT_IN_RANGE_LENGTH; break; } BYTECODE(CHECK_BIT_IN_TABLE) { int mask = RegExpMacroAssembler::kTableMask; uint8_t b = pc[8 + ((current_char & mask) >> kBitsPerByteLog2)]; int bit = (current_char & (kBitsPerByte - 1)); if ((b & (1 << bit)) != 0) pc = byteCode + Load32Aligned(pc + 4); else pc += BC_CHECK_BIT_IN_TABLE_LENGTH; break; } BYTECODE(CHECK_LT) { uint32_t limit = (insn >> BYTECODE_SHIFT); if (current_char < limit) pc = byteCode + Load32Aligned(pc + 4); else pc += BC_CHECK_LT_LENGTH; break; } BYTECODE(CHECK_GT) { uint32_t limit = (insn >> BYTECODE_SHIFT); if (current_char > limit) pc = byteCode + Load32Aligned(pc + 4); else pc += BC_CHECK_GT_LENGTH; break; } BYTECODE(CHECK_REGISTER_LT) if (registers[insn >> BYTECODE_SHIFT] < Load32Aligned(pc + 4)) pc = byteCode + Load32Aligned(pc + 8); else pc += BC_CHECK_REGISTER_LT_LENGTH; break; BYTECODE(CHECK_REGISTER_GE) if (registers[insn >> BYTECODE_SHIFT] >= Load32Aligned(pc + 4)) pc = byteCode + Load32Aligned(pc + 8); else pc += BC_CHECK_REGISTER_GE_LENGTH; break; BYTECODE(CHECK_REGISTER_EQ_POS) if (registers[insn >> BYTECODE_SHIFT] == (int32_t) current) pc = byteCode + Load32Aligned(pc + 4); else pc += BC_CHECK_REGISTER_EQ_POS_LENGTH; break; BYTECODE(CHECK_NOT_REGS_EQUAL) if (registers[insn >> BYTECODE_SHIFT] == registers[Load32Aligned(pc + 4)]) pc += BC_CHECK_NOT_REGS_EQUAL_LENGTH; else pc = byteCode + Load32Aligned(pc + 8); break; BYTECODE(CHECK_NOT_BACK_REF) { int from = registers[insn >> BYTECODE_SHIFT]; int len = registers[(insn >> BYTECODE_SHIFT) + 1] - from; if (from < 0 || len <= 0) { pc += BC_CHECK_NOT_BACK_REF_LENGTH; break; } if (current + len > length) { pc = byteCode + Load32Aligned(pc + 4); break; } else { int i; for (i = 0; i < len; i++) { if (chars[from + i] != chars[current + i]) { pc = byteCode + Load32Aligned(pc + 4); break; } } if (i < len) break; current += len; } pc += BC_CHECK_NOT_BACK_REF_LENGTH; break; } BYTECODE(CHECK_NOT_BACK_REF_NO_CASE) { int from = registers[insn >> BYTECODE_SHIFT]; int len = registers[(insn >> BYTECODE_SHIFT) + 1] - from; if (from < 0 || len <= 0) { pc += BC_CHECK_NOT_BACK_REF_NO_CASE_LENGTH; break; } if (current + len > length) { pc = byteCode + Load32Aligned(pc + 4); break; } if (CaseInsensitiveCompareStrings(chars + from, chars + current, len * sizeof(CharT))) { current += len; pc += BC_CHECK_NOT_BACK_REF_NO_CASE_LENGTH; } else { pc = byteCode + Load32Aligned(pc + 4); } break; } BYTECODE(CHECK_NOT_BACK_REF_NO_CASE_UNICODE) { int from = registers[insn >> BYTECODE_SHIFT]; int len = registers[(insn >> BYTECODE_SHIFT) + 1] - from; if (from < 0 || len <= 0) { pc += BC_CHECK_NOT_BACK_REF_NO_CASE_UNICODE_LENGTH; break; } if (current + len > length) { pc = byteCode + Load32Aligned(pc + 4); break; } if (CaseInsensitiveCompareUCStrings(chars + from, chars + current, len * sizeof(CharT))) { current += len; pc += BC_CHECK_NOT_BACK_REF_NO_CASE_UNICODE_LENGTH; } else { pc = byteCode + Load32Aligned(pc + 4); } break; } BYTECODE(CHECK_AT_START) if (current == 0) pc = byteCode + Load32Aligned(pc + 4); else pc += BC_CHECK_AT_START_LENGTH; break; BYTECODE(CHECK_NOT_AT_START) if (current == 0) pc += BC_CHECK_NOT_AT_START_LENGTH; else pc = byteCode + Load32Aligned(pc + 4); break; BYTECODE(SET_CURRENT_POSITION_FROM_END) { size_t by = static_cast<uint32_t>(insn) >> BYTECODE_SHIFT; if (length - current > by) { current = length - by; current_char = chars[current - 1]; } pc += BC_SET_CURRENT_POSITION_FROM_END_LENGTH; break; } default: MOZ_CRASH("Bad bytecode"); } } } template RegExpRunStatus irregexp::InterpretCode(JSContext* cx, const uint8_t* byteCode, const Latin1Char* chars, size_t current, size_t length, MatchPairs* matches, size_t* endIndex); template RegExpRunStatus irregexp::InterpretCode(JSContext* cx, const uint8_t* byteCode, const char16_t* chars, size_t current, size_t length, MatchPairs* matches, size_t* endIndex);