diff options
Diffstat (limited to 'js/src/irregexp/RegExpEngine.cpp')
-rw-r--r-- | js/src/irregexp/RegExpEngine.cpp | 140 |
1 files changed, 25 insertions, 115 deletions
diff --git a/js/src/irregexp/RegExpEngine.cpp b/js/src/irregexp/RegExpEngine.cpp index 2e19065fd..4d691a5dc 100644 --- a/js/src/irregexp/RegExpEngine.cpp +++ b/js/src/irregexp/RegExpEngine.cpp @@ -31,10 +31,14 @@ #include "irregexp/RegExpEngine.h" #include "irregexp/NativeRegExpMacroAssembler.h" +#include "irregexp/RegExpCharacters.h" #include "irregexp/RegExpMacroAssembler.h" #include "jit/ExecutableAllocator.h" #include "jit/JitCommon.h" +// Generated table +#include "irregexp/RegExpCharacters-inl.h" + using namespace js; using namespace js::irregexp; @@ -61,61 +65,6 @@ RegExpNode::RegExpNode(LifoAlloc* alloc) bm_info_[0] = bm_info_[1] = nullptr; } -// ------------------------------------------------------------------- -// CharacterRange - -// The '2' variant has inclusive from and exclusive to. -// This covers \s as defined in ECMA-262 5.1, 15.10.2.12, -// which include WhiteSpace (7.2) or LineTerminator (7.3) values. -static const int kSpaceRanges[] = { '\t', '\r' + 1, ' ', ' ' + 1, - 0x00A0, 0x00A1, 0x1680, 0x1681, 0x180E, 0x180F, 0x2000, 0x200B, - 0x2028, 0x202A, 0x202F, 0x2030, 0x205F, 0x2060, 0x3000, 0x3001, - 0xFEFF, 0xFF00, 0x10000 }; -static const int kSpaceRangeCount = ArrayLength(kSpaceRanges); - -static const int kSpaceAndSurrogateRanges[] = { '\t', '\r' + 1, ' ', ' ' + 1, - 0x00A0, 0x00A1, 0x1680, 0x1681, 0x180E, 0x180F, 0x2000, 0x200B, - 0x2028, 0x202A, 0x202F, 0x2030, 0x205F, 0x2060, 0x3000, 0x3001, - unicode::LeadSurrogateMin, unicode::TrailSurrogateMax + 1, - 0xFEFF, 0xFF00, 0x10000 }; -static const int kSpaceAndSurrogateRangeCount = ArrayLength(kSpaceAndSurrogateRanges); -static const int kWordRanges[] = { - '0', '9' + 1, 'A', 'Z' + 1, '_', '_' + 1, 'a', 'z' + 1, 0x10000 }; -static const int kWordRangeCount = ArrayLength(kWordRanges); -static const int kIgnoreCaseWordRanges[] = { - '0', '9' + 1, 'A', 'Z' + 1, '_', '_' + 1, 'a', 'z' + 1, - 0x017F, 0x017F + 1, 0x212A, 0x212A + 1, - 0x10000 }; -static const int kIgnoreCaseWordCount = ArrayLength(kIgnoreCaseWordRanges); -static const int kWordAndSurrogateRanges[] = { - '0', '9' + 1, 'A', 'Z' + 1, '_', '_' + 1, 'a', 'z' + 1, - unicode::LeadSurrogateMin, unicode::TrailSurrogateMax + 1, - 0x10000 }; -static const int kWordAndSurrogateRangeCount = ArrayLength(kWordAndSurrogateRanges); -static const int kNegatedIgnoreCaseWordAndSurrogateRanges[] = { - 0, '0', '9' + 1, 'A', - 'Z' + 1, '_', '_' + 1, 'a', - 'z' + 1, 0x017F, - 0x017F + 1, 0x212A, - 0x212A + 1, unicode::LeadSurrogateMin, - unicode::TrailSurrogateMax + 1, 0x10000, - 0x10000 }; -static const int kNegatedIgnoreCaseWordAndSurrogateRangeCount = - ArrayLength(kNegatedIgnoreCaseWordAndSurrogateRanges); -static const int kDigitRanges[] = { '0', '9' + 1, 0x10000 }; -static const int kDigitRangeCount = ArrayLength(kDigitRanges); -static const int kDigitAndSurrogateRanges[] = { - '0', '9' + 1, - unicode::LeadSurrogateMin, unicode::TrailSurrogateMax + 1, - 0x10000 }; -static const int kDigitAndSurrogateRangeCount = ArrayLength(kDigitAndSurrogateRanges); -static const int kSurrogateRanges[] = { - unicode::LeadSurrogateMin, unicode::TrailSurrogateMax + 1, - 0x10000 }; -static const int kSurrogateRangeCount = ArrayLength(kSurrogateRanges); -static const int kLineTerminatorRanges[] = { 0x000A, 0x000B, 0x000D, 0x000E, - 0x2028, 0x202A, 0x10000 }; -static const int kLineTerminatorRangeCount = ArrayLength(kLineTerminatorRanges); static const int kMaxOneByteCharCode = 0xff; static const int kMaxUtf16CodeUnit = 0xffff; @@ -213,7 +162,7 @@ CharacterRange::AddClassEscapeUnicode(LifoAlloc* alloc, char16_t type, break; case 'w': if (ignore_case) - AddClass(kIgnoreCaseWordRanges, kIgnoreCaseWordCount, ranges); + AddClass(kIgnoreCaseWordRanges, kIgnoreCaseWordRangeCount, ranges); else AddClassEscape(alloc, type, ranges); break; @@ -233,33 +182,6 @@ CharacterRange::AddClassEscapeUnicode(LifoAlloc* alloc, char16_t type, } } -#define FOR_EACH_NON_ASCII_TO_ASCII_FOLDING(macro) \ - /* LATIN CAPITAL LETTER Y WITH DIAERESIS */ \ - macro(0x0178, 0x00FF) \ - /* LATIN SMALL LETTER LONG S */ \ - macro(0x017F, 0x0073) \ - /* LATIN CAPITAL LETTER SHARP S */ \ - macro(0x1E9E, 0x00DF) \ - /* KELVIN SIGN */ \ - macro(0x212A, 0x006B) \ - /* ANGSTROM SIGN */ \ - macro(0x212B, 0x00E5) - -// We need to check for the following characters: 0x39c 0x3bc 0x178. -static inline bool -RangeContainsLatin1Equivalents(CharacterRange range, bool unicode) -{ - /* TODO(dcarney): this could be a lot more efficient. */ - if (unicode) { -#define CHECK_RANGE(C, F) \ - if (range.Contains(C)) return true; -FOR_EACH_NON_ASCII_TO_ASCII_FOLDING(CHECK_RANGE) -#undef CHECK_RANGE - } - - return range.Contains(0x39c) || range.Contains(0x3bc) || range.Contains(0x178); -} - static bool RangesContainLatin1Equivalents(const CharacterRangeVector& ranges, bool unicode) { @@ -336,7 +258,7 @@ GetCaseIndependentLetters(char16_t character, // step 3.g. // The standard requires that non-ASCII characters cannot have ASCII // character codes in their equivalence class, even though this - // situation occurs multiple times in the unicode tables. + // situation occurs multiple times in the Unicode tables. static const unsigned kMaxAsciiCharCode = 127; if (upper <= kMaxAsciiCharCode) { if (character > kMaxAsciiCharCode) { @@ -365,31 +287,6 @@ GetCaseIndependentLetters(char16_t character, choices, ArrayLength(choices), letters); } -static char16_t -ConvertNonLatin1ToLatin1(char16_t c, bool unicode) -{ - MOZ_ASSERT(c > kMaxOneByteCharCode); - if (unicode) { - switch (c) { -#define CONVERT(C, F) case C: return F; -FOR_EACH_NON_ASCII_TO_ASCII_FOLDING(CONVERT) -#undef CONVERT - } - } - - switch (c) { - // This are equivalent characters in unicode. - case 0x39c: - case 0x3bc: - return 0xb5; - // This is an uppercase of a Latin-1 character - // outside of Latin-1. - case 0x178: - return 0xff; - } - return 0; -} - void CharacterRange::AddCaseEquivalents(bool is_ascii, bool unicode, CharacterRangeVector* ranges) { @@ -2358,7 +2255,10 @@ void BoyerMoorePositionInfo::SetInterval(const Interval& interval) { s_ = AddRange(s_, kSpaceRanges, kSpaceRangeCount, interval); - w_ = AddRange(w_, kWordRanges, kWordRangeCount, interval); + if (unicode_ignore_case_) + w_ = AddRange(w_, kIgnoreCaseWordRanges, kIgnoreCaseWordRangeCount, interval); + else + w_ = AddRange(w_, kWordRanges, kWordRangeCount, interval); d_ = AddRange(d_, kDigitRanges, kDigitRangeCount, interval); surrogate_ = AddRange(surrogate_, kSurrogateRanges, kSurrogateRangeCount, interval); @@ -2395,11 +2295,12 @@ BoyerMoorePositionInfo::SetAll() BoyerMooreLookahead::BoyerMooreLookahead(LifoAlloc* alloc, size_t length, RegExpCompiler* compiler) : length_(length), compiler_(compiler), bitmaps_(*alloc) { + bool unicode_ignore_case = compiler->unicode() && compiler->ignore_case(); max_char_ = MaximumCharacter(compiler->ascii()); bitmaps_.reserve(length); for (size_t i = 0; i < length; i++) - bitmaps_.append(alloc->newInfallible<BoyerMoorePositionInfo>(alloc)); + bitmaps_.append(alloc->newInfallible<BoyerMoorePositionInfo>(alloc, unicode_ignore_case)); } // Find the longest range of lookahead that has the fewest number of different @@ -3065,15 +2966,22 @@ EmitNotInSurrogatePair(RegExpCompiler* compiler, RegExpNode* on_success, Trace* // Check for [0-9A-Z_a-z]. static void EmitWordCheck(RegExpMacroAssembler* assembler, - jit::Label* word, jit::Label* non_word, bool fall_through_on_word) + jit::Label* word, jit::Label* non_word, bool fall_through_on_word, + bool unicode_ignore_case) { - if (assembler->CheckSpecialCharacterClass(fall_through_on_word ? 'w' : 'W', + if (!unicode_ignore_case && + assembler->CheckSpecialCharacterClass(fall_through_on_word ? 'w' : 'W', fall_through_on_word ? non_word : word)) { // Optimized implementation available. return; } + if (unicode_ignore_case) { + assembler->CheckCharacter(0x017F, word); + assembler->CheckCharacter(0x212A, word); + } + assembler->CheckCharacterGT('z', non_word); assembler->CheckCharacterLT('0', non_word); assembler->CheckCharacterGT('a' - 1, word); @@ -3122,7 +3030,8 @@ AssertionNode::EmitBoundaryCheck(RegExpCompiler* compiler, Trace* trace) assembler->LoadCurrentCharacter(trace->cp_offset(), &before_non_word); } // Fall through on non-word. - EmitWordCheck(assembler, &before_word, &before_non_word, false); + EmitWordCheck(assembler, &before_word, &before_non_word, false, + compiler->unicode() && compiler->ignore_case()); // Next character is not a word character. assembler->Bind(&before_non_word); jit::Label ok; @@ -3162,7 +3071,8 @@ AssertionNode::BacktrackIfPrevious(RegExpCompiler* compiler, // We already checked that we are not at the start of input so it must be // OK to load the previous character. assembler->LoadCurrentCharacter(new_trace.cp_offset() - 1, &dummy, false); - EmitWordCheck(assembler, word, non_word, backtrack_if_previous == kIsNonWord); + EmitWordCheck(assembler, word, non_word, backtrack_if_previous == kIsNonWord, + compiler->unicode() && compiler->ignore_case()); assembler->Bind(&fall_through); on_success()->Emit(compiler, &new_trace); |