JS - RegExp - match updated spec for `/\b/iu` and `/\B/iu`

author: janekptacijarabaci <janekptacijarabaci@seznam.cz> 2017-08-17 20:37:46 +0200
committer: wolfbeast <mcwerewolf@gmail.com> 2018-03-12 09:52:39 +0100
commit: 93f8e06bb8d8656e868679d584c7c8771ff8e42f (patch)
tree: 63d3edb15082c664a91e2cc311e678d8fbe76fd2 /js
parent: 1df7811e8e7943bf94ca7164c7c173a5c622db7b (diff)
download: UXP-93f8e06bb8d8656e868679d584c7c8771ff8e42f.tar
UXP-93f8e06bb8d8656e868679d584c7c8771ff8e42f.tar.gz
UXP-93f8e06bb8d8656e868679d584c7c8771ff8e42f.tar.lz
UXP-93f8e06bb8d8656e868679d584c7c8771ff8e42f.tar.xz
UXP-93f8e06bb8d8656e868679d584c7c8771ff8e42f.zip
3 files changed, 50 insertions, 8 deletions
diff --git a/js/src/irregexp/RegExpEngine.cpp b/js/src/irregexp/RegExpEngine.cpp
index 2e19065fd..7116ff1e5 100644
--- a/js/src/irregexp/RegExpEngine.cpp
+++ b/js/src/irregexp/RegExpEngine.cpp
@@ -2358,7 +2358,10 @@ void
 BoyerMoorePositionInfo::SetInterval(const Interval& interval)
 {
     s_ = AddRange(s_, kSpaceRanges, kSpaceRangeCount, interval);
-    w_ = AddRange(w_, kWordRanges, kWordRangeCount, interval);
+    if (unicode_ignore_case_)
+        w_ = AddRange(w_, kIgnoreCaseWordRanges, kIgnoreCaseWordRangeCount, interval);
+    else
+        w_ = AddRange(w_, kWordRanges, kWordRangeCount, interval);
     d_ = AddRange(d_, kDigitRanges, kDigitRangeCount, interval);
     surrogate_ =
         AddRange(surrogate_, kSurrogateRanges, kSurrogateRangeCount, interval);
@@ -2395,11 +2398,12 @@ BoyerMoorePositionInfo::SetAll()
 BoyerMooreLookahead::BoyerMooreLookahead(LifoAlloc* alloc, size_t length, RegExpCompiler* compiler)
   : length_(length), compiler_(compiler), bitmaps_(*alloc)
 {
+    bool unicode_ignore_case = compiler->unicode() && compiler->ignore_case();
     max_char_ = MaximumCharacter(compiler->ascii());
 
     bitmaps_.reserve(length);
     for (size_t i = 0; i < length; i++)
-        bitmaps_.append(alloc->newInfallible<BoyerMoorePositionInfo>(alloc));
+        bitmaps_.append(alloc->newInfallible<BoyerMoorePositionInfo>(alloc, unicode_ignore_case));
 }
 
 // Find the longest range of lookahead that has the fewest number of different
@@ -3065,15 +3069,22 @@ EmitNotInSurrogatePair(RegExpCompiler* compiler, RegExpNode* on_success, Trace*
 // Check for [0-9A-Z_a-z].
 static void
 EmitWordCheck(RegExpMacroAssembler* assembler,
-              jit::Label* word, jit::Label* non_word, bool fall_through_on_word)
+              jit::Label* word, jit::Label* non_word, bool fall_through_on_word,
+              bool unicode_ignore_case)
 {
-    if (assembler->CheckSpecialCharacterClass(fall_through_on_word ? 'w' : 'W',
+    if (!unicode_ignore_case &&
+        assembler->CheckSpecialCharacterClass(fall_through_on_word ? 'w' : 'W',
                                               fall_through_on_word ? non_word : word))
     {
         // Optimized implementation available.
         return;
     }
 
+    if (unicode_ignore_case) {
+        assembler->CheckCharacter(0x017F, word);
+        assembler->CheckCharacter(0x212A, word);
+    }
+
     assembler->CheckCharacterGT('z', non_word);
     assembler->CheckCharacterLT('0', non_word);
     assembler->CheckCharacterGT('a' - 1, word);
@@ -3122,7 +3133,8 @@ AssertionNode::EmitBoundaryCheck(RegExpCompiler* compiler, Trace* trace)
             assembler->LoadCurrentCharacter(trace->cp_offset(), &before_non_word);
         }
         // Fall through on non-word.
-        EmitWordCheck(assembler, &before_word, &before_non_word, false);
+        EmitWordCheck(assembler, &before_word, &before_non_word, false,
+                      compiler->unicode() && compiler->ignore_case());
         // Next character is not a word character.
         assembler->Bind(&before_non_word);
         jit::Label ok;
@@ -3162,7 +3174,8 @@ AssertionNode::BacktrackIfPrevious(RegExpCompiler* compiler,
     // We already checked that we are not at the start of input so it must be
     // OK to load the previous character.
     assembler->LoadCurrentCharacter(new_trace.cp_offset() - 1, &dummy, false);
-    EmitWordCheck(assembler, word, non_word, backtrack_if_previous == kIsNonWord);
+    EmitWordCheck(assembler, word, non_word, backtrack_if_previous == kIsNonWord,
+                  compiler->unicode() && compiler->ignore_case());
 
     assembler->Bind(&fall_through);
     on_success()->Emit(compiler, &new_trace);
diff --git a/js/src/irregexp/RegExpEngine.h b/js/src/irregexp/RegExpEngine.h
index 78c784aaf..1a8fd4b22 100644
--- a/js/src/irregexp/RegExpEngine.h
+++ b/js/src/irregexp/RegExpEngine.h
@@ -1195,13 +1195,14 @@ AddRange(ContainedInLattice a,
 class BoyerMoorePositionInfo
 {
   public:
-    explicit BoyerMoorePositionInfo(LifoAlloc* alloc)
+    explicit BoyerMoorePositionInfo(LifoAlloc* alloc, bool unicode_ignore_case)
       : map_(*alloc),
         map_count_(0),
         w_(kNotYet),
         s_(kNotYet),
         d_(kNotYet),
-        surrogate_(kNotYet)
+        surrogate_(kNotYet),
+        unicode_ignore_case_(unicode_ignore_case)
     {
         map_.reserve(kMapSize);
         for (int i = 0; i < kMapSize; i++)
@@ -1228,6 +1229,9 @@ class BoyerMoorePositionInfo
     ContainedInLattice s_;  // The \s character class.
     ContainedInLattice d_;  // The \d character class.
     ContainedInLattice surrogate_;  // Surrogate UTF-16 code units.
+
+    // True if the RegExp has unicode and ignoreCase flags.
+    bool unicode_ignore_case_;
 };
 
 typedef InfallibleVector<BoyerMoorePositionInfo*, 1> BoyerMoorePositionInfoVector;
diff --git a/js/src/tests/ecma_6/RegExp/unicode-ignoreCase-word-boundary.js b/js/src/tests/ecma_6/RegExp/unicode-ignoreCase-word-boundary.js
new file mode 100644
index 000000000..c1a04bd3d
--- /dev/null
+++ b/js/src/tests/ecma_6/RegExp/unicode-ignoreCase-word-boundary.js
@@ -0,0 +1,25 @@
+var BUGNUMBER = 1338373;
+var summary = "Word boundary should match U+017F and U+212A in unicode+ignoreCase.";
+
+assertEq(/\b/iu.test('\u017F'), true);
+assertEq(/\b/i.test('\u017F'), false);
+assertEq(/\b/u.test('\u017F'), false);
+assertEq(/\b/.test('\u017F'), false);
+
+assertEq(/\b/iu.test('\u212A'), true);
+assertEq(/\b/i.test('\u212A'), false);
+assertEq(/\b/u.test('\u212A'), false);
+assertEq(/\b/.test('\u212A'), false);
+
+assertEq(/\B/iu.test('\u017F'), false);
+assertEq(/\B/i.test('\u017F'), true);
+assertEq(/\B/u.test('\u017F'), true);
+assertEq(/\B/.test('\u017F'), true);
+
+assertEq(/\B/iu.test('\u212A'), false);
+assertEq(/\B/i.test('\u212A'), true);
+assertEq(/\B/u.test('\u212A'), true);
+assertEq(/\B/.test('\u212A'), true);
+
+if (typeof reportCompare === "function")
+    reportCompare(true, true);
author	janekptacijarabaci <janekptacijarabaci@seznam.cz>	2017-08-17 20:37:46 +0200
committer	wolfbeast <mcwerewolf@gmail.com>	2018-03-12 09:52:39 +0100
commit	93f8e06bb8d8656e868679d584c7c8771ff8e42f (patch)
tree	63d3edb15082c664a91e2cc311e678d8fbe76fd2 /js
parent	1df7811e8e7943bf94ca7164c7c173a5c622db7b (diff)
download	UXP-93f8e06bb8d8656e868679d584c7c8771ff8e42f.tar UXP-93f8e06bb8d8656e868679d584c7c8771ff8e42f.tar.gz UXP-93f8e06bb8d8656e868679d584c7c8771ff8e42f.tar.lz UXP-93f8e06bb8d8656e868679d584c7c8771ff8e42f.tar.xz UXP-93f8e06bb8d8656e868679d584c7c8771ff8e42f.zip