Generate irregexp character tables with make_unicode.py.

author: wolfbeast <mcwerewolf@gmail.com> 2018-03-12 14:15:56 +0100
committer: wolfbeast <mcwerewolf@gmail.com> 2018-03-12 14:15:56 +0100
commit: b392e6d2ea60191615771900690b37b52b47bcd3 (patch)
tree: 64904a45624d866ff0232dc2f5b50478b17d69ba /js/src/irregexp/RegExpEngine.cpp
parent: ba74a4174d2cd6ccbabbc5aa6f4ffdf74b48f45c (diff)
download: UXP-b392e6d2ea60191615771900690b37b52b47bcd3.tar
UXP-b392e6d2ea60191615771900690b37b52b47bcd3.tar.gz
UXP-b392e6d2ea60191615771900690b37b52b47bcd3.tar.lz
UXP-b392e6d2ea60191615771900690b37b52b47bcd3.tar.xz
UXP-b392e6d2ea60191615771900690b37b52b47bcd3.zip
1 files changed, 6 insertions, 109 deletions
diff --git a/js/src/irregexp/RegExpEngine.cpp b/js/src/irregexp/RegExpEngine.cpp
index 2e19065fd..0011e976f 100644
--- a/js/src/irregexp/RegExpEngine.cpp
+++ b/js/src/irregexp/RegExpEngine.cpp
@@ -31,10 +31,14 @@
 #include "irregexp/RegExpEngine.h"
 
 #include "irregexp/NativeRegExpMacroAssembler.h"
+#include "irregexp/RegExpCharacters.h" 
 #include "irregexp/RegExpMacroAssembler.h"
 #include "jit/ExecutableAllocator.h"
 #include "jit/JitCommon.h"
 
+// Generated table
+#include "irregexp/RegExpCharacters-inl.h"
+
 using namespace js;
 using namespace js::irregexp;
 
@@ -61,61 +65,6 @@ RegExpNode::RegExpNode(LifoAlloc* alloc)
     bm_info_[0] = bm_info_[1] = nullptr;
 }
 
-// -------------------------------------------------------------------
-// CharacterRange
-
-// The '2' variant has inclusive from and exclusive to.
-// This covers \s as defined in ECMA-262 5.1, 15.10.2.12,
-// which include WhiteSpace (7.2) or LineTerminator (7.3) values.
-static const int kSpaceRanges[] = { '\t', '\r' + 1, ' ', ' ' + 1,
-    0x00A0, 0x00A1, 0x1680, 0x1681, 0x180E, 0x180F, 0x2000, 0x200B,
-    0x2028, 0x202A, 0x202F, 0x2030, 0x205F, 0x2060, 0x3000, 0x3001,
-    0xFEFF, 0xFF00, 0x10000 };
-static const int kSpaceRangeCount = ArrayLength(kSpaceRanges);
-
-static const int kSpaceAndSurrogateRanges[] = { '\t', '\r' + 1, ' ', ' ' + 1,
-    0x00A0, 0x00A1, 0x1680, 0x1681, 0x180E, 0x180F, 0x2000, 0x200B,
-    0x2028, 0x202A, 0x202F, 0x2030, 0x205F, 0x2060, 0x3000, 0x3001,
-    unicode::LeadSurrogateMin, unicode::TrailSurrogateMax + 1,
-    0xFEFF, 0xFF00, 0x10000 };
-static const int kSpaceAndSurrogateRangeCount = ArrayLength(kSpaceAndSurrogateRanges);
-static const int kWordRanges[] = {
-    '0', '9' + 1, 'A', 'Z' + 1, '_', '_' + 1, 'a', 'z' + 1, 0x10000 };
-static const int kWordRangeCount = ArrayLength(kWordRanges);
-static const int kIgnoreCaseWordRanges[] = {
-    '0', '9' + 1, 'A', 'Z' + 1, '_', '_' + 1, 'a', 'z' + 1,
-    0x017F, 0x017F + 1, 0x212A, 0x212A + 1,
-    0x10000 };
-static const int kIgnoreCaseWordCount = ArrayLength(kIgnoreCaseWordRanges);
-static const int kWordAndSurrogateRanges[] = {
-    '0', '9' + 1, 'A', 'Z' + 1, '_', '_' + 1, 'a', 'z' + 1,
-    unicode::LeadSurrogateMin, unicode::TrailSurrogateMax + 1,
-    0x10000 };
-static const int kWordAndSurrogateRangeCount = ArrayLength(kWordAndSurrogateRanges);
-static const int kNegatedIgnoreCaseWordAndSurrogateRanges[] = {
-    0, '0', '9' + 1, 'A',
-    'Z' + 1, '_', '_' + 1, 'a',
-    'z' + 1, 0x017F,
-    0x017F + 1, 0x212A,
-    0x212A + 1, unicode::LeadSurrogateMin,
-    unicode::TrailSurrogateMax + 1, 0x10000,
-    0x10000 };
-static const int kNegatedIgnoreCaseWordAndSurrogateRangeCount =
-    ArrayLength(kNegatedIgnoreCaseWordAndSurrogateRanges);
-static const int kDigitRanges[] = { '0', '9' + 1, 0x10000 };
-static const int kDigitRangeCount = ArrayLength(kDigitRanges);
-static const int kDigitAndSurrogateRanges[] = {
-    '0', '9' + 1,
-    unicode::LeadSurrogateMin, unicode::TrailSurrogateMax + 1,
-    0x10000 };
-static const int kDigitAndSurrogateRangeCount = ArrayLength(kDigitAndSurrogateRanges);
-static const int kSurrogateRanges[] = {
-    unicode::LeadSurrogateMin, unicode::TrailSurrogateMax + 1,
-    0x10000 };
-static const int kSurrogateRangeCount = ArrayLength(kSurrogateRanges);
-static const int kLineTerminatorRanges[] = { 0x000A, 0x000B, 0x000D, 0x000E,
-    0x2028, 0x202A, 0x10000 };
-static const int kLineTerminatorRangeCount = ArrayLength(kLineTerminatorRanges);
 static const int kMaxOneByteCharCode = 0xff;
 static const int kMaxUtf16CodeUnit = 0xffff;
 
@@ -213,7 +162,7 @@ CharacterRange::AddClassEscapeUnicode(LifoAlloc* alloc, char16_t type,
         break;
       case 'w':
         if (ignore_case)
-            AddClass(kIgnoreCaseWordRanges, kIgnoreCaseWordCount, ranges);
+            AddClass(kIgnoreCaseWordRanges, kIgnoreCaseWordRangeCount, ranges);
         else
             AddClassEscape(alloc, type, ranges);
         break;
@@ -233,33 +182,6 @@ CharacterRange::AddClassEscapeUnicode(LifoAlloc* alloc, char16_t type,
     }
 }
 
-#define FOR_EACH_NON_ASCII_TO_ASCII_FOLDING(macro)      \
-    /* LATIN CAPITAL LETTER Y WITH DIAERESIS */         \
-    macro(0x0178, 0x00FF)                               \
-    /* LATIN SMALL LETTER LONG S */                     \
-    macro(0x017F, 0x0073)                               \
-    /* LATIN CAPITAL LETTER SHARP S */                  \
-    macro(0x1E9E, 0x00DF)                               \
-    /* KELVIN SIGN */                                   \
-    macro(0x212A, 0x006B)                               \
-    /* ANGSTROM SIGN */                                 \
-    macro(0x212B, 0x00E5)
-
-// We need to check for the following characters: 0x39c 0x3bc 0x178.
-static inline bool
-RangeContainsLatin1Equivalents(CharacterRange range, bool unicode)
-{
-    /* TODO(dcarney): this could be a lot more efficient. */
-    if (unicode) {
-#define CHECK_RANGE(C, F) \
-        if (range.Contains(C)) return true;
-FOR_EACH_NON_ASCII_TO_ASCII_FOLDING(CHECK_RANGE)
-#undef CHECK_RANGE
-    }
-
-    return range.Contains(0x39c) || range.Contains(0x3bc) || range.Contains(0x178);
-}
-
 static bool
 RangesContainLatin1Equivalents(const CharacterRangeVector& ranges, bool unicode)
 {
@@ -336,7 +258,7 @@ GetCaseIndependentLetters(char16_t character,
     // step 3.g.
     // The standard requires that non-ASCII characters cannot have ASCII
     // character codes in their equivalence class, even though this
-    // situation occurs multiple times in the unicode tables.
+    // situation occurs multiple times in the Unicode tables.
     static const unsigned kMaxAsciiCharCode = 127;
     if (upper <= kMaxAsciiCharCode) {
         if (character > kMaxAsciiCharCode) {
@@ -365,31 +287,6 @@ GetCaseIndependentLetters(char16_t character,
                                      choices, ArrayLength(choices), letters);
 }
 
-static char16_t
-ConvertNonLatin1ToLatin1(char16_t c, bool unicode)
-{
-    MOZ_ASSERT(c > kMaxOneByteCharCode);
-    if (unicode) {
-        switch (c) {
-#define CONVERT(C, F) case C: return F;
-FOR_EACH_NON_ASCII_TO_ASCII_FOLDING(CONVERT)
-#undef CONVERT
-        }
-    }
-
-    switch (c) {
-      // This are equivalent characters in unicode.
-      case 0x39c:
-      case 0x3bc:
-        return 0xb5;
-      // This is an uppercase of a Latin-1 character
-      // outside of Latin-1.
-      case 0x178:
-        return 0xff;
-    }
-    return 0;
-}
-
 void
 CharacterRange::AddCaseEquivalents(bool is_ascii, bool unicode, CharacterRangeVector* ranges)
 {
author	wolfbeast <mcwerewolf@gmail.com>	2018-03-12 14:15:56 +0100
committer	wolfbeast <mcwerewolf@gmail.com>	2018-03-12 14:15:56 +0100
commit	b392e6d2ea60191615771900690b37b52b47bcd3 (patch)
tree	64904a45624d866ff0232dc2f5b50478b17d69ba /js/src/irregexp/RegExpEngine.cpp
parent	ba74a4174d2cd6ccbabbc5aa6f4ffdf74b48f45c (diff)
download	UXP-b392e6d2ea60191615771900690b37b52b47bcd3.tar UXP-b392e6d2ea60191615771900690b37b52b47bcd3.tar.gz UXP-b392e6d2ea60191615771900690b37b52b47bcd3.tar.lz UXP-b392e6d2ea60191615771900690b37b52b47bcd3.tar.xz UXP-b392e6d2ea60191615771900690b37b52b47bcd3.zip