summaryrefslogtreecommitdiffstats
diff options
context:
space:
mode:
-rw-r--r--js/src/irregexp/RegExpCharacters-inl.h40
-rw-r--r--js/src/irregexp/RegExpCharacters.cpp135
-rw-r--r--js/src/irregexp/RegExpCharacters.h90
-rw-r--r--js/src/irregexp/RegExpEngine.cpp115
-rw-r--r--js/src/moz.build1
-rwxr-xr-xjs/src/vm/make_unicode.py213
6 files changed, 485 insertions, 109 deletions
diff --git a/js/src/irregexp/RegExpCharacters-inl.h b/js/src/irregexp/RegExpCharacters-inl.h
new file mode 100644
index 000000000..d001819fc
--- /dev/null
+++ b/js/src/irregexp/RegExpCharacters-inl.h
@@ -0,0 +1,40 @@
+/* Generated by make_unicode.py DO NOT MODIFY */
+/* Unicode version: 9.0.0 */
+#ifndef V8_JSREGEXPCHARACTERS_INL_H_
+#define V8_JSREGEXPCHARACTERS_INL_H_
+
+namespace js {
+
+namespace irregexp {
+
+static inline bool
+RangeContainsLatin1Equivalents(CharacterRange range, bool unicode)
+{
+ if (unicode) {
+ // "LATIN SMALL LETTER LONG S" case folds to "LATIN SMALL LETTER S".
+ if (range.Contains(0x017F))
+ return true;
+ // "LATIN CAPITAL LETTER SHARP S" case folds to "LATIN SMALL LETTER SHARP S".
+ if (range.Contains(0x1E9E))
+ return true;
+ // "KELVIN SIGN" case folds to "LATIN SMALL LETTER K".
+ if (range.Contains(0x212A))
+ return true;
+ // "ANGSTROM SIGN" case folds to "LATIN SMALL LETTER A WITH RING ABOVE".
+ if (range.Contains(0x212B))
+ return true;
+ }
+
+ // "GREEK CAPITAL LETTER MU" case maps to "MICRO SIGN".
+ // "GREEK SMALL LETTER MU" case maps to "MICRO SIGN".
+ if (range.Contains(0x039C) || range.Contains(0x03BC))
+ return true;
+ // "LATIN CAPITAL LETTER Y WITH DIAERESIS" case maps to "LATIN SMALL LETTER Y WITH DIAERESIS".
+ if (range.Contains(0x0178))
+ return true;
+ return false;
+}
+
+} } // namespace js::irregexp
+
+#endif // V8_JSREGEXPCHARACTERS_INL_H_
diff --git a/js/src/irregexp/RegExpCharacters.cpp b/js/src/irregexp/RegExpCharacters.cpp
new file mode 100644
index 000000000..096c02760
--- /dev/null
+++ b/js/src/irregexp/RegExpCharacters.cpp
@@ -0,0 +1,135 @@
+/* Generated by make_unicode.py DO NOT MODIFY */
+/* Unicode version: 9.0.0 */
+#include "irregexp/RegExpCharacters.h"
+
+#include "mozilla/Assertions.h"
+
+char16_t
+js::irregexp::ConvertNonLatin1ToLatin1(char16_t c, bool unicode)
+{
+ MOZ_ASSERT(c > 0xFF, "Character mustn't be Latin1");
+ if (unicode) {
+ // "LATIN SMALL LETTER LONG S" case folds to "LATIN SMALL LETTER S".
+ if (c == 0x017F)
+ return 0x73;
+ // "LATIN CAPITAL LETTER SHARP S" case folds to "LATIN SMALL LETTER SHARP S".
+ if (c == 0x1E9E)
+ return 0xDF;
+ // "KELVIN SIGN" case folds to "LATIN SMALL LETTER K".
+ if (c == 0x212A)
+ return 0x6B;
+ // "ANGSTROM SIGN" case folds to "LATIN SMALL LETTER A WITH RING ABOVE".
+ if (c == 0x212B)
+ return 0xE5;
+ }
+
+ // "GREEK CAPITAL LETTER MU" case maps to "MICRO SIGN".
+ // "GREEK SMALL LETTER MU" case maps to "MICRO SIGN".
+ if (c == 0x039C || c == 0x03BC)
+ return 0xB5;
+ // "LATIN CAPITAL LETTER Y WITH DIAERESIS" case maps to "LATIN SMALL LETTER Y WITH DIAERESIS".
+ if (c == 0x0178)
+ return 0xFF;
+ return 0;
+}
+
+const int js::irregexp::kSpaceRanges[] = {
+ 0x0009, 0x000D + 1, // CHARACTER TABULATION..CARRIAGE RETURN (CR)
+ 0x0020, 0x0020 + 1, // SPACE
+ 0x00A0, 0x00A0 + 1, // NO-BREAK SPACE
+ 0x1680, 0x1680 + 1, // OGHAM SPACE MARK
+ 0x2000, 0x200A + 1, // EN QUAD..HAIR SPACE
+ 0x2028, 0x2029 + 1, // LINE SEPARATOR..PARAGRAPH SEPARATOR
+ 0x202F, 0x202F + 1, // NARROW NO-BREAK SPACE
+ 0x205F, 0x205F + 1, // MEDIUM MATHEMATICAL SPACE
+ 0x3000, 0x3000 + 1, // IDEOGRAPHIC SPACE
+ 0xFEFF, 0xFEFF + 1, // ZERO WIDTH NO-BREAK SPACE
+ 0xFFFF + 1
+};
+const int js::irregexp::kSpaceRangeCount = 21;
+
+const int js::irregexp::kSpaceAndSurrogateRanges[] = {
+ 0x0009, 0x000D + 1, // CHARACTER TABULATION..CARRIAGE RETURN (CR)
+ 0x0020, 0x0020 + 1, // SPACE
+ 0x00A0, 0x00A0 + 1, // NO-BREAK SPACE
+ 0x1680, 0x1680 + 1, // OGHAM SPACE MARK
+ 0x2000, 0x200A + 1, // EN QUAD..HAIR SPACE
+ 0x2028, 0x2029 + 1, // LINE SEPARATOR..PARAGRAPH SEPARATOR
+ 0x202F, 0x202F + 1, // NARROW NO-BREAK SPACE
+ 0x205F, 0x205F + 1, // MEDIUM MATHEMATICAL SPACE
+ 0x3000, 0x3000 + 1, // IDEOGRAPHIC SPACE
+ 0xD800, 0xDFFF + 1, // <Lead Surrogate Min>..<Trail Surrogate Max>
+ 0xFEFF, 0xFEFF + 1, // ZERO WIDTH NO-BREAK SPACE
+ 0xFFFF + 1
+};
+const int js::irregexp::kSpaceAndSurrogateRangeCount = 23;
+
+const int js::irregexp::kWordRanges[] = {
+ 0x0030, 0x0039 + 1, // DIGIT ZERO..DIGIT NINE
+ 0x0041, 0x005A + 1, // LATIN CAPITAL LETTER A..LATIN CAPITAL LETTER Z
+ 0x005F, 0x005F + 1, // LOW LINE
+ 0x0061, 0x007A + 1, // LATIN SMALL LETTER A..LATIN SMALL LETTER Z
+ 0xFFFF + 1
+};
+const int js::irregexp::kWordRangeCount = 9;
+
+const int js::irregexp::kIgnoreCaseWordRanges[] = {
+ 0x0030, 0x0039 + 1, // DIGIT ZERO..DIGIT NINE
+ 0x0041, 0x005A + 1, // LATIN CAPITAL LETTER A..LATIN CAPITAL LETTER Z
+ 0x005F, 0x005F + 1, // LOW LINE
+ 0x0061, 0x007A + 1, // LATIN SMALL LETTER A..LATIN SMALL LETTER Z
+ 0x017F, 0x017F + 1, // LATIN SMALL LETTER LONG S
+ 0x212A, 0x212A + 1, // KELVIN SIGN
+ 0xFFFF + 1
+};
+const int js::irregexp::kIgnoreCaseWordRangeCount = 13;
+
+const int js::irregexp::kWordAndSurrogateRanges[] = {
+ 0x0030, 0x0039 + 1, // DIGIT ZERO..DIGIT NINE
+ 0x0041, 0x005A + 1, // LATIN CAPITAL LETTER A..LATIN CAPITAL LETTER Z
+ 0x005F, 0x005F + 1, // LOW LINE
+ 0x0061, 0x007A + 1, // LATIN SMALL LETTER A..LATIN SMALL LETTER Z
+ 0xD800, 0xDFFF + 1, // <Lead Surrogate Min>..<Trail Surrogate Max>
+ 0xFFFF + 1
+};
+const int js::irregexp::kWordAndSurrogateRangeCount = 11;
+
+const int js::irregexp::kNegatedIgnoreCaseWordAndSurrogateRanges[] = {
+ 0x0000, 0x002F + 1, // NULL..SOLIDUS
+ 0x003A, 0x0040 + 1, // COLON..COMMERCIAL AT
+ 0x005B, 0x005E + 1, // LEFT SQUARE BRACKET..CIRCUMFLEX ACCENT
+ 0x0060, 0x0060 + 1, // GRAVE ACCENT
+ 0x007B, 0x017E + 1, // LEFT CURLY BRACKET..LATIN SMALL LETTER Z WITH CARON
+ 0x0180, 0x2129 + 1, // LATIN SMALL LETTER B WITH STROKE..TURNED GREEK SMALL LETTER IOTA
+ 0x212B, 0xD7FF + 1, // ANGSTROM SIGN..<Unused>
+ 0xE000, 0xFFFF + 1, // Private Use..<Unused>
+ 0xFFFF + 1
+};
+const int js::irregexp::kNegatedIgnoreCaseWordAndSurrogateRangeCount = 17;
+
+const int js::irregexp::kDigitRanges[] = {
+ 0x0030, 0x0039 + 1, // DIGIT ZERO..DIGIT NINE
+ 0xFFFF + 1
+};
+const int js::irregexp::kDigitRangeCount = 3;
+
+const int js::irregexp::kDigitAndSurrogateRanges[] = {
+ 0x0030, 0x0039 + 1, // DIGIT ZERO..DIGIT NINE
+ 0xD800, 0xDFFF + 1, // <Lead Surrogate Min>..<Trail Surrogate Max>
+ 0xFFFF + 1
+};
+const int js::irregexp::kDigitAndSurrogateRangeCount = 5;
+
+const int js::irregexp::kSurrogateRanges[] = {
+ 0xD800, 0xDFFF + 1, // <Lead Surrogate Min>..<Trail Surrogate Max>
+ 0xFFFF + 1
+};
+const int js::irregexp::kSurrogateRangeCount = 3;
+
+const int js::irregexp::kLineTerminatorRanges[] = {
+ 0x000A, 0x000A + 1, // LINE FEED (LF)
+ 0x000D, 0x000D + 1, // CARRIAGE RETURN (CR)
+ 0x2028, 0x2029 + 1, // LINE SEPARATOR..PARAGRAPH SEPARATOR
+ 0xFFFF + 1
+};
+const int js::irregexp::kLineTerminatorRangeCount = 7;
diff --git a/js/src/irregexp/RegExpCharacters.h b/js/src/irregexp/RegExpCharacters.h
new file mode 100644
index 000000000..0d3cf096f
--- /dev/null
+++ b/js/src/irregexp/RegExpCharacters.h
@@ -0,0 +1,90 @@
+/* -*- Mode: C++; tab-width: 8; indent-tabs-mode: nil; c-basic-offset: 4 -*-
+ * vim: set ts=8 sts=4 et sw=4 tw=99: */
+
+// Copyright 2012 the V8 project authors. All rights reserved.
+// Redistribution and use in source and binary forms, with or without
+// modification, are permitted provided that the following conditions are
+// met:
+//
+// * Redistributions of source code must retain the above copyright
+// notice, this list of conditions and the following disclaimer.
+// * Redistributions in binary form must reproduce the above
+// copyright notice, this list of conditions and the following
+// disclaimer in the documentation and/or other materials provided
+// with the distribution.
+// * Neither the name of Google Inc. nor the names of its
+// contributors may be used to endorse or promote products derived
+// from this software without specific prior written permission.
+//
+// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+// "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+// LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+// A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+// OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+// SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+// LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+// DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+// THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+// (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+// OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+
+#ifndef V8_JSREGEXPCHARACTERS_H_
+#define V8_JSREGEXPCHARACTERS_H_
+
+namespace js {
+
+namespace irregexp {
+
+char16_t
+ConvertNonLatin1ToLatin1(char16_t c, bool unicode);
+
+// -------------------------------------------------------------------
+// CharacterRange
+
+// The ranges have inclusive from and exclusive to.
+
+// This covers \s as defined in ES2016, 21.2.2.12 CharacterClassEscape,
+// which includes WhiteSpace (11.2) and LineTerminator (11.3) values.
+extern const int kSpaceRanges[];
+extern const int kSpaceRangeCount;
+
+// Characters in \s and additionally all surrogate characters.
+extern const int kSpaceAndSurrogateRanges[];
+extern const int kSpaceAndSurrogateRangeCount;
+
+// This covers \w as defined in ES2016, 21.2.2.12 CharacterClassEscape.
+extern const int kWordRanges[];
+extern const int kWordRangeCount;
+
+// Characters which case-fold to characters in \w.
+extern const int kIgnoreCaseWordRanges[];
+extern const int kIgnoreCaseWordRangeCount;
+
+// Characters in \w and additionally all surrogate characters.
+extern const int kWordAndSurrogateRanges[];
+extern const int kWordAndSurrogateRangeCount;
+
+// All characters excluding those which case-fold to \w and excluding all
+// surrogate characters.
+extern const int kNegatedIgnoreCaseWordAndSurrogateRanges[];
+extern const int kNegatedIgnoreCaseWordAndSurrogateRangeCount;
+
+// This covers \d as defined in ES2016, 21.2.2.12 CharacterClassEscape.
+extern const int kDigitRanges[];
+extern const int kDigitRangeCount;
+
+// Characters in \d and additionally all surrogate characters.
+extern const int kDigitAndSurrogateRanges[];
+extern const int kDigitAndSurrogateRangeCount;
+
+// The range of all surrogate characters.
+extern const int kSurrogateRanges[];
+extern const int kSurrogateRangeCount;
+
+// Line terminators as defined in ES2016, 11.3 LineTerminator.
+extern const int kLineTerminatorRanges[];
+extern const int kLineTerminatorRangeCount;
+
+} } // namespace js::irregexp
+
+#endif // V8_JSREGEXPCHARACTERS_H_
diff --git a/js/src/irregexp/RegExpEngine.cpp b/js/src/irregexp/RegExpEngine.cpp
index 2e19065fd..0011e976f 100644
--- a/js/src/irregexp/RegExpEngine.cpp
+++ b/js/src/irregexp/RegExpEngine.cpp
@@ -31,10 +31,14 @@
#include "irregexp/RegExpEngine.h"
#include "irregexp/NativeRegExpMacroAssembler.h"
+#include "irregexp/RegExpCharacters.h"
#include "irregexp/RegExpMacroAssembler.h"
#include "jit/ExecutableAllocator.h"
#include "jit/JitCommon.h"
+// Generated table
+#include "irregexp/RegExpCharacters-inl.h"
+
using namespace js;
using namespace js::irregexp;
@@ -61,61 +65,6 @@ RegExpNode::RegExpNode(LifoAlloc* alloc)
bm_info_[0] = bm_info_[1] = nullptr;
}
-// -------------------------------------------------------------------
-// CharacterRange
-
-// The '2' variant has inclusive from and exclusive to.
-// This covers \s as defined in ECMA-262 5.1, 15.10.2.12,
-// which include WhiteSpace (7.2) or LineTerminator (7.3) values.
-static const int kSpaceRanges[] = { '\t', '\r' + 1, ' ', ' ' + 1,
- 0x00A0, 0x00A1, 0x1680, 0x1681, 0x180E, 0x180F, 0x2000, 0x200B,
- 0x2028, 0x202A, 0x202F, 0x2030, 0x205F, 0x2060, 0x3000, 0x3001,
- 0xFEFF, 0xFF00, 0x10000 };
-static const int kSpaceRangeCount = ArrayLength(kSpaceRanges);
-
-static const int kSpaceAndSurrogateRanges[] = { '\t', '\r' + 1, ' ', ' ' + 1,
- 0x00A0, 0x00A1, 0x1680, 0x1681, 0x180E, 0x180F, 0x2000, 0x200B,
- 0x2028, 0x202A, 0x202F, 0x2030, 0x205F, 0x2060, 0x3000, 0x3001,
- unicode::LeadSurrogateMin, unicode::TrailSurrogateMax + 1,
- 0xFEFF, 0xFF00, 0x10000 };
-static const int kSpaceAndSurrogateRangeCount = ArrayLength(kSpaceAndSurrogateRanges);
-static const int kWordRanges[] = {
- '0', '9' + 1, 'A', 'Z' + 1, '_', '_' + 1, 'a', 'z' + 1, 0x10000 };
-static const int kWordRangeCount = ArrayLength(kWordRanges);
-static const int kIgnoreCaseWordRanges[] = {
- '0', '9' + 1, 'A', 'Z' + 1, '_', '_' + 1, 'a', 'z' + 1,
- 0x017F, 0x017F + 1, 0x212A, 0x212A + 1,
- 0x10000 };
-static const int kIgnoreCaseWordCount = ArrayLength(kIgnoreCaseWordRanges);
-static const int kWordAndSurrogateRanges[] = {
- '0', '9' + 1, 'A', 'Z' + 1, '_', '_' + 1, 'a', 'z' + 1,
- unicode::LeadSurrogateMin, unicode::TrailSurrogateMax + 1,
- 0x10000 };
-static const int kWordAndSurrogateRangeCount = ArrayLength(kWordAndSurrogateRanges);
-static const int kNegatedIgnoreCaseWordAndSurrogateRanges[] = {
- 0, '0', '9' + 1, 'A',
- 'Z' + 1, '_', '_' + 1, 'a',
- 'z' + 1, 0x017F,
- 0x017F + 1, 0x212A,
- 0x212A + 1, unicode::LeadSurrogateMin,
- unicode::TrailSurrogateMax + 1, 0x10000,
- 0x10000 };
-static const int kNegatedIgnoreCaseWordAndSurrogateRangeCount =
- ArrayLength(kNegatedIgnoreCaseWordAndSurrogateRanges);
-static const int kDigitRanges[] = { '0', '9' + 1, 0x10000 };
-static const int kDigitRangeCount = ArrayLength(kDigitRanges);
-static const int kDigitAndSurrogateRanges[] = {
- '0', '9' + 1,
- unicode::LeadSurrogateMin, unicode::TrailSurrogateMax + 1,
- 0x10000 };
-static const int kDigitAndSurrogateRangeCount = ArrayLength(kDigitAndSurrogateRanges);
-static const int kSurrogateRanges[] = {
- unicode::LeadSurrogateMin, unicode::TrailSurrogateMax + 1,
- 0x10000 };
-static const int kSurrogateRangeCount = ArrayLength(kSurrogateRanges);
-static const int kLineTerminatorRanges[] = { 0x000A, 0x000B, 0x000D, 0x000E,
- 0x2028, 0x202A, 0x10000 };
-static const int kLineTerminatorRangeCount = ArrayLength(kLineTerminatorRanges);
static const int kMaxOneByteCharCode = 0xff;
static const int kMaxUtf16CodeUnit = 0xffff;
@@ -213,7 +162,7 @@ CharacterRange::AddClassEscapeUnicode(LifoAlloc* alloc, char16_t type,
break;
case 'w':
if (ignore_case)
- AddClass(kIgnoreCaseWordRanges, kIgnoreCaseWordCount, ranges);
+ AddClass(kIgnoreCaseWordRanges, kIgnoreCaseWordRangeCount, ranges);
else
AddClassEscape(alloc, type, ranges);
break;
@@ -233,33 +182,6 @@ CharacterRange::AddClassEscapeUnicode(LifoAlloc* alloc, char16_t type,
}
}
-#define FOR_EACH_NON_ASCII_TO_ASCII_FOLDING(macro) \
- /* LATIN CAPITAL LETTER Y WITH DIAERESIS */ \
- macro(0x0178, 0x00FF) \
- /* LATIN SMALL LETTER LONG S */ \
- macro(0x017F, 0x0073) \
- /* LATIN CAPITAL LETTER SHARP S */ \
- macro(0x1E9E, 0x00DF) \
- /* KELVIN SIGN */ \
- macro(0x212A, 0x006B) \
- /* ANGSTROM SIGN */ \
- macro(0x212B, 0x00E5)
-
-// We need to check for the following characters: 0x39c 0x3bc 0x178.
-static inline bool
-RangeContainsLatin1Equivalents(CharacterRange range, bool unicode)
-{
- /* TODO(dcarney): this could be a lot more efficient. */
- if (unicode) {
-#define CHECK_RANGE(C, F) \
- if (range.Contains(C)) return true;
-FOR_EACH_NON_ASCII_TO_ASCII_FOLDING(CHECK_RANGE)
-#undef CHECK_RANGE
- }
-
- return range.Contains(0x39c) || range.Contains(0x3bc) || range.Contains(0x178);
-}
-
static bool
RangesContainLatin1Equivalents(const CharacterRangeVector& ranges, bool unicode)
{
@@ -336,7 +258,7 @@ GetCaseIndependentLetters(char16_t character,
// step 3.g.
// The standard requires that non-ASCII characters cannot have ASCII
// character codes in their equivalence class, even though this
- // situation occurs multiple times in the unicode tables.
+ // situation occurs multiple times in the Unicode tables.
static const unsigned kMaxAsciiCharCode = 127;
if (upper <= kMaxAsciiCharCode) {
if (character > kMaxAsciiCharCode) {
@@ -365,31 +287,6 @@ GetCaseIndependentLetters(char16_t character,
choices, ArrayLength(choices), letters);
}
-static char16_t
-ConvertNonLatin1ToLatin1(char16_t c, bool unicode)
-{
- MOZ_ASSERT(c > kMaxOneByteCharCode);
- if (unicode) {
- switch (c) {
-#define CONVERT(C, F) case C: return F;
-FOR_EACH_NON_ASCII_TO_ASCII_FOLDING(CONVERT)
-#undef CONVERT
- }
- }
-
- switch (c) {
- // This are equivalent characters in unicode.
- case 0x39c:
- case 0x3bc:
- return 0xb5;
- // This is an uppercase of a Latin-1 character
- // outside of Latin-1.
- case 0x178:
- return 0xff;
- }
- return 0;
-}
-
void
CharacterRange::AddCaseEquivalents(bool is_ascii, bool unicode, CharacterRangeVector* ranges)
{
diff --git a/js/src/moz.build b/js/src/moz.build
index 77acb10b9..a18170a75 100644
--- a/js/src/moz.build
+++ b/js/src/moz.build
@@ -196,6 +196,7 @@ UNIFIED_SOURCES += [
'gc/Zone.cpp',
'irregexp/NativeRegExpMacroAssembler.cpp',
'irregexp/RegExpAST.cpp',
+ 'irregexp/RegExpCharacters.cpp',
'irregexp/RegExpEngine.cpp',
'irregexp/RegExpInterpreter.cpp',
'irregexp/RegExpMacroAssembler.cpp',
diff --git a/js/src/vm/make_unicode.py b/js/src/vm/make_unicode.py
index 5565d7d14..73c090ac9 100755
--- a/js/src/vm/make_unicode.py
+++ b/js/src/vm/make_unicode.py
@@ -133,6 +133,17 @@ def read_derived_core_properties(derived_core_properties):
for char in range(int(start, 16), int(end, 16) + 1):
yield (char, char_property)
+def int_ranges(ints):
+ """ Yields consecutive ranges (inclusive) from integer values. """
+ from itertools import tee, izip_longest
+
+ (a, b) = tee(sorted(ints))
+ start = next(b)
+ for (curr, succ) in izip_longest(a, b):
+ if curr + 1 != succ:
+ yield (start, curr)
+ start = succ
+
def utf16_encode(code):
NonBMPMin = 0x10000
LeadSurrogateMin = 0xD800
@@ -740,6 +751,204 @@ def splitbins(t):
assert t[i] == t2[(t1[i >> shift] << shift) + (i & mask)]
return best
+def make_irregexp_tables(version,
+ table, index,
+ folding_table, folding_index,
+ test_table):
+ import string
+ from functools import partial
+ from itertools import chain, ifilter, imap
+
+ MAX_ASCII = 0x7F
+ MAX_LATIN1 = 0xFF
+ LEAD_SURROGATE_MIN = 0xD800
+ TRAIL_SURROGATE_MAX = 0xDFFF
+
+ def hex2(n):
+ assert 0 <= n and n < 16**2
+ return '0x{:02X}'.format(n)
+
+ def hex4(n):
+ assert 0 <= n and n < 16**4
+ return '0x{:04X}'.format(n)
+
+ def uhex4(n):
+ assert 0 <= n and n < 16**4
+ return 'U+{:04X}'.format(n)
+
+ def case_info(code):
+ assert 0 <= code and code <= MAX_BMP
+ (upper, lower, flags) = table[index[code]]
+ return ((code + upper) & 0xffff, (code + lower) & 0xffff, flags)
+
+ def is_space(code):
+ (_, _, flags) = case_info(code)
+ return bool(flags & FLAG_SPACE)
+
+ def to_upper(code):
+ (upper, _, _) = case_info(code)
+ return upper
+
+ def casefold(code):
+ assert 0 <= code and code <= MAX_BMP
+ (folding, _, _, _) = folding_table[folding_index[code]]
+ return (code + folding) & 0xffff
+
+ def casefolds_to_ascii(code):
+ return casefold(code) <= MAX_ASCII
+
+ def casefolds_to_latin1(code):
+ return casefold(code) <= MAX_LATIN1
+
+ def casemaps_to_nonlatin1(code):
+ upper = to_upper(code)
+ return upper > MAX_LATIN1
+
+ def char_name(code):
+ assert 0 <= code and code <= MAX_BMP
+ if code not in test_table:
+ return '<Unused>'
+ if code == LEAD_SURROGATE_MIN:
+ return '<Lead Surrogate Min>'
+ if code == TRAIL_SURROGATE_MAX:
+ return '<Trail Surrogate Max>'
+ (_, _, name, alias) = test_table[code]
+ return name if not name.startswith('<') else alias
+
+ def write_character_range(println, name, characters):
+ char_ranges = list(int_ranges(characters))
+ println('')
+ println('const int js::irregexp::k{}Ranges[] = {{'.format(name))
+ for (start, end) in char_ranges:
+ s_name = char_name(start)
+ e_name = char_name(end)
+ println(' {}, {} + 1, // {}'.format(hex4(start), hex4(end),
+ '{}..{}'.format(s_name, e_name)
+ if start != end else s_name))
+ println(' {} + 1'.format(hex4(MAX_BMP)))
+ println('};')
+ println('const int js::irregexp::k{}RangeCount = {};'.format(name,
+ len(char_ranges) * 2 + 1))
+
+ def write_character_test(println, test, consequent, default):
+ # Latin1 characters which, when case-mapped through
+ # String.prototype.toUpperCase(), canonicalize to a non-Latin1 character.
+ # ES2017, §21.2.2.8.2 Runtime Semantics: Canonicalize
+ casemapped_to_nonlatin1 = ifilter(casemaps_to_nonlatin1, xrange(0, MAX_LATIN1 + 1))
+
+ def casemap_closure(ch):
+ upper = to_upper(ch)
+ return (ch, [c for c in xrange(MAX_LATIN1 + 1, MAX_BMP + 1) if upper == to_upper(c)])
+
+ # Mapping from Latin1 characters to the list of case map equivalent
+ # non-Latin1 characters.
+ casemap_for_latin1 = dict(chain(imap(casemap_closure, casemapped_to_nonlatin1)))
+
+ # Non-latin1 characters which, when Unicode case-folded, canonicalize to
+ # a Latin1 character.
+ # ES2017, §21.2.2.8.2 Runtime Semantics: Canonicalize
+ casefolded_to_latin1 = ifilter(casefolds_to_latin1, xrange(MAX_LATIN1 + 1, MAX_BMP + 1))
+
+ println(' if (unicode) {')
+ for ch in casefolded_to_latin1:
+ casefolded = casefold(ch)
+ # Skip if also handled below for case mapping.
+ if casefolded in casemap_for_latin1 and ch in casemap_for_latin1[casefolded]:
+ continue
+ println(' // "{}" case folds to "{}".'.format(char_name(ch),
+ char_name(casefolded)))
+ println(' if ({})'.format(test(ch)))
+ println(' return {};'.format(consequent(casefolded)))
+ println(' }')
+ println('')
+ for (ch, casemapped_chars) in casemap_for_latin1.iteritems():
+ for casemapped in casemapped_chars:
+ println(' // "{}" case maps to "{}".'.format(char_name(casemapped),
+ char_name(ch)))
+ println(' if ({})'.format(' || '.join(imap(test, casemapped_chars))))
+ println(' return {};'.format(consequent(ch)))
+ println(' return {};'.format(default))
+
+ with io.open('../irregexp/RegExpCharacters-inl.h', 'wb') as chars_file:
+ write = partial(print, file=chars_file, sep='', end='')
+ println = partial(write, end='\n')
+
+ write(warning_message)
+ write(unicode_version_message.format(version))
+
+ println('#ifndef V8_JSREGEXPCHARACTERS_INL_H_')
+ println('#define V8_JSREGEXPCHARACTERS_INL_H_')
+ println('')
+ println('namespace js {')
+ println('')
+ println('namespace irregexp {')
+ println('')
+
+ println('static inline bool')
+ println('RangeContainsLatin1Equivalents(CharacterRange range, bool unicode)')
+ println('{')
+ write_character_test(println, lambda ch: 'range.Contains({})'.format(hex4(ch)),
+ lambda _: 'true', 'false')
+ println('}')
+
+ println('')
+ println('} } // namespace js::irregexp')
+ println('')
+ println('#endif // V8_JSREGEXPCHARACTERS_INL_H_')
+
+ with io.open('../irregexp/RegExpCharacters.cpp', 'wb') as chars_file:
+ write = partial(print, file=chars_file, sep='', end='')
+ println = partial(write, end='\n')
+ character_range = partial(write_character_range, println)
+
+ # Characters in \s, 21.2.2.12 CharacterClassEscape.
+ space_chars = filter(is_space, xrange(0, MAX_BMP + 1))
+
+ # Characters in \d, 21.2.2.12 CharacterClassEscape.
+ digit_chars = map(ord, string.digits)
+ assert all(ch <= MAX_ASCII for ch in digit_chars)
+
+ # Characters in \w, 21.2.2.12 CharacterClassEscape.
+ word_chars = map(ord, string.digits + string.ascii_letters + '_')
+ assert all(ch <= MAX_ASCII for ch in word_chars)
+
+ # Characters which case-fold to characters in \w.
+ ignorecase_word_chars = (word_chars +
+ filter(casefolds_to_ascii, xrange(MAX_ASCII + 1, MAX_BMP + 1)))
+
+ # Surrogate characters.
+ surrogate_chars = range(LEAD_SURROGATE_MIN, TRAIL_SURROGATE_MAX + 1)
+
+ write(warning_message)
+ write(unicode_version_message.format(version))
+ println('#include "irregexp/RegExpCharacters.h"')
+ println('')
+ println('#include "mozilla/Assertions.h"')
+ println('')
+
+ println('char16_t')
+ println('js::irregexp::ConvertNonLatin1ToLatin1(char16_t c, bool unicode)')
+ println('{')
+ println(' MOZ_ASSERT(c > {}, "Character mustn\'t be Latin1");'.format(hex2(MAX_LATIN1)))
+ write_character_test(println, lambda ch: 'c == {}'.format(hex4(ch)), hex2, '0')
+ println('}')
+
+ character_range('Space', space_chars)
+ character_range('SpaceAndSurrogate', space_chars + surrogate_chars)
+
+ character_range('Word', word_chars)
+ character_range('IgnoreCaseWord', ignorecase_word_chars)
+ character_range('WordAndSurrogate', word_chars + surrogate_chars)
+ character_range('NegatedIgnoreCaseWordAndSurrogate',
+ set(xrange(0, MAX_BMP + 1)) - set(ignorecase_word_chars + surrogate_chars))
+
+ character_range('Digit', digit_chars)
+ character_range('DigitAndSurrogate', digit_chars + surrogate_chars)
+
+ character_range('Surrogate', surrogate_chars)
+
+ character_range('LineTerminator', line_terminator)
+
def update_unicode(args):
import urllib2
@@ -807,6 +1016,10 @@ def update_unicode(args):
make_non_bmp_file(unicode_version,
non_bmp_lower_map, non_bmp_upper_map,
non_bmp_folding_map, non_bmp_rev_folding_map)
+ make_irregexp_tables(unicode_version,
+ table, index,
+ folding_table, folding_index,
+ test_table)
make_bmp_mapping_test(unicode_version, test_table)
make_non_bmp_mapping_test(unicode_version, non_bmp_upper_map, non_bmp_lower_map)