Issue #1677 - Part 1: Import new V8 regexp code with Mozilla's header modifications

author: Matt A. Tobin <email@mattatobin.com> 2020-11-04 19:46:11 -0500
committer: Matt A. Tobin <email@mattatobin.com> 2020-11-04 20:27:57 -0500
commit: 78b3a722b4b91c2482fed60d7e970a3f57645456 (patch)
tree: 717c2e8f2e1a110295f525e9cca666469dbe8049 /js/src/regexp/special-case.h
parent: 2e07199197e94ed02926c77bd3bd10d187b352b0 (diff)
download: UXP-78b3a722b4b91c2482fed60d7e970a3f57645456.tar
UXP-78b3a722b4b91c2482fed60d7e970a3f57645456.tar.gz
UXP-78b3a722b4b91c2482fed60d7e970a3f57645456.tar.lz
UXP-78b3a722b4b91c2482fed60d7e970a3f57645456.tar.xz
UXP-78b3a722b4b91c2482fed60d7e970a3f57645456.zip
1 files changed, 79 insertions, 0 deletions
diff --git a/js/src/regexp/special-case.h b/js/src/regexp/special-case.h
new file mode 100644
index 000000000..1ccec5d31
--- /dev/null
+++ b/js/src/regexp/special-case.h
@@ -0,0 +1,79 @@
+// Copyright 2019 the V8 project authors. All rights reserved.
+// Use of this source code is governed by a BSD-style license that can be
+// found in the LICENSE file.
+
+#ifndef V8_REGEXP_SPECIAL_CASE_H_
+#define V8_REGEXP_SPECIAL_CASE_H_
+
+#ifdef V8_INTL_SUPPORT
+#include "unicode/uversion.h"
+namespace U_ICU_NAMESPACE {
+class UnicodeSet;
+}  //  namespace U_ICU_NAMESPACE
+
+namespace v8 {
+namespace internal {
+
+// Functions to build special sets of Unicode characters that need special
+// handling under "i" mode that cannot use closeOver(USET_CASE_INSENSITIVE).
+//
+// For the characters in the "ignore set", the process should not treat other
+// characters in the result of closeOver(USET_CASE_INSENSITIVE) as case
+// equivlant under the ECMA262 RegExp "i" mode because these characters are
+// uppercase themselves that no other characters in the set uppercase to.
+//
+// For the characters in the "special add set", the proecess should add only
+// those characters in the result of closeOver(USET_CASE_INSENSITIVE) which is
+// not uppercase characters as case equivlant under the ECMA262 RegExp "i" mode
+// and also that ONE uppercase character that other non uppercase character
+// uppercase into to the set. Other uppercase characters in the result of
+// closeOver(USET_CASE_INSENSITIVE) should not be considered because ECMA262
+// RegExp "i" mode consider two characters as "case equivlant" if both
+// characters uppercase to the same character.
+//
+// For example, consider the following case equivalent set defined by Unicode
+// standard. Notice there are more than one uppercase characters in this set:
+//  U+212B Å Angstrom Sign - an uppercase character.
+//  U+00C5 Å Latin Capital Letter A with Ring Above - an uppercase character.
+//  U+00E5 å Latin Small Letter A with Ring Above - a lowercase character which
+//    uppercase to U+00C5.
+// In this case equivlant set is a special set and need special handling while
+// considering "case equivlant" under the ECMA262 RegExp "i" mode which is
+// different than Unicode Standard:
+//  * U+212B should be included into the "ignore" set because there are no other
+//    characters, under the ECMA262 "i" mode, are considered as "case equivlant"
+//    to it because U+212B is itself an uppercase but neither U+00C5 nor U+00E5
+//    uppercase to U+212B.
+//  * U+00C5 and U+00E5 will both be included into the "special add" set. While
+//    calculate the "equivlant set" under ECMA262 "i" mode, the process will
+//    add U+00E5, because it is not an uppercase character in the set. The
+//    process will also add U+00C5, because it is the uppercase character which
+//    other non uppercase character, U+00C5, uppercase into.
+//
+// For characters not included in "ignore set" and "special add set", the
+// process will just use closeOver(USET_CASE_INSENSITIVE) to calcualte, which is
+// much faster.
+//
+// Under Unicode 12.0, there are only 7 characters in the "special add set" and
+// 4 characters in "ignore set" so even the special add process is slower, it is
+// limited to a small set of cases only.
+//
+// The implementation of these two function will be generated by calling ICU
+// icu::UnicodeSet during the build time into gen/src/regexp/special-case.cc by
+// the code in src/regexp/gen-regexp-special-case.cc.
+//
+// These two function will be used with LazyInstance<> template to generate
+// global sharable set to reduce memory usage and speed up performance.
+
+// Function to build and return the Ignore set.
+icu::UnicodeSet BuildIgnoreSet();
+
+// Function to build and return the Special Add set.
+icu::UnicodeSet BuildSpecialAddSet();
+
+}  // namespace internal
+}  // namespace v8
+
+#endif  // V8_INTL_SUPPORT
+
+#endif  // V8_REGEXP_SPECIAL_CASE_H_
author	Matt A. Tobin <email@mattatobin.com>	2020-11-04 19:46:11 -0500
committer	Matt A. Tobin <email@mattatobin.com>	2020-11-04 20:27:57 -0500
commit	78b3a722b4b91c2482fed60d7e970a3f57645456 (patch)
tree	717c2e8f2e1a110295f525e9cca666469dbe8049 /js/src/regexp/special-case.h
parent	2e07199197e94ed02926c77bd3bd10d187b352b0 (diff)
download	UXP-78b3a722b4b91c2482fed60d7e970a3f57645456.tar UXP-78b3a722b4b91c2482fed60d7e970a3f57645456.tar.gz UXP-78b3a722b4b91c2482fed60d7e970a3f57645456.tar.lz UXP-78b3a722b4b91c2482fed60d7e970a3f57645456.tar.xz UXP-78b3a722b4b91c2482fed60d7e970a3f57645456.zip