1 files changed, 2362 insertions, 0 deletions
diff --git a/intl/icu/source/common/uniset.cpp b/intl/icu/source/common/uniset.cpp
new file mode 100644
index 000000000..7866719a1
--- /dev/null
+++ b/intl/icu/source/common/uniset.cpp
@@ -0,0 +1,2362 @@
+// Copyright (C) 2016 and later: Unicode, Inc. and others.
+// License & terms of use: http://www.unicode.org/copyright.html
+/*
+**********************************************************************
+*   Copyright (C) 1999-2015, International Business Machines
+*   Corporation and others.  All Rights Reserved.
+**********************************************************************
+*   Date        Name        Description
+*   10/20/99    alan        Creation.
+**********************************************************************
+*/
+
+#include "unicode/utypes.h"
+#include "unicode/parsepos.h"
+#include "unicode/symtable.h"
+#include "unicode/uniset.h"
+#include "unicode/utf8.h"
+#include "unicode/utf16.h"
+#include "ruleiter.h"
+#include "cmemory.h"
+#include "cstring.h"
+#include "patternprops.h"
+#include "uelement.h"
+#include "util.h"
+#include "uvector.h"
+#include "charstr.h"
+#include "ustrfmt.h"
+#include "uassert.h"
+#include "bmpset.h"
+#include "unisetspan.h"
+
+// Define UChar constants using hex for EBCDIC compatibility
+// Used #define to reduce private static exports and memory access time.
+#define SET_OPEN        ((UChar)0x005B) /*[*/
+#define SET_CLOSE       ((UChar)0x005D) /*]*/
+#define HYPHEN          ((UChar)0x002D) /*-*/
+#define COMPLEMENT      ((UChar)0x005E) /*^*/
+#define COLON           ((UChar)0x003A) /*:*/
+#define BACKSLASH       ((UChar)0x005C) /*\*/
+#define INTERSECTION    ((UChar)0x0026) /*&*/
+#define UPPER_U         ((UChar)0x0055) /*U*/
+#define LOWER_U         ((UChar)0x0075) /*u*/
+#define OPEN_BRACE      ((UChar)123)    /*{*/
+#define CLOSE_BRACE     ((UChar)125)    /*}*/
+#define UPPER_P         ((UChar)0x0050) /*P*/
+#define LOWER_P         ((UChar)0x0070) /*p*/
+#define UPPER_N         ((UChar)78)     /*N*/
+#define EQUALS          ((UChar)0x003D) /*=*/
+
+// HIGH_VALUE > all valid values. 110000 for codepoints
+#define UNICODESET_HIGH 0x0110000
+
+// LOW <= all valid values. ZERO for codepoints
+#define UNICODESET_LOW 0x000000
+
+// initial storage. Must be >= 0
+#define START_EXTRA 16
+
+// extra amount for growth. Must be >= 0
+#define GROW_EXTRA START_EXTRA
+
+U_NAMESPACE_BEGIN
+
+SymbolTable::~SymbolTable() {}
+
+UOBJECT_DEFINE_RTTI_IMPLEMENTATION(UnicodeSet)
+
+/**
+ * Modify the given UChar32 variable so that it is in range, by
+ * pinning values < UNICODESET_LOW to UNICODESET_LOW, and
+ * pinning values > UNICODESET_HIGH-1 to UNICODESET_HIGH-1.
+ * It modifies its argument in-place and also returns it.
+ */
+static inline UChar32 pinCodePoint(UChar32& c) {
+    if (c < UNICODESET_LOW) {
+        c = UNICODESET_LOW;
+    } else if (c > (UNICODESET_HIGH-1)) {
+        c = (UNICODESET_HIGH-1);
+    }
+    return c;
+}
+
+//----------------------------------------------------------------
+// Debugging
+//----------------------------------------------------------------
+
+// DO NOT DELETE THIS CODE.  This code is used to debug memory leaks.
+// To enable the debugging, define the symbol DEBUG_MEM in the line
+// below.  This will result in text being sent to stdout that looks
+// like this:
+//   DEBUG UnicodeSet: ct 0x00A39B20; 397 [\u0A81-\u0A83\u0A85-
+//   DEBUG UnicodeSet: dt 0x00A39B20; 396 [\u0A81-\u0A83\u0A85-
+// Each line lists a construction (ct) or destruction (dt) event, the
+// object address, the number of outstanding objects after the event,
+// and the pattern of the object in question.
+
+// #define DEBUG_MEM
+
+#ifdef DEBUG_MEM
+#include <stdio.h>
+static int32_t _dbgCount = 0;
+
+static inline void _dbgct(UnicodeSet* set) {
+    UnicodeString str;
+    set->toPattern(str, TRUE);
+    char buf[40];
+    str.extract(0, 39, buf, "");
+    printf("DEBUG UnicodeSet: ct 0x%08X; %d %s\n", set, ++_dbgCount, buf);
+}
+
+static inline void _dbgdt(UnicodeSet* set) {
+    UnicodeString str;
+    set->toPattern(str, TRUE);
+    char buf[40];
+    str.extract(0, 39, buf, "");
+    printf("DEBUG UnicodeSet: dt 0x%08X; %d %s\n", set, --_dbgCount, buf);
+}
+
+#else
+
+#define _dbgct(set)
+#define _dbgdt(set)
+
+#endif
+
+//----------------------------------------------------------------
+// UnicodeString in UVector support
+//----------------------------------------------------------------
+
+static void U_CALLCONV cloneUnicodeString(UElement *dst, UElement *src) {
+    dst->pointer = new UnicodeString(*(UnicodeString*)src->pointer);
+}
+
+static int8_t U_CALLCONV compareUnicodeString(UElement t1, UElement t2) {
+    const UnicodeString &a = *(const UnicodeString*)t1.pointer;
+    const UnicodeString &b = *(const UnicodeString*)t2.pointer;
+    return a.compare(b);
+}
+
+//----------------------------------------------------------------
+// Constructors &c
+//----------------------------------------------------------------
+
+/**
+ * Constructs an empty set.
+ */
+UnicodeSet::UnicodeSet() :
+    len(1), capacity(1 + START_EXTRA), list(0), bmpSet(0), buffer(0),
+    bufferCapacity(0), patLen(0), pat(NULL), strings(NULL), stringSpan(NULL),
+    fFlags(0)
+{
+    UErrorCode status = U_ZERO_ERROR;
+    allocateStrings(status);
+    if (U_FAILURE(status)) {
+        return;
+    }
+    list = (UChar32*) uprv_malloc(sizeof(UChar32) * capacity);
+    if(list!=NULL){
+        list[0] = UNICODESET_HIGH;
+    } else { // If memory allocation failed, set to bogus state.
+        setToBogus();
+        return;
+    }
+    _dbgct(this);
+}
+
+/**
+ * Constructs a set containing the given range. If <code>end >
+ * start</code> then an empty set is created.
+ *
+ * @param start first character, inclusive, of range
+ * @param end last character, inclusive, of range
+ */
+UnicodeSet::UnicodeSet(UChar32 start, UChar32 end) :
+    len(1), capacity(1 + START_EXTRA), list(0), bmpSet(0), buffer(0),
+    bufferCapacity(0), patLen(0), pat(NULL), strings(NULL), stringSpan(NULL),
+    fFlags(0)
+{
+    UErrorCode status = U_ZERO_ERROR;
+    allocateStrings(status);
+    if (U_FAILURE(status)) {
+        return;
+    }
+    list = (UChar32*) uprv_malloc(sizeof(UChar32) * capacity);
+    if(list!=NULL){
+        list[0] = UNICODESET_HIGH;
+        complement(start, end);
+    } else { // If memory allocation failed, set to bogus state.
+        setToBogus();
+        return;
+    }
+    _dbgct(this);
+}
+
+/**
+ * Constructs a set that is identical to the given UnicodeSet.
+ */
+UnicodeSet::UnicodeSet(const UnicodeSet& o) :
+    UnicodeFilter(o),
+    len(0), capacity(o.isFrozen() ? o.len : o.len + GROW_EXTRA), list(0),
+    bmpSet(0),
+    buffer(0), bufferCapacity(0),
+    patLen(0), pat(NULL), strings(NULL), stringSpan(NULL),
+    fFlags(0)
+{
+    UErrorCode status = U_ZERO_ERROR;
+    allocateStrings(status);
+    if (U_FAILURE(status)) {
+        return;
+    }
+    list = (UChar32*) uprv_malloc(sizeof(UChar32) * capacity);
+    if(list!=NULL){
+        *this = o;
+    } else { // If memory allocation failed, set to bogus state.
+        setToBogus();
+        return;
+    }
+    _dbgct(this);
+}
+
+// Copy-construct as thawed.
+UnicodeSet::UnicodeSet(const UnicodeSet& o, UBool /* asThawed */) :
+    UnicodeFilter(o),
+    len(0), capacity(o.len + GROW_EXTRA), list(0),
+    bmpSet(0),
+    buffer(0), bufferCapacity(0),
+    patLen(0), pat(NULL), strings(NULL), stringSpan(NULL),
+    fFlags(0)
+{
+    UErrorCode status = U_ZERO_ERROR;
+    allocateStrings(status);
+    if (U_FAILURE(status)) {
+        return;
+    }
+    list = (UChar32*) uprv_malloc(sizeof(UChar32) * capacity);
+    if(list!=NULL){
+        // *this = o except for bmpSet and stringSpan
+        len = o.len;
+        uprv_memcpy(list, o.list, (size_t)len*sizeof(UChar32));
+        if (strings != NULL && o.strings != NULL) {
+            strings->assign(*o.strings, cloneUnicodeString, status);
+        } else { // Invalid strings.
+            setToBogus();
+            return;
+        }
+        if (o.pat) {
+            setPattern(UnicodeString(o.pat, o.patLen));
+        }
+    } else { // If memory allocation failed, set to bogus state.
+        setToBogus();
+        return;
+    }
+    _dbgct(this);
+}
+
+/**
+ * Destructs the set.
+ */
+UnicodeSet::~UnicodeSet() {
+    _dbgdt(this); // first!
+    uprv_free(list);
+    delete bmpSet;
+    if (buffer) {
+        uprv_free(buffer);
+    }
+    delete strings;
+    delete stringSpan;
+    releasePattern();
+}
+
+/**
+ * Assigns this object to be a copy of another.
+ */
+UnicodeSet& UnicodeSet::operator=(const UnicodeSet& o) {
+    if (this == &o) {
+        return *this;
+    }
+    if (isFrozen()) {
+        return *this;
+    }
+    if (o.isBogus()) {
+        setToBogus();
+        return *this;
+    }
+    UErrorCode ec = U_ZERO_ERROR;
+    ensureCapacity(o.len, ec);
+    if (U_FAILURE(ec)) {
+        return *this; // There is no way to report this error :-(
+    }
+    len = o.len;
+    uprv_memcpy(list, o.list, (size_t)len*sizeof(UChar32));
+    if (o.bmpSet == NULL) {
+        bmpSet = NULL;
+    } else {
+        bmpSet = new BMPSet(*o.bmpSet, list, len);
+        if (bmpSet == NULL) { // Check for memory allocation error.
+            setToBogus();
+            return *this;
+        }
+    }
+    if (strings != NULL && o.strings != NULL) {
+        strings->assign(*o.strings, cloneUnicodeString, ec);
+    } else { // Invalid strings.
+        setToBogus();
+        return *this;
+    }
+    if (o.stringSpan == NULL) {
+        stringSpan = NULL;
+    } else {
+        stringSpan = new UnicodeSetStringSpan(*o.stringSpan, *strings);
+        if (stringSpan == NULL) { // Check for memory allocation error.
+            setToBogus();
+            return *this;
+        }
+    }
+    releasePattern();
+    if (o.pat) {
+        setPattern(UnicodeString(o.pat, o.patLen));
+    }
+    return *this;
+}
+
+/**
+ * Returns a copy of this object.  All UnicodeMatcher objects have
+ * to support cloning in order to allow classes using
+ * UnicodeMatchers, such as Transliterator, to implement cloning.
+ */
+UnicodeFunctor* UnicodeSet::clone() const {
+    return new UnicodeSet(*this);
+}
+
+UnicodeFunctor *UnicodeSet::cloneAsThawed() const {
+    return new UnicodeSet(*this, TRUE);
+}
+
+/**
+ * Compares the specified object with this set for equality.  Returns
+ * <tt>true</tt> if the two sets
+ * have the same size, and every member of the specified set is
+ * contained in this set (or equivalently, every member of this set is
+ * contained in the specified set).
+ *
+ * @param o set to be compared for equality with this set.
+ * @return <tt>true</tt> if the specified set is equal to this set.
+ */
+UBool UnicodeSet::operator==(const UnicodeSet& o) const {
+    if (len != o.len) return FALSE;
+    for (int32_t i = 0; i < len; ++i) {
+        if (list[i] != o.list[i]) return FALSE;
+    }
+    if (*strings != *o.strings) return FALSE;
+    return TRUE;
+}
+
+/**
+ * Returns the hash code value for this set.
+ *
+ * @return the hash code value for this set.
+ * @see Object#hashCode()
+ */
+int32_t UnicodeSet::hashCode(void) const {
+    int32_t result = len;
+    for (int32_t i = 0; i < len; ++i) {
+        result *= 1000003;
+        result += list[i];
+    }
+    return result;
+}
+
+//----------------------------------------------------------------
+// Public API
+//----------------------------------------------------------------
+
+/**
+ * Returns the number of elements in this set (its cardinality),
+ * Note than the elements of a set may include both individual
+ * codepoints and strings.
+ *
+ * @return the number of elements in this set (its cardinality).
+ */
+int32_t UnicodeSet::size(void) const {
+    int32_t n = 0;
+    int32_t count = getRangeCount();
+    for (int32_t i = 0; i < count; ++i) {
+        n += getRangeEnd(i) - getRangeStart(i) + 1;
+    }
+    return n + strings->size();
+}
+
+/**
+ * Returns <tt>true</tt> if this set contains no elements.
+ *
+ * @return <tt>true</tt> if this set contains no elements.
+ */
+UBool UnicodeSet::isEmpty(void) const {
+    return len == 1 && strings->size() == 0;
+}
+
+/**
+ * Returns true if this set contains the given character.
+ * @param c character to be checked for containment
+ * @return true if the test condition is met
+ */
+UBool UnicodeSet::contains(UChar32 c) const {
+    // Set i to the index of the start item greater than ch
+    // We know we will terminate without length test!
+    // LATER: for large sets, add binary search
+    //int32_t i = -1;
+    //for (;;) {
+    //    if (c < list[++i]) break;
+    //}
+    if (bmpSet != NULL) {
+        return bmpSet->contains(c);
+    }
+    if (stringSpan != NULL) {
+        return stringSpan->contains(c);
+    }
+    if (c >= UNICODESET_HIGH) { // Don't need to check LOW bound
+        return FALSE;
+    }
+    int32_t i = findCodePoint(c);
+    return (UBool)(i & 1); // return true if odd
+}
+
+/**
+ * Returns the smallest value i such that c < list[i].  Caller
+ * must ensure that c is a legal value or this method will enter
+ * an infinite loop.  This method performs a binary search.
+ * @param c a character in the range MIN_VALUE..MAX_VALUE
+ * inclusive
+ * @return the smallest integer i in the range 0..len-1,
+ * inclusive, such that c < list[i]
+ */
+int32_t UnicodeSet::findCodePoint(UChar32 c) const {
+    /* Examples:
+                                       findCodePoint(c)
+       set              list[]         c=0 1 3 4 7 8
+       ===              ==============   ===========
+       []               [110000]         0 0 0 0 0 0
+       [\u0000-\u0003]  [0, 4, 110000]   1 1 1 2 2 2
+       [\u0004-\u0007]  [4, 8, 110000]   0 0 0 1 1 2
+       [:Any:]          [0, 110000]      1 1 1 1 1 1
+     */
+
+    // Return the smallest i such that c < list[i].  Assume
+    // list[len - 1] == HIGH and that c is legal (0..HIGH-1).
+    if (c < list[0])
+        return 0;
+    // High runner test.  c is often after the last range, so an
+    // initial check for this condition pays off.
+    int32_t lo = 0;
+    int32_t hi = len - 1;
+    if (lo >= hi || c >= list[hi-1])
+        return hi;
+    // invariant: c >= list[lo]
+    // invariant: c < list[hi]
+    for (;;) {
+        int32_t i = (lo + hi) >> 1;
+        if (i == lo) {
+            break; // Found!
+        } else if (c < list[i]) {
+            hi = i;
+        } else {
+            lo = i;
+        }
+    }
+    return hi;
+}
+
+/**
+ * Returns true if this set contains every character
+ * of the given range.
+ * @param start first character, inclusive, of the range
+ * @param end last character, inclusive, of the range
+ * @return true if the test condition is met
+ */
+UBool UnicodeSet::contains(UChar32 start, UChar32 end) const {
+    //int32_t i = -1;
+    //for (;;) {
+    //    if (start < list[++i]) break;
+    //}
+    int32_t i = findCodePoint(start);
+    return ((i & 1) != 0 && end < list[i]);
+}
+
+/**
+ * Returns <tt>true</tt> if this set contains the given
+ * multicharacter string.
+ * @param s string to be checked for containment
+ * @return <tt>true</tt> if this set contains the specified string
+ */
+UBool UnicodeSet::contains(const UnicodeString& s) const {
+    if (s.length() == 0) return FALSE;
+    int32_t cp = getSingleCP(s);
+    if (cp < 0) {
+        return strings->contains((void*) &s);
+    } else {
+        return contains((UChar32) cp);
+    }
+}
+
+/**
+ * Returns true if this set contains all the characters and strings
+ * of the given set.
+ * @param c set to be checked for containment
+ * @return true if the test condition is met
+ */
+UBool UnicodeSet::containsAll(const UnicodeSet& c) const {
+    // The specified set is a subset if all of its pairs are contained in
+    // this set.  It's possible to code this more efficiently in terms of
+    // direct manipulation of the inversion lists if the need arises.
+    int32_t n = c.getRangeCount();
+    for (int i=0; i<n; ++i) {
+        if (!contains(c.getRangeStart(i), c.getRangeEnd(i))) {
+            return FALSE;
+        }
+    }
+    if (!strings->containsAll(*c.strings)) return FALSE;
+    return TRUE;
+}
+
+/**
+ * Returns true if this set contains all the characters
+ * of the given string.
+ * @param s string containing characters to be checked for containment
+ * @return true if the test condition is met
+ */
+UBool UnicodeSet::containsAll(const UnicodeString& s) const {
+    return (UBool)(span(s.getBuffer(), s.length(), USET_SPAN_CONTAINED) ==
+                   s.length());
+}
+
+/**
+ * Returns true if this set contains none of the characters
+ * of the given range.
+ * @param start first character, inclusive, of the range
+ * @param end last character, inclusive, of the range
+ * @return true if the test condition is met
+ */
+UBool UnicodeSet::containsNone(UChar32 start, UChar32 end) const {
+    //int32_t i = -1;
+    //for (;;) {
+    //    if (start < list[++i]) break;
+    //}
+    int32_t i = findCodePoint(start);
+    return ((i & 1) == 0 && end < list[i]);
+}
+
+/**
+ * Returns true if this set contains none of the characters and strings
+ * of the given set.
+ * @param c set to be checked for containment
+ * @return true if the test condition is met
+ */
+UBool UnicodeSet::containsNone(const UnicodeSet& c) const {
+    // The specified set is a subset if all of its pairs are contained in
+    // this set.  It's possible to code this more efficiently in terms of
+    // direct manipulation of the inversion lists if the need arises.
+    int32_t n = c.getRangeCount();
+    for (int32_t i=0; i<n; ++i) {
+        if (!containsNone(c.getRangeStart(i), c.getRangeEnd(i))) {
+            return FALSE;
+        }
+    }
+    if (!strings->containsNone(*c.strings)) return FALSE;
+    return TRUE;
+}
+
+/**
+ * Returns true if this set contains none of the characters
+ * of the given string.
+ * @param s string containing characters to be checked for containment
+ * @return true if the test condition is met
+ */
+UBool UnicodeSet::containsNone(const UnicodeString& s) const {
+    return (UBool)(span(s.getBuffer(), s.length(), USET_SPAN_NOT_CONTAINED) ==
+                   s.length());
+}
+
+/**
+ * Returns <tt>true</tt> if this set contains any character whose low byte
+ * is the given value.  This is used by <tt>RuleBasedTransliterator</tt> for
+ * indexing.
+ */
+UBool UnicodeSet::matchesIndexValue(uint8_t v) const {
+    /* The index value v, in the range [0,255], is contained in this set if
+     * it is contained in any pair of this set.  Pairs either have the high
+     * bytes equal, or unequal.  If the high bytes are equal, then we have
+     * aaxx..aayy, where aa is the high byte.  Then v is contained if xx <=
+     * v <= yy.  If the high bytes are unequal we have aaxx..bbyy, bb>aa.
+     * Then v is contained if xx <= v || v <= yy.  (This is identical to the
+     * time zone month containment logic.)
+     */
+    int32_t i;
+    int32_t rangeCount=getRangeCount();
+    for (i=0; i<rangeCount; ++i) {
+        UChar32 low = getRangeStart(i);
+        UChar32 high = getRangeEnd(i);
+        if ((low & ~0xFF) == (high & ~0xFF)) {
+            if ((low & 0xFF) <= v && v <= (high & 0xFF)) {
+                return TRUE;
+            }
+        } else if ((low & 0xFF) <= v || v <= (high & 0xFF)) {
+            return TRUE;
+        }
+    }
+    if (strings->size() != 0) {
+        for (i=0; i<strings->size(); ++i) {
+            const UnicodeString& s = *(const UnicodeString*)strings->elementAt(i);
+            //if (s.length() == 0) {
+            //    // Empty strings match everything
+            //    return TRUE;
+            //}
+            // assert(s.length() != 0); // We enforce this elsewhere
+            UChar32 c = s.char32At(0);
+            if ((c & 0xFF) == v) {
+                return TRUE;
+            }
+        }
+    }
+    return FALSE;
+}
+
+/**
+ * Implementation of UnicodeMatcher::matches().  Always matches the
+ * longest possible multichar string.
+ */
+UMatchDegree UnicodeSet::matches(const Replaceable& text,
+                                 int32_t& offset,
+                                 int32_t limit,
+                                 UBool incremental) {
+    if (offset == limit) {
+        // Strings, if any, have length != 0, so we don't worry
+        // about them here.  If we ever allow zero-length strings
+        // we much check for them here.
+        if (contains(U_ETHER)) {
+            return incremental ? U_PARTIAL_MATCH : U_MATCH;
+        } else {
+            return U_MISMATCH;
+        }
+    } else {
+        if (strings->size() != 0) { // try strings first
+
+            // might separate forward and backward loops later
+            // for now they are combined
+
+            // TODO Improve efficiency of this, at least in the forward
+            // direction, if not in both.  In the forward direction we
+            // can assume the strings are sorted.
+
+            int32_t i;
+            UBool forward = offset < limit;
+
+            // firstChar is the leftmost char to match in the
+            // forward direction or the rightmost char to match in
+            // the reverse direction.
+            UChar firstChar = text.charAt(offset);
+
+            // If there are multiple strings that can match we
+            // return the longest match.
+            int32_t highWaterLength = 0;
+
+            for (i=0; i<strings->size(); ++i) {
+                const UnicodeString& trial = *(const UnicodeString*)strings->elementAt(i);
+
+                //if (trial.length() == 0) {
+                //    return U_MATCH; // null-string always matches
+                //}
+                // assert(trial.length() != 0); // We ensure this elsewhere
+
+                UChar c = trial.charAt(forward ? 0 : trial.length() - 1);
+
+                // Strings are sorted, so we can optimize in the
+                // forward direction.
+                if (forward && c > firstChar) break;
+                if (c != firstChar) continue;
+
+                int32_t matchLen = matchRest(text, offset, limit, trial);
+
+                if (incremental) {
+                    int32_t maxLen = forward ? limit-offset : offset-limit;
+                    if (matchLen == maxLen) {
+                        // We have successfully matched but only up to limit.
+                        return U_PARTIAL_MATCH;
+                    }
+                }
+
+                if (matchLen == trial.length()) {
+                    // We have successfully matched the whole string.
+                    if (matchLen > highWaterLength) {
+                        highWaterLength = matchLen;
+                    }
+                    // In the forward direction we know strings
+                    // are sorted so we can bail early.
+                    if (forward && matchLen < highWaterLength) {
+                        break;
+                    }
+                    continue;
+                }
+            }
+
+            // We've checked all strings without a partial match.
+            // If we have full matches, return the longest one.
+            if (highWaterLength != 0) {
+                offset += forward ? highWaterLength : -highWaterLength;
+                return U_MATCH;
+            }
+        }
+        return UnicodeFilter::matches(text, offset, limit, incremental);
+    }
+}
+
+/**
+ * Returns the longest match for s in text at the given position.
+ * If limit > start then match forward from start+1 to limit
+ * matching all characters except s.charAt(0).  If limit < start,
+ * go backward starting from start-1 matching all characters
+ * except s.charAt(s.length()-1).  This method assumes that the
+ * first character, text.charAt(start), matches s, so it does not
+ * check it.
+ * @param text the text to match
+ * @param start the first character to match.  In the forward
+ * direction, text.charAt(start) is matched against s.charAt(0).
+ * In the reverse direction, it is matched against
+ * s.charAt(s.length()-1).
+ * @param limit the limit offset for matching, either last+1 in
+ * the forward direction, or last-1 in the reverse direction,
+ * where last is the index of the last character to match.
+ * @return If part of s matches up to the limit, return |limit -
+ * start|.  If all of s matches before reaching the limit, return
+ * s.length().  If there is a mismatch between s and text, return
+ * 0
+ */
+int32_t UnicodeSet::matchRest(const Replaceable& text,
+                              int32_t start, int32_t limit,
+                              const UnicodeString& s) {
+    int32_t i;
+    int32_t maxLen;
+    int32_t slen = s.length();
+    if (start < limit) {
+        maxLen = limit - start;
+        if (maxLen > slen) maxLen = slen;
+        for (i = 1; i < maxLen; ++i) {
+            if (text.charAt(start + i) != s.charAt(i)) return 0;
+        }
+    } else {
+        maxLen = start - limit;
+        if (maxLen > slen) maxLen = slen;
+        --slen; // <=> slen = s.length() - 1;
+        for (i = 1; i < maxLen; ++i) {
+            if (text.charAt(start - i) != s.charAt(slen - i)) return 0;
+        }
+    }
+    return maxLen;
+}
+
+/**
+ * Implement of UnicodeMatcher
+ */
+void UnicodeSet::addMatchSetTo(UnicodeSet& toUnionTo) const {
+    toUnionTo.addAll(*this);
+}
+
+/**
+ * Returns the index of the given character within this set, where
+ * the set is ordered by ascending code point.  If the character
+ * is not in this set, return -1.  The inverse of this method is
+ * <code>charAt()</code>.
+ * @return an index from 0..size()-1, or -1
+ */
+int32_t UnicodeSet::indexOf(UChar32 c) const {
+    if (c < MIN_VALUE || c > MAX_VALUE) {
+        return -1;
+    }
+    int32_t i = 0;
+    int32_t n = 0;
+    for (;;) {
+        UChar32 start = list[i++];
+        if (c < start) {
+            return -1;
+        }
+        UChar32 limit = list[i++];
+        if (c < limit) {
+            return n + c - start;
+        }
+        n += limit - start;
+    }
+}
+
+/**
+ * Returns the character at the given index within this set, where
+ * the set is ordered by ascending code point.  If the index is
+ * out of range, return (UChar32)-1.  The inverse of this method is
+ * <code>indexOf()</code>.
+ * @param index an index from 0..size()-1
+ * @return the character at the given index, or (UChar32)-1.
+ */
+UChar32 UnicodeSet::charAt(int32_t index) const {
+    if (index >= 0) {
+        // len2 is the largest even integer <= len, that is, it is len
+        // for even values and len-1 for odd values.  With odd values
+        // the last entry is UNICODESET_HIGH.
+        int32_t len2 = len & ~1;
+        for (int32_t i=0; i < len2;) {
+            UChar32 start = list[i++];
+            int32_t count = list[i++] - start;
+            if (index < count) {
+                return (UChar32)(start + index);
+            }
+            index -= count;
+        }
+    }
+    return (UChar32)-1;
+}
+
+/**
+ * Make this object represent the range <code>start - end</code>.
+ * If <code>end > start</code> then this object is set to an
+ * an empty range.
+ *
+ * @param start first character in the set, inclusive
+ * @rparam end last character in the set, inclusive
+ */
+UnicodeSet& UnicodeSet::set(UChar32 start, UChar32 end) {
+    clear();
+    complement(start, end);
+    return *this;
+}
+
+/**
+ * Adds the specified range to this set if it is not already
+ * present.  If this set already contains the specified range,
+ * the call leaves this set unchanged.  If <code>end > start</code>
+ * then an empty range is added, leaving the set unchanged.
+ *
+ * @param start first character, inclusive, of range to be added
+ * to this set.
+ * @param end last character, inclusive, of range to be added
+ * to this set.
+ */
+UnicodeSet& UnicodeSet::add(UChar32 start, UChar32 end) {
+    if (pinCodePoint(start) < pinCodePoint(end)) {
+        UChar32 range[3] = { start, end+1, UNICODESET_HIGH };
+        add(range, 2, 0);
+    } else if (start == end) {
+        add(start);
+    }
+    return *this;
+}
+
+// #define DEBUG_US_ADD
+
+#ifdef DEBUG_US_ADD
+#include <stdio.h>
+void dump(UChar32 c) {
+    if (c <= 0xFF) {
+        printf("%c", (char)c);
+    } else {
+        printf("U+%04X", c);
+    }
+}
+void dump(const UChar32* list, int32_t len) {
+    printf("[");
+    for (int32_t i=0; i<len; ++i) {
+        if (i != 0) printf(", ");
+        dump(list[i]);
+    }
+    printf("]");
+}
+#endif
+
+/**
+ * Adds the specified character to this set if it is not already
+ * present.  If this set already contains the specified character,
+ * the call leaves this set unchanged.
+ */
+UnicodeSet& UnicodeSet::add(UChar32 c) {
+    // find smallest i such that c < list[i]
+    // if odd, then it is IN the set
+    // if even, then it is OUT of the set
+    int32_t i = findCodePoint(pinCodePoint(c));
+
+    // already in set?
+    if ((i & 1) != 0  || isFrozen() || isBogus()) return *this;
+
+    // HIGH is 0x110000
+    // assert(list[len-1] == HIGH);
+
+    // empty = [HIGH]
+    // [start_0, limit_0, start_1, limit_1, HIGH]
+
+    // [..., start_k-1, limit_k-1, start_k, limit_k, ..., HIGH]
+    //                             ^
+    //                             list[i]
+
+    // i == 0 means c is before the first range
+
+#ifdef DEBUG_US_ADD
+    printf("Add of ");
+    dump(c);
+    printf(" found at %d", i);
+    printf(": ");
+    dump(list, len);
+    printf(" => ");
+#endif
+
+    if (c == list[i]-1) {
+        // c is before start of next range
+        list[i] = c;
+        // if we touched the HIGH mark, then add a new one
+        if (c == (UNICODESET_HIGH - 1)) {
+            UErrorCode status = U_ZERO_ERROR;
+            ensureCapacity(len+1, status);
+            if (U_FAILURE(status)) {
+                return *this; // There is no way to report this error :-(
+            }
+            list[len++] = UNICODESET_HIGH;
+        }
+        if (i > 0 && c == list[i-1]) {
+            // collapse adjacent ranges
+
+            // [..., start_k-1, c, c, limit_k, ..., HIGH]
+            //                     ^
+            //                     list[i]
+
+            //for (int32_t k=i-1; k<len-2; ++k) {
+            //    list[k] = list[k+2];
+            //}
+            UChar32* dst = list + i - 1;
+            UChar32* src = dst + 2;
+            UChar32* srclimit = list + len;
+            while (src < srclimit) *(dst++) = *(src++);
+
+            len -= 2;
+        }
+    }
+
+    else if (i > 0 && c == list[i-1]) {
+        // c is after end of prior range
+        list[i-1]++;
+        // no need to check for collapse here
+    }
+
+    else {
+        // At this point we know the new char is not adjacent to
+        // any existing ranges, and it is not 10FFFF.
+
+
+        // [..., start_k-1, limit_k-1, start_k, limit_k, ..., HIGH]
+        //                             ^
+        //                             list[i]
+
+        // [..., start_k-1, limit_k-1, c, c+1, start_k, limit_k, ..., HIGH]
+        //                             ^
+        //                             list[i]
+
+        UErrorCode status = U_ZERO_ERROR;
+        ensureCapacity(len+2, status);
+        if (U_FAILURE(status)) {
+            return *this; // There is no way to report this error :-(
+        }
+
+        //for (int32_t k=len-1; k>=i; --k) {
+        //    list[k+2] = list[k];
+        //}
+        UChar32* src = list + len;
+        UChar32* dst = src + 2;
+        UChar32* srclimit = list + i;
+        while (src > srclimit) *(--dst) = *(--src);
+
+        list[i] = c;
+        list[i+1] = c+1;
+        len += 2;
+    }
+
+#ifdef DEBUG_US_ADD
+    dump(list, len);
+    printf("\n");
+
+    for (i=1; i<len; ++i) {
+        if (list[i] <= list[i-1]) {
+            // Corrupt array!
+            printf("ERROR: list has been corrupted\n");
+            exit(1);
+        }
+    }
+#endif
+
+    releasePattern();
+    return *this;
+}
+
+/**
+ * Adds the specified multicharacter to this set if it is not already
+ * present.  If this set already contains the multicharacter,
+ * the call leaves this set unchanged.
+ * Thus "ch" => {"ch"}
+ * <br><b>Warning: you cannot add an empty string ("") to a UnicodeSet.</b>
+ * @param s the source string
+ * @return the modified set, for chaining
+ */
+UnicodeSet& UnicodeSet::add(const UnicodeString& s) {
+    if (s.length() == 0 || isFrozen() || isBogus()) return *this;
+    int32_t cp = getSingleCP(s);
+    if (cp < 0) {
+        if (!strings->contains((void*) &s)) {
+            _add(s);
+            releasePattern();
+        }
+    } else {
+        add((UChar32)cp);
+    }
+    return *this;
+}
+
+/**
+ * Adds the given string, in order, to 'strings'.  The given string
+ * must have been checked by the caller to not be empty and to not
+ * already be in 'strings'.
+ */
+void UnicodeSet::_add(const UnicodeString& s) {
+    if (isFrozen() || isBogus()) {
+        return;
+    }
+    UnicodeString* t = new UnicodeString(s);
+    if (t == NULL) { // Check for memory allocation error.
+        setToBogus();
+        return;
+    }
+    UErrorCode ec = U_ZERO_ERROR;
+    strings->sortedInsert(t, compareUnicodeString, ec);
+    if (U_FAILURE(ec)) {
+        setToBogus();
+        delete t;
+    }
+}
+
+/**
+ * @return a code point IF the string consists of a single one.
+ * otherwise returns -1.
+ * @param string to test
+ */
+int32_t UnicodeSet::getSingleCP(const UnicodeString& s) {
+    //if (s.length() < 1) {
+    //    throw new IllegalArgumentException("Can't use zero-length strings in UnicodeSet");
+    //}
+    if (s.length() > 2) return -1;
+    if (s.length() == 1) return s.charAt(0);
+
+    // at this point, len = 2
+    UChar32 cp = s.char32At(0);
+    if (cp > 0xFFFF) { // is surrogate pair
+        return cp;
+    }
+    return -1;
+}
+
+/**
+ * Adds each of the characters in this string to the set. Thus "ch" => {"c", "h"}
+ * If this set already any particular character, it has no effect on that character.
+ * @param the source string
+ * @return the modified set, for chaining
+ */
+UnicodeSet& UnicodeSet::addAll(const UnicodeString& s) {
+    UChar32 cp;
+    for (int32_t i = 0; i < s.length(); i += U16_LENGTH(cp)) {
+        cp = s.char32At(i);
+        add(cp);
+    }
+    return *this;
+}
+
+/**
+ * Retains EACH of the characters in this string. Note: "ch" == {"c", "h"}
+ * If this set already any particular character, it has no effect on that character.
+ * @param the source string
+ * @return the modified set, for chaining
+ */
+UnicodeSet& UnicodeSet::retainAll(const UnicodeString& s) {
+    UnicodeSet set;
+    set.addAll(s);
+    retainAll(set);
+    return *this;
+}
+
+/**
+ * Complement EACH of the characters in this string. Note: "ch" == {"c", "h"}
+ * If this set already any particular character, it has no effect on that character.
+ * @param the source string
+ * @return the modified set, for chaining
+ */
+UnicodeSet& UnicodeSet::complementAll(const UnicodeString& s) {
+    UnicodeSet set;
+    set.addAll(s);
+    complementAll(set);
+    return *this;
+}
+
+/**
+ * Remove EACH of the characters in this string. Note: "ch" == {"c", "h"}
+ * If this set already any particular character, it has no effect on that character.
+ * @param the source string
+ * @return the modified set, for chaining
+ */
+UnicodeSet& UnicodeSet::removeAll(const UnicodeString& s) {
+    UnicodeSet set;
+    set.addAll(s);
+    removeAll(set);
+    return *this;
+}
+
+UnicodeSet& UnicodeSet::removeAllStrings() {
+    strings->removeAllElements();
+    return *this;
+}
+
+
+/**
+ * Makes a set from a multicharacter string. Thus "ch" => {"ch"}
+ * <br><b>Warning: you cannot add an empty string ("") to a UnicodeSet.</b>
+ * @param the source string
+ * @return a newly created set containing the given string
+ */
+UnicodeSet* U_EXPORT2 UnicodeSet::createFrom(const UnicodeString& s) {
+    UnicodeSet *set = new UnicodeSet();
+    if (set != NULL) { // Check for memory allocation error.
+        set->add(s);
+    }
+    return set;
+}
+
+
+/**
+ * Makes a set from each of the characters in the string. Thus "ch" => {"c", "h"}
+ * @param the source string
+ * @return a newly created set containing the given characters
+ */
+UnicodeSet* U_EXPORT2 UnicodeSet::createFromAll(const UnicodeString& s) {
+    UnicodeSet *set = new UnicodeSet();
+    if (set != NULL) { // Check for memory allocation error.
+        set->addAll(s);
+    }
+    return set;
+}
+
+/**
+ * Retain only the elements in this set that are contained in the
+ * specified range.  If <code>end > start</code> then an empty range is
+ * retained, leaving the set empty.
+ *
+ * @param start first character, inclusive, of range to be retained
+ * to this set.
+ * @param end last character, inclusive, of range to be retained
+ * to this set.
+ */
+UnicodeSet& UnicodeSet::retain(UChar32 start, UChar32 end) {
+    if (pinCodePoint(start) <= pinCodePoint(end)) {
+        UChar32 range[3] = { start, end+1, UNICODESET_HIGH };
+        retain(range, 2, 0);
+    } else {
+        clear();
+    }
+    return *this;
+}
+
+UnicodeSet& UnicodeSet::retain(UChar32 c) {
+    return retain(c, c);
+}
+
+/**
+ * Removes the specified range from this set if it is present.
+ * The set will not contain the specified range once the call
+ * returns.  If <code>end > start</code> then an empty range is
+ * removed, leaving the set unchanged.
+ *
+ * @param start first character, inclusive, of range to be removed
+ * from this set.
+ * @param end last character, inclusive, of range to be removed
+ * from this set.
+ */
+UnicodeSet& UnicodeSet::remove(UChar32 start, UChar32 end) {
+    if (pinCodePoint(start) <= pinCodePoint(end)) {
+        UChar32 range[3] = { start, end+1, UNICODESET_HIGH };
+        retain(range, 2, 2);
+    }
+    return *this;
+}
+
+/**
+ * Removes the specified character from this set if it is present.
+ * The set will not contain the specified range once the call
+ * returns.
+ */
+UnicodeSet& UnicodeSet::remove(UChar32 c) {
+    return remove(c, c);
+}
+
+/**
+ * Removes the specified string from this set if it is present.
+ * The set will not contain the specified character once the call
+ * returns.
+ * @param the source string
+ * @return the modified set, for chaining
+ */
+UnicodeSet& UnicodeSet::remove(const UnicodeString& s) {
+    if (s.length() == 0 || isFrozen() || isBogus()) return *this;
+    int32_t cp = getSingleCP(s);
+    if (cp < 0) {
+        strings->removeElement((void*) &s);
+        releasePattern();
+    } else {
+        remove((UChar32)cp, (UChar32)cp);
+    }
+    return *this;
+}
+
+/**
+ * Complements the specified range in this set.  Any character in
+ * the range will be removed if it is in this set, or will be
+ * added if it is not in this set.  If <code>end > start</code>
+ * then an empty range is xor'ed, leaving the set unchanged.
+ *
+ * @param start first character, inclusive, of range to be removed
+ * from this set.
+ * @param end last character, inclusive, of range to be removed
+ * from this set.
+ */
+UnicodeSet& UnicodeSet::complement(UChar32 start, UChar32 end) {
+    if (isFrozen() || isBogus()) {
+        return *this;
+    }
+    if (pinCodePoint(start) <= pinCodePoint(end)) {
+        UChar32 range[3] = { start, end+1, UNICODESET_HIGH };
+        exclusiveOr(range, 2, 0);
+    }
+    releasePattern();
+    return *this;
+}
+
+UnicodeSet& UnicodeSet::complement(UChar32 c) {
+    return complement(c, c);
+}
+
+/**
+ * This is equivalent to
+ * <code>complement(MIN_VALUE, MAX_VALUE)</code>.
+ */
+UnicodeSet& UnicodeSet::complement(void) {
+    if (isFrozen() || isBogus()) {
+        return *this;
+    }
+    UErrorCode status = U_ZERO_ERROR;
+    if (list[0] == UNICODESET_LOW) {
+        ensureBufferCapacity(len-1, status);
+        if (U_FAILURE(status)) {
+            return *this;
+        }
+        uprv_memcpy(buffer, list + 1, (size_t)(len-1)*sizeof(UChar32));
+        --len;
+    } else {
+        ensureBufferCapacity(len+1, status);
+        if (U_FAILURE(status)) {
+            return *this;
+        }
+        uprv_memcpy(buffer + 1, list, (size_t)len*sizeof(UChar32));
+        buffer[0] = UNICODESET_LOW;
+        ++len;
+    }
+    swapBuffers();
+    releasePattern();
+    return *this;
+}
+
+/**
+ * Complement the specified string in this set.
+ * The set will not contain the specified string once the call
+ * returns.
+ * <br><b>Warning: you cannot add an empty string ("") to a UnicodeSet.</b>
+ * @param s the string to complement
+ * @return this object, for chaining
+ */
+UnicodeSet& UnicodeSet::complement(const UnicodeString& s) {
+    if (s.length() == 0 || isFrozen() || isBogus()) return *this;
+    int32_t cp = getSingleCP(s);
+    if (cp < 0) {
+        if (strings->contains((void*) &s)) {
+            strings->removeElement((void*) &s);
+        } else {
+            _add(s);
+        }
+        releasePattern();
+    } else {
+        complement((UChar32)cp, (UChar32)cp);
+    }
+    return *this;
+}
+
+/**
+ * Adds all of the elements in the specified set to this set if
+ * they're not already present.  This operation effectively
+ * modifies this set so that its value is the <i>union</i> of the two
+ * sets.  The behavior of this operation is unspecified if the specified
+ * collection is modified while the operation is in progress.
+ *
+ * @param c set whose elements are to be added to this set.
+ * @see #add(char, char)
+ */
+UnicodeSet& UnicodeSet::addAll(const UnicodeSet& c) {
+    if ( c.len>0 && c.list!=NULL ) {
+        add(c.list, c.len, 0);
+    }
+
+    // Add strings in order
+    if ( c.strings!=NULL ) {
+        for (int32_t i=0; i<c.strings->size(); ++i) {
+            const UnicodeString* s = (const UnicodeString*)c.strings->elementAt(i);
+            if (!strings->contains((void*) s)) {
+                _add(*s);
+            }
+        }
+    }
+    return *this;
+}
+
+/**
+ * Retains only the elements in this set that are contained in the
+ * specified set.  In other words, removes from this set all of
+ * its elements that are not contained in the specified set.  This
+ * operation effectively modifies this set so that its value is
+ * the <i>intersection</i> of the two sets.
+ *
+ * @param c set that defines which elements this set will retain.
+ */
+UnicodeSet& UnicodeSet::retainAll(const UnicodeSet& c) {
+    if (isFrozen() || isBogus()) {
+        return *this;
+    }
+    retain(c.list, c.len, 0);
+    strings->retainAll(*c.strings);
+    return *this;
+}
+
+/**
+ * Removes from this set all of its elements that are contained in the
+ * specified set.  This operation effectively modifies this
+ * set so that its value is the <i>asymmetric set difference</i> of
+ * the two sets.
+ *
+ * @param c set that defines which elements will be removed from
+ *          this set.
+ */
+UnicodeSet& UnicodeSet::removeAll(const UnicodeSet& c) {
+    if (isFrozen() || isBogus()) {
+        return *this;
+    }
+    retain(c.list, c.len, 2);
+    strings->removeAll(*c.strings);
+    return *this;
+}
+
+/**
+ * Complements in this set all elements contained in the specified
+ * set.  Any character in the other set will be removed if it is
+ * in this set, or will be added if it is not in this set.
+ *
+ * @param c set that defines which elements will be xor'ed from
+ *          this set.
+ */
+UnicodeSet& UnicodeSet::complementAll(const UnicodeSet& c) {
+    if (isFrozen() || isBogus()) {
+        return *this;
+    }
+    exclusiveOr(c.list, c.len, 0);
+
+    for (int32_t i=0; i<c.strings->size(); ++i) {
+        void* e = c.strings->elementAt(i);
+        if (!strings->removeElement(e)) {
+            _add(*(const UnicodeString*)e);
+        }
+    }
+    return *this;
+}
+
+/**
+ * Removes all of the elements from this set.  This set will be
+ * empty after this call returns.
+ */
+UnicodeSet& UnicodeSet::clear(void) {
+    if (isFrozen()) {
+        return *this;
+    }
+    if (list != NULL) {
+        list[0] = UNICODESET_HIGH;
+    }
+    len = 1;
+    releasePattern();
+    if (strings != NULL) {
+        strings->removeAllElements();
+    }
+    if (list != NULL && strings != NULL) {
+        // Remove bogus
+        fFlags = 0;
+    }
+    return *this;
+}
+
+/**
+ * Iteration method that returns the number of ranges contained in
+ * this set.
+ * @see #getRangeStart
+ * @see #getRangeEnd
+ */
+int32_t UnicodeSet::getRangeCount() const {
+    return len/2;
+}
+
+/**
+ * Iteration method that returns the first character in the
+ * specified range of this set.
+ * @see #getRangeCount
+ * @see #getRangeEnd
+ */
+UChar32 UnicodeSet::getRangeStart(int32_t index) const {
+    return list[index*2];
+}
+
+/**
+ * Iteration method that returns the last character in the
+ * specified range of this set.
+ * @see #getRangeStart
+ * @see #getRangeEnd
+ */
+UChar32 UnicodeSet::getRangeEnd(int32_t index) const {
+    return list[index*2 + 1] - 1;
+}
+
+int32_t UnicodeSet::getStringCount() const {
+    return strings->size();
+}
+
+const UnicodeString* UnicodeSet::getString(int32_t index) const {
+    return (const UnicodeString*) strings->elementAt(index);
+}
+
+/**
+ * Reallocate this objects internal structures to take up the least
+ * possible space, without changing this object's value.
+ */
+UnicodeSet& UnicodeSet::compact() {
+    if (isFrozen() || isBogus()) {
+        return *this;
+    }
+    // Delete buffer first to defragment memory less.
+    if (buffer != NULL) {
+        uprv_free(buffer);
+        buffer = NULL;
+    }
+    if (len < capacity) {
+        // Make the capacity equal to len or 1.
+        // We don't want to realloc of 0 size.
+        int32_t newCapacity = len + (len == 0);
+        UChar32* temp = (UChar32*) uprv_realloc(list, sizeof(UChar32) * newCapacity);
+        if (temp) {
+            list = temp;
+            capacity = newCapacity;
+        }
+        // else what the heck happened?! We allocated less memory!
+        // Oh well. We'll keep our original array.
+    }
+    return *this;
+}
+
+#ifdef DEBUG_SERIALIZE
+#include <stdio.h>
+#endif
+
+/**
+ * Deserialize constructor.
+ */
+UnicodeSet::UnicodeSet(const uint16_t data[], int32_t dataLen, ESerialization serialization, UErrorCode &ec)
+  : len(1), capacity(1+START_EXTRA), list(0), bmpSet(0), buffer(0),
+    bufferCapacity(0), patLen(0), pat(NULL), strings(NULL), stringSpan(NULL),
+    fFlags(0) {
+
+  if(U_FAILURE(ec)) {
+    setToBogus();
+    return;
+  }
+
+  if( (serialization != kSerialized)
+      || (data==NULL)
+      || (dataLen < 1)) {
+    ec = U_ILLEGAL_ARGUMENT_ERROR;
+    setToBogus();
+    return;
+  }
+
+  allocateStrings(ec);
+  if (U_FAILURE(ec)) {
+    setToBogus();
+    return;
+  }
+
+  // bmp?
+  int32_t headerSize = ((data[0]&0x8000)) ?2:1;
+  int32_t bmpLength = (headerSize==1)?data[0]:data[1];
+
+  len = (((data[0]&0x7FFF)-bmpLength)/2)+bmpLength;
+#ifdef DEBUG_SERIALIZE
+  printf("dataLen %d headerSize %d bmpLen %d len %d. data[0]=%X/%X/%X/%X\n", dataLen,headerSize,bmpLength,len, data[0],data[1],data[2],data[3]);
+#endif
+  capacity = len+1;
+  list = (UChar32*) uprv_malloc(sizeof(UChar32) * capacity);
+  if(!list || U_FAILURE(ec)) {
+    setToBogus();
+    return;
+  }
+  // copy bmp
+  int32_t i;
+  for(i = 0; i< bmpLength;i++) {
+    list[i] = data[i+headerSize];
+#ifdef DEBUG_SERIALIZE
+    printf("<<16@%d[%d] %X\n", i+headerSize, i, list[i]);
+#endif
+  }
+  // copy smp
+  for(i=bmpLength;i<len;i++) {
+    list[i] = ((UChar32)data[headerSize+bmpLength+(i-bmpLength)*2+0] << 16) +
+              ((UChar32)data[headerSize+bmpLength+(i-bmpLength)*2+1]);
+#ifdef DEBUG_SERIALIZE
+    printf("<<32@%d+[%d] %lX\n", headerSize+bmpLength+i, i, list[i]);
+#endif
+  }
+  // terminator
+  list[len++]=UNICODESET_HIGH;
+}
+
+
+int32_t UnicodeSet::serialize(uint16_t *dest, int32_t destCapacity, UErrorCode& ec) const {
+    int32_t bmpLength, length, destLength;
+
+    if (U_FAILURE(ec)) {
+        return 0;
+    }
+
+    if (destCapacity<0 || (destCapacity>0 && dest==NULL)) {
+        ec=U_ILLEGAL_ARGUMENT_ERROR;
+        return 0;
+    }
+
+    /* count necessary 16-bit units */
+    length=this->len-1; // Subtract 1 to ignore final UNICODESET_HIGH
+    // assert(length>=0);
+    if (length==0) {
+        /* empty set */
+        if (destCapacity>0) {
+            *dest=0;
+        } else {
+            ec=U_BUFFER_OVERFLOW_ERROR;
+        }
+        return 1;
+    }
+    /* now length>0 */
+
+    if (this->list[length-1]<=0xffff) {
+        /* all BMP */
+        bmpLength=length;
+    } else if (this->list[0]>=0x10000) {
+        /* all supplementary */
+        bmpLength=0;
+        length*=2;
+    } else {
+        /* some BMP, some supplementary */
+        for (bmpLength=0; bmpLength<length && this->list[bmpLength]<=0xffff; ++bmpLength) {}
+        length=bmpLength+2*(length-bmpLength);
+    }
+#ifdef DEBUG_SERIALIZE
+    printf(">> bmpLength%d length%d len%d\n", bmpLength, length, len);
+#endif
+    /* length: number of 16-bit array units */
+    if (length>0x7fff) {
+        /* there are only 15 bits for the length in the first serialized word */
+        ec=U_INDEX_OUTOFBOUNDS_ERROR;
+        return 0;
+    }
+
+    /*
+     * total serialized length:
+     * number of 16-bit array units (length) +
+     * 1 length unit (always) +
+     * 1 bmpLength unit (if there are supplementary values)
+     */
+    destLength=length+((length>bmpLength)?2:1);
+    if (destLength<=destCapacity) {
+        const UChar32 *p;
+        int32_t i;
+
+#ifdef DEBUG_SERIALIZE
+        printf("writeHdr\n");
+#endif
+        *dest=(uint16_t)length;
+        if (length>bmpLength) {
+            *dest|=0x8000;
+            *++dest=(uint16_t)bmpLength;
+        }
+        ++dest;
+
+        /* write the BMP part of the array */
+        p=this->list;
+        for (i=0; i<bmpLength; ++i) {
+#ifdef DEBUG_SERIALIZE
+          printf("writebmp: %x\n", (int)*p);
+#endif
+            *dest++=(uint16_t)*p++;
+        }
+
+        /* write the supplementary part of the array */
+        for (; i<length; i+=2) {
+#ifdef DEBUG_SERIALIZE
+          printf("write32: %x\n", (int)*p);
+#endif
+            *dest++=(uint16_t)(*p>>16);
+            *dest++=(uint16_t)*p++;
+        }
+    } else {
+        ec=U_BUFFER_OVERFLOW_ERROR;
+    }
+    return destLength;
+}
+
+//----------------------------------------------------------------
+// Implementation: Utility methods
+//----------------------------------------------------------------
+
+/**
+ * Allocate our strings vector and return TRUE if successful.
+ */
+UBool UnicodeSet::allocateStrings(UErrorCode &status) {
+    if (U_FAILURE(status)) {
+        return FALSE;
+    }
+    strings = new UVector(uprv_deleteUObject,
+                          uhash_compareUnicodeString, 1, status);
+    if (strings == NULL) { // Check for memory allocation error.
+        status = U_MEMORY_ALLOCATION_ERROR;
+        return FALSE;
+    }
+    if (U_FAILURE(status)) {
+        delete strings;
+        strings = NULL;
+        return FALSE;
+    } 
+    return TRUE;
+}
+
+void UnicodeSet::ensureCapacity(int32_t newLen, UErrorCode& ec) {
+    if (newLen <= capacity)
+        return;
+    UChar32* temp = (UChar32*) uprv_realloc(list, sizeof(UChar32) * (newLen + GROW_EXTRA));
+    if (temp == NULL) {
+        ec = U_MEMORY_ALLOCATION_ERROR;
+        setToBogus();
+        return;
+    }
+    list = temp;
+    capacity = newLen + GROW_EXTRA;
+    // else we keep the original contents on the memory failure.
+}
+
+void UnicodeSet::ensureBufferCapacity(int32_t newLen, UErrorCode& ec) {
+    if (buffer != NULL && newLen <= bufferCapacity)
+        return;
+    UChar32* temp = (UChar32*) uprv_realloc(buffer, sizeof(UChar32) * (newLen + GROW_EXTRA));
+    if (temp == NULL) {
+        ec = U_MEMORY_ALLOCATION_ERROR;
+        setToBogus();
+        return;
+    }
+    buffer = temp;
+    bufferCapacity = newLen + GROW_EXTRA;
+    // else we keep the original contents on the memory failure.
+}
+
+/**
+ * Swap list and buffer.
+ */
+void UnicodeSet::swapBuffers(void) {
+    // swap list and buffer
+    UChar32* temp = list;
+    list = buffer;
+    buffer = temp;
+
+    int32_t c = capacity;
+    capacity = bufferCapacity;
+    bufferCapacity = c;
+}
+
+void UnicodeSet::setToBogus() {
+    clear(); // Remove everything in the set.
+    fFlags = kIsBogus;
+}
+
+//----------------------------------------------------------------
+// Implementation: Fundamental operators
+//----------------------------------------------------------------
+
+static inline UChar32 max(UChar32 a, UChar32 b) {
+    return (a > b) ? a : b;
+}
+
+// polarity = 0, 3 is normal: x xor y
+// polarity = 1, 2: x xor ~y == x === y
+
+void UnicodeSet::exclusiveOr(const UChar32* other, int32_t otherLen, int8_t polarity) {
+    if (isFrozen() || isBogus()) {
+        return;
+    }
+    UErrorCode status = U_ZERO_ERROR;
+    ensureBufferCapacity(len + otherLen, status);
+    if (U_FAILURE(status)) {
+        return;
+    }
+
+    int32_t i = 0, j = 0, k = 0;
+    UChar32 a = list[i++];
+    UChar32 b;
+    if (polarity == 1 || polarity == 2) {
+        b = UNICODESET_LOW;
+        if (other[j] == UNICODESET_LOW) { // skip base if already LOW
+            ++j;
+            b = other[j];
+        }
+    } else {
+        b = other[j++];
+    }
+    // simplest of all the routines
+    // sort the values, discarding identicals!
+    for (;;) {
+        if (a < b) {
+            buffer[k++] = a;
+            a = list[i++];
+        } else if (b < a) {
+            buffer[k++] = b;
+            b = other[j++];
+        } else if (a != UNICODESET_HIGH) { // at this point, a == b
+            // discard both values!
+            a = list[i++];
+            b = other[j++];
+        } else { // DONE!
+            buffer[k++] = UNICODESET_HIGH;
+            len = k;
+            break;
+        }
+    }
+    swapBuffers();
+    releasePattern();
+}
+
+// polarity = 0 is normal: x union y
+// polarity = 2: x union ~y
+// polarity = 1: ~x union y
+// polarity = 3: ~x union ~y
+
+void UnicodeSet::add(const UChar32* other, int32_t otherLen, int8_t polarity) {
+    if (isFrozen() || isBogus() || other==NULL) {
+        return;
+    }
+    UErrorCode status = U_ZERO_ERROR;
+    ensureBufferCapacity(len + otherLen, status);
+    if (U_FAILURE(status)) {
+        return;
+    }
+
+    int32_t i = 0, j = 0, k = 0;
+    UChar32 a = list[i++];
+    UChar32 b = other[j++];
+    // change from xor is that we have to check overlapping pairs
+    // polarity bit 1 means a is second, bit 2 means b is.
+    for (;;) {
+        switch (polarity) {
+          case 0: // both first; take lower if unequal
+            if (a < b) { // take a
+                // Back up over overlapping ranges in buffer[]
+                if (k > 0 && a <= buffer[k-1]) {
+                    // Pick latter end value in buffer[] vs. list[]
+                    a = max(list[i], buffer[--k]);
+                } else {
+                    // No overlap
+                    buffer[k++] = a;
+                    a = list[i];
+                }
+                i++; // Common if/else code factored out
+                polarity ^= 1;
+            } else if (b < a) { // take b
+                if (k > 0 && b <= buffer[k-1]) {
+                    b = max(other[j], buffer[--k]);
+                } else {
+                    buffer[k++] = b;
+                    b = other[j];
+                }
+                j++;
+                polarity ^= 2;
+            } else { // a == b, take a, drop b
+                if (a == UNICODESET_HIGH) goto loop_end;
+                // This is symmetrical; it doesn't matter if
+                // we backtrack with a or b. - liu
+                if (k > 0 && a <= buffer[k-1]) {
+                    a = max(list[i], buffer[--k]);
+                } else {
+                    // No overlap
+                    buffer[k++] = a;
+                    a = list[i];
+                }
+                i++;
+                polarity ^= 1;
+                b = other[j++];
+                polarity ^= 2;
+            }
+            break;
+          case 3: // both second; take higher if unequal, and drop other
+            if (b <= a) { // take a
+                if (a == UNICODESET_HIGH) goto loop_end;
+                buffer[k++] = a;
+            } else { // take b
+                if (b == UNICODESET_HIGH) goto loop_end;
+                buffer[k++] = b;
+            }
+            a = list[i++];
+            polarity ^= 1;   // factored common code
+            b = other[j++];
+            polarity ^= 2;
+            break;
+          case 1: // a second, b first; if b < a, overlap
+            if (a < b) { // no overlap, take a
+                buffer[k++] = a; a = list[i++]; polarity ^= 1;
+            } else if (b < a) { // OVERLAP, drop b
+                b = other[j++];
+                polarity ^= 2;
+            } else { // a == b, drop both!
+                if (a == UNICODESET_HIGH) goto loop_end;
+                a = list[i++];
+                polarity ^= 1;
+                b = other[j++];
+                polarity ^= 2;
+            }
+            break;
+          case 2: // a first, b second; if a < b, overlap
+            if (b < a) { // no overlap, take b
+                buffer[k++] = b;
+                b = other[j++];
+                polarity ^= 2;
+            } else  if (a < b) { // OVERLAP, drop a
+                a = list[i++];
+                polarity ^= 1;
+            } else { // a == b, drop both!
+                if (a == UNICODESET_HIGH) goto loop_end;
+                a = list[i++];
+                polarity ^= 1;
+                b = other[j++];
+                polarity ^= 2;
+            }
+            break;
+        }
+    }
+ loop_end:
+    buffer[k++] = UNICODESET_HIGH;    // terminate
+    len = k;
+    swapBuffers();
+    releasePattern();
+}
+
+// polarity = 0 is normal: x intersect y
+// polarity = 2: x intersect ~y == set-minus
+// polarity = 1: ~x intersect y
+// polarity = 3: ~x intersect ~y
+
+void UnicodeSet::retain(const UChar32* other, int32_t otherLen, int8_t polarity) {
+    if (isFrozen() || isBogus()) {
+        return;
+    }
+    UErrorCode status = U_ZERO_ERROR;
+    ensureBufferCapacity(len + otherLen, status);
+    if (U_FAILURE(status)) {
+        return;
+    }
+
+    int32_t i = 0, j = 0, k = 0;
+    UChar32 a = list[i++];
+    UChar32 b = other[j++];
+    // change from xor is that we have to check overlapping pairs
+    // polarity bit 1 means a is second, bit 2 means b is.
+    for (;;) {
+        switch (polarity) {
+          case 0: // both first; drop the smaller
+            if (a < b) { // drop a
+                a = list[i++];
+                polarity ^= 1;
+            } else if (b < a) { // drop b
+                b = other[j++];
+                polarity ^= 2;
+            } else { // a == b, take one, drop other
+                if (a == UNICODESET_HIGH) goto loop_end;
+                buffer[k++] = a;
+                a = list[i++];
+                polarity ^= 1;
+                b = other[j++];
+                polarity ^= 2;
+            }
+            break;
+          case 3: // both second; take lower if unequal
+            if (a < b) { // take a
+                buffer[k++] = a;
+                a = list[i++];
+                polarity ^= 1;
+            } else if (b < a) { // take b
+                buffer[k++] = b;
+                b = other[j++];
+                polarity ^= 2;
+            } else { // a == b, take one, drop other
+                if (a == UNICODESET_HIGH) goto loop_end;
+                buffer[k++] = a;
+                a = list[i++];
+                polarity ^= 1;
+                b = other[j++];
+                polarity ^= 2;
+            }
+            break;
+          case 1: // a second, b first;
+            if (a < b) { // NO OVERLAP, drop a
+                a = list[i++];
+                polarity ^= 1;
+            } else if (b < a) { // OVERLAP, take b
+                buffer[k++] = b;
+                b = other[j++];
+                polarity ^= 2;
+            } else { // a == b, drop both!
+                if (a == UNICODESET_HIGH) goto loop_end;
+                a = list[i++];
+                polarity ^= 1;
+                b = other[j++];
+                polarity ^= 2;
+            }
+            break;
+          case 2: // a first, b second; if a < b, overlap
+            if (b < a) { // no overlap, drop b
+                b = other[j++];
+                polarity ^= 2;
+            } else  if (a < b) { // OVERLAP, take a
+                buffer[k++] = a;
+                a = list[i++];
+                polarity ^= 1;
+            } else { // a == b, drop both!
+                if (a == UNICODESET_HIGH) goto loop_end;
+                a = list[i++];
+                polarity ^= 1;
+                b = other[j++];
+                polarity ^= 2;
+            }
+            break;
+        }
+    }
+ loop_end:
+    buffer[k++] = UNICODESET_HIGH;    // terminate
+    len = k;
+    swapBuffers();
+    releasePattern();
+}
+
+/**
+ * Append the <code>toPattern()</code> representation of a
+ * string to the given <code>StringBuffer</code>.
+ */
+void UnicodeSet::_appendToPat(UnicodeString& buf, const UnicodeString& s, UBool
+escapeUnprintable) {
+    UChar32 cp;
+    for (int32_t i = 0; i < s.length(); i += U16_LENGTH(cp)) {
+        _appendToPat(buf, cp = s.char32At(i), escapeUnprintable);
+    }
+}
+
+/**
+ * Append the <code>toPattern()</code> representation of a
+ * character to the given <code>StringBuffer</code>.
+ */
+void UnicodeSet::_appendToPat(UnicodeString& buf, UChar32 c, UBool
+escapeUnprintable) {
+    if (escapeUnprintable && ICU_Utility::isUnprintable(c)) {
+        // Use hex escape notation (\uxxxx or \Uxxxxxxxx) for anything
+        // unprintable
+        if (ICU_Utility::escapeUnprintable(buf, c)) {
+            return;
+        }
+    }
+    // Okay to let ':' pass through
+    switch (c) {
+    case SET_OPEN:
+    case SET_CLOSE:
+    case HYPHEN:
+    case COMPLEMENT:
+    case INTERSECTION:
+    case BACKSLASH:
+    case OPEN_BRACE:
+    case CLOSE_BRACE:
+    case COLON:
+    case SymbolTable::SYMBOL_REF:
+        buf.append(BACKSLASH);
+        break;
+    default:
+        // Escape whitespace
+        if (PatternProps::isWhiteSpace(c)) {
+            buf.append(BACKSLASH);
+        }
+        break;
+    }
+    buf.append(c);
+}
+
+/**
+ * Append a string representation of this set to result.  This will be
+ * a cleaned version of the string passed to applyPattern(), if there
+ * is one.  Otherwise it will be generated.
+ */
+UnicodeString& UnicodeSet::_toPattern(UnicodeString& result,
+                                      UBool escapeUnprintable) const
+{
+    if (pat != NULL) {
+        int32_t i;
+        int32_t backslashCount = 0;
+        for (i=0; i<patLen; ) {
+            UChar32 c;
+            U16_NEXT(pat, i, patLen, c);
+            if (escapeUnprintable && ICU_Utility::isUnprintable(c)) {
+                // If the unprintable character is preceded by an odd
+                // number of backslashes, then it has been escaped.
+                // Before unescaping it, we delete the final
+                // backslash.
+                if ((backslashCount % 2) == 1) {
+                    result.truncate(result.length() - 1);
+                }
+                ICU_Utility::escapeUnprintable(result, c);
+                backslashCount = 0;
+            } else {
+                result.append(c);
+                if (c == BACKSLASH) {
+                    ++backslashCount;
+                } else {
+                    backslashCount = 0;
+                }
+            }
+        }
+        return result;
+    }
+
+    return _generatePattern(result, escapeUnprintable);
+}
+
+/**
+ * Returns a string representation of this set.  If the result of
+ * calling this function is passed to a UnicodeSet constructor, it
+ * will produce another set that is equal to this one.
+ */
+UnicodeString& UnicodeSet::toPattern(UnicodeString& result,
+                                     UBool escapeUnprintable) const
+{
+    result.truncate(0);
+    return _toPattern(result, escapeUnprintable);
+}
+
+/**
+ * Generate and append a string representation of this set to result.
+ * This does not use this.pat, the cleaned up copy of the string
+ * passed to applyPattern().
+ */
+UnicodeString& UnicodeSet::_generatePattern(UnicodeString& result,
+                                            UBool escapeUnprintable) const
+{
+    result.append(SET_OPEN);
+
+//  // Check against the predefined categories.  We implicitly build
+//  // up ALL category sets the first time toPattern() is called.
+//  for (int8_t cat=0; cat<Unicode::GENERAL_TYPES_COUNT; ++cat) {
+//      if (*this == getCategorySet(cat)) {
+//          result.append(COLON);
+//          result.append(CATEGORY_NAMES, cat*2, 2);
+//          return result.append(CATEGORY_CLOSE);
+//      }
+//  }
+
+    int32_t count = getRangeCount();
+
+    // If the set contains at least 2 intervals and includes both
+    // MIN_VALUE and MAX_VALUE, then the inverse representation will
+    // be more economical.
+    if (count > 1 &&
+        getRangeStart(0) == MIN_VALUE &&
+        getRangeEnd(count-1) == MAX_VALUE) {
+
+        // Emit the inverse
+        result.append(COMPLEMENT);
+
+        for (int32_t i = 1; i < count; ++i) {
+            UChar32 start = getRangeEnd(i-1)+1;
+            UChar32 end = getRangeStart(i)-1;
+            _appendToPat(result, start, escapeUnprintable);
+            if (start != end) {
+                if ((start+1) != end) {
+                    result.append(HYPHEN);
+                }
+                _appendToPat(result, end, escapeUnprintable);
+            }
+        }
+    }
+
+    // Default; emit the ranges as pairs
+    else {
+        for (int32_t i = 0; i < count; ++i) {
+            UChar32 start = getRangeStart(i);
+            UChar32 end = getRangeEnd(i);
+            _appendToPat(result, start, escapeUnprintable);
+            if (start != end) {
+                if ((start+1) != end) {
+                    result.append(HYPHEN);
+                }
+                _appendToPat(result, end, escapeUnprintable);
+            }
+        }
+    }
+
+    for (int32_t i = 0; i<strings->size(); ++i) {
+        result.append(OPEN_BRACE);
+        _appendToPat(result,
+                     *(const UnicodeString*) strings->elementAt(i),
+                     escapeUnprintable);
+        result.append(CLOSE_BRACE);
+    }
+    return result.append(SET_CLOSE);
+}
+
+/**
+* Release existing cached pattern
+*/
+void UnicodeSet::releasePattern() {
+    if (pat) {
+        uprv_free(pat);
+        pat = NULL;
+        patLen = 0;
+    }
+}
+
+/**
+* Set the new pattern to cache.
+*/
+void UnicodeSet::setPattern(const UnicodeString& newPat) {
+    releasePattern();
+    int32_t newPatLen = newPat.length();
+    pat = (UChar *)uprv_malloc((newPatLen + 1) * sizeof(UChar));
+    if (pat) {
+        patLen = newPatLen;
+        newPat.extractBetween(0, patLen, pat);
+        pat[patLen] = 0;
+    }
+    // else we don't care if malloc failed. This was just a nice cache.
+    // We can regenerate an equivalent pattern later when requested.
+}
+
+UnicodeFunctor *UnicodeSet::freeze() {
+    if(!isFrozen() && !isBogus()) {
+        // Do most of what compact() does before freezing because
+        // compact() will not work when the set is frozen.
+        // Small modification: Don't shrink if the savings would be tiny (<=GROW_EXTRA).
+
+        // Delete buffer first to defragment memory less.
+        if (buffer != NULL) {
+            uprv_free(buffer);
+            buffer = NULL;
+        }
+        if (capacity > (len + GROW_EXTRA)) {
+            // Make the capacity equal to len or 1.
+            // We don't want to realloc of 0 size.
+            capacity = len + (len == 0);
+            list = (UChar32*) uprv_realloc(list, sizeof(UChar32) * capacity);
+            if (list == NULL) { // Check for memory allocation error.
+                setToBogus();
+                return this;
+            }
+        }
+
+        // Optimize contains() and span() and similar functions.
+        if (!strings->isEmpty()) {
+            stringSpan = new UnicodeSetStringSpan(*this, *strings, UnicodeSetStringSpan::ALL);
+            if (stringSpan != NULL && !stringSpan->needsStringSpanUTF16()) {
+                // All strings are irrelevant for span() etc. because
+                // all of each string's code points are contained in this set.
+                // Do not check needsStringSpanUTF8() because UTF-8 has at most as
+                // many relevant strings as UTF-16.
+                // (Thus needsStringSpanUTF8() implies needsStringSpanUTF16().)
+                delete stringSpan;
+                stringSpan = NULL;
+            }
+        }
+        if (stringSpan == NULL) {
+            // No span-relevant strings: Optimize for code point spans.
+            bmpSet=new BMPSet(list, len);
+            if (bmpSet == NULL) { // Check for memory allocation error.
+                setToBogus();
+            }
+        }
+    }
+    return this;
+}
+
+int32_t UnicodeSet::span(const UChar *s, int32_t length, USetSpanCondition spanCondition) const {
+    if(length>0 && bmpSet!=NULL) {
+        return (int32_t)(bmpSet->span(s, s+length, spanCondition)-s);
+    }
+    if(length<0) {
+        length=u_strlen(s);
+    }
+    if(length==0) {
+        return 0;
+    }
+    if(stringSpan!=NULL) {
+        return stringSpan->span(s, length, spanCondition);
+    } else if(!strings->isEmpty()) {
+        uint32_t which= spanCondition==USET_SPAN_NOT_CONTAINED ?
+                            UnicodeSetStringSpan::FWD_UTF16_NOT_CONTAINED :
+                            UnicodeSetStringSpan::FWD_UTF16_CONTAINED;
+        UnicodeSetStringSpan strSpan(*this, *strings, which);
+        if(strSpan.needsStringSpanUTF16()) {
+            return strSpan.span(s, length, spanCondition);
+        }
+    }
+
+    if(spanCondition!=USET_SPAN_NOT_CONTAINED) {
+        spanCondition=USET_SPAN_CONTAINED;  // Pin to 0/1 values.
+    }
+
+    UChar32 c;
+    int32_t start=0, prev=0;
+    do {
+        U16_NEXT(s, start, length, c);
+        if(spanCondition!=contains(c)) {
+            break;
+        }
+    } while((prev=start)<length);
+    return prev;
+}
+
+int32_t UnicodeSet::spanBack(const UChar *s, int32_t length, USetSpanCondition spanCondition) const {
+    if(length>0 && bmpSet!=NULL) {
+        return (int32_t)(bmpSet->spanBack(s, s+length, spanCondition)-s);
+    }
+    if(length<0) {
+        length=u_strlen(s);
+    }
+    if(length==0) {
+        return 0;
+    }
+    if(stringSpan!=NULL) {
+        return stringSpan->spanBack(s, length, spanCondition);
+    } else if(!strings->isEmpty()) {
+        uint32_t which= spanCondition==USET_SPAN_NOT_CONTAINED ?
+                            UnicodeSetStringSpan::BACK_UTF16_NOT_CONTAINED :
+                            UnicodeSetStringSpan::BACK_UTF16_CONTAINED;
+        UnicodeSetStringSpan strSpan(*this, *strings, which);
+        if(strSpan.needsStringSpanUTF16()) {
+            return strSpan.spanBack(s, length, spanCondition);
+        }
+    }
+
+    if(spanCondition!=USET_SPAN_NOT_CONTAINED) {
+        spanCondition=USET_SPAN_CONTAINED;  // Pin to 0/1 values.
+    }
+
+    UChar32 c;
+    int32_t prev=length;
+    do {
+        U16_PREV(s, 0, length, c);
+        if(spanCondition!=contains(c)) {
+            break;
+        }
+    } while((prev=length)>0);
+    return prev;
+}
+
+int32_t UnicodeSet::spanUTF8(const char *s, int32_t length, USetSpanCondition spanCondition) const {
+    if(length>0 && bmpSet!=NULL) {
+        const uint8_t *s0=(const uint8_t *)s;
+        return (int32_t)(bmpSet->spanUTF8(s0, length, spanCondition)-s0);
+    }
+    if(length<0) {
+        length=(int32_t)uprv_strlen(s);
+    }
+    if(length==0) {
+        return 0;
+    }
+    if(stringSpan!=NULL) {
+        return stringSpan->spanUTF8((const uint8_t *)s, length, spanCondition);
+    } else if(!strings->isEmpty()) {
+        uint32_t which= spanCondition==USET_SPAN_NOT_CONTAINED ?
+                            UnicodeSetStringSpan::FWD_UTF8_NOT_CONTAINED :
+                            UnicodeSetStringSpan::FWD_UTF8_CONTAINED;
+        UnicodeSetStringSpan strSpan(*this, *strings, which);
+        if(strSpan.needsStringSpanUTF8()) {
+            return strSpan.spanUTF8((const uint8_t *)s, length, spanCondition);
+        }
+    }
+
+    if(spanCondition!=USET_SPAN_NOT_CONTAINED) {
+        spanCondition=USET_SPAN_CONTAINED;  // Pin to 0/1 values.
+    }
+
+    UChar32 c;
+    int32_t start=0, prev=0;
+    do {
+        U8_NEXT_OR_FFFD(s, start, length, c);
+        if(spanCondition!=contains(c)) {
+            break;
+        }
+    } while((prev=start)<length);
+    return prev;
+}
+
+int32_t UnicodeSet::spanBackUTF8(const char *s, int32_t length, USetSpanCondition spanCondition) const {
+    if(length>0 && bmpSet!=NULL) {
+        const uint8_t *s0=(const uint8_t *)s;
+        return bmpSet->spanBackUTF8(s0, length, spanCondition);
+    }
+    if(length<0) {
+        length=(int32_t)uprv_strlen(s);
+    }
+    if(length==0) {
+        return 0;
+    }
+    if(stringSpan!=NULL) {
+        return stringSpan->spanBackUTF8((const uint8_t *)s, length, spanCondition);
+    } else if(!strings->isEmpty()) {
+        uint32_t which= spanCondition==USET_SPAN_NOT_CONTAINED ?
+                            UnicodeSetStringSpan::BACK_UTF8_NOT_CONTAINED :
+                            UnicodeSetStringSpan::BACK_UTF8_CONTAINED;
+        UnicodeSetStringSpan strSpan(*this, *strings, which);
+        if(strSpan.needsStringSpanUTF8()) {
+            return strSpan.spanBackUTF8((const uint8_t *)s, length, spanCondition);
+        }
+    }
+
+    if(spanCondition!=USET_SPAN_NOT_CONTAINED) {
+        spanCondition=USET_SPAN_CONTAINED;  // Pin to 0/1 values.
+    }
+
+    UChar32 c;
+    int32_t prev=length;
+    do {
+        U8_PREV_OR_FFFD(s, 0, length, c);
+        if(spanCondition!=contains(c)) {
+            break;
+        }
+    } while((prev=length)>0);
+    return prev;
+}
+
+U_NAMESPACE_END