diff --git a/README.chromium b/README.chromium index 9b947618ea..2ff4f2a322 100644 --- a/README.chromium +++ b/README.chromium @@ -265,3 +265,4 @@ D. Local Modifications https://unicode-org.atlassian.net/browse/ICU-20250 - Fix: https://github.com/unicode-org/icu/pull/265 + https://github.com/unicode-org/icu/pull/278 diff --git a/patches/uniset_perf2.patch b/patches/uniset_perf2.patch new file mode 100644 index 0000000000..621b3e5d45 --- /dev/null +++ b/patches/uniset_perf2.patch @@ -0,0 +1,1607 @@ +diff --git a/source/common/characterproperties.cpp b/source/common/characterproperties.cpp +index 3aff85b3..b416ef52 100644 +--- a/source/common/characterproperties.cpp ++++ b/source/common/characterproperties.cpp +@@ -23,6 +23,9 @@ + #include "umutex.h" + #include "uprops.h" + ++using icu::LocalPointer; ++using icu::Normalizer2Factory; ++using icu::Normalizer2Impl; + using icu::UInitOnce; + using icu::UnicodeSet; + +@@ -30,11 +33,13 @@ namespace { + + UBool U_CALLCONV characterproperties_cleanup(); + ++constexpr int32_t NUM_INCLUSIONS = UPROPS_SRC_COUNT + UCHAR_INT_LIMIT - UCHAR_INT_START; ++ + struct Inclusion { + UnicodeSet *fSet; + UInitOnce fInitOnce; + }; +-Inclusion gInclusions[UPROPS_SRC_COUNT]; // cached getInclusions() ++Inclusion gInclusions[NUM_INCLUSIONS]; // cached getInclusions() + + UnicodeSet *sets[UCHAR_BINARY_LIMIT] = {}; + +@@ -80,35 +85,22 @@ UBool U_CALLCONV characterproperties_cleanup() { + return TRUE; + } + +-} // namespace +- +-U_NAMESPACE_BEGIN +- +-/* +-Reduce excessive reallocation, and make it easier to detect initialization problems. +-Usually you don't see smaller sets than this for Unicode 5.0. +-*/ +-constexpr int32_t DEFAULT_INCLUSION_CAPACITY = 3072; +- +-void U_CALLCONV CharacterProperties::initInclusion(UPropertySource src, UErrorCode &errorCode) { ++void U_CALLCONV initInclusion(UPropertySource src, UErrorCode &errorCode) { + // This function is invoked only via umtx_initOnce(). +- // This function is a friend of class UnicodeSet. +- + U_ASSERT(0 <= src && src < UPROPS_SRC_COUNT); + if (src == UPROPS_SRC_NONE) { + errorCode = U_INTERNAL_PROGRAM_ERROR; + return; + } +- UnicodeSet * &incl = gInclusions[src].fSet; +- U_ASSERT(incl == nullptr); ++ U_ASSERT(gInclusions[src].fSet == nullptr); + +- incl = new UnicodeSet(); +- if (incl == nullptr) { ++ LocalPointer incl(new UnicodeSet()); ++ if (incl.isNull()) { + errorCode = U_MEMORY_ALLOCATION_ERROR; + return; + } + USetAdder sa = { +- (USet *)incl, ++ (USet *)incl.getAlias(), + _set_add, + _set_addRange, + _set_addString, +@@ -116,7 +108,6 @@ void U_CALLCONV CharacterProperties::initInclusion(UPropertySource src, UErrorCo + nullptr // don't need removeRange() + }; + +- incl->ensureCapacity(DEFAULT_INCLUSION_CAPACITY, errorCode); + switch(src) { + case UPROPS_SRC_CHAR: + uchar_addPropertyStarts(&sa, &errorCode); +@@ -183,12 +174,15 @@ void U_CALLCONV CharacterProperties::initInclusion(UPropertySource src, UErrorCo + } + + if (U_FAILURE(errorCode)) { +- delete incl; +- incl = nullptr; + return; + } +- // Compact for caching ++ if (incl->isBogus()) { ++ errorCode = U_MEMORY_ALLOCATION_ERROR; ++ return; ++ } ++ // Compact for caching. + incl->compact(); ++ gInclusions[src].fSet = incl.orphan(); + ucln_common_registerCleanup(UCLN_COMMON_CHARACTERPROPERTIES, characterproperties_cleanup); + } + +@@ -199,15 +193,66 @@ const UnicodeSet *getInclusionsForSource(UPropertySource src, UErrorCode &errorC + return nullptr; + } + Inclusion &i = gInclusions[src]; +- umtx_initOnce(i.fInitOnce, &CharacterProperties::initInclusion, src, errorCode); ++ umtx_initOnce(i.fInitOnce, &initInclusion, src, errorCode); + return i.fSet; + } + ++void U_CALLCONV initIntPropInclusion(UProperty prop, UErrorCode &errorCode) { ++ // This function is invoked only via umtx_initOnce(). ++ U_ASSERT(UCHAR_INT_START <= prop && prop < UCHAR_INT_LIMIT); ++ int32_t inclIndex = UPROPS_SRC_COUNT + prop - UCHAR_INT_START; ++ U_ASSERT(gInclusions[inclIndex].fSet == nullptr); ++ UPropertySource src = uprops_getSource(prop); ++ const UnicodeSet *incl = getInclusionsForSource(src, errorCode); ++ if (U_FAILURE(errorCode)) { ++ return; ++ } ++ ++ LocalPointer intPropIncl(new UnicodeSet(0, 0)); ++ if (intPropIncl.isNull()) { ++ errorCode = U_MEMORY_ALLOCATION_ERROR; ++ return; ++ } ++ int32_t numRanges = incl->getRangeCount(); ++ int32_t prevValue = 0; ++ for (int32_t i = 0; i < numRanges; ++i) { ++ UChar32 rangeEnd = incl->getRangeEnd(i); ++ for (UChar32 c = incl->getRangeStart(i); c <= rangeEnd; ++c) { ++ // TODO: Get a UCharacterProperty.IntProperty to avoid the property dispatch. ++ int32_t value = u_getIntPropertyValue(c, prop); ++ if (value != prevValue) { ++ intPropIncl->add(c); ++ prevValue = value; ++ } ++ } ++ } ++ ++ if (intPropIncl->isBogus()) { ++ errorCode = U_MEMORY_ALLOCATION_ERROR; ++ return; ++ } ++ // Compact for caching. ++ intPropIncl->compact(); ++ gInclusions[inclIndex].fSet = intPropIncl.orphan(); ++ ucln_common_registerCleanup(UCLN_COMMON_CHARACTERPROPERTIES, characterproperties_cleanup); ++} ++ ++} // namespace ++ ++U_NAMESPACE_BEGIN ++ + const UnicodeSet *CharacterProperties::getInclusionsForProperty( + UProperty prop, UErrorCode &errorCode) { + if (U_FAILURE(errorCode)) { return nullptr; } +- UPropertySource src = uprops_getSource(prop); +- return getInclusionsForSource(src, errorCode); ++ if (UCHAR_INT_START <= prop && prop < UCHAR_INT_LIMIT) { ++ int32_t inclIndex = UPROPS_SRC_COUNT + prop - UCHAR_INT_START; ++ Inclusion &i = gInclusions[inclIndex]; ++ umtx_initOnce(i.fInitOnce, &initIntPropInclusion, prop, errorCode); ++ return i.fSet; ++ } else { ++ UPropertySource src = uprops_getSource(prop); ++ return getInclusionsForSource(src, errorCode); ++ } + } + + U_NAMESPACE_END +@@ -216,7 +261,7 @@ namespace { + + UnicodeSet *makeSet(UProperty property, UErrorCode &errorCode) { + if (U_FAILURE(errorCode)) { return nullptr; } +- icu::LocalPointer set(new UnicodeSet()); ++ LocalPointer set(new UnicodeSet()); + if (set.isNull()) { + errorCode = U_MEMORY_ALLOCATION_ERROR; + return nullptr; +diff --git a/source/common/ucptrie.cpp b/source/common/ucptrie.cpp +index 13496ad5..b72e3183 100644 +--- a/source/common/ucptrie.cpp ++++ b/source/common/ucptrie.cpp +@@ -280,7 +280,7 @@ UChar32 getRange(const void *t, UChar32 start, + int32_t prevI3Block = -1; + int32_t prevBlock = -1; + UChar32 c = start; +- uint32_t value; ++ uint32_t trieValue, value; + bool haveValue = false; + do { + int32_t i3Block; +@@ -319,6 +319,7 @@ UChar32 getRange(const void *t, UChar32 start, + return c - 1; + } + } else { ++ trieValue = trie->nullValue; + value = nullValue; + if (pValue != nullptr) { *pValue = nullValue; } + haveValue = true; +@@ -357,6 +358,7 @@ UChar32 getRange(const void *t, UChar32 start, + return c - 1; + } + } else { ++ trieValue = trie->nullValue; + value = nullValue; + if (pValue != nullptr) { *pValue = nullValue; } + haveValue = true; +@@ -364,23 +366,32 @@ UChar32 getRange(const void *t, UChar32 start, + c = (c + dataBlockLength) & ~dataMask; + } else { + int32_t di = block + (c & dataMask); +- uint32_t value2 = getValue(trie->data, valueWidth, di); +- value2 = maybeFilterValue(value2, trie->nullValue, nullValue, +- filter, context); ++ uint32_t trieValue2 = getValue(trie->data, valueWidth, di); + if (haveValue) { +- if (value2 != value) { +- return c - 1; ++ if (trieValue2 != trieValue) { ++ if (filter == nullptr || ++ maybeFilterValue(trieValue2, trie->nullValue, nullValue, ++ filter, context) != value) { ++ return c - 1; ++ } ++ trieValue = trieValue2; // may or may not help + } + } else { +- value = value2; ++ trieValue = trieValue2; ++ value = maybeFilterValue(trieValue2, trie->nullValue, nullValue, ++ filter, context); + if (pValue != nullptr) { *pValue = value; } + haveValue = true; + } + while ((++c & dataMask) != 0) { +- if (maybeFilterValue(getValue(trie->data, valueWidth, ++di), +- trie->nullValue, nullValue, +- filter, context) != value) { +- return c - 1; ++ trieValue2 = getValue(trie->data, valueWidth, ++di); ++ if (trieValue2 != trieValue) { ++ if (filter == nullptr || ++ maybeFilterValue(trieValue2, trie->nullValue, nullValue, ++ filter, context) != value) { ++ return c - 1; ++ } ++ trieValue = trieValue2; // may or may not help + } + } + } +diff --git a/source/common/umutablecptrie.cpp b/source/common/umutablecptrie.cpp +index 44af8309..926be468 100644 +--- a/source/common/umutablecptrie.cpp ++++ b/source/common/umutablecptrie.cpp +@@ -304,41 +304,56 @@ UChar32 MutableCodePointTrie::getRange( + uint32_t nullValue = initialValue; + if (filter != nullptr) { nullValue = filter(context, nullValue); } + UChar32 c = start; +- uint32_t value; ++ uint32_t trieValue, value; + bool haveValue = false; + int32_t i = c >> UCPTRIE_SHIFT_3; + do { + if (flags[i] == ALL_SAME) { +- uint32_t value2 = maybeFilterValue(index[i], initialValue, nullValue, +- filter, context); ++ uint32_t trieValue2 = index[i]; + if (haveValue) { +- if (value2 != value) { +- return c - 1; ++ if (trieValue2 != trieValue) { ++ if (filter == nullptr || ++ maybeFilterValue(trieValue2, initialValue, nullValue, ++ filter, context) != value) { ++ return c - 1; ++ } ++ trieValue = trieValue2; // may or may not help + } + } else { +- value = value2; ++ trieValue = trieValue2; ++ value = maybeFilterValue(trieValue2, initialValue, nullValue, filter, context); + if (pValue != nullptr) { *pValue = value; } + haveValue = true; + } + c = (c + UCPTRIE_SMALL_DATA_BLOCK_LENGTH) & ~UCPTRIE_SMALL_DATA_MASK; + } else /* MIXED */ { + int32_t di = index[i] + (c & UCPTRIE_SMALL_DATA_MASK); +- uint32_t value2 = maybeFilterValue(data[di], initialValue, nullValue, +- filter, context); ++ uint32_t trieValue2 = data[di]; + if (haveValue) { +- if (value2 != value) { +- return c - 1; ++ if (trieValue2 != trieValue) { ++ if (filter == nullptr || ++ maybeFilterValue(trieValue2, initialValue, nullValue, ++ filter, context) != value) { ++ return c - 1; ++ } ++ trieValue = trieValue2; // may or may not help + } + } else { +- value = value2; ++ trieValue = trieValue2; ++ value = maybeFilterValue(trieValue2, initialValue, nullValue, filter, context); + if (pValue != nullptr) { *pValue = value; } + haveValue = true; + } + while ((++c & UCPTRIE_SMALL_DATA_MASK) != 0) { +- if (maybeFilterValue(data[++di], initialValue, nullValue, +- filter, context) != value) { +- return c - 1; ++ trieValue2 = data[++di]; ++ if (trieValue2 != trieValue) { ++ if (filter == nullptr || ++ maybeFilterValue(trieValue2, initialValue, nullValue, ++ filter, context) != value) { ++ return c - 1; ++ } + } ++ trieValue = trieValue2; // may or may not help + } + } + ++i; +diff --git a/source/common/unicode/uniset.h b/source/common/unicode/uniset.h +index 0abc7542..af56b872 100644 +--- a/source/common/unicode/uniset.h ++++ b/source/common/unicode/uniset.h +@@ -27,7 +27,6 @@ U_NAMESPACE_BEGIN + + // Forward Declarations. + class BMPSet; +-class CharacterProperties; + class ParsePosition; + class RBBIRuleScanner; + class SymbolTable; +@@ -276,14 +275,23 @@ class RuleCharacterIterator; + * @stable ICU 2.0 + */ + class U_COMMON_API UnicodeSet U_FINAL : public UnicodeFilter { ++private: ++ /** ++ * Enough for sets with few ranges. ++ * For example, White_Space has 10 ranges, list length 21. ++ */ ++ static constexpr int32_t INITIAL_CAPACITY = 25; ++ // fFlags constant ++ static constexpr uint8_t kIsBogus = 1; // This set is bogus (i.e. not valid) ++ ++ UChar32* list = stackList; // MUST be terminated with HIGH ++ int32_t capacity = INITIAL_CAPACITY; // capacity of list ++ int32_t len = 1; // length of list used; 1 <= len <= capacity ++ uint8_t fFlags = 0; // Bit flag (see constants above) + +- int32_t len; // length of list used; 0 <= len <= capacity +- int32_t capacity; // capacity of list +- UChar32* list; // MUST be terminated with HIGH +- BMPSet *bmpSet; // The set is frozen iff either bmpSet or stringSpan is not NULL. +- UChar32* buffer; // internal buffer, may be NULL +- int32_t bufferCapacity; // capacity of buffer +- int32_t patLen; ++ BMPSet *bmpSet = nullptr; // The set is frozen iff either bmpSet or stringSpan is not NULL. ++ UChar32* buffer = nullptr; // internal buffer, may be NULL ++ int32_t bufferCapacity = 0; // capacity of buffer + + /** + * The pattern representation of this set. This may not be the +@@ -294,15 +302,19 @@ class U_COMMON_API UnicodeSet U_FINAL : public UnicodeFilter { + * indicating that toPattern() must generate a pattern + * representation from the inversion list. + */ +- char16_t *pat; +- UVector* strings; // maintained in sorted order +- UnicodeSetStringSpan *stringSpan; ++ char16_t *pat = nullptr; ++ int32_t patLen = 0; ++ ++ UVector* strings = nullptr; // maintained in sorted order ++ UnicodeSetStringSpan *stringSpan = nullptr; ++ ++ /** ++ * Initial list array. ++ * Avoids some heap allocations, and list is never nullptr. ++ * Increases the object size a bit. ++ */ ++ UChar32 stackList[INITIAL_CAPACITY]; + +-private: +- enum { // constants +- kIsBogus = 1 // This set is bogus (i.e. not valid) +- }; +- uint8_t fFlags; // Bit flag (see constants above) + public: + /** + * Determine if this object contains a valid set. +@@ -1480,8 +1492,6 @@ private: + + friend class USetAccess; + +- int32_t getStringCount() const; +- + const UnicodeString* getString(int32_t index) const; + + //---------------------------------------------------------------- +@@ -1528,13 +1538,18 @@ private: + // Implementation: Utility methods + //---------------------------------------------------------------- + +- void ensureCapacity(int32_t newLen, UErrorCode& ec); ++ static int32_t nextCapacity(int32_t minCapacity); ++ ++ bool ensureCapacity(int32_t newLen); + +- void ensureBufferCapacity(int32_t newLen, UErrorCode& ec); ++ bool ensureBufferCapacity(int32_t newLen); + + void swapBuffers(void); + + UBool allocateStrings(UErrorCode &status); ++ UBool hasStrings() const; ++ int32_t stringsSize() const; ++ UBool stringsContains(const UnicodeString &s) const; + + UnicodeString& _toPattern(UnicodeString& result, + UBool escapeUnprintable) const; +@@ -1614,7 +1629,6 @@ private: + UnicodeString& rebuiltPat, + UErrorCode& ec); + +- friend class CharacterProperties; + static const UnicodeSet* getInclusions(int32_t src, UErrorCode &status); + + /** +@@ -1646,7 +1660,10 @@ private: + /** + * Set the new pattern to cache. + */ +- void setPattern(const UnicodeString& newPat); ++ void setPattern(const UnicodeString& newPat) { ++ setPattern(newPat.getBuffer(), newPat.length()); ++ } ++ void setPattern(const char16_t *newPat, int32_t newPatLen); + /** + * Release existing cached pattern. + */ +diff --git a/source/common/uniset.cpp b/source/common/uniset.cpp +index e8378e0a..20242776 100644 +--- a/source/common/uniset.cpp ++++ b/source/common/uniset.cpp +@@ -14,6 +14,7 @@ + #include "unicode/parsepos.h" + #include "unicode/symtable.h" + #include "unicode/uniset.h" ++#include "unicode/ustring.h" + #include "unicode/utf8.h" + #include "unicode/utf16.h" + #include "ruleiter.h" +@@ -53,11 +54,8 @@ + // LOW <= all valid values. ZERO for codepoints + #define UNICODESET_LOW 0x000000 + +-// initial storage. Must be >= 0 +-#define START_EXTRA 16 +- +-// extra amount for growth. Must be >= 0 +-#define GROW_EXTRA START_EXTRA ++/** Max list [0, 1, 2, ..., max code point, HIGH] */ ++constexpr int32_t MAX_LENGTH = UNICODESET_HIGH + 1; + + U_NAMESPACE_BEGIN + +@@ -137,6 +135,18 @@ static int8_t U_CALLCONV compareUnicodeString(UElement t1, UElement t2) { + return a.compare(b); + } + ++UBool UnicodeSet::hasStrings() const { ++ return strings != nullptr && !strings->isEmpty(); ++} ++ ++int32_t UnicodeSet::stringsSize() const { ++ return strings == nullptr ? 0 : strings->size(); ++} ++ ++UBool UnicodeSet::stringsContains(const UnicodeString &s) const { ++ return strings != nullptr && strings->contains((void*) &s); ++} ++ + //---------------------------------------------------------------- + // Constructors &c + //---------------------------------------------------------------- +@@ -144,24 +154,8 @@ static int8_t U_CALLCONV compareUnicodeString(UElement t1, UElement t2) { + /** + * Constructs an empty set. + */ +-UnicodeSet::UnicodeSet() : +- len(1), capacity(1 + START_EXTRA), list(0), bmpSet(0), buffer(0), +- bufferCapacity(0), patLen(0), pat(NULL), strings(NULL), stringSpan(NULL), +- fFlags(0) +-{ +- UErrorCode status = U_ZERO_ERROR; +- allocateStrings(status); +- if (U_FAILURE(status)) { +- setToBogus(); // If memory allocation failed, set to bogus state. +- return; +- } +- list = (UChar32*) uprv_malloc(sizeof(UChar32) * capacity); +- if(list!=NULL){ +- list[0] = UNICODESET_HIGH; +- } else { // If memory allocation failed, set to bogus state. +- setToBogus(); +- return; +- } ++UnicodeSet::UnicodeSet() { ++ list[0] = UNICODESET_HIGH; + _dbgct(this); + } + +@@ -172,89 +166,39 @@ UnicodeSet::UnicodeSet() : + * @param start first character, inclusive, of range + * @param end last character, inclusive, of range + */ +-UnicodeSet::UnicodeSet(UChar32 start, UChar32 end) : +- len(1), capacity(1 + START_EXTRA), list(0), bmpSet(0), buffer(0), +- bufferCapacity(0), patLen(0), pat(NULL), strings(NULL), stringSpan(NULL), +- fFlags(0) +-{ +- UErrorCode status = U_ZERO_ERROR; +- allocateStrings(status); +- if (U_FAILURE(status)) { +- setToBogus(); // If memory allocation failed, set to bogus state. +- return; +- } +- list = (UChar32*) uprv_malloc(sizeof(UChar32) * capacity); +- if(list!=NULL){ +- list[0] = UNICODESET_HIGH; +- complement(start, end); +- } else { // If memory allocation failed, set to bogus state. +- setToBogus(); +- return; +- } ++UnicodeSet::UnicodeSet(UChar32 start, UChar32 end) { ++ list[0] = UNICODESET_HIGH; ++ add(start, end); + _dbgct(this); + } + + /** + * Constructs a set that is identical to the given UnicodeSet. + */ +-UnicodeSet::UnicodeSet(const UnicodeSet& o) : +- UnicodeFilter(o), +- len(0), capacity(o.isFrozen() ? o.len : o.len + GROW_EXTRA), list(0), +- bmpSet(0), +- buffer(0), bufferCapacity(0), +- patLen(0), pat(NULL), strings(NULL), stringSpan(NULL), +- fFlags(0) +-{ +- UErrorCode status = U_ZERO_ERROR; +- allocateStrings(status); +- if (U_FAILURE(status)) { +- setToBogus(); // If memory allocation failed, set to bogus state. +- return; +- } +- list = (UChar32*) uprv_malloc(sizeof(UChar32) * capacity); +- if(list!=NULL){ +- *this = o; +- } else { // If memory allocation failed, set to bogus state. +- setToBogus(); +- return; +- } ++UnicodeSet::UnicodeSet(const UnicodeSet& o) : UnicodeFilter(o) { ++ *this = o; + _dbgct(this); + } + + // Copy-construct as thawed. +-UnicodeSet::UnicodeSet(const UnicodeSet& o, UBool /* asThawed */) : +- UnicodeFilter(o), +- len(0), capacity(o.len + GROW_EXTRA), list(0), +- bmpSet(0), +- buffer(0), bufferCapacity(0), +- patLen(0), pat(NULL), strings(NULL), stringSpan(NULL), +- fFlags(0) +-{ +- UErrorCode status = U_ZERO_ERROR; +- allocateStrings(status); +- if (U_FAILURE(status)) { +- setToBogus(); // If memory allocation failed, set to bogus state. +- return; +- } +- list = (UChar32*) uprv_malloc(sizeof(UChar32) * capacity); +- if(list!=NULL){ ++UnicodeSet::UnicodeSet(const UnicodeSet& o, UBool /* asThawed */) : UnicodeFilter(o) { ++ if (ensureCapacity(o.len)) { + // *this = o except for bmpSet and stringSpan + len = o.len; + uprv_memcpy(list, o.list, (size_t)len*sizeof(UChar32)); +- if (strings != NULL && o.strings != NULL) { +- strings->assign(*o.strings, cloneUnicodeString, status); +- } else { // Invalid strings. +- setToBogus(); +- return; ++ if (o.hasStrings()) { ++ UErrorCode status = U_ZERO_ERROR; ++ if (!allocateStrings(status) || ++ (strings->assign(*o.strings, cloneUnicodeString, status), U_FAILURE(status))) { ++ setToBogus(); ++ return; ++ } + } + if (o.pat) { +- setPattern(UnicodeString(o.pat, o.patLen)); ++ setPattern(o.pat, o.patLen); + } +- } else { // If memory allocation failed, set to bogus state. +- setToBogus(); +- return; ++ _dbgct(this); + } +- _dbgct(this); + } + + /** +@@ -262,9 +206,11 @@ UnicodeSet::UnicodeSet(const UnicodeSet& o, UBool /* asThawed */) : + */ + UnicodeSet::~UnicodeSet() { + _dbgdt(this); // first! +- uprv_free(list); ++ if (list != stackList) { ++ uprv_free(list); ++ } + delete bmpSet; +- if (buffer) { ++ if (buffer != stackList) { + uprv_free(buffer); + } + delete strings; +@@ -290,32 +236,30 @@ UnicodeSet& UnicodeSet::copyFrom(const UnicodeSet& o, UBool asThawed) { + setToBogus(); + return *this; + } +- UErrorCode ec = U_ZERO_ERROR; +- ensureCapacity(o.len, ec); +- if (U_FAILURE(ec)) { ++ if (!ensureCapacity(o.len)) { + // ensureCapacity will mark the UnicodeSet as Bogus if OOM failure happens. + return *this; + } + len = o.len; + uprv_memcpy(list, o.list, (size_t)len*sizeof(UChar32)); +- if (o.bmpSet == NULL || asThawed) { +- bmpSet = NULL; +- } else { ++ if (o.bmpSet != nullptr && !asThawed) { + bmpSet = new BMPSet(*o.bmpSet, list, len); + if (bmpSet == NULL) { // Check for memory allocation error. + setToBogus(); + return *this; + } + } +- if (strings != NULL && o.strings != NULL) { +- strings->assign(*o.strings, cloneUnicodeString, ec); +- } else { // Invalid strings. +- setToBogus(); +- return *this; ++ if (o.hasStrings()) { ++ UErrorCode status = U_ZERO_ERROR; ++ if ((strings == nullptr && !allocateStrings(status)) || ++ (strings->assign(*o.strings, cloneUnicodeString, status), U_FAILURE(status))) { ++ setToBogus(); ++ return *this; ++ } ++ } else if (hasStrings()) { ++ strings->removeAllElements(); + } +- if (o.stringSpan == NULL || asThawed) { +- stringSpan = NULL; +- } else { ++ if (o.stringSpan != nullptr && !asThawed) { + stringSpan = new UnicodeSetStringSpan(*o.stringSpan, *strings); + if (stringSpan == NULL) { // Check for memory allocation error. + setToBogus(); +@@ -324,7 +268,7 @@ UnicodeSet& UnicodeSet::copyFrom(const UnicodeSet& o, UBool asThawed) { + } + releasePattern(); + if (o.pat) { +- setPattern(UnicodeString(o.pat, o.patLen)); ++ setPattern(o.pat, o.patLen); + } + return *this; + } +@@ -357,7 +301,8 @@ UBool UnicodeSet::operator==(const UnicodeSet& o) const { + for (int32_t i = 0; i < len; ++i) { + if (list[i] != o.list[i]) return FALSE; + } +- if (*strings != *o.strings) return FALSE; ++ if (hasStrings() != o.hasStrings()) { return FALSE; } ++ if (hasStrings() && *strings != *o.strings) return FALSE; + return TRUE; + } + +@@ -393,7 +338,7 @@ int32_t UnicodeSet::size(void) const { + for (int32_t i = 0; i < count; ++i) { + n += getRangeEnd(i) - getRangeStart(i) + 1; + } +- return n + strings->size(); ++ return n + stringsSize(); + } + + /** +@@ -402,7 +347,7 @@ int32_t UnicodeSet::size(void) const { + * @return true if this set contains no elements. + */ + UBool UnicodeSet::isEmpty(void) const { +- return len == 1 && strings->size() == 0; ++ return len == 1 && !hasStrings(); + } + + /** +@@ -502,7 +447,7 @@ UBool UnicodeSet::contains(const UnicodeString& s) const { + if (s.length() == 0) return FALSE; + int32_t cp = getSingleCP(s); + if (cp < 0) { +- return strings->contains((void*) &s); ++ return stringsContains(s); + } else { + return contains((UChar32) cp); + } +@@ -524,8 +469,7 @@ UBool UnicodeSet::containsAll(const UnicodeSet& c) const { + return FALSE; + } + } +- if (!strings->containsAll(*c.strings)) return FALSE; +- return TRUE; ++ return !c.hasStrings() || (strings != nullptr && strings->containsAll(*c.strings)); + } + + /** +@@ -571,8 +515,7 @@ UBool UnicodeSet::containsNone(const UnicodeSet& c) const { + return FALSE; + } + } +- if (!strings->containsNone(*c.strings)) return FALSE; +- return TRUE; ++ return strings == nullptr || !c.hasStrings() || strings->containsNone(*c.strings); + } + + /** +@@ -613,7 +556,7 @@ UBool UnicodeSet::matchesIndexValue(uint8_t v) const { + return TRUE; + } + } +- if (strings->size() != 0) { ++ if (hasStrings()) { + for (i=0; isize(); ++i) { + const UnicodeString& s = *(const UnicodeString*)strings->elementAt(i); + //if (s.length() == 0) { +@@ -648,7 +591,7 @@ UMatchDegree UnicodeSet::matches(const Replaceable& text, + return U_MISMATCH; + } + } else { +- if (strings->size() != 0) { // try strings first ++ if (hasStrings()) { // try strings first + + // might separate forward and backward loops later + // for now they are combined +@@ -849,7 +792,39 @@ UnicodeSet& UnicodeSet::set(UChar32 start, UChar32 end) { + */ + UnicodeSet& UnicodeSet::add(UChar32 start, UChar32 end) { + if (pinCodePoint(start) < pinCodePoint(end)) { +- UChar32 range[3] = { start, end+1, UNICODESET_HIGH }; ++ UChar32 limit = end + 1; ++ // Fast path for adding a new range after the last one. ++ // Odd list length: [..., lastStart, lastLimit, HIGH] ++ if ((len & 1) != 0) { ++ // If the list is empty, set lastLimit low enough to not be adjacent to 0. ++ UChar32 lastLimit = len == 1 ? -2 : list[len - 2]; ++ if (lastLimit <= start && !isFrozen() && !isBogus()) { ++ if (lastLimit == start) { ++ // Extend the last range. ++ list[len - 2] = limit; ++ if (limit == UNICODESET_HIGH) { ++ --len; ++ } ++ } else { ++ list[len - 1] = start; ++ if (limit < UNICODESET_HIGH) { ++ if (ensureCapacity(len + 2)) { ++ list[len++] = limit; ++ list[len++] = UNICODESET_HIGH; ++ } ++ } else { // limit == UNICODESET_HIGH ++ if (ensureCapacity(len + 1)) { ++ list[len++] = UNICODESET_HIGH; ++ } ++ } ++ } ++ releasePattern(); ++ return *this; ++ } ++ } ++ // This is slow. Could be much faster using findCodePoint(start) ++ // and modifying the list, dealing with adjacent & overlapping ranges. ++ UChar32 range[3] = { start, limit, UNICODESET_HIGH }; + add(range, 2, 0); + } else if (start == end) { + add(start); +@@ -918,9 +893,7 @@ UnicodeSet& UnicodeSet::add(UChar32 c) { + list[i] = c; + // if we touched the HIGH mark, then add a new one + if (c == (UNICODESET_HIGH - 1)) { +- UErrorCode status = U_ZERO_ERROR; +- ensureCapacity(len+1, status); +- if (U_FAILURE(status)) { ++ if (!ensureCapacity(len+1)) { + // ensureCapacity will mark the object as Bogus if OOM failure happens. + return *this; + } +@@ -964,21 +937,13 @@ UnicodeSet& UnicodeSet::add(UChar32 c) { + // ^ + // list[i] + +- UErrorCode status = U_ZERO_ERROR; +- ensureCapacity(len+2, status); +- if (U_FAILURE(status)) { ++ if (!ensureCapacity(len+2)) { + // ensureCapacity will mark the object as Bogus if OOM failure happens. + return *this; + } + +- //for (int32_t k=len-1; k>=i; --k) { +- // list[k+2] = list[k]; +- //} +- UChar32* src = list + len; +- UChar32* dst = src + 2; +- UChar32* srclimit = list + i; +- while (src > srclimit) *(--dst) = *(--src); +- ++ UChar32 *p = list + i; ++ uprv_memmove(p + 2, p, (len - i) * sizeof(*p)); + list[i] = c; + list[i+1] = c+1; + len += 2; +@@ -1014,7 +979,7 @@ UnicodeSet& UnicodeSet::add(const UnicodeString& s) { + if (s.length() == 0 || isFrozen() || isBogus()) return *this; + int32_t cp = getSingleCP(s); + if (cp < 0) { +- if (!strings->contains((void*) &s)) { ++ if (!stringsContains(s)) { + _add(s); + releasePattern(); + } +@@ -1033,12 +998,16 @@ void UnicodeSet::_add(const UnicodeString& s) { + if (isFrozen() || isBogus()) { + return; + } ++ UErrorCode ec = U_ZERO_ERROR; ++ if (strings == nullptr && !allocateStrings(ec)) { ++ setToBogus(); ++ return; ++ } + UnicodeString* t = new UnicodeString(s); + if (t == NULL) { // Check for memory allocation error. + setToBogus(); + return; + } +- UErrorCode ec = U_ZERO_ERROR; + strings->sortedInsert(t, compareUnicodeString, ec); + if (U_FAILURE(ec)) { + setToBogus(); +@@ -1121,7 +1090,10 @@ UnicodeSet& UnicodeSet::removeAll(const UnicodeString& s) { + } + + UnicodeSet& UnicodeSet::removeAllStrings() { +- strings->removeAllElements(); ++ if (!isFrozen() && hasStrings()) { ++ strings->removeAllElements(); ++ releasePattern(); ++ } + return *this; + } + +@@ -1217,8 +1189,9 @@ UnicodeSet& UnicodeSet::remove(const UnicodeString& s) { + if (s.length() == 0 || isFrozen() || isBogus()) return *this; + int32_t cp = getSingleCP(s); + if (cp < 0) { +- strings->removeElement((void*) &s); +- releasePattern(); ++ if (strings != nullptr && strings->removeElement((void*) &s)) { ++ releasePattern(); ++ } + } else { + remove((UChar32)cp, (UChar32)cp); + } +@@ -1260,24 +1233,17 @@ UnicodeSet& UnicodeSet::complement(void) { + if (isFrozen() || isBogus()) { + return *this; + } +- UErrorCode status = U_ZERO_ERROR; + if (list[0] == UNICODESET_LOW) { +- ensureBufferCapacity(len-1, status); +- if (U_FAILURE(status)) { +- return *this; +- } +- uprv_memcpy(buffer, list + 1, (size_t)(len-1)*sizeof(UChar32)); ++ uprv_memmove(list, list + 1, (size_t)(len-1)*sizeof(UChar32)); + --len; + } else { +- ensureBufferCapacity(len+1, status); +- if (U_FAILURE(status)) { ++ if (!ensureCapacity(len+1)) { + return *this; + } +- uprv_memcpy(buffer + 1, list, (size_t)len*sizeof(UChar32)); +- buffer[0] = UNICODESET_LOW; ++ uprv_memmove(list + 1, list, (size_t)len*sizeof(UChar32)); ++ list[0] = UNICODESET_LOW; + ++len; + } +- swapBuffers(); + releasePattern(); + return *this; + } +@@ -1294,7 +1260,7 @@ UnicodeSet& UnicodeSet::complement(const UnicodeString& s) { + if (s.length() == 0 || isFrozen() || isBogus()) return *this; + int32_t cp = getSingleCP(s); + if (cp < 0) { +- if (strings->contains((void*) &s)) { ++ if (stringsContains(s)) { + strings->removeElement((void*) &s); + } else { + _add(s); +@@ -1325,7 +1291,7 @@ UnicodeSet& UnicodeSet::addAll(const UnicodeSet& c) { + if ( c.strings!=NULL ) { + for (int32_t i=0; isize(); ++i) { + const UnicodeString* s = (const UnicodeString*)c.strings->elementAt(i); +- if (!strings->contains((void*) s)) { ++ if (!stringsContains(*s)) { + _add(*s); + } + } +@@ -1347,7 +1313,13 @@ UnicodeSet& UnicodeSet::retainAll(const UnicodeSet& c) { + return *this; + } + retain(c.list, c.len, 0); +- strings->retainAll(*c.strings); ++ if (hasStrings()) { ++ if (!c.hasStrings()) { ++ strings->removeAllElements(); ++ } else { ++ strings->retainAll(*c.strings); ++ } ++ } + return *this; + } + +@@ -1365,7 +1337,9 @@ UnicodeSet& UnicodeSet::removeAll(const UnicodeSet& c) { + return *this; + } + retain(c.list, c.len, 2); +- strings->removeAll(*c.strings); ++ if (hasStrings() && c.hasStrings()) { ++ strings->removeAll(*c.strings); ++ } + return *this; + } + +@@ -1383,10 +1357,12 @@ UnicodeSet& UnicodeSet::complementAll(const UnicodeSet& c) { + } + exclusiveOr(c.list, c.len, 0); + +- for (int32_t i=0; isize(); ++i) { +- void* e = c.strings->elementAt(i); +- if (!strings->removeElement(e)) { +- _add(*(const UnicodeString*)e); ++ if (c.strings != nullptr) { ++ for (int32_t i=0; isize(); ++i) { ++ void* e = c.strings->elementAt(i); ++ if (strings == nullptr || !strings->removeElement(e)) { ++ _add(*(const UnicodeString*)e); ++ } + } + } + return *this; +@@ -1400,18 +1376,14 @@ UnicodeSet& UnicodeSet::clear(void) { + if (isFrozen()) { + return *this; + } +- if (list != NULL) { +- list[0] = UNICODESET_HIGH; +- } ++ list[0] = UNICODESET_HIGH; + len = 1; + releasePattern(); + if (strings != NULL) { + strings->removeAllElements(); + } +- if (list != NULL && strings != NULL) { +- // Remove bogus +- fFlags = 0; +- } ++ // Remove bogus ++ fFlags = 0; + return *this; + } + +@@ -1445,10 +1417,6 @@ UChar32 UnicodeSet::getRangeEnd(int32_t index) const { + return list[index*2 + 1] - 1; + } + +-int32_t UnicodeSet::getStringCount() const { +- return strings->size(); +-} +- + const UnicodeString* UnicodeSet::getString(int32_t index) const { + return (const UnicodeString*) strings->elementAt(index); + } +@@ -1462,22 +1430,32 @@ UnicodeSet& UnicodeSet::compact() { + return *this; + } + // Delete buffer first to defragment memory less. +- if (buffer != NULL) { ++ if (buffer != stackList) { + uprv_free(buffer); + buffer = NULL; +- } +- if (len < capacity) { +- // Make the capacity equal to len or 1. +- // We don't want to realloc of 0 size. +- int32_t newCapacity = len + (len == 0); +- UChar32* temp = (UChar32*) uprv_realloc(list, sizeof(UChar32) * newCapacity); ++ bufferCapacity = 0; ++ } ++ if (list == stackList) { ++ // pass ++ } else if (len <= INITIAL_CAPACITY) { ++ uprv_memcpy(stackList, list, len * sizeof(UChar32)); ++ uprv_free(list); ++ list = stackList; ++ capacity = INITIAL_CAPACITY; ++ } else if ((len + 7) < capacity) { ++ // If we have more than a little unused capacity, shrink it to len. ++ UChar32* temp = (UChar32*) uprv_realloc(list, sizeof(UChar32) * len); + if (temp) { + list = temp; +- capacity = newCapacity; ++ capacity = len; + } + // else what the heck happened?! We allocated less memory! + // Oh well. We'll keep our original array. + } ++ if (strings != nullptr && strings->isEmpty()) { ++ delete strings; ++ strings = nullptr; ++ } + return *this; + } + +@@ -1488,10 +1466,8 @@ UnicodeSet& UnicodeSet::compact() { + /** + * Deserialize constructor. + */ +-UnicodeSet::UnicodeSet(const uint16_t data[], int32_t dataLen, ESerialization serialization, UErrorCode &ec) +- : len(1), capacity(1+START_EXTRA), list(0), bmpSet(0), buffer(0), +- bufferCapacity(0), patLen(0), pat(NULL), strings(NULL), stringSpan(NULL), +- fFlags(0) { ++UnicodeSet::UnicodeSet(const uint16_t data[], int32_t dataLen, ESerialization serialization, ++ UErrorCode &ec) { + + if(U_FAILURE(ec)) { + setToBogus(); +@@ -1506,24 +1482,15 @@ UnicodeSet::UnicodeSet(const uint16_t data[], int32_t dataLen, ESerialization se + return; + } + +- allocateStrings(ec); +- if (U_FAILURE(ec)) { +- setToBogus(); +- return; +- } +- + // bmp? + int32_t headerSize = ((data[0]&0x8000)) ?2:1; + int32_t bmpLength = (headerSize==1)?data[0]:data[1]; + +- len = (((data[0]&0x7FFF)-bmpLength)/2)+bmpLength; ++ int32_t newLength = (((data[0]&0x7FFF)-bmpLength)/2)+bmpLength; + #ifdef DEBUG_SERIALIZE +- printf("dataLen %d headerSize %d bmpLen %d len %d. data[0]=%X/%X/%X/%X\n", dataLen,headerSize,bmpLength,len, data[0],data[1],data[2],data[3]); ++ printf("dataLen %d headerSize %d bmpLen %d len %d. data[0]=%X/%X/%X/%X\n", dataLen,headerSize,bmpLength,newLength, data[0],data[1],data[2],data[3]); + #endif +- capacity = len+1; +- list = (UChar32*) uprv_malloc(sizeof(UChar32) * capacity); +- if(!list || U_FAILURE(ec)) { +- setToBogus(); ++ if(!ensureCapacity(newLength + 1)) { // +1 for HIGH + return; + } + // copy bmp +@@ -1535,15 +1502,18 @@ UnicodeSet::UnicodeSet(const uint16_t data[], int32_t dataLen, ESerialization se + #endif + } + // copy smp +- for(i=bmpLength;i MAX_LENGTH) { ++ newCapacity = MAX_LENGTH; ++ } ++ return newCapacity; ++ } ++} ++ ++bool UnicodeSet::ensureCapacity(int32_t newLen) { ++ if (newLen > MAX_LENGTH) { ++ newLen = MAX_LENGTH; ++ } + if (newLen <= capacity) { +- return; ++ return true; + } +- UChar32* temp = (UChar32*) uprv_realloc(list, sizeof(UChar32) * (newLen + GROW_EXTRA)); ++ int32_t newCapacity = nextCapacity(newLen); ++ UChar32* temp = (UChar32*) uprv_malloc(newCapacity * sizeof(UChar32)); + if (temp == NULL) { +- ec = U_MEMORY_ALLOCATION_ERROR; + setToBogus(); // set the object to bogus state if an OOM failure occurred. +- return; ++ return false; ++ } ++ // Copy only the actual contents. ++ uprv_memcpy(temp, list, len * sizeof(UChar32)); ++ if (list != stackList) { ++ uprv_free(list); + } + list = temp; +- capacity = newLen + GROW_EXTRA; +- // else we keep the original contents on the memory failure. ++ capacity = newCapacity; ++ return true; + } + +-void UnicodeSet::ensureBufferCapacity(int32_t newLen, UErrorCode& ec) { +- if (buffer != NULL && newLen <= bufferCapacity) +- return; +- UChar32* temp = (UChar32*) uprv_realloc(buffer, sizeof(UChar32) * (newLen + GROW_EXTRA)); ++bool UnicodeSet::ensureBufferCapacity(int32_t newLen) { ++ if (newLen > MAX_LENGTH) { ++ newLen = MAX_LENGTH; ++ } ++ if (newLen <= bufferCapacity) { ++ return true; ++ } ++ int32_t newCapacity = nextCapacity(newLen); ++ UChar32* temp = (UChar32*) uprv_malloc(newCapacity * sizeof(UChar32)); + if (temp == NULL) { +- ec = U_MEMORY_ALLOCATION_ERROR; + setToBogus(); +- return; ++ return false; ++ } ++ // The buffer has no contents to be copied. ++ // It is always filled from scratch after this call. ++ if (buffer != stackList) { ++ uprv_free(buffer); + } + buffer = temp; +- bufferCapacity = newLen + GROW_EXTRA; +- // else we keep the original contents on the memory failure. ++ bufferCapacity = newCapacity; ++ return true; + } + + /** +@@ -1727,9 +1729,7 @@ void UnicodeSet::exclusiveOr(const UChar32* other, int32_t otherLen, int8_t pola + if (isFrozen() || isBogus()) { + return; + } +- UErrorCode status = U_ZERO_ERROR; +- ensureBufferCapacity(len + otherLen, status); +- if (U_FAILURE(status)) { ++ if (!ensureBufferCapacity(len + otherLen)) { + return; + } + +@@ -1777,9 +1777,7 @@ void UnicodeSet::add(const UChar32* other, int32_t otherLen, int8_t polarity) { + if (isFrozen() || isBogus() || other==NULL) { + return; + } +- UErrorCode status = U_ZERO_ERROR; +- ensureBufferCapacity(len + otherLen, status); +- if (U_FAILURE(status)) { ++ if (!ensureBufferCapacity(len + otherLen)) { + return; + } + +@@ -1890,9 +1888,7 @@ void UnicodeSet::retain(const UChar32* other, int32_t otherLen, int8_t polarity) + if (isFrozen() || isBogus()) { + return; + } +- UErrorCode status = U_ZERO_ERROR; +- ensureBufferCapacity(len + otherLen, status); +- if (U_FAILURE(status)) { ++ if (!ensureBufferCapacity(len + otherLen)) { + return; + } + +@@ -2138,12 +2134,14 @@ UnicodeString& UnicodeSet::_generatePattern(UnicodeString& result, + } + } + +- for (int32_t i = 0; isize(); ++i) { +- result.append(OPEN_BRACE); +- _appendToPat(result, +- *(const UnicodeString*) strings->elementAt(i), +- escapeUnprintable); +- result.append(CLOSE_BRACE); ++ if (strings != nullptr) { ++ for (int32_t i = 0; isize(); ++i) { ++ result.append(OPEN_BRACE); ++ _appendToPat(result, ++ *(const UnicodeString*) strings->elementAt(i), ++ escapeUnprintable); ++ result.append(CLOSE_BRACE); ++ } + } + return result.append(SET_CLOSE); + } +@@ -2162,13 +2160,12 @@ void UnicodeSet::releasePattern() { + /** + * Set the new pattern to cache. + */ +-void UnicodeSet::setPattern(const UnicodeString& newPat) { ++void UnicodeSet::setPattern(const char16_t *newPat, int32_t newPatLen) { + releasePattern(); +- int32_t newPatLen = newPat.length(); + pat = (UChar *)uprv_malloc((newPatLen + 1) * sizeof(UChar)); + if (pat) { + patLen = newPatLen; +- newPat.extractBetween(0, patLen, pat); ++ u_memcpy(pat, newPat, patLen); + pat[patLen] = 0; + } + // else we don't care if malloc failed. This was just a nice cache. +@@ -2177,30 +2174,15 @@ void UnicodeSet::setPattern(const UnicodeString& newPat) { + + UnicodeFunctor *UnicodeSet::freeze() { + if(!isFrozen() && !isBogus()) { +- // Do most of what compact() does before freezing because +- // compact() will not work when the set is frozen. +- // Small modification: Don't shrink if the savings would be tiny (<=GROW_EXTRA). +- +- // Delete buffer first to defragment memory less. +- if (buffer != NULL) { +- uprv_free(buffer); +- buffer = NULL; +- } +- if (capacity > (len + GROW_EXTRA)) { +- // Make the capacity equal to len or 1. +- // We don't want to realloc of 0 size. +- capacity = len + (len == 0); +- list = (UChar32*) uprv_realloc(list, sizeof(UChar32) * capacity); +- if (list == NULL) { // Check for memory allocation error. +- setToBogus(); +- return this; +- } +- } ++ compact(); + + // Optimize contains() and span() and similar functions. +- if (!strings->isEmpty()) { ++ if (hasStrings()) { + stringSpan = new UnicodeSetStringSpan(*this, *strings, UnicodeSetStringSpan::ALL); +- if (stringSpan != NULL && !stringSpan->needsStringSpanUTF16()) { ++ if (stringSpan == nullptr) { ++ setToBogus(); ++ return this; ++ } else if (!stringSpan->needsStringSpanUTF16()) { + // All strings are irrelevant for span() etc. because + // all of each string's code points are contained in this set. + // Do not check needsStringSpanUTF8() because UTF-8 has at most as +@@ -2233,7 +2215,7 @@ int32_t UnicodeSet::span(const UChar *s, int32_t length, USetSpanCondition spanC + } + if(stringSpan!=NULL) { + return stringSpan->span(s, length, spanCondition); +- } else if(!strings->isEmpty()) { ++ } else if(hasStrings()) { + uint32_t which= spanCondition==USET_SPAN_NOT_CONTAINED ? + UnicodeSetStringSpan::FWD_UTF16_NOT_CONTAINED : + UnicodeSetStringSpan::FWD_UTF16_CONTAINED; +@@ -2270,7 +2252,7 @@ int32_t UnicodeSet::spanBack(const UChar *s, int32_t length, USetSpanCondition s + } + if(stringSpan!=NULL) { + return stringSpan->spanBack(s, length, spanCondition); +- } else if(!strings->isEmpty()) { ++ } else if(hasStrings()) { + uint32_t which= spanCondition==USET_SPAN_NOT_CONTAINED ? + UnicodeSetStringSpan::BACK_UTF16_NOT_CONTAINED : + UnicodeSetStringSpan::BACK_UTF16_CONTAINED; +@@ -2308,7 +2290,7 @@ int32_t UnicodeSet::spanUTF8(const char *s, int32_t length, USetSpanCondition sp + } + if(stringSpan!=NULL) { + return stringSpan->spanUTF8((const uint8_t *)s, length, spanCondition); +- } else if(!strings->isEmpty()) { ++ } else if(hasStrings()) { + uint32_t which= spanCondition==USET_SPAN_NOT_CONTAINED ? + UnicodeSetStringSpan::FWD_UTF8_NOT_CONTAINED : + UnicodeSetStringSpan::FWD_UTF8_CONTAINED; +@@ -2346,7 +2328,7 @@ int32_t UnicodeSet::spanBackUTF8(const char *s, int32_t length, USetSpanConditio + } + if(stringSpan!=NULL) { + return stringSpan->spanBackUTF8((const uint8_t *)s, length, spanCondition); +- } else if(!strings->isEmpty()) { ++ } else if(hasStrings()) { + uint32_t which= spanCondition==USET_SPAN_NOT_CONTAINED ? + UnicodeSetStringSpan::BACK_UTF8_NOT_CONTAINED : + UnicodeSetStringSpan::BACK_UTF8_CONTAINED; +diff --git a/source/common/uniset_closure.cpp b/source/common/uniset_closure.cpp +index 0b7da796..882231ba 100644 +--- a/source/common/uniset_closure.cpp ++++ b/source/common/uniset_closure.cpp +@@ -31,10 +31,6 @@ + #include "util.h" + #include "uvector.h" + +-// initial storage. Must be >= 0 +-// *** same as in uniset.cpp ! *** +-#define START_EXTRA 16 +- + U_NAMESPACE_BEGIN + + // TODO memory debugging provided inside uniset.cpp +@@ -49,42 +45,16 @@ U_NAMESPACE_BEGIN + UnicodeSet::UnicodeSet(const UnicodeString& pattern, + uint32_t options, + const SymbolTable* symbols, +- UErrorCode& status) : +- len(0), capacity(START_EXTRA), list(0), bmpSet(0), buffer(0), +- bufferCapacity(0), patLen(0), pat(NULL), strings(NULL), stringSpan(NULL), +- fFlags(0) +-{ +- if(U_SUCCESS(status)){ +- list = (UChar32*) uprv_malloc(sizeof(UChar32) * capacity); +- /* test for NULL */ +- if(list == NULL) { +- status = U_MEMORY_ALLOCATION_ERROR; +- }else{ +- allocateStrings(status); +- applyPattern(pattern, options, symbols, status); +- } +- } ++ UErrorCode& status) { ++ applyPattern(pattern, options, symbols, status); + _dbgct(this); + } + + UnicodeSet::UnicodeSet(const UnicodeString& pattern, ParsePosition& pos, + uint32_t options, + const SymbolTable* symbols, +- UErrorCode& status) : +- len(0), capacity(START_EXTRA), list(0), bmpSet(0), buffer(0), +- bufferCapacity(0), patLen(0), pat(NULL), strings(NULL), stringSpan(NULL), +- fFlags(0) +-{ +- if(U_SUCCESS(status)){ +- list = (UChar32*) uprv_malloc(sizeof(UChar32) * capacity); +- /* test for NULL */ +- if(list == NULL) { +- status = U_MEMORY_ALLOCATION_ERROR; +- }else{ +- allocateStrings(status); +- applyPattern(pattern, pos, options, symbols, status); +- } +- } ++ UErrorCode& status) { ++ applyPattern(pattern, pos, options, symbols, status); + _dbgct(this); + } + +@@ -199,7 +169,7 @@ UnicodeSet& UnicodeSet::closeOver(int32_t attribute) { + // start with input set to guarantee inclusion + // USET_CASE: remove strings because the strings will actually be reduced (folded); + // therefore, start with no strings and add only those needed +- if (attribute & USET_CASE_INSENSITIVE) { ++ if ((attribute & USET_CASE_INSENSITIVE) && foldSet.hasStrings()) { + foldSet.strings->removeAllElements(); + } + +@@ -234,7 +204,7 @@ UnicodeSet& UnicodeSet::closeOver(int32_t attribute) { + } + } + } +- if (strings != NULL && strings->size() > 0) { ++ if (hasStrings()) { + if (attribute & USET_CASE_INSENSITIVE) { + for (int32_t j=0; jsize(); ++j) { + str = *(const UnicodeString *) strings->elementAt(j); +diff --git a/source/common/uniset_props.cpp b/source/common/uniset_props.cpp +index 6cfd80a7..e98c175f 100644 +--- a/source/common/uniset_props.cpp ++++ b/source/common/uniset_props.cpp +@@ -47,10 +47,6 @@ + + U_NAMESPACE_USE + +-// initial storage. Must be >= 0 +-// *** same as in uniset.cpp ! *** +-#define START_EXTRA 16 +- + // Define UChar constants using hex for EBCDIC compatibility + // Used #define to reduce private static exports and memory access time. + #define SET_OPEN ((UChar)0x005B) /*[*/ +@@ -185,21 +181,8 @@ isPOSIXClose(const UnicodeString &pattern, int32_t pos) { + * @param pattern a string specifying what characters are in the set + */ + UnicodeSet::UnicodeSet(const UnicodeString& pattern, +- UErrorCode& status) : +- len(0), capacity(START_EXTRA), list(0), bmpSet(0), buffer(0), +- bufferCapacity(0), patLen(0), pat(NULL), strings(NULL), stringSpan(NULL), +- fFlags(0) +-{ +- if(U_SUCCESS(status)){ +- list = (UChar32*) uprv_malloc(sizeof(UChar32) * capacity); +- /* test for NULL */ +- if(list == NULL) { +- status = U_MEMORY_ALLOCATION_ERROR; +- }else{ +- allocateStrings(status); +- applyPattern(pattern, status); +- } +- } ++ UErrorCode& status) { ++ applyPattern(pattern, status); + _dbgct(this); + } + +@@ -713,6 +696,11 @@ static UBool numericValueFilter(UChar32 ch, void* context) { + return u_getNumericValue(ch) == *(double*)context; + } + ++static UBool generalCategoryMaskFilter(UChar32 ch, void* context) { ++ int32_t value = *(int32_t*)context; ++ return (U_GET_GC_MASK((UChar32) ch) & value) != 0; ++} ++ + static UBool versionFilter(UChar32 ch, void* context) { + static const UVersionInfo none = { 0, 0, 0, 0 }; + UVersionInfo v; +@@ -721,6 +709,16 @@ static UBool versionFilter(UChar32 ch, void* context) { + return uprv_memcmp(&v, &none, sizeof(v)) > 0 && uprv_memcmp(&v, version, sizeof(v)) <= 0; + } + ++typedef struct { ++ UProperty prop; ++ int32_t value; ++} IntPropertyContext; ++ ++static UBool intPropertyFilter(UChar32 ch, void* context) { ++ IntPropertyContext* c = (IntPropertyContext*)context; ++ return u_getIntPropertyValue((UChar32) ch, c->prop) == c->value; ++} ++ + static UBool scriptExtensionsFilter(UChar32 ch, void* context) { + return uscript_hasScript(ch, *(UScriptCode*)context); + } +@@ -781,43 +779,6 @@ void UnicodeSet::applyFilter(UnicodeSet::Filter filter, + + namespace { + +-/** Maps map values to 1 if the mask contains their value'th bit, all others to 0. */ +-uint32_t U_CALLCONV generalCategoryMaskFilter(const void *context, uint32_t value) { +- uint32_t mask = *(const uint32_t *)context; +- value = U_MASK(value) & mask; +- if (value != 0) { value = 1; } +- return value; +-} +- +-/** Maps one map value to 1, all others to 0. */ +-uint32_t U_CALLCONV intValueFilter(const void *context, uint32_t value) { +- uint32_t v = *(const uint32_t *)context; +- return value == v ? 1 : 0; +-} +- +-} // namespace +- +-void UnicodeSet::applyIntPropertyValue(const UCPMap *map, +- UCPMapValueFilter *filter, const void *context, +- UErrorCode &errorCode) { +- if (U_FAILURE(errorCode)) { return; } +- clear(); +- UChar32 start = 0, end; +- uint32_t value; +- while ((end = ucpmap_getRange(map, start, UCPMAP_RANGE_NORMAL, 0, +- filter, context, &value)) >= 0) { +- if (value != 0) { +- add(start, end); +- } +- start = end + 1; +- } +- if (isBogus()) { +- errorCode = U_MEMORY_ALLOCATION_ERROR; +- } +-} +- +-namespace { +- + static UBool mungeCharName(char* dst, const char* src, int32_t dstCapacity) { + /* Note: we use ' ' in compiler code page */ + int32_t j = 0; +@@ -845,11 +806,10 @@ static UBool mungeCharName(char* dst, const char* src, int32_t dstCapacity) { + + UnicodeSet& + UnicodeSet::applyIntPropertyValue(UProperty prop, int32_t value, UErrorCode& ec) { +- if (U_FAILURE(ec)) { return *this; } +- // All of the following check isFrozen() before modifying this set. ++ if (U_FAILURE(ec) || isFrozen()) { return *this; } + if (prop == UCHAR_GENERAL_CATEGORY_MASK) { +- const UCPMap *map = u_getIntPropertyMap(UCHAR_GENERAL_CATEGORY, &ec); +- applyIntPropertyValue(map, generalCategoryMaskFilter, &value, ec); ++ const UnicodeSet* inclusions = CharacterProperties::getInclusionsForProperty(prop, ec); ++ applyFilter(generalCategoryMaskFilter, &value, inclusions, ec); + } else if (prop == UCHAR_SCRIPT_EXTENSIONS) { + const UnicodeSet* inclusions = CharacterProperties::getInclusionsForProperty(prop, ec); + UScriptCode script = (UScriptCode)value; +@@ -866,14 +826,11 @@ UnicodeSet::applyIntPropertyValue(UProperty prop, int32_t value, UErrorCode& ec) + clear(); + } + } else if (UCHAR_INT_START <= prop && prop < UCHAR_INT_LIMIT) { +- const UCPMap *map = u_getIntPropertyMap(prop, &ec); +- applyIntPropertyValue(map, intValueFilter, &value, ec); ++ const UnicodeSet* inclusions = CharacterProperties::getInclusionsForProperty(prop, ec); ++ IntPropertyContext c = {prop, value}; ++ applyFilter(intPropertyFilter, &c, inclusions, ec); + } else { +- // This code used to always call getInclusions(property source) +- // which sets an error for an unsupported property. + ec = U_ILLEGAL_ARGUMENT_ERROR; +- // Otherwise we would just clear() this set because +- // getIntPropertyValue(c, prop) returns 0 for all code points. + } + return *this; + } +diff --git a/source/common/uprops.h b/source/common/uprops.h +index 1a8e4e84..34b3600b 100644 +--- a/source/common/uprops.h ++++ b/source/common/uprops.h +@@ -462,7 +462,6 @@ class UnicodeSet; + class CharacterProperties { + public: + CharacterProperties() = delete; +- static void U_CALLCONV initInclusion(UPropertySource src, UErrorCode &errorCode); + static const UnicodeSet *getInclusionsForProperty(UProperty prop, UErrorCode &errorCode); + }; + +diff --git a/source/common/uset.cpp b/source/common/uset.cpp +index 39ad0a34..eae7981d 100644 +--- a/source/common/uset.cpp ++++ b/source/common/uset.cpp +@@ -249,7 +249,7 @@ class USetAccess /* not : public UObject because all methods are static */ { + public: + /* Try to have the compiler inline these*/ + inline static int32_t getStringCount(const UnicodeSet& set) { +- return set.getStringCount(); ++ return set.stringsSize(); + } + inline static const UnicodeString* getString(const UnicodeSet& set, + int32_t i) { +diff --git a/source/common/usetiter.cpp b/source/common/usetiter.cpp +index 93048ba2..79151690 100644 +--- a/source/common/usetiter.cpp ++++ b/source/common/usetiter.cpp +@@ -116,7 +116,7 @@ void UnicodeSetIterator::reset() { + stringCount = 0; + } else { + endRange = set->getRangeCount() - 1; +- stringCount = set->strings->size(); ++ stringCount = set->stringsSize(); + } + range = 0; + endElement = -1; diff --git a/source/common/characterproperties.cpp b/source/common/characterproperties.cpp index 3aff85b3f1..b416ef5227 100644 --- a/source/common/characterproperties.cpp +++ b/source/common/characterproperties.cpp @@ -23,6 +23,9 @@ #include "umutex.h" #include "uprops.h" +using icu::LocalPointer; +using icu::Normalizer2Factory; +using icu::Normalizer2Impl; using icu::UInitOnce; using icu::UnicodeSet; @@ -30,11 +33,13 @@ namespace { UBool U_CALLCONV characterproperties_cleanup(); +constexpr int32_t NUM_INCLUSIONS = UPROPS_SRC_COUNT + UCHAR_INT_LIMIT - UCHAR_INT_START; + struct Inclusion { UnicodeSet *fSet; UInitOnce fInitOnce; }; -Inclusion gInclusions[UPROPS_SRC_COUNT]; // cached getInclusions() +Inclusion gInclusions[NUM_INCLUSIONS]; // cached getInclusions() UnicodeSet *sets[UCHAR_BINARY_LIMIT] = {}; @@ -80,35 +85,22 @@ UBool U_CALLCONV characterproperties_cleanup() { return TRUE; } -} // namespace - -U_NAMESPACE_BEGIN - -/* -Reduce excessive reallocation, and make it easier to detect initialization problems. -Usually you don't see smaller sets than this for Unicode 5.0. -*/ -constexpr int32_t DEFAULT_INCLUSION_CAPACITY = 3072; - -void U_CALLCONV CharacterProperties::initInclusion(UPropertySource src, UErrorCode &errorCode) { +void U_CALLCONV initInclusion(UPropertySource src, UErrorCode &errorCode) { // This function is invoked only via umtx_initOnce(). - // This function is a friend of class UnicodeSet. - U_ASSERT(0 <= src && src < UPROPS_SRC_COUNT); if (src == UPROPS_SRC_NONE) { errorCode = U_INTERNAL_PROGRAM_ERROR; return; } - UnicodeSet * &incl = gInclusions[src].fSet; - U_ASSERT(incl == nullptr); + U_ASSERT(gInclusions[src].fSet == nullptr); - incl = new UnicodeSet(); - if (incl == nullptr) { + LocalPointer incl(new UnicodeSet()); + if (incl.isNull()) { errorCode = U_MEMORY_ALLOCATION_ERROR; return; } USetAdder sa = { - (USet *)incl, + (USet *)incl.getAlias(), _set_add, _set_addRange, _set_addString, @@ -116,7 +108,6 @@ void U_CALLCONV CharacterProperties::initInclusion(UPropertySource src, UErrorCo nullptr // don't need removeRange() }; - incl->ensureCapacity(DEFAULT_INCLUSION_CAPACITY, errorCode); switch(src) { case UPROPS_SRC_CHAR: uchar_addPropertyStarts(&sa, &errorCode); @@ -183,12 +174,15 @@ void U_CALLCONV CharacterProperties::initInclusion(UPropertySource src, UErrorCo } if (U_FAILURE(errorCode)) { - delete incl; - incl = nullptr; return; } - // Compact for caching + if (incl->isBogus()) { + errorCode = U_MEMORY_ALLOCATION_ERROR; + return; + } + // Compact for caching. incl->compact(); + gInclusions[src].fSet = incl.orphan(); ucln_common_registerCleanup(UCLN_COMMON_CHARACTERPROPERTIES, characterproperties_cleanup); } @@ -199,15 +193,66 @@ const UnicodeSet *getInclusionsForSource(UPropertySource src, UErrorCode &errorC return nullptr; } Inclusion &i = gInclusions[src]; - umtx_initOnce(i.fInitOnce, &CharacterProperties::initInclusion, src, errorCode); + umtx_initOnce(i.fInitOnce, &initInclusion, src, errorCode); return i.fSet; } +void U_CALLCONV initIntPropInclusion(UProperty prop, UErrorCode &errorCode) { + // This function is invoked only via umtx_initOnce(). + U_ASSERT(UCHAR_INT_START <= prop && prop < UCHAR_INT_LIMIT); + int32_t inclIndex = UPROPS_SRC_COUNT + prop - UCHAR_INT_START; + U_ASSERT(gInclusions[inclIndex].fSet == nullptr); + UPropertySource src = uprops_getSource(prop); + const UnicodeSet *incl = getInclusionsForSource(src, errorCode); + if (U_FAILURE(errorCode)) { + return; + } + + LocalPointer intPropIncl(new UnicodeSet(0, 0)); + if (intPropIncl.isNull()) { + errorCode = U_MEMORY_ALLOCATION_ERROR; + return; + } + int32_t numRanges = incl->getRangeCount(); + int32_t prevValue = 0; + for (int32_t i = 0; i < numRanges; ++i) { + UChar32 rangeEnd = incl->getRangeEnd(i); + for (UChar32 c = incl->getRangeStart(i); c <= rangeEnd; ++c) { + // TODO: Get a UCharacterProperty.IntProperty to avoid the property dispatch. + int32_t value = u_getIntPropertyValue(c, prop); + if (value != prevValue) { + intPropIncl->add(c); + prevValue = value; + } + } + } + + if (intPropIncl->isBogus()) { + errorCode = U_MEMORY_ALLOCATION_ERROR; + return; + } + // Compact for caching. + intPropIncl->compact(); + gInclusions[inclIndex].fSet = intPropIncl.orphan(); + ucln_common_registerCleanup(UCLN_COMMON_CHARACTERPROPERTIES, characterproperties_cleanup); +} + +} // namespace + +U_NAMESPACE_BEGIN + const UnicodeSet *CharacterProperties::getInclusionsForProperty( UProperty prop, UErrorCode &errorCode) { if (U_FAILURE(errorCode)) { return nullptr; } - UPropertySource src = uprops_getSource(prop); - return getInclusionsForSource(src, errorCode); + if (UCHAR_INT_START <= prop && prop < UCHAR_INT_LIMIT) { + int32_t inclIndex = UPROPS_SRC_COUNT + prop - UCHAR_INT_START; + Inclusion &i = gInclusions[inclIndex]; + umtx_initOnce(i.fInitOnce, &initIntPropInclusion, prop, errorCode); + return i.fSet; + } else { + UPropertySource src = uprops_getSource(prop); + return getInclusionsForSource(src, errorCode); + } } U_NAMESPACE_END @@ -216,7 +261,7 @@ namespace { UnicodeSet *makeSet(UProperty property, UErrorCode &errorCode) { if (U_FAILURE(errorCode)) { return nullptr; } - icu::LocalPointer set(new UnicodeSet()); + LocalPointer set(new UnicodeSet()); if (set.isNull()) { errorCode = U_MEMORY_ALLOCATION_ERROR; return nullptr; diff --git a/source/common/ucptrie.cpp b/source/common/ucptrie.cpp index 13496ad56c..b72e318387 100644 --- a/source/common/ucptrie.cpp +++ b/source/common/ucptrie.cpp @@ -280,7 +280,7 @@ UChar32 getRange(const void *t, UChar32 start, int32_t prevI3Block = -1; int32_t prevBlock = -1; UChar32 c = start; - uint32_t value; + uint32_t trieValue, value; bool haveValue = false; do { int32_t i3Block; @@ -319,6 +319,7 @@ UChar32 getRange(const void *t, UChar32 start, return c - 1; } } else { + trieValue = trie->nullValue; value = nullValue; if (pValue != nullptr) { *pValue = nullValue; } haveValue = true; @@ -357,6 +358,7 @@ UChar32 getRange(const void *t, UChar32 start, return c - 1; } } else { + trieValue = trie->nullValue; value = nullValue; if (pValue != nullptr) { *pValue = nullValue; } haveValue = true; @@ -364,23 +366,32 @@ UChar32 getRange(const void *t, UChar32 start, c = (c + dataBlockLength) & ~dataMask; } else { int32_t di = block + (c & dataMask); - uint32_t value2 = getValue(trie->data, valueWidth, di); - value2 = maybeFilterValue(value2, trie->nullValue, nullValue, - filter, context); + uint32_t trieValue2 = getValue(trie->data, valueWidth, di); if (haveValue) { - if (value2 != value) { - return c - 1; + if (trieValue2 != trieValue) { + if (filter == nullptr || + maybeFilterValue(trieValue2, trie->nullValue, nullValue, + filter, context) != value) { + return c - 1; + } + trieValue = trieValue2; // may or may not help } } else { - value = value2; + trieValue = trieValue2; + value = maybeFilterValue(trieValue2, trie->nullValue, nullValue, + filter, context); if (pValue != nullptr) { *pValue = value; } haveValue = true; } while ((++c & dataMask) != 0) { - if (maybeFilterValue(getValue(trie->data, valueWidth, ++di), - trie->nullValue, nullValue, - filter, context) != value) { - return c - 1; + trieValue2 = getValue(trie->data, valueWidth, ++di); + if (trieValue2 != trieValue) { + if (filter == nullptr || + maybeFilterValue(trieValue2, trie->nullValue, nullValue, + filter, context) != value) { + return c - 1; + } + trieValue = trieValue2; // may or may not help } } } diff --git a/source/common/umutablecptrie.cpp b/source/common/umutablecptrie.cpp index 44af83093d..926be46834 100644 --- a/source/common/umutablecptrie.cpp +++ b/source/common/umutablecptrie.cpp @@ -304,41 +304,56 @@ UChar32 MutableCodePointTrie::getRange( uint32_t nullValue = initialValue; if (filter != nullptr) { nullValue = filter(context, nullValue); } UChar32 c = start; - uint32_t value; + uint32_t trieValue, value; bool haveValue = false; int32_t i = c >> UCPTRIE_SHIFT_3; do { if (flags[i] == ALL_SAME) { - uint32_t value2 = maybeFilterValue(index[i], initialValue, nullValue, - filter, context); + uint32_t trieValue2 = index[i]; if (haveValue) { - if (value2 != value) { - return c - 1; + if (trieValue2 != trieValue) { + if (filter == nullptr || + maybeFilterValue(trieValue2, initialValue, nullValue, + filter, context) != value) { + return c - 1; + } + trieValue = trieValue2; // may or may not help } } else { - value = value2; + trieValue = trieValue2; + value = maybeFilterValue(trieValue2, initialValue, nullValue, filter, context); if (pValue != nullptr) { *pValue = value; } haveValue = true; } c = (c + UCPTRIE_SMALL_DATA_BLOCK_LENGTH) & ~UCPTRIE_SMALL_DATA_MASK; } else /* MIXED */ { int32_t di = index[i] + (c & UCPTRIE_SMALL_DATA_MASK); - uint32_t value2 = maybeFilterValue(data[di], initialValue, nullValue, - filter, context); + uint32_t trieValue2 = data[di]; if (haveValue) { - if (value2 != value) { - return c - 1; + if (trieValue2 != trieValue) { + if (filter == nullptr || + maybeFilterValue(trieValue2, initialValue, nullValue, + filter, context) != value) { + return c - 1; + } + trieValue = trieValue2; // may or may not help } } else { - value = value2; + trieValue = trieValue2; + value = maybeFilterValue(trieValue2, initialValue, nullValue, filter, context); if (pValue != nullptr) { *pValue = value; } haveValue = true; } while ((++c & UCPTRIE_SMALL_DATA_MASK) != 0) { - if (maybeFilterValue(data[++di], initialValue, nullValue, - filter, context) != value) { - return c - 1; + trieValue2 = data[++di]; + if (trieValue2 != trieValue) { + if (filter == nullptr || + maybeFilterValue(trieValue2, initialValue, nullValue, + filter, context) != value) { + return c - 1; + } } + trieValue = trieValue2; // may or may not help } } ++i; diff --git a/source/common/unicode/uniset.h b/source/common/unicode/uniset.h index 0abc754240..af56b87227 100644 --- a/source/common/unicode/uniset.h +++ b/source/common/unicode/uniset.h @@ -27,7 +27,6 @@ U_NAMESPACE_BEGIN // Forward Declarations. class BMPSet; -class CharacterProperties; class ParsePosition; class RBBIRuleScanner; class SymbolTable; @@ -276,14 +275,23 @@ class RuleCharacterIterator; * @stable ICU 2.0 */ class U_COMMON_API UnicodeSet U_FINAL : public UnicodeFilter { +private: + /** + * Enough for sets with few ranges. + * For example, White_Space has 10 ranges, list length 21. + */ + static constexpr int32_t INITIAL_CAPACITY = 25; + // fFlags constant + static constexpr uint8_t kIsBogus = 1; // This set is bogus (i.e. not valid) + + UChar32* list = stackList; // MUST be terminated with HIGH + int32_t capacity = INITIAL_CAPACITY; // capacity of list + int32_t len = 1; // length of list used; 1 <= len <= capacity + uint8_t fFlags = 0; // Bit flag (see constants above) - int32_t len; // length of list used; 0 <= len <= capacity - int32_t capacity; // capacity of list - UChar32* list; // MUST be terminated with HIGH - BMPSet *bmpSet; // The set is frozen iff either bmpSet or stringSpan is not NULL. - UChar32* buffer; // internal buffer, may be NULL - int32_t bufferCapacity; // capacity of buffer - int32_t patLen; + BMPSet *bmpSet = nullptr; // The set is frozen iff either bmpSet or stringSpan is not NULL. + UChar32* buffer = nullptr; // internal buffer, may be NULL + int32_t bufferCapacity = 0; // capacity of buffer /** * The pattern representation of this set. This may not be the @@ -294,15 +302,19 @@ class U_COMMON_API UnicodeSet U_FINAL : public UnicodeFilter { * indicating that toPattern() must generate a pattern * representation from the inversion list. */ - char16_t *pat; - UVector* strings; // maintained in sorted order - UnicodeSetStringSpan *stringSpan; + char16_t *pat = nullptr; + int32_t patLen = 0; + + UVector* strings = nullptr; // maintained in sorted order + UnicodeSetStringSpan *stringSpan = nullptr; + + /** + * Initial list array. + * Avoids some heap allocations, and list is never nullptr. + * Increases the object size a bit. + */ + UChar32 stackList[INITIAL_CAPACITY]; -private: - enum { // constants - kIsBogus = 1 // This set is bogus (i.e. not valid) - }; - uint8_t fFlags; // Bit flag (see constants above) public: /** * Determine if this object contains a valid set. @@ -1480,8 +1492,6 @@ class U_COMMON_API UnicodeSet U_FINAL : public UnicodeFilter { friend class USetAccess; - int32_t getStringCount() const; - const UnicodeString* getString(int32_t index) const; //---------------------------------------------------------------- @@ -1528,13 +1538,18 @@ class U_COMMON_API UnicodeSet U_FINAL : public UnicodeFilter { // Implementation: Utility methods //---------------------------------------------------------------- - void ensureCapacity(int32_t newLen, UErrorCode& ec); + static int32_t nextCapacity(int32_t minCapacity); + + bool ensureCapacity(int32_t newLen); - void ensureBufferCapacity(int32_t newLen, UErrorCode& ec); + bool ensureBufferCapacity(int32_t newLen); void swapBuffers(void); UBool allocateStrings(UErrorCode &status); + UBool hasStrings() const; + int32_t stringsSize() const; + UBool stringsContains(const UnicodeString &s) const; UnicodeString& _toPattern(UnicodeString& result, UBool escapeUnprintable) const; @@ -1614,7 +1629,6 @@ class U_COMMON_API UnicodeSet U_FINAL : public UnicodeFilter { UnicodeString& rebuiltPat, UErrorCode& ec); - friend class CharacterProperties; static const UnicodeSet* getInclusions(int32_t src, UErrorCode &status); /** @@ -1646,7 +1660,10 @@ class U_COMMON_API UnicodeSet U_FINAL : public UnicodeFilter { /** * Set the new pattern to cache. */ - void setPattern(const UnicodeString& newPat); + void setPattern(const UnicodeString& newPat) { + setPattern(newPat.getBuffer(), newPat.length()); + } + void setPattern(const char16_t *newPat, int32_t newPatLen); /** * Release existing cached pattern. */ diff --git a/source/common/uniset.cpp b/source/common/uniset.cpp index e8378e0a22..2024277616 100644 --- a/source/common/uniset.cpp +++ b/source/common/uniset.cpp @@ -14,6 +14,7 @@ #include "unicode/parsepos.h" #include "unicode/symtable.h" #include "unicode/uniset.h" +#include "unicode/ustring.h" #include "unicode/utf8.h" #include "unicode/utf16.h" #include "ruleiter.h" @@ -53,11 +54,8 @@ // LOW <= all valid values. ZERO for codepoints #define UNICODESET_LOW 0x000000 -// initial storage. Must be >= 0 -#define START_EXTRA 16 - -// extra amount for growth. Must be >= 0 -#define GROW_EXTRA START_EXTRA +/** Max list [0, 1, 2, ..., max code point, HIGH] */ +constexpr int32_t MAX_LENGTH = UNICODESET_HIGH + 1; U_NAMESPACE_BEGIN @@ -137,6 +135,18 @@ static int8_t U_CALLCONV compareUnicodeString(UElement t1, UElement t2) { return a.compare(b); } +UBool UnicodeSet::hasStrings() const { + return strings != nullptr && !strings->isEmpty(); +} + +int32_t UnicodeSet::stringsSize() const { + return strings == nullptr ? 0 : strings->size(); +} + +UBool UnicodeSet::stringsContains(const UnicodeString &s) const { + return strings != nullptr && strings->contains((void*) &s); +} + //---------------------------------------------------------------- // Constructors &c //---------------------------------------------------------------- @@ -144,24 +154,8 @@ static int8_t U_CALLCONV compareUnicodeString(UElement t1, UElement t2) { /** * Constructs an empty set. */ -UnicodeSet::UnicodeSet() : - len(1), capacity(1 + START_EXTRA), list(0), bmpSet(0), buffer(0), - bufferCapacity(0), patLen(0), pat(NULL), strings(NULL), stringSpan(NULL), - fFlags(0) -{ - UErrorCode status = U_ZERO_ERROR; - allocateStrings(status); - if (U_FAILURE(status)) { - setToBogus(); // If memory allocation failed, set to bogus state. - return; - } - list = (UChar32*) uprv_malloc(sizeof(UChar32) * capacity); - if(list!=NULL){ - list[0] = UNICODESET_HIGH; - } else { // If memory allocation failed, set to bogus state. - setToBogus(); - return; - } +UnicodeSet::UnicodeSet() { + list[0] = UNICODESET_HIGH; _dbgct(this); } @@ -172,89 +166,39 @@ UnicodeSet::UnicodeSet() : * @param start first character, inclusive, of range * @param end last character, inclusive, of range */ -UnicodeSet::UnicodeSet(UChar32 start, UChar32 end) : - len(1), capacity(1 + START_EXTRA), list(0), bmpSet(0), buffer(0), - bufferCapacity(0), patLen(0), pat(NULL), strings(NULL), stringSpan(NULL), - fFlags(0) -{ - UErrorCode status = U_ZERO_ERROR; - allocateStrings(status); - if (U_FAILURE(status)) { - setToBogus(); // If memory allocation failed, set to bogus state. - return; - } - list = (UChar32*) uprv_malloc(sizeof(UChar32) * capacity); - if(list!=NULL){ - list[0] = UNICODESET_HIGH; - complement(start, end); - } else { // If memory allocation failed, set to bogus state. - setToBogus(); - return; - } +UnicodeSet::UnicodeSet(UChar32 start, UChar32 end) { + list[0] = UNICODESET_HIGH; + add(start, end); _dbgct(this); } /** * Constructs a set that is identical to the given UnicodeSet. */ -UnicodeSet::UnicodeSet(const UnicodeSet& o) : - UnicodeFilter(o), - len(0), capacity(o.isFrozen() ? o.len : o.len + GROW_EXTRA), list(0), - bmpSet(0), - buffer(0), bufferCapacity(0), - patLen(0), pat(NULL), strings(NULL), stringSpan(NULL), - fFlags(0) -{ - UErrorCode status = U_ZERO_ERROR; - allocateStrings(status); - if (U_FAILURE(status)) { - setToBogus(); // If memory allocation failed, set to bogus state. - return; - } - list = (UChar32*) uprv_malloc(sizeof(UChar32) * capacity); - if(list!=NULL){ - *this = o; - } else { // If memory allocation failed, set to bogus state. - setToBogus(); - return; - } +UnicodeSet::UnicodeSet(const UnicodeSet& o) : UnicodeFilter(o) { + *this = o; _dbgct(this); } // Copy-construct as thawed. -UnicodeSet::UnicodeSet(const UnicodeSet& o, UBool /* asThawed */) : - UnicodeFilter(o), - len(0), capacity(o.len + GROW_EXTRA), list(0), - bmpSet(0), - buffer(0), bufferCapacity(0), - patLen(0), pat(NULL), strings(NULL), stringSpan(NULL), - fFlags(0) -{ - UErrorCode status = U_ZERO_ERROR; - allocateStrings(status); - if (U_FAILURE(status)) { - setToBogus(); // If memory allocation failed, set to bogus state. - return; - } - list = (UChar32*) uprv_malloc(sizeof(UChar32) * capacity); - if(list!=NULL){ +UnicodeSet::UnicodeSet(const UnicodeSet& o, UBool /* asThawed */) : UnicodeFilter(o) { + if (ensureCapacity(o.len)) { // *this = o except for bmpSet and stringSpan len = o.len; uprv_memcpy(list, o.list, (size_t)len*sizeof(UChar32)); - if (strings != NULL && o.strings != NULL) { - strings->assign(*o.strings, cloneUnicodeString, status); - } else { // Invalid strings. - setToBogus(); - return; + if (o.hasStrings()) { + UErrorCode status = U_ZERO_ERROR; + if (!allocateStrings(status) || + (strings->assign(*o.strings, cloneUnicodeString, status), U_FAILURE(status))) { + setToBogus(); + return; + } } if (o.pat) { - setPattern(UnicodeString(o.pat, o.patLen)); + setPattern(o.pat, o.patLen); } - } else { // If memory allocation failed, set to bogus state. - setToBogus(); - return; + _dbgct(this); } - _dbgct(this); } /** @@ -262,9 +206,11 @@ UnicodeSet::UnicodeSet(const UnicodeSet& o, UBool /* asThawed */) : */ UnicodeSet::~UnicodeSet() { _dbgdt(this); // first! - uprv_free(list); + if (list != stackList) { + uprv_free(list); + } delete bmpSet; - if (buffer) { + if (buffer != stackList) { uprv_free(buffer); } delete strings; @@ -290,32 +236,30 @@ UnicodeSet& UnicodeSet::copyFrom(const UnicodeSet& o, UBool asThawed) { setToBogus(); return *this; } - UErrorCode ec = U_ZERO_ERROR; - ensureCapacity(o.len, ec); - if (U_FAILURE(ec)) { + if (!ensureCapacity(o.len)) { // ensureCapacity will mark the UnicodeSet as Bogus if OOM failure happens. return *this; } len = o.len; uprv_memcpy(list, o.list, (size_t)len*sizeof(UChar32)); - if (o.bmpSet == NULL || asThawed) { - bmpSet = NULL; - } else { + if (o.bmpSet != nullptr && !asThawed) { bmpSet = new BMPSet(*o.bmpSet, list, len); if (bmpSet == NULL) { // Check for memory allocation error. setToBogus(); return *this; } } - if (strings != NULL && o.strings != NULL) { - strings->assign(*o.strings, cloneUnicodeString, ec); - } else { // Invalid strings. - setToBogus(); - return *this; + if (o.hasStrings()) { + UErrorCode status = U_ZERO_ERROR; + if ((strings == nullptr && !allocateStrings(status)) || + (strings->assign(*o.strings, cloneUnicodeString, status), U_FAILURE(status))) { + setToBogus(); + return *this; + } + } else if (hasStrings()) { + strings->removeAllElements(); } - if (o.stringSpan == NULL || asThawed) { - stringSpan = NULL; - } else { + if (o.stringSpan != nullptr && !asThawed) { stringSpan = new UnicodeSetStringSpan(*o.stringSpan, *strings); if (stringSpan == NULL) { // Check for memory allocation error. setToBogus(); @@ -324,7 +268,7 @@ UnicodeSet& UnicodeSet::copyFrom(const UnicodeSet& o, UBool asThawed) { } releasePattern(); if (o.pat) { - setPattern(UnicodeString(o.pat, o.patLen)); + setPattern(o.pat, o.patLen); } return *this; } @@ -357,7 +301,8 @@ UBool UnicodeSet::operator==(const UnicodeSet& o) const { for (int32_t i = 0; i < len; ++i) { if (list[i] != o.list[i]) return FALSE; } - if (*strings != *o.strings) return FALSE; + if (hasStrings() != o.hasStrings()) { return FALSE; } + if (hasStrings() && *strings != *o.strings) return FALSE; return TRUE; } @@ -393,7 +338,7 @@ int32_t UnicodeSet::size(void) const { for (int32_t i = 0; i < count; ++i) { n += getRangeEnd(i) - getRangeStart(i) + 1; } - return n + strings->size(); + return n + stringsSize(); } /** @@ -402,7 +347,7 @@ int32_t UnicodeSet::size(void) const { * @return true if this set contains no elements. */ UBool UnicodeSet::isEmpty(void) const { - return len == 1 && strings->size() == 0; + return len == 1 && !hasStrings(); } /** @@ -502,7 +447,7 @@ UBool UnicodeSet::contains(const UnicodeString& s) const { if (s.length() == 0) return FALSE; int32_t cp = getSingleCP(s); if (cp < 0) { - return strings->contains((void*) &s); + return stringsContains(s); } else { return contains((UChar32) cp); } @@ -524,8 +469,7 @@ UBool UnicodeSet::containsAll(const UnicodeSet& c) const { return FALSE; } } - if (!strings->containsAll(*c.strings)) return FALSE; - return TRUE; + return !c.hasStrings() || (strings != nullptr && strings->containsAll(*c.strings)); } /** @@ -571,8 +515,7 @@ UBool UnicodeSet::containsNone(const UnicodeSet& c) const { return FALSE; } } - if (!strings->containsNone(*c.strings)) return FALSE; - return TRUE; + return strings == nullptr || !c.hasStrings() || strings->containsNone(*c.strings); } /** @@ -613,7 +556,7 @@ UBool UnicodeSet::matchesIndexValue(uint8_t v) const { return TRUE; } } - if (strings->size() != 0) { + if (hasStrings()) { for (i=0; isize(); ++i) { const UnicodeString& s = *(const UnicodeString*)strings->elementAt(i); //if (s.length() == 0) { @@ -648,7 +591,7 @@ UMatchDegree UnicodeSet::matches(const Replaceable& text, return U_MISMATCH; } } else { - if (strings->size() != 0) { // try strings first + if (hasStrings()) { // try strings first // might separate forward and backward loops later // for now they are combined @@ -849,7 +792,39 @@ UnicodeSet& UnicodeSet::set(UChar32 start, UChar32 end) { */ UnicodeSet& UnicodeSet::add(UChar32 start, UChar32 end) { if (pinCodePoint(start) < pinCodePoint(end)) { - UChar32 range[3] = { start, end+1, UNICODESET_HIGH }; + UChar32 limit = end + 1; + // Fast path for adding a new range after the last one. + // Odd list length: [..., lastStart, lastLimit, HIGH] + if ((len & 1) != 0) { + // If the list is empty, set lastLimit low enough to not be adjacent to 0. + UChar32 lastLimit = len == 1 ? -2 : list[len - 2]; + if (lastLimit <= start && !isFrozen() && !isBogus()) { + if (lastLimit == start) { + // Extend the last range. + list[len - 2] = limit; + if (limit == UNICODESET_HIGH) { + --len; + } + } else { + list[len - 1] = start; + if (limit < UNICODESET_HIGH) { + if (ensureCapacity(len + 2)) { + list[len++] = limit; + list[len++] = UNICODESET_HIGH; + } + } else { // limit == UNICODESET_HIGH + if (ensureCapacity(len + 1)) { + list[len++] = UNICODESET_HIGH; + } + } + } + releasePattern(); + return *this; + } + } + // This is slow. Could be much faster using findCodePoint(start) + // and modifying the list, dealing with adjacent & overlapping ranges. + UChar32 range[3] = { start, limit, UNICODESET_HIGH }; add(range, 2, 0); } else if (start == end) { add(start); @@ -918,9 +893,7 @@ UnicodeSet& UnicodeSet::add(UChar32 c) { list[i] = c; // if we touched the HIGH mark, then add a new one if (c == (UNICODESET_HIGH - 1)) { - UErrorCode status = U_ZERO_ERROR; - ensureCapacity(len+1, status); - if (U_FAILURE(status)) { + if (!ensureCapacity(len+1)) { // ensureCapacity will mark the object as Bogus if OOM failure happens. return *this; } @@ -964,21 +937,13 @@ UnicodeSet& UnicodeSet::add(UChar32 c) { // ^ // list[i] - UErrorCode status = U_ZERO_ERROR; - ensureCapacity(len+2, status); - if (U_FAILURE(status)) { + if (!ensureCapacity(len+2)) { // ensureCapacity will mark the object as Bogus if OOM failure happens. return *this; } - //for (int32_t k=len-1; k>=i; --k) { - // list[k+2] = list[k]; - //} - UChar32* src = list + len; - UChar32* dst = src + 2; - UChar32* srclimit = list + i; - while (src > srclimit) *(--dst) = *(--src); - + UChar32 *p = list + i; + uprv_memmove(p + 2, p, (len - i) * sizeof(*p)); list[i] = c; list[i+1] = c+1; len += 2; @@ -1014,7 +979,7 @@ UnicodeSet& UnicodeSet::add(const UnicodeString& s) { if (s.length() == 0 || isFrozen() || isBogus()) return *this; int32_t cp = getSingleCP(s); if (cp < 0) { - if (!strings->contains((void*) &s)) { + if (!stringsContains(s)) { _add(s); releasePattern(); } @@ -1033,12 +998,16 @@ void UnicodeSet::_add(const UnicodeString& s) { if (isFrozen() || isBogus()) { return; } + UErrorCode ec = U_ZERO_ERROR; + if (strings == nullptr && !allocateStrings(ec)) { + setToBogus(); + return; + } UnicodeString* t = new UnicodeString(s); if (t == NULL) { // Check for memory allocation error. setToBogus(); return; } - UErrorCode ec = U_ZERO_ERROR; strings->sortedInsert(t, compareUnicodeString, ec); if (U_FAILURE(ec)) { setToBogus(); @@ -1121,7 +1090,10 @@ UnicodeSet& UnicodeSet::removeAll(const UnicodeString& s) { } UnicodeSet& UnicodeSet::removeAllStrings() { - strings->removeAllElements(); + if (!isFrozen() && hasStrings()) { + strings->removeAllElements(); + releasePattern(); + } return *this; } @@ -1217,8 +1189,9 @@ UnicodeSet& UnicodeSet::remove(const UnicodeString& s) { if (s.length() == 0 || isFrozen() || isBogus()) return *this; int32_t cp = getSingleCP(s); if (cp < 0) { - strings->removeElement((void*) &s); - releasePattern(); + if (strings != nullptr && strings->removeElement((void*) &s)) { + releasePattern(); + } } else { remove((UChar32)cp, (UChar32)cp); } @@ -1260,24 +1233,17 @@ UnicodeSet& UnicodeSet::complement(void) { if (isFrozen() || isBogus()) { return *this; } - UErrorCode status = U_ZERO_ERROR; if (list[0] == UNICODESET_LOW) { - ensureBufferCapacity(len-1, status); - if (U_FAILURE(status)) { - return *this; - } - uprv_memcpy(buffer, list + 1, (size_t)(len-1)*sizeof(UChar32)); + uprv_memmove(list, list + 1, (size_t)(len-1)*sizeof(UChar32)); --len; } else { - ensureBufferCapacity(len+1, status); - if (U_FAILURE(status)) { + if (!ensureCapacity(len+1)) { return *this; } - uprv_memcpy(buffer + 1, list, (size_t)len*sizeof(UChar32)); - buffer[0] = UNICODESET_LOW; + uprv_memmove(list + 1, list, (size_t)len*sizeof(UChar32)); + list[0] = UNICODESET_LOW; ++len; } - swapBuffers(); releasePattern(); return *this; } @@ -1294,7 +1260,7 @@ UnicodeSet& UnicodeSet::complement(const UnicodeString& s) { if (s.length() == 0 || isFrozen() || isBogus()) return *this; int32_t cp = getSingleCP(s); if (cp < 0) { - if (strings->contains((void*) &s)) { + if (stringsContains(s)) { strings->removeElement((void*) &s); } else { _add(s); @@ -1325,7 +1291,7 @@ UnicodeSet& UnicodeSet::addAll(const UnicodeSet& c) { if ( c.strings!=NULL ) { for (int32_t i=0; isize(); ++i) { const UnicodeString* s = (const UnicodeString*)c.strings->elementAt(i); - if (!strings->contains((void*) s)) { + if (!stringsContains(*s)) { _add(*s); } } @@ -1347,7 +1313,13 @@ UnicodeSet& UnicodeSet::retainAll(const UnicodeSet& c) { return *this; } retain(c.list, c.len, 0); - strings->retainAll(*c.strings); + if (hasStrings()) { + if (!c.hasStrings()) { + strings->removeAllElements(); + } else { + strings->retainAll(*c.strings); + } + } return *this; } @@ -1365,7 +1337,9 @@ UnicodeSet& UnicodeSet::removeAll(const UnicodeSet& c) { return *this; } retain(c.list, c.len, 2); - strings->removeAll(*c.strings); + if (hasStrings() && c.hasStrings()) { + strings->removeAll(*c.strings); + } return *this; } @@ -1383,10 +1357,12 @@ UnicodeSet& UnicodeSet::complementAll(const UnicodeSet& c) { } exclusiveOr(c.list, c.len, 0); - for (int32_t i=0; isize(); ++i) { - void* e = c.strings->elementAt(i); - if (!strings->removeElement(e)) { - _add(*(const UnicodeString*)e); + if (c.strings != nullptr) { + for (int32_t i=0; isize(); ++i) { + void* e = c.strings->elementAt(i); + if (strings == nullptr || !strings->removeElement(e)) { + _add(*(const UnicodeString*)e); + } } } return *this; @@ -1400,18 +1376,14 @@ UnicodeSet& UnicodeSet::clear(void) { if (isFrozen()) { return *this; } - if (list != NULL) { - list[0] = UNICODESET_HIGH; - } + list[0] = UNICODESET_HIGH; len = 1; releasePattern(); if (strings != NULL) { strings->removeAllElements(); } - if (list != NULL && strings != NULL) { - // Remove bogus - fFlags = 0; - } + // Remove bogus + fFlags = 0; return *this; } @@ -1445,10 +1417,6 @@ UChar32 UnicodeSet::getRangeEnd(int32_t index) const { return list[index*2 + 1] - 1; } -int32_t UnicodeSet::getStringCount() const { - return strings->size(); -} - const UnicodeString* UnicodeSet::getString(int32_t index) const { return (const UnicodeString*) strings->elementAt(index); } @@ -1462,22 +1430,32 @@ UnicodeSet& UnicodeSet::compact() { return *this; } // Delete buffer first to defragment memory less. - if (buffer != NULL) { + if (buffer != stackList) { uprv_free(buffer); buffer = NULL; - } - if (len < capacity) { - // Make the capacity equal to len or 1. - // We don't want to realloc of 0 size. - int32_t newCapacity = len + (len == 0); - UChar32* temp = (UChar32*) uprv_realloc(list, sizeof(UChar32) * newCapacity); + bufferCapacity = 0; + } + if (list == stackList) { + // pass + } else if (len <= INITIAL_CAPACITY) { + uprv_memcpy(stackList, list, len * sizeof(UChar32)); + uprv_free(list); + list = stackList; + capacity = INITIAL_CAPACITY; + } else if ((len + 7) < capacity) { + // If we have more than a little unused capacity, shrink it to len. + UChar32* temp = (UChar32*) uprv_realloc(list, sizeof(UChar32) * len); if (temp) { list = temp; - capacity = newCapacity; + capacity = len; } // else what the heck happened?! We allocated less memory! // Oh well. We'll keep our original array. } + if (strings != nullptr && strings->isEmpty()) { + delete strings; + strings = nullptr; + } return *this; } @@ -1488,10 +1466,8 @@ UnicodeSet& UnicodeSet::compact() { /** * Deserialize constructor. */ -UnicodeSet::UnicodeSet(const uint16_t data[], int32_t dataLen, ESerialization serialization, UErrorCode &ec) - : len(1), capacity(1+START_EXTRA), list(0), bmpSet(0), buffer(0), - bufferCapacity(0), patLen(0), pat(NULL), strings(NULL), stringSpan(NULL), - fFlags(0) { +UnicodeSet::UnicodeSet(const uint16_t data[], int32_t dataLen, ESerialization serialization, + UErrorCode &ec) { if(U_FAILURE(ec)) { setToBogus(); @@ -1506,24 +1482,15 @@ UnicodeSet::UnicodeSet(const uint16_t data[], int32_t dataLen, ESerialization se return; } - allocateStrings(ec); - if (U_FAILURE(ec)) { - setToBogus(); - return; - } - // bmp? int32_t headerSize = ((data[0]&0x8000)) ?2:1; int32_t bmpLength = (headerSize==1)?data[0]:data[1]; - len = (((data[0]&0x7FFF)-bmpLength)/2)+bmpLength; + int32_t newLength = (((data[0]&0x7FFF)-bmpLength)/2)+bmpLength; #ifdef DEBUG_SERIALIZE - printf("dataLen %d headerSize %d bmpLen %d len %d. data[0]=%X/%X/%X/%X\n", dataLen,headerSize,bmpLength,len, data[0],data[1],data[2],data[3]); + printf("dataLen %d headerSize %d bmpLen %d len %d. data[0]=%X/%X/%X/%X\n", dataLen,headerSize,bmpLength,newLength, data[0],data[1],data[2],data[3]); #endif - capacity = len+1; - list = (UChar32*) uprv_malloc(sizeof(UChar32) * capacity); - if(!list || U_FAILURE(ec)) { - setToBogus(); + if(!ensureCapacity(newLength + 1)) { // +1 for HIGH return; } // copy bmp @@ -1535,15 +1502,18 @@ UnicodeSet::UnicodeSet(const uint16_t data[], int32_t dataLen, ESerialization se #endif } // copy smp - for(i=bmpLength;i MAX_LENGTH) { + newCapacity = MAX_LENGTH; + } + return newCapacity; + } +} + +bool UnicodeSet::ensureCapacity(int32_t newLen) { + if (newLen > MAX_LENGTH) { + newLen = MAX_LENGTH; + } if (newLen <= capacity) { - return; + return true; } - UChar32* temp = (UChar32*) uprv_realloc(list, sizeof(UChar32) * (newLen + GROW_EXTRA)); + int32_t newCapacity = nextCapacity(newLen); + UChar32* temp = (UChar32*) uprv_malloc(newCapacity * sizeof(UChar32)); if (temp == NULL) { - ec = U_MEMORY_ALLOCATION_ERROR; setToBogus(); // set the object to bogus state if an OOM failure occurred. - return; + return false; + } + // Copy only the actual contents. + uprv_memcpy(temp, list, len * sizeof(UChar32)); + if (list != stackList) { + uprv_free(list); } list = temp; - capacity = newLen + GROW_EXTRA; - // else we keep the original contents on the memory failure. + capacity = newCapacity; + return true; } -void UnicodeSet::ensureBufferCapacity(int32_t newLen, UErrorCode& ec) { - if (buffer != NULL && newLen <= bufferCapacity) - return; - UChar32* temp = (UChar32*) uprv_realloc(buffer, sizeof(UChar32) * (newLen + GROW_EXTRA)); +bool UnicodeSet::ensureBufferCapacity(int32_t newLen) { + if (newLen > MAX_LENGTH) { + newLen = MAX_LENGTH; + } + if (newLen <= bufferCapacity) { + return true; + } + int32_t newCapacity = nextCapacity(newLen); + UChar32* temp = (UChar32*) uprv_malloc(newCapacity * sizeof(UChar32)); if (temp == NULL) { - ec = U_MEMORY_ALLOCATION_ERROR; setToBogus(); - return; + return false; + } + // The buffer has no contents to be copied. + // It is always filled from scratch after this call. + if (buffer != stackList) { + uprv_free(buffer); } buffer = temp; - bufferCapacity = newLen + GROW_EXTRA; - // else we keep the original contents on the memory failure. + bufferCapacity = newCapacity; + return true; } /** @@ -1727,9 +1729,7 @@ void UnicodeSet::exclusiveOr(const UChar32* other, int32_t otherLen, int8_t pola if (isFrozen() || isBogus()) { return; } - UErrorCode status = U_ZERO_ERROR; - ensureBufferCapacity(len + otherLen, status); - if (U_FAILURE(status)) { + if (!ensureBufferCapacity(len + otherLen)) { return; } @@ -1777,9 +1777,7 @@ void UnicodeSet::add(const UChar32* other, int32_t otherLen, int8_t polarity) { if (isFrozen() || isBogus() || other==NULL) { return; } - UErrorCode status = U_ZERO_ERROR; - ensureBufferCapacity(len + otherLen, status); - if (U_FAILURE(status)) { + if (!ensureBufferCapacity(len + otherLen)) { return; } @@ -1890,9 +1888,7 @@ void UnicodeSet::retain(const UChar32* other, int32_t otherLen, int8_t polarity) if (isFrozen() || isBogus()) { return; } - UErrorCode status = U_ZERO_ERROR; - ensureBufferCapacity(len + otherLen, status); - if (U_FAILURE(status)) { + if (!ensureBufferCapacity(len + otherLen)) { return; } @@ -2138,12 +2134,14 @@ UnicodeString& UnicodeSet::_generatePattern(UnicodeString& result, } } - for (int32_t i = 0; isize(); ++i) { - result.append(OPEN_BRACE); - _appendToPat(result, - *(const UnicodeString*) strings->elementAt(i), - escapeUnprintable); - result.append(CLOSE_BRACE); + if (strings != nullptr) { + for (int32_t i = 0; isize(); ++i) { + result.append(OPEN_BRACE); + _appendToPat(result, + *(const UnicodeString*) strings->elementAt(i), + escapeUnprintable); + result.append(CLOSE_BRACE); + } } return result.append(SET_CLOSE); } @@ -2162,13 +2160,12 @@ void UnicodeSet::releasePattern() { /** * Set the new pattern to cache. */ -void UnicodeSet::setPattern(const UnicodeString& newPat) { +void UnicodeSet::setPattern(const char16_t *newPat, int32_t newPatLen) { releasePattern(); - int32_t newPatLen = newPat.length(); pat = (UChar *)uprv_malloc((newPatLen + 1) * sizeof(UChar)); if (pat) { patLen = newPatLen; - newPat.extractBetween(0, patLen, pat); + u_memcpy(pat, newPat, patLen); pat[patLen] = 0; } // else we don't care if malloc failed. This was just a nice cache. @@ -2177,30 +2174,15 @@ void UnicodeSet::setPattern(const UnicodeString& newPat) { UnicodeFunctor *UnicodeSet::freeze() { if(!isFrozen() && !isBogus()) { - // Do most of what compact() does before freezing because - // compact() will not work when the set is frozen. - // Small modification: Don't shrink if the savings would be tiny (<=GROW_EXTRA). - - // Delete buffer first to defragment memory less. - if (buffer != NULL) { - uprv_free(buffer); - buffer = NULL; - } - if (capacity > (len + GROW_EXTRA)) { - // Make the capacity equal to len or 1. - // We don't want to realloc of 0 size. - capacity = len + (len == 0); - list = (UChar32*) uprv_realloc(list, sizeof(UChar32) * capacity); - if (list == NULL) { // Check for memory allocation error. - setToBogus(); - return this; - } - } + compact(); // Optimize contains() and span() and similar functions. - if (!strings->isEmpty()) { + if (hasStrings()) { stringSpan = new UnicodeSetStringSpan(*this, *strings, UnicodeSetStringSpan::ALL); - if (stringSpan != NULL && !stringSpan->needsStringSpanUTF16()) { + if (stringSpan == nullptr) { + setToBogus(); + return this; + } else if (!stringSpan->needsStringSpanUTF16()) { // All strings are irrelevant for span() etc. because // all of each string's code points are contained in this set. // Do not check needsStringSpanUTF8() because UTF-8 has at most as @@ -2233,7 +2215,7 @@ int32_t UnicodeSet::span(const UChar *s, int32_t length, USetSpanCondition spanC } if(stringSpan!=NULL) { return stringSpan->span(s, length, spanCondition); - } else if(!strings->isEmpty()) { + } else if(hasStrings()) { uint32_t which= spanCondition==USET_SPAN_NOT_CONTAINED ? UnicodeSetStringSpan::FWD_UTF16_NOT_CONTAINED : UnicodeSetStringSpan::FWD_UTF16_CONTAINED; @@ -2270,7 +2252,7 @@ int32_t UnicodeSet::spanBack(const UChar *s, int32_t length, USetSpanCondition s } if(stringSpan!=NULL) { return stringSpan->spanBack(s, length, spanCondition); - } else if(!strings->isEmpty()) { + } else if(hasStrings()) { uint32_t which= spanCondition==USET_SPAN_NOT_CONTAINED ? UnicodeSetStringSpan::BACK_UTF16_NOT_CONTAINED : UnicodeSetStringSpan::BACK_UTF16_CONTAINED; @@ -2308,7 +2290,7 @@ int32_t UnicodeSet::spanUTF8(const char *s, int32_t length, USetSpanCondition sp } if(stringSpan!=NULL) { return stringSpan->spanUTF8((const uint8_t *)s, length, spanCondition); - } else if(!strings->isEmpty()) { + } else if(hasStrings()) { uint32_t which= spanCondition==USET_SPAN_NOT_CONTAINED ? UnicodeSetStringSpan::FWD_UTF8_NOT_CONTAINED : UnicodeSetStringSpan::FWD_UTF8_CONTAINED; @@ -2346,7 +2328,7 @@ int32_t UnicodeSet::spanBackUTF8(const char *s, int32_t length, USetSpanConditio } if(stringSpan!=NULL) { return stringSpan->spanBackUTF8((const uint8_t *)s, length, spanCondition); - } else if(!strings->isEmpty()) { + } else if(hasStrings()) { uint32_t which= spanCondition==USET_SPAN_NOT_CONTAINED ? UnicodeSetStringSpan::BACK_UTF8_NOT_CONTAINED : UnicodeSetStringSpan::BACK_UTF8_CONTAINED; diff --git a/source/common/uniset_closure.cpp b/source/common/uniset_closure.cpp index 0b7da79682..882231ba1a 100644 --- a/source/common/uniset_closure.cpp +++ b/source/common/uniset_closure.cpp @@ -31,10 +31,6 @@ #include "util.h" #include "uvector.h" -// initial storage. Must be >= 0 -// *** same as in uniset.cpp ! *** -#define START_EXTRA 16 - U_NAMESPACE_BEGIN // TODO memory debugging provided inside uniset.cpp @@ -49,42 +45,16 @@ U_NAMESPACE_BEGIN UnicodeSet::UnicodeSet(const UnicodeString& pattern, uint32_t options, const SymbolTable* symbols, - UErrorCode& status) : - len(0), capacity(START_EXTRA), list(0), bmpSet(0), buffer(0), - bufferCapacity(0), patLen(0), pat(NULL), strings(NULL), stringSpan(NULL), - fFlags(0) -{ - if(U_SUCCESS(status)){ - list = (UChar32*) uprv_malloc(sizeof(UChar32) * capacity); - /* test for NULL */ - if(list == NULL) { - status = U_MEMORY_ALLOCATION_ERROR; - }else{ - allocateStrings(status); - applyPattern(pattern, options, symbols, status); - } - } + UErrorCode& status) { + applyPattern(pattern, options, symbols, status); _dbgct(this); } UnicodeSet::UnicodeSet(const UnicodeString& pattern, ParsePosition& pos, uint32_t options, const SymbolTable* symbols, - UErrorCode& status) : - len(0), capacity(START_EXTRA), list(0), bmpSet(0), buffer(0), - bufferCapacity(0), patLen(0), pat(NULL), strings(NULL), stringSpan(NULL), - fFlags(0) -{ - if(U_SUCCESS(status)){ - list = (UChar32*) uprv_malloc(sizeof(UChar32) * capacity); - /* test for NULL */ - if(list == NULL) { - status = U_MEMORY_ALLOCATION_ERROR; - }else{ - allocateStrings(status); - applyPattern(pattern, pos, options, symbols, status); - } - } + UErrorCode& status) { + applyPattern(pattern, pos, options, symbols, status); _dbgct(this); } @@ -199,7 +169,7 @@ UnicodeSet& UnicodeSet::closeOver(int32_t attribute) { // start with input set to guarantee inclusion // USET_CASE: remove strings because the strings will actually be reduced (folded); // therefore, start with no strings and add only those needed - if (attribute & USET_CASE_INSENSITIVE) { + if ((attribute & USET_CASE_INSENSITIVE) && foldSet.hasStrings()) { foldSet.strings->removeAllElements(); } @@ -234,7 +204,7 @@ UnicodeSet& UnicodeSet::closeOver(int32_t attribute) { } } } - if (strings != NULL && strings->size() > 0) { + if (hasStrings()) { if (attribute & USET_CASE_INSENSITIVE) { for (int32_t j=0; jsize(); ++j) { str = *(const UnicodeString *) strings->elementAt(j); diff --git a/source/common/uniset_props.cpp b/source/common/uniset_props.cpp index 6cfd80a705..e98c175f51 100644 --- a/source/common/uniset_props.cpp +++ b/source/common/uniset_props.cpp @@ -47,10 +47,6 @@ U_NAMESPACE_USE -// initial storage. Must be >= 0 -// *** same as in uniset.cpp ! *** -#define START_EXTRA 16 - // Define UChar constants using hex for EBCDIC compatibility // Used #define to reduce private static exports and memory access time. #define SET_OPEN ((UChar)0x005B) /*[*/ @@ -185,21 +181,8 @@ isPOSIXClose(const UnicodeString &pattern, int32_t pos) { * @param pattern a string specifying what characters are in the set */ UnicodeSet::UnicodeSet(const UnicodeString& pattern, - UErrorCode& status) : - len(0), capacity(START_EXTRA), list(0), bmpSet(0), buffer(0), - bufferCapacity(0), patLen(0), pat(NULL), strings(NULL), stringSpan(NULL), - fFlags(0) -{ - if(U_SUCCESS(status)){ - list = (UChar32*) uprv_malloc(sizeof(UChar32) * capacity); - /* test for NULL */ - if(list == NULL) { - status = U_MEMORY_ALLOCATION_ERROR; - }else{ - allocateStrings(status); - applyPattern(pattern, status); - } - } + UErrorCode& status) { + applyPattern(pattern, status); _dbgct(this); } @@ -713,6 +696,11 @@ static UBool numericValueFilter(UChar32 ch, void* context) { return u_getNumericValue(ch) == *(double*)context; } +static UBool generalCategoryMaskFilter(UChar32 ch, void* context) { + int32_t value = *(int32_t*)context; + return (U_GET_GC_MASK((UChar32) ch) & value) != 0; +} + static UBool versionFilter(UChar32 ch, void* context) { static const UVersionInfo none = { 0, 0, 0, 0 }; UVersionInfo v; @@ -721,6 +709,16 @@ static UBool versionFilter(UChar32 ch, void* context) { return uprv_memcmp(&v, &none, sizeof(v)) > 0 && uprv_memcmp(&v, version, sizeof(v)) <= 0; } +typedef struct { + UProperty prop; + int32_t value; +} IntPropertyContext; + +static UBool intPropertyFilter(UChar32 ch, void* context) { + IntPropertyContext* c = (IntPropertyContext*)context; + return u_getIntPropertyValue((UChar32) ch, c->prop) == c->value; +} + static UBool scriptExtensionsFilter(UChar32 ch, void* context) { return uscript_hasScript(ch, *(UScriptCode*)context); } @@ -781,43 +779,6 @@ void UnicodeSet::applyFilter(UnicodeSet::Filter filter, namespace { -/** Maps map values to 1 if the mask contains their value'th bit, all others to 0. */ -uint32_t U_CALLCONV generalCategoryMaskFilter(const void *context, uint32_t value) { - uint32_t mask = *(const uint32_t *)context; - value = U_MASK(value) & mask; - if (value != 0) { value = 1; } - return value; -} - -/** Maps one map value to 1, all others to 0. */ -uint32_t U_CALLCONV intValueFilter(const void *context, uint32_t value) { - uint32_t v = *(const uint32_t *)context; - return value == v ? 1 : 0; -} - -} // namespace - -void UnicodeSet::applyIntPropertyValue(const UCPMap *map, - UCPMapValueFilter *filter, const void *context, - UErrorCode &errorCode) { - if (U_FAILURE(errorCode)) { return; } - clear(); - UChar32 start = 0, end; - uint32_t value; - while ((end = ucpmap_getRange(map, start, UCPMAP_RANGE_NORMAL, 0, - filter, context, &value)) >= 0) { - if (value != 0) { - add(start, end); - } - start = end + 1; - } - if (isBogus()) { - errorCode = U_MEMORY_ALLOCATION_ERROR; - } -} - -namespace { - static UBool mungeCharName(char* dst, const char* src, int32_t dstCapacity) { /* Note: we use ' ' in compiler code page */ int32_t j = 0; @@ -845,11 +806,10 @@ static UBool mungeCharName(char* dst, const char* src, int32_t dstCapacity) { UnicodeSet& UnicodeSet::applyIntPropertyValue(UProperty prop, int32_t value, UErrorCode& ec) { - if (U_FAILURE(ec)) { return *this; } - // All of the following check isFrozen() before modifying this set. + if (U_FAILURE(ec) || isFrozen()) { return *this; } if (prop == UCHAR_GENERAL_CATEGORY_MASK) { - const UCPMap *map = u_getIntPropertyMap(UCHAR_GENERAL_CATEGORY, &ec); - applyIntPropertyValue(map, generalCategoryMaskFilter, &value, ec); + const UnicodeSet* inclusions = CharacterProperties::getInclusionsForProperty(prop, ec); + applyFilter(generalCategoryMaskFilter, &value, inclusions, ec); } else if (prop == UCHAR_SCRIPT_EXTENSIONS) { const UnicodeSet* inclusions = CharacterProperties::getInclusionsForProperty(prop, ec); UScriptCode script = (UScriptCode)value; @@ -866,14 +826,11 @@ UnicodeSet::applyIntPropertyValue(UProperty prop, int32_t value, UErrorCode& ec) clear(); } } else if (UCHAR_INT_START <= prop && prop < UCHAR_INT_LIMIT) { - const UCPMap *map = u_getIntPropertyMap(prop, &ec); - applyIntPropertyValue(map, intValueFilter, &value, ec); + const UnicodeSet* inclusions = CharacterProperties::getInclusionsForProperty(prop, ec); + IntPropertyContext c = {prop, value}; + applyFilter(intPropertyFilter, &c, inclusions, ec); } else { - // This code used to always call getInclusions(property source) - // which sets an error for an unsupported property. ec = U_ILLEGAL_ARGUMENT_ERROR; - // Otherwise we would just clear() this set because - // getIntPropertyValue(c, prop) returns 0 for all code points. } return *this; } diff --git a/source/common/uprops.h b/source/common/uprops.h index 1a8e4e84f7..34b3600b6c 100644 --- a/source/common/uprops.h +++ b/source/common/uprops.h @@ -462,7 +462,6 @@ class UnicodeSet; class CharacterProperties { public: CharacterProperties() = delete; - static void U_CALLCONV initInclusion(UPropertySource src, UErrorCode &errorCode); static const UnicodeSet *getInclusionsForProperty(UProperty prop, UErrorCode &errorCode); }; diff --git a/source/common/uset.cpp b/source/common/uset.cpp index 39ad0a340c..eae7981d52 100644 --- a/source/common/uset.cpp +++ b/source/common/uset.cpp @@ -249,7 +249,7 @@ class USetAccess /* not : public UObject because all methods are static */ { public: /* Try to have the compiler inline these*/ inline static int32_t getStringCount(const UnicodeSet& set) { - return set.getStringCount(); + return set.stringsSize(); } inline static const UnicodeString* getString(const UnicodeSet& set, int32_t i) { diff --git a/source/common/usetiter.cpp b/source/common/usetiter.cpp index 93048ba2a0..7915169049 100644 --- a/source/common/usetiter.cpp +++ b/source/common/usetiter.cpp @@ -116,7 +116,7 @@ void UnicodeSetIterator::reset() { stringCount = 0; } else { endRange = set->getRangeCount() - 1; - stringCount = set->strings->size(); + stringCount = set->stringsSize(); } range = 0; endElement = -1;