diff options
Diffstat (limited to 'intl/icu/source/i18n/strmatch.cpp')
-rw-r--r-- | intl/icu/source/i18n/strmatch.cpp | 296 |
1 files changed, 296 insertions, 0 deletions
diff --git a/intl/icu/source/i18n/strmatch.cpp b/intl/icu/source/i18n/strmatch.cpp new file mode 100644 index 000000000..a04f57f18 --- /dev/null +++ b/intl/icu/source/i18n/strmatch.cpp @@ -0,0 +1,296 @@ +// Copyright (C) 2016 and later: Unicode, Inc. and others. +// License & terms of use: http://www.unicode.org/copyright.html +/* +********************************************************************** +* Copyright (c) 2001-2012, International Business Machines Corporation +* and others. All Rights Reserved. +********************************************************************** +* Date Name Description +* 07/23/01 aliu Creation. +********************************************************************** +*/ + +#include "unicode/utypes.h" + +#if !UCONFIG_NO_TRANSLITERATION + +#include "strmatch.h" +#include "rbt_data.h" +#include "util.h" +#include "unicode/uniset.h" +#include "unicode/utf16.h" + +U_NAMESPACE_BEGIN + +UOBJECT_DEFINE_RTTI_IMPLEMENTATION(StringMatcher) + +StringMatcher::StringMatcher(const UnicodeString& theString, + int32_t start, + int32_t limit, + int32_t segmentNum, + const TransliterationRuleData& theData) : + data(&theData), + segmentNumber(segmentNum), + matchStart(-1), + matchLimit(-1) +{ + theString.extractBetween(start, limit, pattern); +} + +StringMatcher::StringMatcher(const StringMatcher& o) : + UnicodeFunctor(o), + UnicodeMatcher(o), + UnicodeReplacer(o), + pattern(o.pattern), + data(o.data), + segmentNumber(o.segmentNumber), + matchStart(o.matchStart), + matchLimit(o.matchLimit) +{ +} + +/** + * Destructor + */ +StringMatcher::~StringMatcher() { +} + +/** + * Implement UnicodeFunctor + */ +UnicodeFunctor* StringMatcher::clone() const { + return new StringMatcher(*this); +} + +/** + * UnicodeFunctor API. Cast 'this' to a UnicodeMatcher* pointer + * and return the pointer. + */ +UnicodeMatcher* StringMatcher::toMatcher() const { + StringMatcher *nonconst_this = const_cast<StringMatcher *>(this); + UnicodeMatcher *nonconst_base = static_cast<UnicodeMatcher *>(nonconst_this); + + return nonconst_base; +} + +/** + * UnicodeFunctor API. Cast 'this' to a UnicodeReplacer* pointer + * and return the pointer. + */ +UnicodeReplacer* StringMatcher::toReplacer() const { + StringMatcher *nonconst_this = const_cast<StringMatcher *>(this); + UnicodeReplacer *nonconst_base = static_cast<UnicodeReplacer *>(nonconst_this); + + return nonconst_base; +} + +/** + * Implement UnicodeMatcher + */ +UMatchDegree StringMatcher::matches(const Replaceable& text, + int32_t& offset, + int32_t limit, + UBool incremental) { + int32_t i; + int32_t cursor = offset; + if (limit < cursor) { + // Match in the reverse direction + for (i=pattern.length()-1; i>=0; --i) { + UChar keyChar = pattern.charAt(i); + UnicodeMatcher* subm = data->lookupMatcher(keyChar); + if (subm == 0) { + if (cursor > limit && + keyChar == text.charAt(cursor)) { + --cursor; + } else { + return U_MISMATCH; + } + } else { + UMatchDegree m = + subm->matches(text, cursor, limit, incremental); + if (m != U_MATCH) { + return m; + } + } + } + // Record the match position, but adjust for a normal + // forward start, limit, and only if a prior match does not + // exist -- we want the rightmost match. + if (matchStart < 0) { + matchStart = cursor+1; + matchLimit = offset+1; + } + } else { + for (i=0; i<pattern.length(); ++i) { + if (incremental && cursor == limit) { + // We've reached the context limit without a mismatch and + // without completing our match. + return U_PARTIAL_MATCH; + } + UChar keyChar = pattern.charAt(i); + UnicodeMatcher* subm = data->lookupMatcher(keyChar); + if (subm == 0) { + // Don't need the cursor < limit check if + // incremental is TRUE (because it's done above); do need + // it otherwise. + if (cursor < limit && + keyChar == text.charAt(cursor)) { + ++cursor; + } else { + return U_MISMATCH; + } + } else { + UMatchDegree m = + subm->matches(text, cursor, limit, incremental); + if (m != U_MATCH) { + return m; + } + } + } + // Record the match position + matchStart = offset; + matchLimit = cursor; + } + + offset = cursor; + return U_MATCH; +} + +/** + * Implement UnicodeMatcher + */ +UnicodeString& StringMatcher::toPattern(UnicodeString& result, + UBool escapeUnprintable) const +{ + result.truncate(0); + UnicodeString str, quoteBuf; + if (segmentNumber > 0) { + result.append((UChar)40); /*(*/ + } + for (int32_t i=0; i<pattern.length(); ++i) { + UChar keyChar = pattern.charAt(i); + const UnicodeMatcher* m = data->lookupMatcher(keyChar); + if (m == 0) { + ICU_Utility::appendToRule(result, keyChar, FALSE, escapeUnprintable, quoteBuf); + } else { + ICU_Utility::appendToRule(result, m->toPattern(str, escapeUnprintable), + TRUE, escapeUnprintable, quoteBuf); + } + } + if (segmentNumber > 0) { + result.append((UChar)41); /*)*/ + } + // Flush quoteBuf out to result + ICU_Utility::appendToRule(result, -1, + TRUE, escapeUnprintable, quoteBuf); + return result; +} + +/** + * Implement UnicodeMatcher + */ +UBool StringMatcher::matchesIndexValue(uint8_t v) const { + if (pattern.length() == 0) { + return TRUE; + } + UChar32 c = pattern.char32At(0); + const UnicodeMatcher *m = data->lookupMatcher(c); + return (m == 0) ? ((c & 0xFF) == v) : m->matchesIndexValue(v); +} + +/** + * Implement UnicodeMatcher + */ +void StringMatcher::addMatchSetTo(UnicodeSet& toUnionTo) const { + UChar32 ch; + for (int32_t i=0; i<pattern.length(); i+=U16_LENGTH(ch)) { + ch = pattern.char32At(i); + const UnicodeMatcher* matcher = data->lookupMatcher(ch); + if (matcher == NULL) { + toUnionTo.add(ch); + } else { + matcher->addMatchSetTo(toUnionTo); + } + } +} + +/** + * UnicodeReplacer API + */ +int32_t StringMatcher::replace(Replaceable& text, + int32_t start, + int32_t limit, + int32_t& /*cursor*/) { + + int32_t outLen = 0; + + // Copy segment with out-of-band data + int32_t dest = limit; + // If there was no match, that means that a quantifier + // matched zero-length. E.g., x (a)* y matched "xy". + if (matchStart >= 0) { + if (matchStart != matchLimit) { + text.copy(matchStart, matchLimit, dest); + outLen = matchLimit - matchStart; + } + } + + text.handleReplaceBetween(start, limit, UnicodeString()); // delete original text + + return outLen; +} + +/** + * UnicodeReplacer API + */ +UnicodeString& StringMatcher::toReplacerPattern(UnicodeString& rule, + UBool /*escapeUnprintable*/) const { + // assert(segmentNumber > 0); + rule.truncate(0); + rule.append((UChar)0x0024 /*$*/); + ICU_Utility::appendNumber(rule, segmentNumber, 10, 1); + return rule; +} + +/** + * Remove any match info. This must be called before performing a + * set of matches with this segment. + */ + void StringMatcher::resetMatch() { + matchStart = matchLimit = -1; +} + +/** + * Union the set of all characters that may output by this object + * into the given set. + * @param toUnionTo the set into which to union the output characters + */ +void StringMatcher::addReplacementSetTo(UnicodeSet& /*toUnionTo*/) const { + // The output of this replacer varies; it is the source text between + // matchStart and matchLimit. Since this varies depending on the + // input text, we can't compute it here. We can either do nothing + // or we can add ALL characters to the set. It's probably more useful + // to do nothing. +} + +/** + * Implement UnicodeFunctor + */ +void StringMatcher::setData(const TransliterationRuleData* d) { + data = d; + int32_t i = 0; + while (i<pattern.length()) { + UChar32 c = pattern.char32At(i); + UnicodeFunctor* f = data->lookup(c); + if (f != NULL) { + f->setData(data); + } + i += U16_LENGTH(c); + } +} + +U_NAMESPACE_END + +#endif /* #if !UCONFIG_NO_TRANSLITERATION */ + +//eof |