diff options
Diffstat (limited to 'intl/icu/source/i18n/brktrans.cpp')
-rw-r--r-- | intl/icu/source/i18n/brktrans.cpp | 193 |
1 files changed, 193 insertions, 0 deletions
diff --git a/intl/icu/source/i18n/brktrans.cpp b/intl/icu/source/i18n/brktrans.cpp new file mode 100644 index 000000000..714a0a872 --- /dev/null +++ b/intl/icu/source/i18n/brktrans.cpp @@ -0,0 +1,193 @@ +// Copyright (C) 2016 and later: Unicode, Inc. and others. +// License & terms of use: http://www.unicode.org/copyright.html +/* +********************************************************************** +* Copyright (C) 2008-2015, International Business Machines +* Corporation and others. All Rights Reserved. +********************************************************************** +* Date Name Description +* 05/11/2008 Andy Heninger Port from Java +********************************************************************** +*/ + +#include "unicode/utypes.h" + +#if !UCONFIG_NO_TRANSLITERATION && !UCONFIG_NO_BREAK_ITERATION + +#include "unicode/brkiter.h" +#include "unicode/localpointer.h" +#include "unicode/uchar.h" +#include "unicode/unifilt.h" +#include "unicode/uniset.h" + +#include "brktrans.h" +#include "cmemory.h" +#include "mutex.h" +#include "uprops.h" +#include "uinvchar.h" +#include "util.h" +#include "uvectr32.h" + +U_NAMESPACE_BEGIN + +UOBJECT_DEFINE_RTTI_IMPLEMENTATION(BreakTransliterator) + +static const UChar SPACE = 32; // ' ' + + +/** + * Constructs a transliterator with the default delimiters '{' and + * '}'. + */ +BreakTransliterator::BreakTransliterator(UnicodeFilter* adoptedFilter) : + Transliterator(UNICODE_STRING("Any-BreakInternal", 17), adoptedFilter), + cachedBI(NULL), cachedBoundaries(NULL), fInsertion(SPACE) { + } + + +/** + * Destructor. + */ +BreakTransliterator::~BreakTransliterator() { +} + +/** + * Copy constructor. + */ +BreakTransliterator::BreakTransliterator(const BreakTransliterator& o) : + Transliterator(o), cachedBI(NULL), cachedBoundaries(NULL), fInsertion(o.fInsertion) { +} + + +/** + * Transliterator API. + */ +Transliterator* BreakTransliterator::clone(void) const { + return new BreakTransliterator(*this); +} + +/** + * Implements {@link Transliterator#handleTransliterate}. + */ +void BreakTransliterator::handleTransliterate(Replaceable& text, UTransPosition& offsets, + UBool isIncremental ) const { + + UErrorCode status = U_ZERO_ERROR; + LocalPointer<BreakIterator> bi; + LocalPointer<UVector32> boundaries; + + { + Mutex m; + BreakTransliterator *nonConstThis = const_cast<BreakTransliterator *>(this); + boundaries.moveFrom(nonConstThis->cachedBoundaries); + bi.moveFrom(nonConstThis->cachedBI); + } + if (bi.isNull()) { + bi.adoptInstead(BreakIterator::createWordInstance(Locale::getEnglish(), status)); + } + if (boundaries.isNull()) { + boundaries.adoptInstead(new UVector32(status)); + } + + if (bi.isNull() || boundaries.isNull() || U_FAILURE(status)) { + return; + } + + boundaries->removeAllElements(); + UnicodeString sText = replaceableAsString(text); + bi->setText(sText); + bi->preceding(offsets.start); + + // To make things much easier, we will stack the boundaries, and then insert at the end. + // generally, we won't need too many, since we will be filtered. + + int32_t boundary; + for(boundary = bi->next(); boundary != UBRK_DONE && boundary < offsets.limit; boundary = bi->next()) { + if (boundary == 0) continue; + // HACK: Check to see that preceeding item was a letter + + UChar32 cp = sText.char32At(boundary-1); + int type = u_charType(cp); + //System.out.println(Integer.toString(cp,16) + " (before): " + type); + if ((U_MASK(type) & (U_GC_L_MASK | U_GC_M_MASK)) == 0) continue; + + cp = sText.char32At(boundary); + type = u_charType(cp); + //System.out.println(Integer.toString(cp,16) + " (after): " + type); + if ((U_MASK(type) & (U_GC_L_MASK | U_GC_M_MASK)) == 0) continue; + + boundaries->addElement(boundary, status); + // printf("Boundary at %d\n", boundary); + } + + int delta = 0; + int lastBoundary = 0; + + if (boundaries->size() != 0) { // if we found something, adjust + delta = boundaries->size() * fInsertion.length(); + lastBoundary = boundaries->lastElementi(); + + // we do this from the end backwards, so that we don't have to keep updating. + + while (boundaries->size() > 0) { + boundary = boundaries->popi(); + text.handleReplaceBetween(boundary, boundary, fInsertion); + } + } + + // Now fix up the return values + offsets.contextLimit += delta; + offsets.limit += delta; + offsets.start = isIncremental ? lastBoundary + delta : offsets.limit; + + // Return break iterator & boundaries vector to the cache. + { + Mutex m; + BreakTransliterator *nonConstThis = const_cast<BreakTransliterator *>(this); + if (nonConstThis->cachedBI.isNull()) { + nonConstThis->cachedBI.moveFrom(bi); + } + if (nonConstThis->cachedBoundaries.isNull()) { + nonConstThis->cachedBoundaries.moveFrom(boundaries); + } + } + + // TODO: do something with U_FAILURE(status); + // (need to look at transliterators overall, not just here.) +} + +// +// getInsertion() +// +const UnicodeString &BreakTransliterator::getInsertion() const { + return fInsertion; +} + +// +// setInsertion() +// +void BreakTransliterator::setInsertion(const UnicodeString &insertion) { + this->fInsertion = insertion; +} + +// +// replaceableAsString Hack to let break iterators work +// on the replaceable text from transliterators. +// In practice, the only real Replaceable type that we +// will be seeing is UnicodeString, so this function +// will normally be efficient. +// +UnicodeString BreakTransliterator::replaceableAsString(Replaceable &r) { + UnicodeString s; + UnicodeString *rs = dynamic_cast<UnicodeString *>(&r); + if (rs != NULL) { + s = *rs; + } else { + r.extractBetween(0, r.length(), s); + } + return s; +} + +U_NAMESPACE_END + +#endif /* #if !UCONFIG_NO_TRANSLITERATION */ |