diff options
Diffstat (limited to 'intl/icu/source/common/dictbe.h')
-rw-r--r-- | intl/icu/source/common/dictbe.h | 429 |
1 files changed, 429 insertions, 0 deletions
diff --git a/intl/icu/source/common/dictbe.h b/intl/icu/source/common/dictbe.h new file mode 100644 index 000000000..9390e92b0 --- /dev/null +++ b/intl/icu/source/common/dictbe.h @@ -0,0 +1,429 @@ +// Copyright (C) 2016 and later: Unicode, Inc. and others. +// License & terms of use: http://www.unicode.org/copyright.html +/** + ******************************************************************************* + * Copyright (C) 2006-2014, International Business Machines Corporation * + * and others. All Rights Reserved. * + ******************************************************************************* + */ + +#ifndef DICTBE_H +#define DICTBE_H + +#include "unicode/utypes.h" +#include "unicode/uniset.h" +#include "unicode/utext.h" + +#include "brkeng.h" + +U_NAMESPACE_BEGIN + +class DictionaryMatcher; +class Normalizer2; + +/******************************************************************* + * DictionaryBreakEngine + */ + +/** + * <p>DictionaryBreakEngine is a kind of LanguageBreakEngine that uses a + * dictionary to determine language-specific breaks.</p> + * + * <p>After it is constructed a DictionaryBreakEngine may be shared between + * threads without synchronization.</p> + */ +class DictionaryBreakEngine : public LanguageBreakEngine { + private: + /** + * The set of characters handled by this engine + * @internal + */ + + UnicodeSet fSet; + + /** + * The set of break types handled by this engine + * @internal + */ + + uint32_t fTypes; + + /** + * <p>Default constructor.</p> + * + */ + DictionaryBreakEngine(); + + public: + + /** + * <p>Constructor setting the break types handled.</p> + * + * @param breakTypes A bitmap of types handled by the engine. + */ + DictionaryBreakEngine( uint32_t breakTypes ); + + /** + * <p>Virtual destructor.</p> + */ + virtual ~DictionaryBreakEngine(); + + /** + * <p>Indicate whether this engine handles a particular character for + * a particular kind of break.</p> + * + * @param c A character which begins a run that the engine might handle + * @param breakType The type of text break which the caller wants to determine + * @return TRUE if this engine handles the particular character and break + * type. + */ + virtual UBool handles( UChar32 c, int32_t breakType ) const; + + /** + * <p>Find any breaks within a run in the supplied text.</p> + * + * @param text A UText representing the text. The iterator is left at + * the end of the run of characters which the engine is capable of handling + * that starts from the first (or last) character in the range. + * @param startPos The start of the run within the supplied text. + * @param endPos The end of the run within the supplied text. + * @param reverse Whether the caller is looking for breaks in a reverse + * direction. + * @param breakType The type of break desired, or -1. + * @param foundBreaks An allocated C array of the breaks found, if any + * @return The number of breaks found. + */ + virtual int32_t findBreaks( UText *text, + int32_t startPos, + int32_t endPos, + UBool reverse, + int32_t breakType, + UStack &foundBreaks ) const; + + protected: + + /** + * <p>Set the character set handled by this engine.</p> + * + * @param set A UnicodeSet of the set of characters handled by the engine + */ + virtual void setCharacters( const UnicodeSet &set ); + + /** + * <p>Set the break types handled by this engine.</p> + * + * @param breakTypes A bitmap of types handled by the engine. + */ +// virtual void setBreakTypes( uint32_t breakTypes ); + + /** + * <p>Divide up a range of known dictionary characters handled by this break engine.</p> + * + * @param text A UText representing the text + * @param rangeStart The start of the range of dictionary characters + * @param rangeEnd The end of the range of dictionary characters + * @param foundBreaks Output of C array of int32_t break positions, or 0 + * @return The number of breaks found + */ + virtual int32_t divideUpDictionaryRange( UText *text, + int32_t rangeStart, + int32_t rangeEnd, + UStack &foundBreaks ) const = 0; + +}; + +/******************************************************************* + * ThaiBreakEngine + */ + +/** + * <p>ThaiBreakEngine is a kind of DictionaryBreakEngine that uses a + * dictionary and heuristics to determine Thai-specific breaks.</p> + * + * <p>After it is constructed a ThaiBreakEngine may be shared between + * threads without synchronization.</p> + */ +class ThaiBreakEngine : public DictionaryBreakEngine { + private: + /** + * The set of characters handled by this engine + * @internal + */ + + UnicodeSet fThaiWordSet; + UnicodeSet fEndWordSet; + UnicodeSet fBeginWordSet; + UnicodeSet fSuffixSet; + UnicodeSet fMarkSet; + DictionaryMatcher *fDictionary; + + public: + + /** + * <p>Default constructor.</p> + * + * @param adoptDictionary A DictionaryMatcher to adopt. Deleted when the + * engine is deleted. + */ + ThaiBreakEngine(DictionaryMatcher *adoptDictionary, UErrorCode &status); + + /** + * <p>Virtual destructor.</p> + */ + virtual ~ThaiBreakEngine(); + + protected: + /** + * <p>Divide up a range of known dictionary characters handled by this break engine.</p> + * + * @param text A UText representing the text + * @param rangeStart The start of the range of dictionary characters + * @param rangeEnd The end of the range of dictionary characters + * @param foundBreaks Output of C array of int32_t break positions, or 0 + * @return The number of breaks found + */ + virtual int32_t divideUpDictionaryRange( UText *text, + int32_t rangeStart, + int32_t rangeEnd, + UStack &foundBreaks ) const; + +}; + +/******************************************************************* + * LaoBreakEngine + */ + +/** + * <p>LaoBreakEngine is a kind of DictionaryBreakEngine that uses a + * dictionary and heuristics to determine Lao-specific breaks.</p> + * + * <p>After it is constructed a LaoBreakEngine may be shared between + * threads without synchronization.</p> + */ +class LaoBreakEngine : public DictionaryBreakEngine { + private: + /** + * The set of characters handled by this engine + * @internal + */ + + UnicodeSet fLaoWordSet; + UnicodeSet fEndWordSet; + UnicodeSet fBeginWordSet; + UnicodeSet fMarkSet; + DictionaryMatcher *fDictionary; + + public: + + /** + * <p>Default constructor.</p> + * + * @param adoptDictionary A DictionaryMatcher to adopt. Deleted when the + * engine is deleted. + */ + LaoBreakEngine(DictionaryMatcher *adoptDictionary, UErrorCode &status); + + /** + * <p>Virtual destructor.</p> + */ + virtual ~LaoBreakEngine(); + + protected: + /** + * <p>Divide up a range of known dictionary characters handled by this break engine.</p> + * + * @param text A UText representing the text + * @param rangeStart The start of the range of dictionary characters + * @param rangeEnd The end of the range of dictionary characters + * @param foundBreaks Output of C array of int32_t break positions, or 0 + * @return The number of breaks found + */ + virtual int32_t divideUpDictionaryRange( UText *text, + int32_t rangeStart, + int32_t rangeEnd, + UStack &foundBreaks ) const; + +}; + +/******************************************************************* + * BurmeseBreakEngine + */ + +/** + * <p>BurmeseBreakEngine is a kind of DictionaryBreakEngine that uses a + * DictionaryMatcher and heuristics to determine Burmese-specific breaks.</p> + * + * <p>After it is constructed a BurmeseBreakEngine may be shared between + * threads without synchronization.</p> + */ +class BurmeseBreakEngine : public DictionaryBreakEngine { + private: + /** + * The set of characters handled by this engine + * @internal + */ + + UnicodeSet fBurmeseWordSet; + UnicodeSet fEndWordSet; + UnicodeSet fBeginWordSet; + UnicodeSet fMarkSet; + DictionaryMatcher *fDictionary; + + public: + + /** + * <p>Default constructor.</p> + * + * @param adoptDictionary A DictionaryMatcher to adopt. Deleted when the + * engine is deleted. + */ + BurmeseBreakEngine(DictionaryMatcher *adoptDictionary, UErrorCode &status); + + /** + * <p>Virtual destructor.</p> + */ + virtual ~BurmeseBreakEngine(); + + protected: + /** + * <p>Divide up a range of known dictionary characters.</p> + * + * @param text A UText representing the text + * @param rangeStart The start of the range of dictionary characters + * @param rangeEnd The end of the range of dictionary characters + * @param foundBreaks Output of C array of int32_t break positions, or 0 + * @return The number of breaks found + */ + virtual int32_t divideUpDictionaryRange( UText *text, + int32_t rangeStart, + int32_t rangeEnd, + UStack &foundBreaks ) const; + +}; + +/******************************************************************* + * KhmerBreakEngine + */ + +/** + * <p>KhmerBreakEngine is a kind of DictionaryBreakEngine that uses a + * DictionaryMatcher and heuristics to determine Khmer-specific breaks.</p> + * + * <p>After it is constructed a KhmerBreakEngine may be shared between + * threads without synchronization.</p> + */ +class KhmerBreakEngine : public DictionaryBreakEngine { + private: + /** + * The set of characters handled by this engine + * @internal + */ + + UnicodeSet fKhmerWordSet; + UnicodeSet fEndWordSet; + UnicodeSet fBeginWordSet; + UnicodeSet fMarkSet; + DictionaryMatcher *fDictionary; + + public: + + /** + * <p>Default constructor.</p> + * + * @param adoptDictionary A DictionaryMatcher to adopt. Deleted when the + * engine is deleted. + */ + KhmerBreakEngine(DictionaryMatcher *adoptDictionary, UErrorCode &status); + + /** + * <p>Virtual destructor.</p> + */ + virtual ~KhmerBreakEngine(); + + protected: + /** + * <p>Divide up a range of known dictionary characters.</p> + * + * @param text A UText representing the text + * @param rangeStart The start of the range of dictionary characters + * @param rangeEnd The end of the range of dictionary characters + * @param foundBreaks Output of C array of int32_t break positions, or 0 + * @return The number of breaks found + */ + virtual int32_t divideUpDictionaryRange( UText *text, + int32_t rangeStart, + int32_t rangeEnd, + UStack &foundBreaks ) const; + +}; + +#if !UCONFIG_NO_NORMALIZATION + +/******************************************************************* + * CjkBreakEngine + */ + +//indicates language/script that the CjkBreakEngine will handle +enum LanguageType { + kKorean, + kChineseJapanese +}; + +/** + * <p>CjkBreakEngine is a kind of DictionaryBreakEngine that uses a + * dictionary with costs associated with each word and + * Viterbi decoding to determine CJK-specific breaks.</p> + */ +class CjkBreakEngine : public DictionaryBreakEngine { + protected: + /** + * The set of characters handled by this engine + * @internal + */ + UnicodeSet fHangulWordSet; + UnicodeSet fHanWordSet; + UnicodeSet fKatakanaWordSet; + UnicodeSet fHiraganaWordSet; + + DictionaryMatcher *fDictionary; + const Normalizer2 *nfkcNorm2; + + public: + + /** + * <p>Default constructor.</p> + * + * @param adoptDictionary A DictionaryMatcher to adopt. Deleted when the + * engine is deleted. The DictionaryMatcher must contain costs for each word + * in order for the dictionary to work properly. + */ + CjkBreakEngine(DictionaryMatcher *adoptDictionary, LanguageType type, UErrorCode &status); + + /** + * <p>Virtual destructor.</p> + */ + virtual ~CjkBreakEngine(); + + protected: + /** + * <p>Divide up a range of known dictionary characters handled by this break engine.</p> + * + * @param text A UText representing the text + * @param rangeStart The start of the range of dictionary characters + * @param rangeEnd The end of the range of dictionary characters + * @param foundBreaks Output of C array of int32_t break positions, or 0 + * @return The number of breaks found + */ + virtual int32_t divideUpDictionaryRange( UText *text, + int32_t rangeStart, + int32_t rangeEnd, + UStack &foundBreaks ) const; + +}; + +#endif + +U_NAMESPACE_END + + /* DICTBE_H */ +#endif |