diff options
Diffstat (limited to 'intl/icu/source/i18n/unicode/search.h')
-rw-r--r-- | intl/icu/source/i18n/unicode/search.h | 577 |
1 files changed, 577 insertions, 0 deletions
diff --git a/intl/icu/source/i18n/unicode/search.h b/intl/icu/source/i18n/unicode/search.h new file mode 100644 index 000000000..0acfcced3 --- /dev/null +++ b/intl/icu/source/i18n/unicode/search.h @@ -0,0 +1,577 @@ +// Copyright (C) 2016 and later: Unicode, Inc. and others. +// License & terms of use: http://www.unicode.org/copyright.html +/* +********************************************************************** +* Copyright (C) 2001-2011 IBM and others. All rights reserved. +********************************************************************** +* Date Name Description +* 03/22/2000 helena Creation. +********************************************************************** +*/ + +#ifndef SEARCH_H +#define SEARCH_H + +#include "unicode/utypes.h" + +/** + * \file + * \brief C++ API: SearchIterator object. + */ + +#if !UCONFIG_NO_COLLATION && !UCONFIG_NO_BREAK_ITERATION + +#include "unicode/uobject.h" +#include "unicode/unistr.h" +#include "unicode/chariter.h" +#include "unicode/brkiter.h" +#include "unicode/usearch.h" + +/** +* @stable ICU 2.0 +*/ +struct USearch; +/** +* @stable ICU 2.0 +*/ +typedef struct USearch USearch; + +U_NAMESPACE_BEGIN + +/** + * + * <tt>SearchIterator</tt> is an abstract base class that provides + * methods to search for a pattern within a text string. Instances of + * <tt>SearchIterator</tt> maintain a current position and scans over the + * target text, returning the indices the pattern is matched and the length + * of each match. + * <p> + * <tt>SearchIterator</tt> defines a protocol for text searching. + * Subclasses provide concrete implementations of various search algorithms. + * For example, <tt>StringSearch</tt> implements language-sensitive pattern + * matching based on the comparison rules defined in a + * <tt>RuleBasedCollator</tt> object. + * <p> + * Other options for searching includes using a BreakIterator to restrict + * the points at which matches are detected. + * <p> + * <tt>SearchIterator</tt> provides an API that is similar to that of + * other text iteration classes such as <tt>BreakIterator</tt>. Using + * this class, it is easy to scan through text looking for all occurances of + * a given pattern. The following example uses a <tt>StringSearch</tt> + * object to find all instances of "fox" in the target string. Any other + * subclass of <tt>SearchIterator</tt> can be used in an identical + * manner. + * <pre><code> + * UnicodeString target("The quick brown fox jumped over the lazy fox"); + * UnicodeString pattern("fox"); + * + * SearchIterator *iter = new StringSearch(pattern, target); + * UErrorCode error = U_ZERO_ERROR; + * for (int pos = iter->first(error); pos != USEARCH_DONE; + * pos = iter->next(error)) { + * printf("Found match at %d pos, length is %d\n", pos, + * iter.getMatchLength()); + * } + * </code></pre> + * + * @see StringSearch + * @see RuleBasedCollator + */ +class U_I18N_API SearchIterator : public UObject { + +public: + + // public constructors and destructors ------------------------------- + + /** + * Copy constructor that creates a SearchIterator instance with the same + * behavior, and iterating over the same text. + * @param other the SearchIterator instance to be copied. + * @stable ICU 2.0 + */ + SearchIterator(const SearchIterator &other); + + /** + * Destructor. Cleans up the search iterator data struct. + * @stable ICU 2.0 + */ + virtual ~SearchIterator(); + + // public get and set methods ---------------------------------------- + + /** + * Sets the index to point to the given position, and clears any state + * that's affected. + * <p> + * This method takes the argument index and sets the position in the text + * string accordingly without checking if the index is pointing to a + * valid starting point to begin searching. + * @param position within the text to be set. If position is less + * than or greater than the text range for searching, + * an U_INDEX_OUTOFBOUNDS_ERROR will be returned + * @param status for errors if it occurs + * @stable ICU 2.0 + */ + virtual void setOffset(int32_t position, UErrorCode &status) = 0; + + /** + * Return the current index in the text being searched. + * If the iteration has gone past the end of the text + * (or past the beginning for a backwards search), USEARCH_DONE + * is returned. + * @return current index in the text being searched. + * @stable ICU 2.0 + */ + virtual int32_t getOffset(void) const = 0; + + /** + * Sets the text searching attributes located in the enum + * USearchAttribute with values from the enum USearchAttributeValue. + * USEARCH_DEFAULT can be used for all attributes for resetting. + * @param attribute text attribute (enum USearchAttribute) to be set + * @param value text attribute value + * @param status for errors if it occurs + * @stable ICU 2.0 + */ + void setAttribute(USearchAttribute attribute, + USearchAttributeValue value, + UErrorCode &status); + + /** + * Gets the text searching attributes + * @param attribute text attribute (enum USearchAttribute) to be retrieve + * @return text attribute value + * @stable ICU 2.0 + */ + USearchAttributeValue getAttribute(USearchAttribute attribute) const; + + /** + * Returns the index to the match in the text string that was searched. + * This call returns a valid result only after a successful call to + * <tt>first</tt>, <tt>next</tt>, <tt>previous</tt>, or <tt>last</tt>. + * Just after construction, or after a searching method returns + * <tt>USEARCH_DONE</tt>, this method will return <tt>USEARCH_DONE</tt>. + * <p> + * Use getMatchedLength to get the matched string length. + * @return index of a substring within the text string that is being + * searched. + * @see #first + * @see #next + * @see #previous + * @see #last + * @stable ICU 2.0 + */ + int32_t getMatchedStart(void) const; + + /** + * Returns the length of text in the string which matches the search + * pattern. This call returns a valid result only after a successful call + * to <tt>first</tt>, <tt>next</tt>, <tt>previous</tt>, or <tt>last</tt>. + * Just after construction, or after a searching method returns + * <tt>USEARCH_DONE</tt>, this method will return 0. + * @return The length of the match in the target text, or 0 if there + * is no match currently. + * @see #first + * @see #next + * @see #previous + * @see #last + * @stable ICU 2.0 + */ + int32_t getMatchedLength(void) const; + + /** + * Returns the text that was matched by the most recent call to + * <tt>first</tt>, <tt>next</tt>, <tt>previous</tt>, or <tt>last</tt>. + * If the iterator is not pointing at a valid match (e.g. just after + * construction or after <tt>USEARCH_DONE</tt> has been returned, + * returns an empty string. + * @param result stores the matched string or an empty string if a match + * is not found. + * @see #first + * @see #next + * @see #previous + * @see #last + * @stable ICU 2.0 + */ + void getMatchedText(UnicodeString &result) const; + + /** + * Set the BreakIterator that will be used to restrict the points + * at which matches are detected. The user is responsible for deleting + * the breakiterator. + * @param breakiter A BreakIterator that will be used to restrict the + * points at which matches are detected. If a match is + * found, but the match's start or end index is not a + * boundary as determined by the <tt>BreakIterator</tt>, + * the match will be rejected and another will be searched + * for. If this parameter is <tt>NULL</tt>, no break + * detection is attempted. + * @param status for errors if it occurs + * @see BreakIterator + * @stable ICU 2.0 + */ + void setBreakIterator(BreakIterator *breakiter, UErrorCode &status); + + /** + * Returns the BreakIterator that is used to restrict the points at + * which matches are detected. This will be the same object that was + * passed to the constructor or to <tt>setBreakIterator</tt>. + * Note that <tt>NULL</tt> is a legal value; it means that break + * detection should not be attempted. + * @return BreakIterator used to restrict matchings. + * @see #setBreakIterator + * @stable ICU 2.0 + */ + const BreakIterator * getBreakIterator(void) const; + + /** + * Set the string text to be searched. Text iteration will hence begin at + * the start of the text string. This method is useful if you want to + * re-use an iterator to search for the same pattern within a different + * body of text. The user is responsible for deleting the text. + * @param text string to be searched. + * @param status for errors. If the text length is 0, + * an U_ILLEGAL_ARGUMENT_ERROR is returned. + * @stable ICU 2.0 + */ + virtual void setText(const UnicodeString &text, UErrorCode &status); + + /** + * Set the string text to be searched. Text iteration will hence begin at + * the start of the text string. This method is useful if you want to + * re-use an iterator to search for the same pattern within a different + * body of text. + * <p> + * Note: No parsing of the text within the <tt>CharacterIterator</tt> + * will be done during searching for this version. The block of text + * in <tt>CharacterIterator</tt> will be used as it is. + * The user is responsible for deleting the text. + * @param text string iterator to be searched. + * @param status for errors if any. If the text length is 0 then an + * U_ILLEGAL_ARGUMENT_ERROR is returned. + * @stable ICU 2.0 + */ + virtual void setText(CharacterIterator &text, UErrorCode &status); + + /** + * Return the string text to be searched. + * @return text string to be searched. + * @stable ICU 2.0 + */ + const UnicodeString & getText(void) const; + + // operator overloading ---------------------------------------------- + + /** + * Equality operator. + * @param that SearchIterator instance to be compared. + * @return TRUE if both BreakIterators are of the same class, have the + * same behavior, terates over the same text and have the same + * attributes. FALSE otherwise. + * @stable ICU 2.0 + */ + virtual UBool operator==(const SearchIterator &that) const; + + /** + * Not-equal operator. + * @param that SearchIterator instance to be compared. + * @return FALSE if operator== returns TRUE, and vice versa. + * @stable ICU 2.0 + */ + UBool operator!=(const SearchIterator &that) const; + + // public methods ---------------------------------------------------- + + /** + * Returns a copy of SearchIterator with the same behavior, and + * iterating over the same text, as this one. Note that all data will be + * replicated, except for the text string to be searched. + * @return cloned object + * @stable ICU 2.0 + */ + virtual SearchIterator* safeClone(void) const = 0; + + /** + * Returns the first index at which the string text matches the search + * pattern. The iterator is adjusted so that its current index (as + * returned by <tt>getOffset</tt>) is the match position if one + * was found. + * If a match is not found, <tt>USEARCH_DONE</tt> will be returned and + * the iterator will be adjusted to the index USEARCH_DONE + * @param status for errors if it occurs + * @return The character index of the first match, or + * <tt>USEARCH_DONE</tt> if there are no matches. + * @see #getOffset + * @stable ICU 2.0 + */ + int32_t first(UErrorCode &status); + + /** + * Returns the first index equal or greater than <tt>position</tt> at which the + * string text matches the search pattern. The iterator is adjusted so + * that its current index (as returned by <tt>getOffset</tt>) is the + * match position if one was found. + * If a match is not found, <tt>USEARCH_DONE</tt> will be returned and the + * iterator will be adjusted to the index <tt>USEARCH_DONE</tt>. + * @param position where search if to start from. If position is less + * than or greater than the text range for searching, + * an U_INDEX_OUTOFBOUNDS_ERROR will be returned + * @param status for errors if it occurs + * @return The character index of the first match following + * <tt>position</tt>, or <tt>USEARCH_DONE</tt> if there are no + * matches. + * @see #getOffset + * @stable ICU 2.0 + */ + int32_t following(int32_t position, UErrorCode &status); + + /** + * Returns the last index in the target text at which it matches the + * search pattern. The iterator is adjusted so that its current index + * (as returned by <tt>getOffset</tt>) is the match position if one was + * found. + * If a match is not found, <tt>USEARCH_DONE</tt> will be returned and + * the iterator will be adjusted to the index USEARCH_DONE. + * @param status for errors if it occurs + * @return The index of the first match, or <tt>USEARCH_DONE</tt> if + * there are no matches. + * @see #getOffset + * @stable ICU 2.0 + */ + int32_t last(UErrorCode &status); + + /** + * Returns the first index less than <tt>position</tt> at which the string + * text matches the search pattern. The iterator is adjusted so that its + * current index (as returned by <tt>getOffset</tt>) is the match + * position if one was found. If a match is not found, + * <tt>USEARCH_DONE</tt> will be returned and the iterator will be + * adjusted to the index USEARCH_DONE + * <p> + * When <tt>USEARCH_OVERLAP</tt> option is off, the last index of the + * result match is always less than <tt>position</tt>. + * When <tt>USERARCH_OVERLAP</tt> is on, the result match may span across + * <tt>position</tt>. + * + * @param position where search is to start from. If position is less + * than or greater than the text range for searching, + * an U_INDEX_OUTOFBOUNDS_ERROR will be returned + * @param status for errors if it occurs + * @return The character index of the first match preceding + * <tt>position</tt>, or <tt>USEARCH_DONE</tt> if there are + * no matches. + * @see #getOffset + * @stable ICU 2.0 + */ + int32_t preceding(int32_t position, UErrorCode &status); + + /** + * Returns the index of the next point at which the text matches the + * search pattern, starting from the current position + * The iterator is adjusted so that its current index (as returned by + * <tt>getOffset</tt>) is the match position if one was found. + * If a match is not found, <tt>USEARCH_DONE</tt> will be returned and + * the iterator will be adjusted to a position after the end of the text + * string. + * @param status for errors if it occurs + * @return The index of the next match after the current position, + * or <tt>USEARCH_DONE</tt> if there are no more matches. + * @see #getOffset + * @stable ICU 2.0 + */ + int32_t next(UErrorCode &status); + + /** + * Returns the index of the previous point at which the string text + * matches the search pattern, starting at the current position. + * The iterator is adjusted so that its current index (as returned by + * <tt>getOffset</tt>) is the match position if one was found. + * If a match is not found, <tt>USEARCH_DONE</tt> will be returned and + * the iterator will be adjusted to the index USEARCH_DONE + * @param status for errors if it occurs + * @return The index of the previous match before the current position, + * or <tt>USEARCH_DONE</tt> if there are no more matches. + * @see #getOffset + * @stable ICU 2.0 + */ + int32_t previous(UErrorCode &status); + + /** + * Resets the iteration. + * Search will begin at the start of the text string if a forward + * iteration is initiated before a backwards iteration. Otherwise if a + * backwards iteration is initiated before a forwards iteration, the + * search will begin at the end of the text string. + * @stable ICU 2.0 + */ + virtual void reset(); + +protected: + // protected data members --------------------------------------------- + + /** + * C search data struct + * @stable ICU 2.0 + */ + USearch *m_search_; + + /** + * Break iterator. + * Currently the C++ breakiterator does not have getRules etc to reproduce + * another in C. Hence we keep the original around and do the verification + * at the end of the match. The user is responsible for deleting this + * break iterator. + * @stable ICU 2.0 + */ + BreakIterator *m_breakiterator_; + + /** + * Unicode string version of the search text + * @stable ICU 2.0 + */ + UnicodeString m_text_; + + // protected constructors and destructors ----------------------------- + + /** + * Default constructor. + * Initializes data to the default values. + * @stable ICU 2.0 + */ + SearchIterator(); + + /** + * Constructor for use by subclasses. + * @param text The target text to be searched. + * @param breakiter A {@link BreakIterator} that is used to restrict the + * points at which matches are detected. If + * <tt>handleNext</tt> or <tt>handlePrev</tt> finds a + * match, but the match's start or end index is not a + * boundary as determined by the <tt>BreakIterator</tt>, + * the match is rejected and <tt>handleNext</tt> or + * <tt>handlePrev</tt> is called again. If this parameter + * is <tt>NULL</tt>, no break detection is attempted. + * @see #handleNext + * @see #handlePrev + * @stable ICU 2.0 + */ + SearchIterator(const UnicodeString &text, + BreakIterator *breakiter = NULL); + + /** + * Constructor for use by subclasses. + * <p> + * Note: No parsing of the text within the <tt>CharacterIterator</tt> + * will be done during searching for this version. The block of text + * in <tt>CharacterIterator</tt> will be used as it is. + * @param text The target text to be searched. + * @param breakiter A {@link BreakIterator} that is used to restrict the + * points at which matches are detected. If + * <tt>handleNext</tt> or <tt>handlePrev</tt> finds a + * match, but the match's start or end index is not a + * boundary as determined by the <tt>BreakIterator</tt>, + * the match is rejected and <tt>handleNext</tt> or + * <tt>handlePrev</tt> is called again. If this parameter + * is <tt>NULL</tt>, no break detection is attempted. + * @see #handleNext + * @see #handlePrev + * @stable ICU 2.0 + */ + SearchIterator(CharacterIterator &text, BreakIterator *breakiter = NULL); + + // protected methods -------------------------------------------------- + + /** + * Assignment operator. Sets this iterator to have the same behavior, + * and iterate over the same text, as the one passed in. + * @param that instance to be copied. + * @stable ICU 2.0 + */ + SearchIterator & operator=(const SearchIterator &that); + + /** + * Abstract method which subclasses override to provide the mechanism + * for finding the next match in the target text. This allows different + * subclasses to provide different search algorithms. + * <p> + * If a match is found, the implementation should return the index at + * which the match starts and should call + * <tt>setMatchLength</tt> with the number of characters + * in the target text that make up the match. If no match is found, the + * method should return USEARCH_DONE. + * <p> + * @param position The index in the target text at which the search + * should start. + * @param status for error codes if it occurs. + * @return index at which the match starts, else if match is not found + * USEARCH_DONE is returned + * @see #setMatchLength + * @stable ICU 2.0 + */ + virtual int32_t handleNext(int32_t position, UErrorCode &status) + = 0; + + /** + * Abstract method which subclasses override to provide the mechanism for + * finding the previous match in the target text. This allows different + * subclasses to provide different search algorithms. + * <p> + * If a match is found, the implementation should return the index at + * which the match starts and should call + * <tt>setMatchLength</tt> with the number of characters + * in the target text that make up the match. If no match is found, the + * method should return USEARCH_DONE. + * <p> + * @param position The index in the target text at which the search + * should start. + * @param status for error codes if it occurs. + * @return index at which the match starts, else if match is not found + * USEARCH_DONE is returned + * @see #setMatchLength + * @stable ICU 2.0 + */ + virtual int32_t handlePrev(int32_t position, UErrorCode &status) + = 0; + + /** + * Sets the length of the currently matched string in the text string to + * be searched. + * Subclasses' <tt>handleNext</tt> and <tt>handlePrev</tt> + * methods should call this when they find a match in the target text. + * @param length length of the matched text. + * @see #handleNext + * @see #handlePrev + * @stable ICU 2.0 + */ + virtual void setMatchLength(int32_t length); + + /** + * Sets the offset of the currently matched string in the text string to + * be searched. + * Subclasses' <tt>handleNext</tt> and <tt>handlePrev</tt> + * methods should call this when they find a match in the target text. + * @param position start offset of the matched text. + * @see #handleNext + * @see #handlePrev + * @stable ICU 2.0 + */ + virtual void setMatchStart(int32_t position); + + /** + * sets match not found + * @stable ICU 2.0 + */ + void setMatchNotFound(); +}; + +inline UBool SearchIterator::operator!=(const SearchIterator &that) const +{ + return !operator==(that); +} +U_NAMESPACE_END + +#endif /* #if !UCONFIG_NO_COLLATION */ + +#endif + |