com.ibm.icu.text.SearchIterator Maven / Gradle / Ivy
Show all versions of icu4j Show documentation
/*
*******************************************************************************
* Copyright (C) 1996-2010, International Business Machines Corporation and *
* others. All Rights Reserved. *
*******************************************************************************
*/
package com.ibm.icu.text;
import java.text.CharacterIterator;
/**
* SearchIterator is an abstract base class that defines a protocol
* for text searching. Subclasses provide concrete implementations of
* various search algorithms. A concrete subclass, StringSearch, is
* provided that implements language-sensitive pattern matching based
* on the comparison rules defined in a RuleBasedCollator
* object. Instances of SearchIterator maintain a current position and
* scan over the target text, returning the indices where a match is
* found and the length of each match. Generally, the sequence of forward
* matches will be equivalent to the sequence of backward matches.One
* case where this statement may not hold is when non-overlapping mode
* is set on and there are continuous repetitive patterns in the text.
* Consider the case searching for pattern "aba" in the text
* "ababababa", setting overlapping mode off will produce forward matches
* at offsets 0, 4. However when a backwards search is done, the
* results will be at offsets 6 and 2.
*
* If matches searched for have boundary restrictions. BreakIterators
* can be used to define the valid boundaries of such a match. Once a
* BreakIterator is set, potential matches will be tested against the
* BreakIterator to determine if the boundaries are valid and that all
* characters in the potential match are equivalent to the pattern
* searched for. For example, looking for the pattern "fox" in the text
* "foxy fox" will produce match results at offset 0 and 5 with length 3
* if no BreakIterators were set. However if a WordBreakIterator is set,
* the only match that would be found will be at the offset 5. Since,
* the SearchIterator guarantees that if a BreakIterator is set, all its
* matches will match the given pattern exactly, a potential match that
* passes the BreakIterator might still not produce a valid match. For
* instance the pattern "e" will not be found in the string
* "\u00e9" (latin small letter e with acute) if a
* CharacterBreakIterator is used. Even though "e" is
* a part of the character "\u00e9" and the potential match at
* offset 0 length 1 passes the CharacterBreakIterator test, "\u00e9"
* is not equivalent to "e", hence the SearchIterator rejects the potential
* match. By default, the SearchIterator
* does not impose any boundary restriction on the matches, it will
* return all results that match the pattern. Illustrating with the
* above example, "e" will
* be found in the string "\u00e9" if no BreakIterator is
* specified.
*
* SearchIterator also provides a means to handle overlapping
* matches via the API setOverlapping(boolean). For example, if
* overlapping mode is set, searching for the pattern "abab" in the
* text "ababab" will match at positions 0 and 2, whereas if
* overlapping is not set, SearchIterator will only match at position
* 0. By default, overlapping mode is not set.
*
* The APIs in SearchIterator are similar to that of other text
* iteration classes such as BreakIterator. Using this class, it is
* easy to scan through text looking for all occurances of a
* match.
*
* Example of use:
*
* String target = "The quick brown fox jumped over the lazy fox";
* String pattern = "fox";
* SearchIterator iter = new StringSearch(pattern, target);
* for (int pos = iter.first(); pos != SearchIterator.DONE;
* pos = iter.next()) {
* // println matches at offset 16 and 41 with length 3
* System.out.println("Found match at " + pos + ", length is "
* + iter.getMatchLength());
* }
* target = "ababababa";
* pattern = "aba";
* iter.setTarget(new StringCharacterIterator(pattern));
* iter.setOverlapping(false);
* System.out.println("Overlapping mode set to false");
* System.out.println("Forward matches of pattern " + pattern + " in text "
* + text + ": ");
* for (int pos = iter.first(); pos != SearchIterator.DONE;
* pos = iter.next()) {
* // println matches at offset 0 and 4 with length 3
* System.out.println("offset " + pos + ", length "
* + iter.getMatchLength());
* }
* System.out.println("Backward matches of pattern " + pattern + " in text "
* + text + ": ");
* for (int pos = iter.last(); pos != SearchIterator.DONE;
* pos = iter.previous()) {
* // println matches at offset 6 and 2 with length 3
* System.out.println("offset " + pos + ", length "
* + iter.getMatchLength());
* }
* System.out.println("Overlapping mode set to true");
* System.out.println("Index set to 2");
* iter.setIndex(2);
* iter.setOverlapping(true);
* System.out.println("Forward matches of pattern " + pattern + " in text "
* + text + ": ");
* for (int pos = iter.first(); pos != SearchIterator.DONE;
* pos = iter.next()) {
* // println matches at offset 2, 4 and 6 with length 3
* System.out.println("offset " + pos + ", length "
* + iter.getMatchLength());
* }
* System.out.println("Index set to 2");
* iter.setIndex(2);
* System.out.println("Backward matches of pattern " + pattern + " in text "
* + text + ": ");
* for (int pos = iter.last(); pos != SearchIterator.DONE;
* pos = iter.previous()) {
* // println matches at offset 0 with length 3
* System.out.println("offset " + pos + ", length "
* + iter.getMatchLength());
* }
*
*
* @author Laura Werner, synwee
* @stable ICU 2.0
* @see BreakIterator
*/
public abstract class SearchIterator
{
// public data members -------------------------------------------------
/**
* DONE is returned by previous() and next() after all valid matches have
* been returned, and by first() and last() if there are no matches at all.
* @see #previous
* @see #next
* @stable ICU 2.0
*/
public static final int DONE = -1;
// public methods -----------------------------------------------------
// public setters -----------------------------------------------------
/**
*
* Sets the position in the target text at which the next search will start.
* This method clears any previous match.
*
* @param position position from which to start the next search
* @exception IndexOutOfBoundsException thrown if argument position is out
* of the target text range.
* @see #getIndex
* @stable ICU 2.8
*/
public void setIndex(int position) {
if (position < targetText.getBeginIndex()
|| position > targetText.getEndIndex()) {
throw new IndexOutOfBoundsException(
"setIndex(int) expected position to be between " +
targetText.getBeginIndex() + " and " + targetText.getEndIndex());
}
m_setOffset_ = position;
m_reset_ = false;
matchLength = 0;
}
/**
*
* Determines whether overlapping matches are returned. See the class
* documentation for more information about overlapping matches.
*
*
* The default setting of this property is false
*
* @param allowOverlap flag indicator if overlapping matches are allowed
* @see #isOverlapping
* @stable ICU 2.8
*/
public void setOverlapping(boolean allowOverlap)
{
m_isOverlap_ = allowOverlap;
}
/**
* Set the BreakIterator that is used to restrict the points at which
* matches are detected.
* Using null as the parameter is legal; it means that break
* detection should not be attempted.
* See class documentation for more information.
* @param breakiter A BreakIterator that will be used to restrict the
* points at which matches are detected.
* @see #getBreakIterator
* @see BreakIterator
* @stable ICU 2.0
*/
public void setBreakIterator(BreakIterator breakiter)
{
breakIterator = breakiter;
if (breakIterator != null) {
breakIterator.setText(targetText);
}
}
/**
* Set the target text to be searched. Text iteration will then begin at
* the start of the text string. This method is useful if you want to
* reuse an iterator to search within a different body of text.
* @param text new text iterator to look for match,
* @exception IllegalArgumentException thrown when text is null or has
* 0 length
* @see #getTarget
* @stable ICU 2.4
*/
public void setTarget(CharacterIterator text)
{
if (text == null || text.getEndIndex() == text.getIndex()) {
throw new IllegalArgumentException("Illegal null or empty text");
}
targetText = text;
targetText.setIndex(targetText.getBeginIndex());
matchLength = 0;
m_reset_ = true;
m_isForwardSearching_ = true;
if (breakIterator != null) {
breakIterator.setText(targetText);
}
}
// public getters ----------------------------------------------------
/**
*
* Returns the index of the most recent match in the target text.
* This call returns a valid result only after a successful call to
* {@link #first}, {@link #next}, {@link #previous}, or {@link #last}.
* Just after construction, or after a searching method returns
* DONE, this method will return DONE.
*
*
* Use getMatchLength to get the length of the matched text.
* getMatchedText will return the subtext in the searched
* target text from index getMatchStart() with length getMatchLength().
*
* @return index to a substring within the text string that is being
* searched.
* @see #getMatchLength
* @see #getMatchedText
* @see #first
* @see #next
* @see #previous
* @see #last
* @see #DONE
* @stable ICU 2.8
*/
public int getMatchStart()
{
return m_lastMatchStart_;
}
/**
* Return the index in the target text at which the iterator is currently
* positioned.
* If the iteration has gone past the end of the target text, or past
* the beginning for a backwards search, {@link #DONE} is returned.
* @return index in the target text at which the iterator is currently
* positioned.
* @stable ICU 2.8
* @see #first
* @see #next
* @see #previous
* @see #last
* @see #DONE
*/
public abstract int getIndex();
/**
*
* Returns the length of the most recent match in the target text.
* This call returns a valid result only after a successful
* call to {@link #first}, {@link #next}, {@link #previous}, or
* {@link #last}.
* Just after construction, or after a searching method returns
* DONE, this method will return 0. See getMatchStart() for
* more details.
*
* @return The length of the most recent match in the target text, or 0 if
* there is no match.
* @see #getMatchStart
* @see #getMatchedText
* @see #first
* @see #next
* @see #previous
* @see #last
* @see #DONE
* @stable ICU 2.0
*/
public int getMatchLength()
{
return matchLength;
}
/**
* Returns the BreakIterator that is used to restrict the indexes at which
* matches are detected. This will be the same object that was passed to
* the constructor or to setBreakIterator
.
* If the BreakIterator has not been set, null will be returned.
* See setBreakIterator for more information.
* @return the BreakIterator set to restrict logic matches
* @see #setBreakIterator
* @see BreakIterator
* @stable ICU 2.0
*/
public BreakIterator getBreakIterator()
{
return breakIterator;
}
/**
* Return the target text that is being searched.
* @return target text being searched.
* @see #setTarget
* @stable ICU 2.0
*/
public CharacterIterator getTarget()
{
return targetText;
}
/**
* Returns the text that was matched by the most recent call to
* {@link #first}, {@link #next}, {@link #previous}, or {@link #last}.
* If the iterator is not pointing at a valid match, for instance just
* after construction or after DONE has been returned, an empty
* String will be returned. See getMatchStart for more information
* @see #getMatchStart
* @see #getMatchLength
* @see #first
* @see #next
* @see #previous
* @see #last
* @see #DONE
* @return the substring in the target text of the most recent match
* @stable ICU 2.0
*/
public String getMatchedText()
{
if (matchLength > 0) {
int limit = m_lastMatchStart_ + matchLength;
StringBuilder result = new StringBuilder(matchLength);
result.append(targetText.current());
targetText.next();
while (targetText.getIndex() < limit) {
result.append(targetText.current());
targetText.next();
}
targetText.setIndex(m_lastMatchStart_);
return result.toString();
}
return null;
}
// miscellaneous public methods -----------------------------------------
/**
* Search forwards in the target text for the next valid match,
* starting the search from the current iterator position. The iterator is
* adjusted so that its current index, as returned by {@link #getIndex},
* is the starting position of the match if one was found. If a match is
* found, the index of the match is returned, otherwise DONE is
* returned. If overlapping mode is set, the beginning of the found match
* can be before the end of the current match, if any.
* @return The starting index of the next forward match after the current
* iterator position, or
* DONE if there are no more matches.
* @see #getMatchStart
* @see #getMatchLength
* @see #getMatchedText
* @see #following
* @see #preceding
* @see #previous
* @see #first
* @see #last
* @see #DONE
* @stable ICU 2.0
*/
public int next()
{
int start = targetText.getIndex();
if (m_setOffset_ != DONE) {
start = m_setOffset_;
m_setOffset_ = DONE;
}
if (m_isForwardSearching_) {
if (!m_reset_ &&
start + matchLength >= targetText.getEndIndex()) {
// not enough characters to match
matchLength = 0;
targetText.setIndex(targetText.getEndIndex());
m_lastMatchStart_ = DONE;
return DONE;
}
m_reset_ = false;
}
else {
// switching direction.
// if matchedIndex == USEARCH_DONE, it means that either a
// setIndex has been called or that previous ran off the text
// string. the iterator would have been set to offset 0 if a
// match is not found.
m_isForwardSearching_ = true;
if (start != DONE) {
// there's no need to set the collation element iterator
// the next call to next will set the offset.
return start;
}
}
if (start == DONE) {
start = targetText.getBeginIndex();
}
if (matchLength > 0) {
// if match length is 0 we are at the start of the iteration
if (m_isOverlap_) {
start ++;
}
else {
start += matchLength;
}
}
m_lastMatchStart_ = handleNext(start);
return m_lastMatchStart_;
}
/**
* Search backwards in the target text for the next valid match,
* starting the search from the current iterator position. The iterator is
* adjusted so that its current index, as returned by {@link #getIndex},
* is the starting position of the match if one was found. If a match is
* found, the index is returned, otherwise DONE is returned. If
* overlapping mode is set, the end of the found match can be after the
* beginning of the previous match, if any.
* @return The starting index of the next backwards match after the current
* iterator position, or
* DONE if there are no more matches.
* @see #getMatchStart
* @see #getMatchLength
* @see #getMatchedText
* @see #following
* @see #preceding
* @see #next
* @see #first
* @see #last
* @see #DONE
* @stable ICU 2.0
*/
public int previous()
{
int start = targetText.getIndex();
if (m_setOffset_ != DONE) {
start = m_setOffset_;
m_setOffset_ = DONE;
}
if (m_reset_) {
m_isForwardSearching_ = false;
m_reset_ = false;
start = targetText.getEndIndex();
}
if (m_isForwardSearching_ == true) {
// switching direction.
// if matchedIndex == USEARCH_DONE, it means that either a
// setIndex has been called or that next ran off the text
// string. the iterator would have been set to offset textLength if
// a match is not found.
m_isForwardSearching_ = false;
if (start != targetText.getEndIndex()) {
return start;
}
}
else {
if (start == targetText.getBeginIndex()) {
// not enough characters to match
matchLength = 0;
targetText.setIndex(targetText.getBeginIndex());
m_lastMatchStart_ = DONE;
return DONE;
}
}
m_lastMatchStart_ = handlePrevious(start);
return m_lastMatchStart_;
}
/**
* Return true if the overlapping property has been set.
* See setOverlapping(boolean) for more information.
* @see #setOverlapping
* @return true if the overlapping property has been set, false otherwise
* @stable ICU 2.8
*/
public boolean isOverlapping()
{
return m_isOverlap_;
}
/**
*
* Resets the search iteration. All properties will be reset to their
* default values.
*
*
* If a forward iteration is initiated, the next search will begin at the
* start of the target text. Otherwise, if a backwards iteration is initiated,
* the next search will begin at the end of the target text.
*
* @stable ICU 2.8
*/
public void reset()
{
// reset is setting the attributes that are already in string search
matchLength = 0;
setIndex(targetText.getBeginIndex());
m_isOverlap_ = false;
m_isForwardSearching_ = true;
m_reset_ = true;
m_setOffset_ = DONE;
}
/**
* Return the index of the first forward match in the target text.
* This method sets the iteration to begin at the start of the
* target text and searches forward from there.
* @return The index of the first forward match, or DONE
* if there are no matches.
* @see #getMatchStart
* @see #getMatchLength
* @see #getMatchedText
* @see #following
* @see #preceding
* @see #next
* @see #previous
* @see #last
* @see #DONE
* @stable ICU 2.0
*/
public final int first()
{
m_isForwardSearching_ = true;
setIndex(targetText.getBeginIndex());
return next();
}
/**
* Return the index of the first forward match in target text that
* is at or after argument position.
* This method sets the iteration to begin at the specified
* position in the the target text and searches forward from there.
* @return The index of the first forward match, or DONE
* if there are no matches.
* @see #getMatchStart
* @see #getMatchLength
* @see #getMatchedText
* @see #first
* @see #preceding
* @see #next
* @see #previous
* @see #last
* @see #DONE
* @stable ICU 2.0
*/
public final int following(int position)
{
m_isForwardSearching_ = true;
// position checked in usearch_setOffset
setIndex(position);
return next();
}
/**
* Return the index of the first backward match in target text.
* This method sets the iteration to begin at the end of the
* target text and searches backwards from there.
* @return The starting index of the first backward match, or
* DONE
if there are no matches.
* @see #getMatchStart
* @see #getMatchLength
* @see #getMatchedText
* @see #first
* @see #preceding
* @see #next
* @see #previous
* @see #following
* @see #DONE
* @stable ICU 2.0
*/
public final int last()
{
m_isForwardSearching_ = false;
setIndex(targetText.getEndIndex());
return previous();
}
/**
* Return the index of the first backwards match in target
* text that ends at or before argument position.
* This method sets the iteration to begin at the argument
* position index of the target text and searches backwards from there.
* @return The starting index of the first backwards match, or
* DONE
* if there are no matches.
* @see #getMatchStart
* @see #getMatchLength
* @see #getMatchedText
* @see #first
* @see #following
* @see #next
* @see #previous
* @see #last
* @see #DONE
* @stable ICU 2.0
*/
public final int preceding(int position)
{
m_isForwardSearching_ = false;
// position checked in usearch_setOffset
setIndex(position);
return previous();
}
// protected data member ----------------------------------------------
/**
* The BreakIterator to define the boundaries of a logical match.
* This value can be a null.
* See class documentation for more information.
* @see #setBreakIterator(BreakIterator)
* @see #getBreakIterator
* @see BreakIterator
* @stable ICU 2.0
*/
protected BreakIterator breakIterator;
/**
* Target text for searching.
* @see #setTarget(CharacterIterator)
* @see #getTarget
* @stable ICU 2.0
*/
protected CharacterIterator targetText;
/**
* Length of the most current match in target text.
* Value 0 is the default value.
* @see #setMatchLength
* @see #getMatchLength
* @stable ICU 2.0
*/
protected int matchLength;
// protected constructor ----------------------------------------------
/**
* Protected constructor for use by subclasses.
* Initializes the iterator with the argument target text for searching
* and sets the BreakIterator.
* See class documentation for more details on the use of the target text
* and BreakIterator.
* @param target The target text to be searched.
* @param breaker A {@link BreakIterator} that is used to determine the
* boundaries of a logical match. This argument can be null.
* @exception IllegalArgumentException thrown when argument target is null,
* or of length 0
* @see BreakIterator
* @stable ICU 2.0
*/
protected SearchIterator(CharacterIterator target, BreakIterator breaker)
{
if (target == null
|| (target.getEndIndex() - target.getBeginIndex()) == 0) {
throw new IllegalArgumentException(
"Illegal argument target. " +
" Argument can not be null or of length 0");
}
targetText = target;
breakIterator = breaker;
if (breakIterator != null) {
breakIterator.setText(target);
}
matchLength = 0;
m_lastMatchStart_ = DONE;
m_isOverlap_ = false;
m_isForwardSearching_ = true;
m_reset_ = true;
m_setOffset_ = DONE;
}
// protected methods --------------------------------------------------
/**
* Sets the length of the most recent match in the target text.
* Subclasses' handleNext() and handlePrevious() methods should call this
* after they find a match in the target text.
* @param length new length to set
* @see #handleNext
* @see #handlePrevious
* @stable ICU 2.0
*/
protected void setMatchLength(int length)
{
matchLength = length;
}
/**
*
* Abstract method that subclasses override to provide the mechanism
* for finding the next forwards match in the target text. This
* allows different subclasses to provide different search algorithms.
*
*
* If a match is found, this function must call setMatchLength(int) to
* set the length of the result match.
* The iterator is adjusted so that its current index, as returned by
* {@link #getIndex}, is the starting position of the match if one was
* found. If a match is not found, DONE will be returned.
*
* @param start index in the target text at which the forwards search
* should begin.
* @return the starting index of the next forwards match if found, DONE
* otherwise
* @see #setMatchLength(int)
* @see #handlePrevious(int)
* @see #DONE
* @stable ICU 2.0
*/
protected abstract int handleNext(int start);
/**
*
* Abstract method which subclasses override to provide the mechanism
* for finding the next backwards match in the target text.
* This allows different
* subclasses to provide different search algorithms.
*
*
* If a match is found, this function must call setMatchLength(int) to
* set the length of the result match.
* The iterator is adjusted so that its current index, as returned by
* {@link #getIndex}, is the starting position of the match if one was
* found. If a match is not found, DONE will be returned.
*
* @param startAt index in the target text at which the backwards search
* should begin.
* @return the starting index of the next backwards match if found,
* DONE otherwise
* @see #setMatchLength(int)
* @see #handleNext(int)
* @see #DONE
* @stable ICU 2.0
*/
protected abstract int handlePrevious(int startAt);
// private data members ------------------------------------------------
/**
* Flag indicates if we are doing a forwards search
*/
private boolean m_isForwardSearching_;
/**
* Flag to indicate if overlapping search is to be done.
* E.g. looking for "aa" in "aaa" will yield matches at offset 0 and 1.
*/
private boolean m_isOverlap_;
/**
* Flag indicates if we are at the start of a string search.
* This indicates that we are in forward search and at the start of m_text.
*/
private boolean m_reset_;
/**
* Data member to store user defined position in setIndex().
* If setIndex() is not called, this value will be DONE.
*/
private int m_setOffset_;
/**
* Offset of the beginning of the last match
*/
private int m_lastMatchStart_;
}