com.ibm.icu.text.RuleBasedBreakIterator Maven / Gradle / Ivy
Show all versions of icu4j Show documentation
// © 2016 and later: Unicode, Inc. and others.
// License & terms of use: http://www.unicode.org/copyright.html
/*
*******************************************************************************
* Copyright (C) 2005-2016 International Business Machines Corporation and
* others. All Rights Reserved.
*******************************************************************************
*/
package com.ibm.icu.text;
import static com.ibm.icu.impl.CharacterIteration.DONE32;
import static com.ibm.icu.impl.CharacterIteration.next32;
import static com.ibm.icu.impl.CharacterIteration.nextTrail32;
import java.io.ByteArrayOutputStream;
import java.io.IOException;
import java.io.InputStream;
import java.io.OutputStream;
import java.nio.ByteBuffer;
import java.text.CharacterIterator;
import java.util.ArrayList;
import java.util.List;
import java.util.MissingResourceException;
import com.ibm.icu.impl.CharacterIteration;
import com.ibm.icu.impl.ICUBinary;
import com.ibm.icu.impl.ICUDebug;
import com.ibm.icu.impl.RBBIDataWrapper;
import com.ibm.icu.impl.breakiter.BurmeseBreakEngine;
import com.ibm.icu.impl.breakiter.CjkBreakEngine;
import com.ibm.icu.impl.breakiter.DictionaryBreakEngine;
import com.ibm.icu.impl.breakiter.KhmerBreakEngine;
import com.ibm.icu.impl.breakiter.LSTMBreakEngine;
import com.ibm.icu.impl.breakiter.LanguageBreakEngine;
import com.ibm.icu.impl.breakiter.LaoBreakEngine;
import com.ibm.icu.impl.breakiter.ThaiBreakEngine;
import com.ibm.icu.impl.breakiter.UnhandledBreakEngine;
import com.ibm.icu.lang.UCharacter;
import com.ibm.icu.lang.UProperty;
import com.ibm.icu.lang.UScript;
import com.ibm.icu.util.CodePointTrie;
/**
* Rule Based Break Iterator
* This is a port of the C++ class RuleBasedBreakIterator from ICU4C.
*
* @stable ICU 2.0
*/
public class RuleBasedBreakIterator extends BreakIterator {
//=======================================================================
// Constructors & Factories
//=======================================================================
/**
* private constructor
*/
private RuleBasedBreakIterator() {
fDictionaryCharCount = 0;
synchronized(gAllBreakEngines) {
fBreakEngines = new ArrayList<>(gAllBreakEngines);
}
}
/**
* Create a break iterator from a precompiled set of break rules.
*
* Creating a break iterator from the binary rules is much faster than
* creating one from source rules.
*
* The binary rules are generated by the RuleBasedBreakIterator.compileRules() function.
* Binary break iterator rules are not guaranteed to be compatible between
* different versions of ICU.
*
* @param is an input stream supplying the compiled binary rules.
* @throws IOException if there is an error while reading the rules from the InputStream.
* @see #compileRules(String, OutputStream)
* @stable ICU 4.8
*/
public static RuleBasedBreakIterator getInstanceFromCompiledRules(InputStream is) throws IOException {
RuleBasedBreakIterator This = new RuleBasedBreakIterator();
This.fRData = RBBIDataWrapper.get(ICUBinary.getByteBufferFromInputStreamAndCloseStream(is));
This.fLookAheadMatches = new int[This.fRData.fFTable.fLookAheadResultsSize];
return This;
}
/**
* This factory method doesn't have an access modifier; it is only accessible in the same
* package.
*
* Create a break iterator from a precompiled set of break rules.
*
* Creating a break iterator from the binary rules is much faster than
* creating one from source rules.
*
* The binary rules are generated by the RuleBasedBreakIterator.compileRules() function.
* Binary break iterator rules are not guaranteed to be compatible between
* different versions of ICU.
*
* @param bytes a buffer supplying the compiled binary rules.
* @param phraseBreaking a flag indicating if phrase breaking is required.
* @throws IOException if there is an error while reading the rules from the buffer.
* @see #compileRules(String, OutputStream)
* @internal
*/
/* package-potected */ static RuleBasedBreakIterator getInstanceFromCompiledRules(
ByteBuffer bytes, boolean phraseBreaking) throws IOException {
RuleBasedBreakIterator instance = getInstanceFromCompiledRules(bytes);
instance.fPhraseBreaking = phraseBreaking;
return instance;
}
/**
* Create a break iterator from a precompiled set of break rules.
*
* Creating a break iterator from the binary rules is much faster than
* creating one from source rules.
*
* The binary rules are generated by the RuleBasedBreakIterator.compileRules() function.
* Binary break iterator rules are not guaranteed to be compatible between
* different versions of ICU.
*
* @param bytes a buffer supplying the compiled binary rules.
* @throws IOException if there is an error while reading the rules from the buffer.
* @see #compileRules(String, OutputStream)
* @internal
* @deprecated This API is ICU internal only.
*/
@Deprecated
public static RuleBasedBreakIterator getInstanceFromCompiledRules(ByteBuffer bytes) throws IOException {
RuleBasedBreakIterator This = new RuleBasedBreakIterator();
This.fRData = RBBIDataWrapper.get(bytes);
This.fLookAheadMatches = new int[This.fRData.fFTable.fLookAheadResultsSize];
return This;
}
/**
* Construct a RuleBasedBreakIterator from a set of rules supplied as a string.
* @param rules The break rules to be used.
* @stable ICU 2.2
*/
public RuleBasedBreakIterator(String rules) {
this();
try {
ByteArrayOutputStream ruleOS = new ByteArrayOutputStream();
compileRules(rules, ruleOS);
fRData = RBBIDataWrapper.get(ByteBuffer.wrap(ruleOS.toByteArray()));
fLookAheadMatches = new int[fRData.fFTable.fLookAheadResultsSize];
} catch (IOException e) {
///CLOVER:OFF
// An IO exception can only arrive here if there is a bug in the RBBI Rule compiler,
// causing bogus compiled rules to be produced, but with no compile error raised.
RuntimeException rte = new RuntimeException("RuleBasedBreakIterator rule compilation internal error: "
+ e.getMessage());
throw rte;
///CLOVER:ON
}
}
//=======================================================================
// Boilerplate
//=======================================================================
/**
* Clones this iterator.
* @return A newly-constructed RuleBasedBreakIterator with the same
* behavior as this one.
* @stable ICU 2.0
*/
@Override
public Object clone() {
RuleBasedBreakIterator result;
result = (RuleBasedBreakIterator)super.clone();
if (fText != null) {
result.fText = (CharacterIterator)(fText.clone());
}
synchronized (gAllBreakEngines) {
result.fBreakEngines = new ArrayList<>(gAllBreakEngines);
}
result.fLookAheadMatches = new int[fRData.fFTable.fLookAheadResultsSize];
result.fBreakCache = result.new BreakCache(fBreakCache);
result.fDictionaryCache = result.new DictionaryCache(fDictionaryCache);
return result;
}
/**
* Returns true if both BreakIterators are of the same class, have the same
* rules, and iterate over the same text.
* @stable ICU 2.0
*/
@Override
public boolean equals(Object that) {
if (that == null) {
return false;
}
if (this == that) {
return true;
}
try {
RuleBasedBreakIterator other = (RuleBasedBreakIterator) that;
if (fRData != other.fRData && (fRData == null || other.fRData == null)) {
return false;
}
if (fRData != null && other.fRData != null &&
(!fRData.fRuleSource.equals(other.fRData.fRuleSource))) {
return false;
}
if (fText == null && other.fText == null) {
return true;
}
if (fText == null || other.fText == null || !fText.equals(other.fText)) {
return false;
}
return fPosition == other.fPosition;
}
catch(ClassCastException e) {
return false;
}
}
/**
* Returns the description (rules) used to create this iterator.
* (In ICU4C, the same function is RuleBasedBreakIterator::getRules())
* @stable ICU 2.0
*/
@Override
public String toString() {
String retStr = "";
if (fRData != null) {
retStr = fRData.fRuleSource;
}
return retStr;
}
/**
* Compute a hashcode for this BreakIterator
* @return A hash code
* @stable ICU 2.0
*/
@Override
public int hashCode()
{
return fRData.fRuleSource.hashCode();
}
private static final int START_STATE = 1; // The state number of the starting state
private static final int STOP_STATE = 0; // The state-transition value indicating "stop"
// RBBIRunMode - the state machine runs an extra iteration at the beginning and end
// of user text. A variable with this enum type keeps track of where we
// are. The state machine only fetches user text input while in RUN mode.
private static final int RBBI_START = 0;
private static final int RBBI_RUN = 1;
private static final int RBBI_END = 2;
/**
* The character iterator through which this BreakIterator accesses the text.
*/
private CharacterIterator fText = new java.text.StringCharacterIterator("");
/**
* The rule data for this BreakIterator instance.
* Not intended for public use. Declared public for testing purposes only.
* @internal
* @deprecated This API is ICU internal only.
*/
@Deprecated
public RBBIDataWrapper fRData;
/**
* The iteration state - current position, rule status for the current position,
* and whether the iterator ran off the end, yielding UBRK_DONE.
* Current position is pinned to be 0 < position <= text.length.
* Current position is always set to a boundary.
*
* The current position of the iterator. Pinned, 0 < fPosition <= text.length.
* Never has the value UBRK_DONE (-1).
*/
private int fPosition;
/**
* Index of the Rule {tag} values for the most recent match.
*/
private int fRuleStatusIndex;
/**
* True when iteration has run off the end, and iterator functions should return UBRK_DONE.
*/
private boolean fDone;
/**
* Array of look-ahead tentative results.
*/
private int[] fLookAheadMatches;
/**
* Cache of previously determined boundary positions.
*/
private BreakCache fBreakCache = new BreakCache();
/**
* Flag used to indicate if phrase breaking is required.
*/
private boolean fPhraseBreaking = false;
/**
* Counter for the number of characters encountered with the "dictionary"
* flag set. Normal RBBI iterators don't use it, although the code
* for updating it is live. Dictionary Based break iterators (a subclass
* of us) access this field directly.
* @internal
*/
private int fDictionaryCharCount;
private DictionaryCache fDictionaryCache = new DictionaryCache();
/**
* ICU debug argument name for RBBI
*/
private static final String RBBI_DEBUG_ARG = "rbbi";
/**
* Debugging flag. Trace operation of state machine when true.
*/
private static final boolean TRACE = ICUDebug.enabled(RBBI_DEBUG_ARG)
&& ICUDebug.value(RBBI_DEBUG_ARG).indexOf("trace") >= 0;
/**
* The "default" break engine - just skips over ranges of dictionary words,
* producing no breaks. Should only be used if characters need to be handled
* by a dictionary but we have no dictionary implementation for them.
*
* Only one instance; shared by all break iterators.
*/
private static final UnhandledBreakEngine gUnhandledBreakEngine;
/**
* List of all known break engines, common for all break iterators.
* Lazily updated as break engines are needed, because instantiation of
* break engines is expensive.
*
* Because gAllBreakEngines can be referenced concurrently from different
* BreakIterator instances, all access is synchronized.
*/
private static final List gAllBreakEngines;
static {
gUnhandledBreakEngine = new UnhandledBreakEngine();
gAllBreakEngines = new ArrayList<>();
gAllBreakEngines.add(gUnhandledBreakEngine);
}
/**
* List of all known break engines. Similar to gAllBreakEngines, but local to a
* break iterator, allowing it to be used without synchronization.
*/
private List fBreakEngines;
/**
* Dump the contents of the state table and character classes for this break iterator.
* For debugging only.
* @internal
* @deprecated This API is ICU internal only.
*/
@Deprecated
public void dump(java.io.PrintStream out) {
if (out == null) {
out = System.out;
}
this.fRData.dump(out);
}
/**
* Compile a set of source break rules into the binary state tables used
* by the break iterator engine. Creating a break iterator from precompiled
* rules is much faster than creating one from source rules.
*
* Binary break rules are not guaranteed to be compatible between different
* versions of ICU.
*
*
* @param rules The source form of the break rules
* @param ruleBinary An output stream to receive the compiled rules.
* @throws IOException If there is an error writing the output.
* @see #getInstanceFromCompiledRules(InputStream)
* @stable ICU 4.8
*/
public static void compileRules(String rules, OutputStream ruleBinary) throws IOException {
RBBIRuleBuilder.compileRules(rules, ruleBinary);
}
//=======================================================================
// BreakIterator overrides
//=======================================================================
/**
* Sets the current iteration position to the beginning of the text.
* (i.e., the CharacterIterator's starting offset).
* @return The offset of the beginning of the text.
* @stable ICU 2.0
*/
@Override
public int first() {
if (fText == null) {
return BreakIterator.DONE;
}
fText.first();
int start = fText.getIndex();
if (!fBreakCache.seek(start)) {
fBreakCache.populateNear(start);
}
fBreakCache.current();
assert(fPosition == start);
return fPosition;
}
/**
* Sets the current iteration position to the end of the text.
* (i.e., the CharacterIterator's ending offset).
* @return The text's past-the-end offset.
* @stable ICU 2.0
*/
@Override
public int last() {
if (fText == null) {
return BreakIterator.DONE;
}
int endPos = fText.getEndIndex();
boolean endShouldBeBoundary = isBoundary(endPos); // Has side effect of setting iterator position.
assert(endShouldBeBoundary);
if (fPosition != endPos) {
assert(fPosition == endPos);
}
return endPos;
}
/**
* Advances the iterator either forward or backward the specified number of steps.
* Negative values move backward, and positive values move forward. This is
* equivalent to repeatedly calling next() or previous().
* @param n The number of steps to move. The sign indicates the direction
* (negative is backwards, and positive is forwards).
* @return The character offset of the boundary position n boundaries away from
* the current one.
* @stable ICU 2.0
*/
@Override
public int next(int n) {
int result = 0;
if (n > 0) {
for (; n > 0 && result != DONE; --n) {
result = next();
}
} else if (n < 0) {
for (; n < 0 && result != DONE; ++n) {
result = previous();
}
} else {
result = current();
}
return result;
}
/**
* Advances the iterator to the next boundary position.
* @return The position of the first boundary after this one.
* @stable ICU 2.0
*/
@Override
public int next() {
fBreakCache.next();
return fDone ? DONE : fPosition;
}
/**
* Moves the iterator backwards, to the boundary preceding the current one.
* @return The position of the boundary position immediately preceding the starting position.
* @stable ICU 2.0
*/
@Override
public int previous() {
fBreakCache.previous();
return fDone ? DONE : fPosition;
}
/**
* Sets the iterator to refer to the first boundary position following
* the specified position.
* @param startPos The position from which to begin searching for a break position.
* @return The position of the first break after the current position.
* @stable ICU 2.0
*/
@Override
public int following(int startPos) {
// if the supplied position is before the beginning, return the
// text's starting offset
if (startPos < fText.getBeginIndex()) {
return first();
}
// Move requested offset to a code point start. It might be between a lead and trail surrogate.
// Or it may be beyond the end of the text.
startPos = CISetIndex32(fText, startPos);
fBreakCache.following(startPos);
return fDone ? DONE : fPosition;
}
/**
* Sets the iterator to refer to the last boundary position before the
* specified position.
* @param offset The position to begin searching for a break from.
* @return The position of the last boundary before the starting position.
* @stable ICU 2.0
*/
@Override
public int preceding(int offset) {
if (fText == null || offset > fText.getEndIndex()) {
return last();
} else if (offset < fText.getBeginIndex()) {
return first();
}
// Move requested offset to a code point start. It might be between a lead and trail surrogate.
// int adjustedOffset = CISetIndex32(fText, offset); // TODO: restore to match ICU4C behavior.
int adjustedOffset = offset;
fBreakCache.preceding(adjustedOffset);
return fDone ? DONE : fPosition;
}
/**
* Throw IllegalArgumentException unless begin <= offset < end.
* @stable ICU 2.0
*/
protected static final void checkOffset(int offset, CharacterIterator text) {
if (offset < text.getBeginIndex() || offset > text.getEndIndex()) {
throw new IllegalArgumentException("offset out of bounds");
}
}
/**
* Returns true if the specified position is a boundary position. As a side
* effect, leaves the iterator pointing to the first boundary position at
* or after "offset".
* @param offset the offset to check.
* @return True if "offset" is a boundary position.
* @stable ICU 2.0
*/
@Override
public boolean isBoundary(int offset) {
// TODO: behavior difference with ICU4C, which considers out-of-range offsets
// to not be boundaries, and to not be errors.
checkOffset(offset, fText);
// Adjust offset to be on a code point boundary and not beyond the end of the text.
// Note that isBoundary() is always false for offsets that are not on code point boundaries.
// But we still need the side effect of leaving iteration at the following boundary.
int adjustedOffset = CISetIndex32(fText, offset);
boolean result = false;
if (fBreakCache.seek(adjustedOffset) || fBreakCache.populateNear(adjustedOffset)) {
result = (fBreakCache.current() == offset);
}
if (!result) {
// Not on a boundary. isBoundary() must leave iterator on the following boundary.
// fBreakCache.seek(), above, left us on the preceding boundary, so advance one.
next();
}
return result;
}
/**
* Returns the current iteration position. Note that DONE is never
* returned from this function; if iteration has run to the end of a
* string, current() will return the length of the string while
* next() will return BreakIterator.DONE).
* @return The current iteration position.
* @stable ICU 2.0
*/
@Override
public int current() {
return (fText != null) ? fPosition : BreakIterator.DONE;
}
/**
* Return the status tag from the break rule that determined the boundary at
* the current iteration position. The values appear in the rule source
* within brackets, {123}, for example. For rules that do not specify a
* status, a default value of 0 is returned. If more than one rule applies,
* the numerically largest of the possible status values is returned.
*
* Of the standard types of ICU break iterators, only the word and line break
* iterator provides status values. The values are defined in
* class RuleBasedBreakIterator, and allow distinguishing between words
* that contain alphabetic letters, "words" that appear to be numbers,
* punctuation and spaces, words containing ideographic characters, and
* more. Call getRuleStatus
after obtaining a boundary
* position from next()
, previous()
, or
* any other break iterator functions that returns a boundary position.
*
* Note that getRuleStatus()
returns the value corresponding to
* current()
index even after next()
has returned DONE.
*
* @return the status from the break rule that determined the boundary
* at the current iteration position.
*
* @stable ICU 60
*/
@Override
public int getRuleStatus() {
// Status records have this form:
// Count N <-- fLastRuleStatusIndex points here.
// Status val 0
// Status val 1
// ...
// Status val N-1 <-- the value we need to return
// The status values are sorted in ascending order.
// This function returns the last (largest) of the array of status values.
int idx = fRuleStatusIndex + fRData.fStatusTable[fRuleStatusIndex];
int tagVal = fRData.fStatusTable[idx];
return tagVal;
}
/**
* Get the status (tag) values from the break rule(s) that determined the boundary
* at the current iteration position. The values appear in the rule source
* within brackets, {123}, for example. The default status value for rules
* that do not explicitly provide one is zero.
*
* The status values used by the standard ICU break rules are defined
* as public constants in class RuleBasedBreakIterator.
*
* If the size of the output array is insufficient to hold the data,
* the output will be truncated to the available length. No exception
* will be thrown.
*
* @param fillInArray an array to be filled in with the status values.
* @return The number of rule status values from the rules that determined
* the boundary at the current iteration position.
* In the event that the array is too small, the return value
* is the total number of status values that were available,
* not the reduced number that were actually returned.
* @stable ICU 60
*/
@Override
public int getRuleStatusVec(int[] fillInArray) {
int numStatusVals = fRData.fStatusTable[fRuleStatusIndex];
if (fillInArray != null) {
int numToCopy = Math.min(numStatusVals, fillInArray.length);
for (int i=0; i
* Caution:The state of the returned CharacterIterator
* must not be modified in any way while the BreakIterator is still in use.
* Doing so will lead to undefined behavior of the BreakIterator.
* Clone the returned CharacterIterator first and work with that.
*
* The returned CharacterIterator is a reference
* to the actual iterator being used by the BreakIterator.
* No guarantees are made about the current position
* of this iterator when it is returned; it may differ from the
* BreakIterators current position. If you need to move that
* position to examine the text, clone this function's return value first.
* @return An iterator over the text being analyzed.
* @stable ICU 2.0
*/
@Override
public CharacterIterator getText() {
return fText;
}
/**
* Set the iterator to analyze a new piece of text. This function resets
* the current iteration position to the beginning of the text.
* (The old iterator is dropped.)
*
* Caution: The supplied CharacterIterator is used
* directly by the BreakIterator, and must not be altered in any
* way by code outside of the BreakIterator.
* Doing so will lead to undefined behavior of the BreakIterator.
*
* @param newText An iterator over the text to analyze.
* @stable ICU 2.0
*/
@Override
public void setText(CharacterIterator newText) {
if (newText != null) {
fBreakCache.reset(newText.getBeginIndex(), 0);
} else {
fBreakCache.reset();
}
fDictionaryCache.reset();
fText = newText;
this.first();
}
/**
* Control debug, trace and dump options.
* @internal
* @deprecated This API is ICU internal only.
*/
@Deprecated
public static final String fDebugEnv = ICUDebug.enabled(RBBI_DEBUG_ARG) ?
ICUDebug.value(RBBI_DEBUG_ARG) : null;
private LanguageBreakEngine getLanguageBreakEngine(int c) {
// We have a dictionary character.
// Does an already instantiated break engine handle it?
for (LanguageBreakEngine candidate : fBreakEngines) {
if (candidate.handles(c)) {
return candidate;
}
}
synchronized (gAllBreakEngines) {
// This break iterator's list of break engines didn't handle the character.
// Check the global list, another break iterator may have instantiated the
// desired engine.
for (LanguageBreakEngine candidate : gAllBreakEngines) {
if (candidate.handles(c)) {
fBreakEngines.add(candidate);
return candidate;
}
}
// The global list doesn't have an existing engine, build one.
int script = UCharacter.getIntPropertyValue(c, UProperty.SCRIPT);
if (script == UScript.KATAKANA || script == UScript.HIRAGANA) {
// Katakana, Hiragana and Han are handled by the same dictionary engine.
// Fold them together for mapping from script -> engine.
script = UScript.HAN;
}
LanguageBreakEngine eng;
try {
switch (script) {
case UScript.THAI:
try {
eng = LSTMBreakEngine.create(script, LSTMBreakEngine.createData(script));
} catch (MissingResourceException e) {
eng = new ThaiBreakEngine();
}
break;
case UScript.LAO:
eng = new LaoBreakEngine();
break;
case UScript.MYANMAR:
try {
eng = LSTMBreakEngine.create(script, LSTMBreakEngine.createData(script));
} catch (MissingResourceException e) {
eng = new BurmeseBreakEngine();
}
break;
case UScript.KHMER:
eng = new KhmerBreakEngine();
break;
case UScript.HAN:
eng = new CjkBreakEngine(false);
break;
case UScript.HANGUL:
eng = new CjkBreakEngine(true);
break;
default:
gUnhandledBreakEngine.handleChar(c);
eng = gUnhandledBreakEngine;
break;
}
} catch (IOException e) {
eng = null;
}
if (eng != null && eng != gUnhandledBreakEngine) {
gAllBreakEngines.add(eng);
fBreakEngines.add(eng);
}
return eng;
} // end synchronized(gAllBreakEngines)
}
/**
* The State Machine Engine for moving forward is here.
* This function is the heart of the RBBI run time engine.
*
* Input
* fPosition, the position in the text to begin from.
* Output
* fPosition: the boundary following the starting position.
* fDictionaryCharCount the number of dictionary characters encountered.
* If > 0, the segment will be further subdivided
* fRuleStatusIndex Info from the state table indicating which rules caused the boundary.
*
* @return the new iterator position
*
* A note on supplementary characters and the position of underlying
* Java CharacterIterator: Normally, a character iterator is positioned at
* the char most recently returned by next(). Within this function, when
* a supplementary char is being processed, the char iterator is left
* sitting on the trail surrogate, in the middle of the code point.
* This is different from everywhere else, where an iterator always
* points at the lead surrogate of a supplementary.
*/
private int handleNext() {
if (TRACE) {
System.out.println("Handle Next pos char state category");
}
// handleNext always sets the break tag value.
// Set the default for it.
fRuleStatusIndex = 0;
fDictionaryCharCount = 0;
// caches for quicker access
CharacterIterator text = fText;
CodePointTrie trie = fRData.fTrie;
char[] stateTable = fRData.fFTable.fTable;
int initialPosition = fPosition;
text.setIndex(initialPosition);
int result = initialPosition;
// Set up the starting char
int c = text.current();
if (c >= UTF16.LEAD_SURROGATE_MIN_VALUE) {
c = nextTrail32(text, c);
if (c == DONE32) {
fDone = true;
return BreakIterator.DONE;
}
}
// Set the initial state for the state machine
int state = START_STATE;
int row = fRData.getRowIndex(state);
short category = 3;
int flagsState = fRData.fFTable.fFlags;
int dictStart = fRData.fFTable.fDictCategoriesStart;
int mode = RBBI_RUN;
if ((flagsState & RBBIDataWrapper.RBBI_BOF_REQUIRED) != 0) {
category = 2;
mode = RBBI_START;
if (TRACE) {
System.out.print(" " + RBBIDataWrapper.intToString(text.getIndex(), 5));
System.out.print(RBBIDataWrapper.intToHexString(c, 10));
System.out.println(RBBIDataWrapper.intToString(state,7) + RBBIDataWrapper.intToString(category,6));
}
}
// loop until we reach the end of the text or transition to state 0
while (state != STOP_STATE) {
if (c == DONE32) {
// Reached end of input string.
if (mode == RBBI_END) {
// We have already run the loop one last time with the
// character set to the pseudo {eof} value. Now it is time
// to unconditionally bail out.
break;
}
// Run the loop one last time with the fake end-of-input character category
mode = RBBI_END;
category = 1;
}
else if (mode == RBBI_RUN) {
// Get the char category. An incoming category of 1 or 2 mens that
// we are preset for doing the beginning or end of input, and
// that we shouldn't get a category from an actual text input character.
//
// look up the current character's character category, which tells us
// which column in the state table to look at.
//
category = (short) trie.get(c);
// Check for categories that require word dictionary handling.
if (category >= dictStart) {
fDictionaryCharCount++;
}
if (TRACE) {
System.out.print(" " + RBBIDataWrapper.intToString(text.getIndex(), 5));
System.out.print(RBBIDataWrapper.intToHexString(c, 10));
System.out.println(RBBIDataWrapper.intToString(state,7) + RBBIDataWrapper.intToString(category,6));
}
// Advance to the next character.
// If this is a beginning-of-input loop iteration, don't advance.
// The next iteration will be processing the first real input character.
c = text.next();
if (c >= UTF16.LEAD_SURROGATE_MIN_VALUE) {
c = nextTrail32(text, c);
}
}
else {
mode = RBBI_RUN;
}
// look up a state transition in the state table
state = stateTable[row + RBBIDataWrapper.NEXTSTATES + category];
row = fRData.getRowIndex(state);
int accepting = stateTable[row + RBBIDataWrapper.ACCEPTING];
if (accepting == RBBIDataWrapper.ACCEPTING_UNCONDITIONAL) {
// Match found, common case
result = text.getIndex();
if (c >= UTF16.SUPPLEMENTARY_MIN_VALUE && c <= UTF16.CODEPOINT_MAX_VALUE) {
// The iterator has been left in the middle of a surrogate pair.
// We want the start of it.
result--;
}
// Remember the break status (tag) values.
fRuleStatusIndex = stateTable[row + RBBIDataWrapper.TAGSIDX];
} else if (accepting > RBBIDataWrapper.ACCEPTING_UNCONDITIONAL) {
// Lookahead match is completed
int lookaheadResult = fLookAheadMatches[accepting];
if (lookaheadResult >= 0) {
fRuleStatusIndex = stateTable[row + RBBIDataWrapper.TAGSIDX];
fPosition = lookaheadResult;
return lookaheadResult;
}
}
// If we are at the position of the '/' in a look-ahead (hard break) rule;
// record the current position, to be returned later, if the full rule matches.
// TODO: Move this check before the previous check of fAccepting.
// This would enable hard-break rules with no following context.
// But there are line break test failures when trying this. Investigate.
// Issue ICU-20837
int rule = stateTable[row + RBBIDataWrapper.LOOKAHEAD];
if (rule != 0) {
int pos = text.getIndex();
if (c >= UTF16.SUPPLEMENTARY_MIN_VALUE && c <= UTF16.CODEPOINT_MAX_VALUE) {
// The iterator has been left in the middle of a surrogate pair.
// We want the beginning of it.
pos--;
}
fLookAheadMatches[rule] = pos;
}
} // End of state machine main loop
// The state machine is done. Check whether it found a match...
// If the iterator failed to advance in the match engine force it ahead by one.
// This indicates a defect in the break rules, which should always match
// at least one character.
if (result == initialPosition) {
if (TRACE) {
System.out.println("Iterator did not move. Advancing by 1.");
}
text.setIndex(initialPosition);
next32(text);
result = text.getIndex();
fRuleStatusIndex = 0;
}
// Leave the iterator at our result position.
// (we may have advanced beyond the last accepting position chasing after
// longer matches that never completed.)
fPosition = result;
if (TRACE) {
System.out.println("result = " + result);
}
return result;
}
/**
* Iterate backwards from an arbitrary position in the input text using the Safe Reverse rules.
* This locates a "Safe Position" from which the forward break rules
* will operate correctly. A Safe Position is not necessarily a boundary itself.
*
* The logic of this function is very similar to handleNext(), above, but simpler
* because the safe table does not require as many options.
*
* @param fromPosition the position in the input text to begin the iteration.
* @internal
*/
private int handleSafePrevious(int fromPosition) {
char state;
short category = 0;
int result = 0;
// caches for quicker access
CharacterIterator text = fText;
CodePointTrie trie = fRData.fTrie;
char[] stateTable = fRData.fRTable.fTable;
CISetIndex32(text, fromPosition);
if (TRACE) {
System.out.print("Handle Previous pos char state category");
}
// if we're already at the start of the text, return DONE.
if (text.getIndex() == text.getBeginIndex()) {
return BreakIterator.DONE;
}
// Set the initial state for the state machine
int c = CharacterIteration.previous32(text);
state = START_STATE;
int row = fRData.getRowIndex(state);
// loop until we reach the start of the text or transition to state 0
//
for (; c != DONE32; c = CharacterIteration.previous32(text)) {
// look up the current character's character category, which tells us
// which column in the state table to look at.
//
// And off the dictionary flag bit. For reverse iteration it is not used.
category = (short) trie.get(c);
if (TRACE) {
System.out.print(" " + RBBIDataWrapper.intToString(text.getIndex(), 5));
System.out.print(RBBIDataWrapper.intToHexString(c, 10));
System.out.println(RBBIDataWrapper.intToString(state,7) + RBBIDataWrapper.intToString(category,6));
}
// State Transition - move machine to its next state
//
assert(category < fRData.fHeader.fCatCount);
state = stateTable[row + RBBIDataWrapper.NEXTSTATES + category];
row = fRData.getRowIndex(state);
if (state == STOP_STATE) {
// This is the normal exit from the lookup state machine.
// Transition to state zero means we have found a safe point.
break;
}
}
// The state machine is done.
result = text.getIndex();
if (TRACE) {
System.out.println("result = " + result);
}
return result;
}
/**
* Set the index of a CharacterIterator.
* Pin the index to the valid range range of BeginIndex <= index <= EndIndex.
* If the index points to a trail surrogate of a supplementary character, adjust it
* to the start (lead surrogate) index.
*
* @param ci A CharacterIterator to set
* @param index the index to set
* @return the resulting index, possibly pinned or adjusted.
*/
private static int CISetIndex32(CharacterIterator ci, int index) {
if (index <= ci.getBeginIndex()) {
ci.first();
} else if (index >= ci.getEndIndex()) {
ci.setIndex(ci.getEndIndex());
} else if (Character.isLowSurrogate(ci.setIndex(index))) {
if (!Character.isHighSurrogate(ci.previous())) {
ci.next();
}
}
return ci.getIndex();
}
/** DictionaryCache stores the boundaries obtained from a run of dictionary characters.
* Dictionary boundaries are moved first to this cache, then from here
* to the main BreakCache, where they may inter-leave with non-dictionary
* boundaries. The public BreakIterator API always fetches directly
* from the main BreakCache, not from here.
*
* In common situations, the number of boundaries in a single dictionary run
* should be quite small, it will be terminated by punctuation, spaces,
* or any other non-dictionary characters. The main BreakCache may end
* up with boundaries from multiple dictionary based runs.
*
* The boundaries are stored in a simple ArrayList (vector), with the
* assumption that they will be accessed sequentially.
*/
class DictionaryCache {
void reset() {
fPositionInCache = -1;
fStart = 0;
fLimit = 0;
fFirstRuleStatusIndex = 0;
fOtherRuleStatusIndex = 0;
fBreaks.removeAllElements();
};
boolean following(int fromPos) {
if (fromPos >= fLimit || fromPos < fStart) {
fPositionInCache = -1;
return false;
}
// Sequential iteration, move from previous boundary to the following
int r = 0;
if (fPositionInCache >= 0 && fPositionInCache < fBreaks.size() && fBreaks.elementAt(fPositionInCache) == fromPos) {
++fPositionInCache;
if (fPositionInCache >= fBreaks.size()) {
fPositionInCache = -1;
return false;
}
r = fBreaks.elementAt(fPositionInCache);
assert(r > fromPos);
fBoundary = r;
fStatusIndex = fOtherRuleStatusIndex;
return true;
}
// Random indexing. Linear search for the boundary following the given position.
for (fPositionInCache = 0; fPositionInCache < fBreaks.size(); ++fPositionInCache) {
r= fBreaks.elementAt(fPositionInCache);
if (r > fromPos) {
fBoundary = r;
fStatusIndex = fOtherRuleStatusIndex;
return true;
}
}
// Internal error. fStart <= fromPos < fLimit, but no cached boundary.
assert(false);
fPositionInCache = -1;
return false;
};
boolean preceding(int fromPos) {
if (fromPos <= fStart || fromPos > fLimit) {
fPositionInCache = -1;
return false;
}
if (fromPos == fLimit) {
fPositionInCache = fBreaks.size() - 1;
if (fPositionInCache >= 0) {
assert(fBreaks.elementAt(fPositionInCache) == fromPos);
}
}
int r;
if (fPositionInCache > 0 && fPositionInCache < fBreaks.size() && fBreaks.elementAt(fPositionInCache) == fromPos) {
--fPositionInCache;
r = fBreaks.elementAt(fPositionInCache);
assert(r < fromPos);
fBoundary = r;
fStatusIndex = ( r== fStart) ? fFirstRuleStatusIndex : fOtherRuleStatusIndex;
return true;
}
if (fPositionInCache == 0) {
fPositionInCache = -1;
return false;
}
for (fPositionInCache = fBreaks.size()-1; fPositionInCache >= 0; --fPositionInCache) {
r = fBreaks.elementAt(fPositionInCache);
if (r < fromPos) {
fBoundary = r;
fStatusIndex = ( r == fStart) ? fFirstRuleStatusIndex : fOtherRuleStatusIndex;
return true;
}
}
assert(false);
fPositionInCache = -1;
return false;
};
/**
* Populate the cache with the dictionary based boundaries within a region of text.
* @param startPos The start position of a range of text
* @param endPos The end position of a range of text
* @param firstRuleStatus The rule status index that applies to the break at startPos
* @param otherRuleStatus The rule status index that applies to boundaries other than startPos
* @internal
*/
void populateDictionary(int startPos, int endPos,
int firstRuleStatus, int otherRuleStatus) {
if ((endPos - startPos) <= 1) {
return;
}
reset();
fFirstRuleStatusIndex = firstRuleStatus;
fOtherRuleStatusIndex = otherRuleStatus;
int rangeStart = startPos;
int rangeEnd = endPos;
int category;
int current;
int foundBreakCount = 0;
// Loop through the text, looking for ranges of dictionary characters.
// For each span, find the appropriate break engine, and ask it to find
// any breaks within the span.
fText.setIndex(rangeStart);
int c = CharacterIteration.current32(fText);
category = (short)fRData.fTrie.get(c);
int dictStart = fRData.fFTable.fDictCategoriesStart;
while(true) {
while((current = fText.getIndex()) < rangeEnd && (category < dictStart)) {
c = CharacterIteration.next32(fText); // pre-increment
category = (short)fRData.fTrie.get(c);
}
if (current >= rangeEnd) {
break;
}
// We now have a dictionary character. Get the appropriate language object
// to deal with it.
LanguageBreakEngine lbe = getLanguageBreakEngine(c);
// Ask the language object if there are any breaks. It will add them to the cache and
// leave the text pointer on the other side of its range, ready to search for the next one.
if (lbe != null) {
foundBreakCount += lbe.findBreaks(fText, rangeStart, rangeEnd, fBreaks, fPhraseBreaking);
}
// Reload the loop variables for the next go-round
c = CharacterIteration.current32(fText);
category = (short)fRData.fTrie.get(c);
}
// If we found breaks, ensure that the first and last entries are
// the original starting and ending position. And initialize the
// cache iteration position to the first entry.
// System.out.printf("foundBreakCount = %d%n", foundBreakCount);
if (foundBreakCount > 0) {
assert(foundBreakCount == fBreaks.size());
if (startPos < fBreaks.elementAt(0)) {
// The dictionary did not place a boundary at the start of the segment of text.
// Add one now. This should not commonly happen, but it would be easy for interactions
// of the rules for dictionary segments and the break engine implementations to
// inadvertently cause it. Cover it here, just in case.
fBreaks.offer(startPos);
}
if (endPos > fBreaks.peek()) {
fBreaks.push(endPos);
}
fPositionInCache = 0;
// Note: Dictionary matching may extend beyond the original limit.
fStart = fBreaks.elementAt(0);
fLimit = fBreaks.peek();
} else {
// there were no language-based breaks, even though the segment contained
// dictionary characters. Subsequent attempts to fetch boundaries from the dictionary cache
// for this range will fail, and the calling code will fall back to the rule based boundaries.
}
};
DictionaryCache() {
fPositionInCache = -1;
fBreaks = new DictionaryBreakEngine.DequeI();
}
/**
* copy constructor. Used by RuleBasedBreakIterator.clone().
*
* @param src the source object to be copied.
*/
DictionaryCache(DictionaryCache src) {
try {
fBreaks = (DictionaryBreakEngine.DequeI)src.fBreaks.clone();
}
catch (CloneNotSupportedException e) {
throw new RuntimeException(e);
}
fPositionInCache = src.fPositionInCache;
fStart = src.fStart;
fLimit = src.fLimit;
fFirstRuleStatusIndex = src.fFirstRuleStatusIndex;
fOtherRuleStatusIndex = src.fOtherRuleStatusIndex;
fBoundary = src.fBoundary;
fStatusIndex = src.fStatusIndex;
}
// A data structure containing the boundaries themselves. Essentially a vector of raw ints.
DictionaryBreakEngine.DequeI fBreaks;
int fPositionInCache; // Index in fBreaks of last boundary returned by following()
// // or preceding(). Optimizes sequential access.
int fStart; // Text position of first boundary in cache.
int fLimit; // Last boundary in cache. Which is the limit of the
// // text segment being handled by the dictionary.
int fFirstRuleStatusIndex; // Rule status info for first boundary.
int fOtherRuleStatusIndex; // Rule status info for 2nd through last boundaries.
int fBoundary; // Current boundary. Set by preceding(), following().
int fStatusIndex; // Current rule status index. Set by preceding, following().
};
/*
* class BreakCache
*
* Cache of break boundary positions and rule status values.
* Break iterator API functions, next(), previous(), etc., will use cached results
* when possible, and otherwise cache new results as they are obtained.
*
* Uniformly caches both dictionary and rule based (non-dictionary) boundaries.
*
* The cache is implemented as a single circular buffer.
*/
/*
* size of the circular cache buffer.
*/
class BreakCache {
BreakCache() {
reset();
};
void reset(int pos, int ruleStatus) {
fStartBufIdx = 0;
fEndBufIdx = 0;
fTextIdx = pos;
fBufIdx = 0;
fBoundaries[0] = pos;
fStatuses[0] = (short)ruleStatus;
}
void reset() {reset(0, 0); };
void next() {
if (fBufIdx == fEndBufIdx) {
fDone = !populateFollowing();
fPosition = fTextIdx;
fRuleStatusIndex = fStatuses[fBufIdx];
} else {
fBufIdx = modChunkSize(fBufIdx + 1);
fTextIdx = fPosition = fBoundaries[fBufIdx];
fRuleStatusIndex = fStatuses[fBufIdx];
}
};
void previous() {
int initialBufIdx = fBufIdx;
if (fBufIdx == fStartBufIdx) {
// At start of cache. Prepend to it.
populatePreceding();
} else {
// Cache already holds the next boundary
fBufIdx = modChunkSize(fBufIdx - 1);
fTextIdx = fBoundaries[fBufIdx];
}
fDone = (fBufIdx == initialBufIdx);
fPosition = fTextIdx;
fRuleStatusIndex = fStatuses[fBufIdx];
return;
};
// Move the iteration state to the position following the startPosition.
// Input position must be pinned to the input length.
void following(int startPos) {
if (startPos == fTextIdx || seek(startPos) || populateNear(startPos)) {
// startPos is in the cache. Do a next() from that position.
// TODO: an awkward set of interactions with bi->fDone
// seek() does not clear it; it can't because of interactions with populateNear().
// next() does not clear it in the fast-path case, where everything matters. Maybe it should.
// So clear it here, for the case where seek() succeeded on an iterator that had previously run off the end.
fDone = false;
next();
}
};
void preceding(int startPos) {
if (startPos == fTextIdx || seek(startPos) || populateNear(startPos)) {
if (startPos == fTextIdx) {
previous();
} else {
// seek() leaves the BreakCache positioned at the preceding boundary
// if the requested position is between two boundaries.
// current() pushes the BreakCache position out to the BreakIterator itself.
assert(startPos > fTextIdx);
current();
}
}
return;
};
/**
* Update the state of the public BreakIterator (fBI) to reflect the
* current state of the break iterator cache (this).
*/
int current() {
fPosition = fTextIdx;
fRuleStatusIndex = fStatuses[fBufIdx];
fDone = false;
return fTextIdx;
};
/**
* Add boundaries to the cache near the specified position.
* The given position need not be a boundary itself.
* The input position must be within the range of the text, and
* on a code point boundary.
* If the requested position is a break boundary, leave the iteration
* position on it.
* If the requested position is not a boundary, leave the iteration
* position on the preceding boundary and include both the the
* preceding and following boundaries in the cache.
* Additional boundaries, either preceding or following, may be added
* to the cache as a side effect.
*
* Return false if the operation failed.
*/
boolean populateNear(int position) {
assert(position < fBoundaries[fStartBufIdx] || position > fBoundaries[fEndBufIdx]);
// Add boundaries to the cache near the specified position.
// The given position need not be a boundary itself.
// The input position must be within the range of the text, and
// on a code point boundary.
// If the requested position is a break boundary, leave the iteration
// position on it.
// If the requested position is not a boundary, leave the iteration
// position on the preceding boundary and include both the
// preceding and following boundaries in the cache.
// Additional boundaries, either preceding or following, may be added
// to the cache as a side effect.
// If the requested position is not near already cached positions, clear the existing cache,
// find a near-by boundary and begin new cache contents there.
// Threshold for a text position to be considered near to existing cache contents.
// TODO: See issue ICU-22024 "perf tuning of Cache needed."
// This value is subject to change. See the ticket for more details.
final int CACHE_NEAR = 15;
int startOfText = fText.getBeginIndex();
int aBoundary = -1;
int ruleStatusIndex = 0;
boolean retainCache = false;
if ((position > fBoundaries[fStartBufIdx] - CACHE_NEAR) && position < (fBoundaries[fEndBufIdx] + CACHE_NEAR)) {
// Requested position is near the existing cache. Retain it.
retainCache = true;
} else if (position <= startOfText + CACHE_NEAR) {
// Requested position is near the start of the text. Fill cache from start, skipping
// the need to find a safe point.
retainCache = false;
aBoundary = startOfText;
} else {
// Requested position is not near the existing cache.
// Find a safe point to refill the cache from.
int backupPos = handleSafePrevious(position);
if (fBoundaries[fEndBufIdx] < position && fBoundaries[fEndBufIdx] >= (backupPos - CACHE_NEAR)) {
// The requested position is beyond the end of the existing cache, but the
// reverse rules produced a position near or before the cached region.
// Retain the existing cache, and fill from the end of it.
retainCache = true;
} else if (backupPos < startOfText + CACHE_NEAR) {
// The safe reverse rules moved us to near the start of text.
// Take that (index 0) as the backup boundary, avoiding the complication
// (in the following block) of moving forward from the safe point to a known boundary.
//
// Retain the cache if it begins not too far from the requested position.
aBoundary = startOfText;
retainCache = (fBoundaries[fStartBufIdx] <= (position + CACHE_NEAR));
} else {
// The safe reverse rules produced a position that is neither near the existing
// cache, nor near the start of text.
// Advance to the boundary following.
// There is a complication: the safe reverse rules identify pairs of code points
// that are safe. If advancing from the safe point moves forwards by less than
// two code points, we need to advance one more time to ensure that the boundary
// is good, including a correct rules status value.
//
retainCache = false;
fPosition = backupPos;
aBoundary = handleNext();
if (aBoundary == backupPos + 1 ||
(aBoundary == backupPos + 2 &&
Character.isHighSurrogate(fText.setIndex(backupPos)) &&
Character.isLowSurrogate(fText.next()))) {
// The initial handleNext() only advanced by a single code point. Go again.
// Safe rules identify safe pairs.
aBoundary = handleNext();
}
if (aBoundary == BreakIterator.DONE) {
aBoundary = fText.getEndIndex();
}
ruleStatusIndex = fRuleStatusIndex;
}
}
if (!retainCache) {
assert(aBoundary != -1);
reset(aBoundary, ruleStatusIndex); // Reset cache to hold aBoundary as a single starting point.
}
// Fill in boundaries between existing cache content and the new requested position.
if (fBoundaries[fEndBufIdx] < position) {
// The last position in the cache precedes the requested position.
// Add following position(s) to the cache.
while (fBoundaries[fEndBufIdx] < position) {
if (!populateFollowing()) {
assert false;
return false;
}
}
fBufIdx = fEndBufIdx; // Set iterator position to the end of the buffer.
fTextIdx = fBoundaries[fBufIdx]; // Required because populateFollowing may add extra boundaries.
while (fTextIdx > position) { // Move backwards to a position at or preceding the requested pos.
previous();
}
return true;
}
if (fBoundaries[fStartBufIdx] > position) {
// The first position in the cache is beyond the requested position.
// back up more until we get a boundary <= the requested position.
while (fBoundaries[fStartBufIdx] > position) {
populatePreceding();
}
fBufIdx = fStartBufIdx; // Set iterator position to the start of the buffer.
fTextIdx = fBoundaries[fBufIdx]; // Required because populatePreceding may add extra boundaries.
while (fTextIdx < position) { // Move forwards to a position at or following the requested pos.
next();
}
if (fTextIdx > position) {
// If position is not itself a boundary, the next() loop above will overshoot.
// Back up one, leaving cache position at the boundary preceding the requested position.
previous();
}
return true;
}
assert fTextIdx == position;
return true;
};
/**
* Add boundary(s) to the cache following the current last boundary.
* Return false if at the end of the text, and no more boundaries can be added.
* Leave iteration position at the first newly added boundary, or unchanged if no boundary was added.
*/
boolean populateFollowing() {
int fromPosition = fBoundaries[fEndBufIdx];
int fromRuleStatusIdx = fStatuses[fEndBufIdx];
int pos = 0;
int ruleStatusIdx = 0;
if (fDictionaryCache.following(fromPosition)) {
addFollowing(fDictionaryCache.fBoundary, fDictionaryCache.fStatusIndex, UpdateCachePosition);
return true;
}
fPosition = fromPosition;
pos = handleNext();
if (pos == BreakIterator.DONE) {
return false;
}
ruleStatusIdx = fRuleStatusIndex;
if (fDictionaryCharCount > 0) {
// The text segment obtained from the rules includes dictionary characters.
// Subdivide it, with subdivided results going into the dictionary cache.
fDictionaryCache.populateDictionary(fromPosition, pos, fromRuleStatusIdx, ruleStatusIdx);
if (fDictionaryCache.following(fromPosition)) {
addFollowing(fDictionaryCache.fBoundary, fDictionaryCache.fStatusIndex, UpdateCachePosition);
return true;
// TODO: may want to move a sizable chunk of the dictionary cache to the break cache at this point.
// But be careful with interactions with populateNear().
}
}
// Rule based segment did not include dictionary characters.
// Or, it did contain dictionary chars, but the dictionary segmenter didn't handle them,
// meaning that we didn't take the return, above.
// Add its end point to the cache.
addFollowing(pos, ruleStatusIdx, UpdateCachePosition);
// Add several non-dictionary boundaries at this point, to optimize straight forward iteration.
// (subsequent calls to BreakIterator::next() will take the fast path, getting cached results.
//
for (int count=0; count<6; ++count) {
pos = handleNext();
if (pos == BreakIterator.DONE || fDictionaryCharCount > 0) {
break;
}
addFollowing(pos, fRuleStatusIndex, RetainCachePosition);
}
return true;
};
/**
* Add one or more boundaries to the cache preceding the first currently cached boundary.
* Leave the iteration position on the first added boundary.
* Return false if no boundaries could be added (if at the start of the text.)
*/
boolean populatePreceding() {
int textBegin = fText.getBeginIndex();
int fromPosition = fBoundaries[fStartBufIdx];
if (fromPosition == textBegin) {
return false;
}
int position = textBegin;
int positionStatusIdx = 0;
if (fDictionaryCache.preceding(fromPosition)) {
addPreceding(fDictionaryCache.fBoundary, fDictionaryCache.fStatusIndex, UpdateCachePosition);
return true;
}
int backupPosition = fromPosition;
// Find a boundary somewhere preceding the first already-cached boundary
do {
backupPosition = backupPosition - 30;
if (backupPosition <= textBegin) {
backupPosition = textBegin;
} else {
backupPosition = handleSafePrevious(backupPosition);
}
if (backupPosition == BreakIterator.DONE || backupPosition == textBegin) {
position = textBegin;
positionStatusIdx = 0;
} else {
// Advance to the boundary following the backup position.
// There is a complication: the safe reverse rules identify pairs of code points
// that are safe. If advancing from the safe point moves forwards by less than
// two code points, we need to advance one more time to ensure that the boundary
// is good, including a correct rules status value.
//
fPosition = backupPosition; // TODO: pass starting position in a clearer way.
position = handleNext();
if (position == backupPosition + 1 ||
(position == backupPosition + 2 &&
Character.isHighSurrogate(fText.setIndex(backupPosition)) &&
Character.isLowSurrogate(fText.next()))) {
// The initial handleNext() only advanced by a single code point. Go again.
// Safe rules identify safe pairs.
position = handleNext();
}
positionStatusIdx = fRuleStatusIndex;
}
} while (position >= fromPosition);
// Find boundaries between the one we just located and the first already-cached boundary
// Put them in a side buffer, because we don't yet know where they will fall in the circular cache buffer.
fSideBuffer.removeAllElements();
fSideBuffer.push(position);
fSideBuffer.push(positionStatusIdx);
do {
int prevPosition = fPosition = position;
int prevStatusIdx = positionStatusIdx;
position = handleNext();
positionStatusIdx = fRuleStatusIndex;
if (position == BreakIterator.DONE) {
break;
}
boolean segmentHandledByDictionary = false;
if (fDictionaryCharCount != 0) {
// Segment from the rules includes dictionary characters.
// Subdivide it, with subdivided results going into the dictionary cache.
int dictSegEndPosition = position;
fDictionaryCache.populateDictionary(prevPosition, dictSegEndPosition, prevStatusIdx, positionStatusIdx);
while (fDictionaryCache.following(prevPosition)) {
position = fDictionaryCache.fBoundary;
positionStatusIdx = fDictionaryCache.fStatusIndex;
segmentHandledByDictionary = true;
assert(position > prevPosition);
if (position >= fromPosition) {
break;
}
assert(position <= dictSegEndPosition);
fSideBuffer.push(position);
fSideBuffer.push(positionStatusIdx);
prevPosition = position;
}
assert(position==dictSegEndPosition || position>=fromPosition);
}
if (!segmentHandledByDictionary && position < fromPosition) {
fSideBuffer.push(position);
fSideBuffer.push(positionStatusIdx);
}
} while (position < fromPosition);
// Move boundaries from the side buffer to the main circular buffer.
boolean success = false;
if (!fSideBuffer.isEmpty()) {
positionStatusIdx = fSideBuffer.pop();
position = fSideBuffer.pop();
addPreceding(position, positionStatusIdx, UpdateCachePosition);
success = true;
}
while (!fSideBuffer.isEmpty()) {
positionStatusIdx = fSideBuffer.pop();
position = fSideBuffer.pop();
if (!addPreceding(position, positionStatusIdx, RetainCachePosition)) {
// No space in circular buffer to hold a new preceding result while
// also retaining the current cache (iteration) position.
// Bailing out is safe; the cache will refill again if needed.
break;
}
}
return success;
};
static final boolean RetainCachePosition = false;
static final boolean UpdateCachePosition = true;
/**
* Add the boundary following the current position.
* The current position can be left as it was, or changed to the newly added boundary,
* as specified by the update parameter.
*/
void addFollowing(int position, int ruleStatusIdx, boolean update) {
assert(position > fBoundaries[fEndBufIdx]);
assert(ruleStatusIdx <= Short.MAX_VALUE);
int nextIdx = modChunkSize(fEndBufIdx + 1);
if (nextIdx == fStartBufIdx) {
fStartBufIdx = modChunkSize(fStartBufIdx + 6); // TODO: experiment. Probably revert to 1.
}
fBoundaries[nextIdx] = position;
fStatuses[nextIdx] = (short)ruleStatusIdx;
fEndBufIdx = nextIdx;
if (update == UpdateCachePosition) {
// Set current position to the newly added boundary.
fBufIdx = nextIdx;
fTextIdx = position;
} else {
// Retaining the original cache position.
// Check if the added boundary wraps around the buffer, and would over-write the original position.
// It's the responsibility of callers of this function to not add too many.
assert(nextIdx != fBufIdx);
}
};
/**
* Add the boundary preceding the current position.
* The current position can be left as it was, or changed to the newly added boundary,
* as specified by the update parameter.
*/
boolean addPreceding(int position, int ruleStatusIdx, boolean update) {
assert(position < fBoundaries[fStartBufIdx]);
assert(ruleStatusIdx <= Short.MAX_VALUE);
int nextIdx = modChunkSize(fStartBufIdx - 1);
if (nextIdx == fEndBufIdx) {
if (fBufIdx == fEndBufIdx && update == RetainCachePosition) {
// Failure. The insertion of the new boundary would claim the buffer position that is the
// current iteration position. And we also want to retain the current iteration position.
// (The buffer is already completely full of entries that precede the iteration position.)
return false;
}
fEndBufIdx = modChunkSize(fEndBufIdx - 1);
}
fBoundaries[nextIdx] = position;
fStatuses[nextIdx] = (short)ruleStatusIdx;
fStartBufIdx = nextIdx;
if (update == UpdateCachePosition) {
fBufIdx = nextIdx;
fTextIdx = position;
}
return true;
};
/**
* Set the cache position to the specified position, or, if the position
* falls between to cached boundaries, to the preceding boundary.
* Fails if the requested position is outside of the range of boundaries currently held by the cache.
* The startPosition must be on a code point boundary.
*
* Return true if successful, false if the specified position is after
* the last cached boundary or before the first.
*/
boolean seek(int pos) {
if (pos < fBoundaries[fStartBufIdx] || pos > fBoundaries[fEndBufIdx]) {
return false;
}
if (pos == fBoundaries[fStartBufIdx]) {
// Common case: seek(0), from BreakIterator::first()
fBufIdx = fStartBufIdx;
fTextIdx = fBoundaries[fBufIdx];
return true;
}
if (pos == fBoundaries[fEndBufIdx]) {
fBufIdx = fEndBufIdx;
fTextIdx = fBoundaries[fBufIdx];
return true;
}
int min = fStartBufIdx;
int max = fEndBufIdx;
while (min != max) {
int probe = (min + max + (min>max ? CACHE_SIZE : 0)) / 2;
probe = modChunkSize(probe);
if (fBoundaries[probe] > pos) {
max = probe;
} else {
min = modChunkSize(probe + 1);
}
}
assert(fBoundaries[max] > pos);
fBufIdx = modChunkSize(max - 1);
fTextIdx = fBoundaries[fBufIdx];
assert(fTextIdx <= pos);
return true;
};
/**
* copy constructor, used from RuleBasedBreakIterator.clone().
*
* @param src
*/
BreakCache(BreakCache src) {
fStartBufIdx = src.fStartBufIdx;
fEndBufIdx = src.fEndBufIdx;
fTextIdx = src.fTextIdx;
fBufIdx = src.fBufIdx;
fBoundaries = src.fBoundaries.clone();
fStatuses = src.fStatuses.clone();
fSideBuffer = new DictionaryBreakEngine.DequeI(); // Transient, no need to clone contents.
}
void dumpCache() {
System.out.printf("fTextIdx:%d fBufIdx:%d%n", fTextIdx, fBufIdx);
for (int i=fStartBufIdx; ; i=modChunkSize(i+1)) {
System.out.printf("%d %d%n", i, fBoundaries[i]);
if (i == fEndBufIdx) {
break;
}
}
};
private final int modChunkSize(int index) { return index & (CACHE_SIZE - 1); };
static final int CACHE_SIZE = 128;
// static_assert((CACHE_SIZE & (CACHE_SIZE-1)) == 0, "CACHE_SIZE must be power of two.");
int fStartBufIdx;
int fEndBufIdx; // inclusive
int fTextIdx;
int fBufIdx;
int[] fBoundaries = new int[CACHE_SIZE];
short[] fStatuses = new short[CACHE_SIZE];
DictionaryBreakEngine.DequeI fSideBuffer = new DictionaryBreakEngine.DequeI();
};
}