com.ibm.icu.text.RuleBasedBreakIterator Maven / Gradle / Ivy
// © 2016 and later: Unicode, Inc. and others.
// License & terms of use: http://www.unicode.org/copyright.html#License
/*
*******************************************************************************
* Copyright (C) 2005-2016 International Business Machines Corporation and
* others. All Rights Reserved.
*******************************************************************************
*/
package com.ibm.icu.text;
import static com.ibm.icu.impl.CharacterIteration.DONE32;
import static com.ibm.icu.impl.CharacterIteration.next32;
import static com.ibm.icu.impl.CharacterIteration.nextTrail32;
import static com.ibm.icu.impl.CharacterIteration.previous32;
import java.io.ByteArrayOutputStream;
import java.io.IOException;
import java.io.InputStream;
import java.io.OutputStream;
import java.nio.ByteBuffer;
import java.text.CharacterIterator;
import java.util.concurrent.ConcurrentHashMap;
import com.ibm.icu.impl.Assert;
import com.ibm.icu.impl.CharTrie;
import com.ibm.icu.impl.CharacterIteration;
import com.ibm.icu.impl.ICUBinary;
import com.ibm.icu.impl.ICUDebug;
import com.ibm.icu.lang.UCharacter;
import com.ibm.icu.lang.UProperty;
import com.ibm.icu.lang.UScript;
/**
* Rule Based Break Iterator
* This is a port of the C++ class RuleBasedBreakIterator from ICU4C.
*
* @stable ICU 2.0
*/
public class RuleBasedBreakIterator extends BreakIterator {
//=======================================================================
// Constructors & Factories
//=======================================================================
/**
* private constructor
*/
private RuleBasedBreakIterator() {
fLastStatusIndexValid = true;
fDictionaryCharCount = 0;
fBreakEngines.put(-1, fUnhandledBreakEngine);
}
/**
* Create a break iterator from a precompiled set of break rules.
*
* Creating a break iterator from the binary rules is much faster than
* creating one from source rules.
*
* The binary rules are generated by the RuleBasedBreakIterator.compileRules() function.
* Binary break iterator rules are not guaranteed to be compatible between
* different versions of ICU.
*
* @param is an input stream supplying the compiled binary rules.
* @throws IOException if there is an error while reading the rules from the InputStream.
* @see #compileRules(String, OutputStream)
* @stable ICU 4.8
*/
public static RuleBasedBreakIterator getInstanceFromCompiledRules(InputStream is) throws IOException {
RuleBasedBreakIterator This = new RuleBasedBreakIterator();
This.fRData = RBBIDataWrapper.get(ICUBinary.getByteBufferFromInputStreamAndCloseStream(is));
return This;
}
/**
* Create a break iterator from a precompiled set of break rules.
*
* Creating a break iterator from the binary rules is much faster than
* creating one from source rules.
*
* The binary rules are generated by the RuleBasedBreakIterator.compileRules() function.
* Binary break iterator rules are not guaranteed to be compatible between
* different versions of ICU.
*
* @param bytes a buffer supplying the compiled binary rules.
* @throws IOException if there is an error while reading the rules from the buffer.
* @see #compileRules(String, OutputStream)
* @internal
* @deprecated This API is ICU internal only.
*/
@Deprecated
public static RuleBasedBreakIterator getInstanceFromCompiledRules(ByteBuffer bytes) throws IOException {
RuleBasedBreakIterator This = new RuleBasedBreakIterator();
This.fRData = RBBIDataWrapper.get(bytes);
return This;
}
/**
* Construct a RuleBasedBreakIterator from a set of rules supplied as a string.
* @param rules The break rules to be used.
* @stable ICU 2.2
*/
public RuleBasedBreakIterator(String rules) {
this();
try {
ByteArrayOutputStream ruleOS = new ByteArrayOutputStream();
compileRules(rules, ruleOS);
fRData = RBBIDataWrapper.get(ByteBuffer.wrap(ruleOS.toByteArray()));
} catch (IOException e) {
///CLOVER:OFF
// An IO exception can only arrive here if there is a bug in the RBBI Rule compiler,
// causing bogus compiled rules to be produced, but with no compile error raised.
RuntimeException rte = new RuntimeException("RuleBasedBreakIterator rule compilation internal error: "
+ e.getMessage());
throw rte;
///CLOVER:ON
}
}
//=======================================================================
// Boilerplate
//=======================================================================
/**
* Clones this iterator.
* @return A newly-constructed RuleBasedBreakIterator with the same
* behavior as this one.
* @stable ICU 2.0
*/
@Override
public Object clone()
{
RuleBasedBreakIterator result = (RuleBasedBreakIterator)super.clone();
if (fText != null) {
result.fText = (CharacterIterator)(fText.clone());
}
return result;
}
/**
* Returns true if both BreakIterators are of the same class, have the same
* rules, and iterate over the same text.
* @stable ICU 2.0
*/
@Override
public boolean equals(Object that) {
if (that == null) {
return false;
}
if (this == that) {
return true;
}
try {
RuleBasedBreakIterator other = (RuleBasedBreakIterator) that;
if (fRData != other.fRData && (fRData == null || other.fRData == null)) {
return false;
}
if (fRData != null && other.fRData != null &&
(!fRData.fRuleSource.equals(other.fRData.fRuleSource))) {
return false;
}
if (fText == null && other.fText == null) {
return true;
}
if (fText == null || other.fText == null) {
return false;
}
return fText.equals(other.fText);
}
catch(ClassCastException e) {
return false;
}
}
/**
* Returns the description (rules) used to create this iterator.
* (In ICU4C, the same function is RuleBasedBreakIterator::getRules())
* @stable ICU 2.0
*/
@Override
public String toString() {
String retStr = "";
if (fRData != null) {
retStr = fRData.fRuleSource;
}
return retStr;
}
/**
* Compute a hashcode for this BreakIterator
* @return A hash code
* @stable ICU 2.0
*/
@Override
public int hashCode()
{
return fRData.fRuleSource.hashCode();
}
private static final int START_STATE = 1; // The state number of the starting state
private static final int STOP_STATE = 0; // The state-transition value indicating "stop"
// RBBIRunMode - the state machine runs an extra iteration at the beginning and end
// of user text. A variable with this enum type keeps track of where we
// are. The state machine only fetches user text input while in RUN mode.
private static final int RBBI_START = 0;
private static final int RBBI_RUN = 1;
private static final int RBBI_END = 2;
/*
* The character iterator through which this BreakIterator accesses the text.
*/
private CharacterIterator fText = new java.text.StringCharacterIterator("");
/**
* The rule data for this BreakIterator instance. Package private.
*/
RBBIDataWrapper fRData;
/*
* Index of the Rule {tag} values for the most recent match.
*/
private int fLastRuleStatusIndex;
/*
* Rule tag value valid flag.
* Some iterator operations don't intrinsically set the correct tag value.
* This flag lets us lazily compute the value if we are ever asked for it.
*/
private boolean fLastStatusIndexValid;
/**
* Counter for the number of characters encountered with the "dictionary"
* flag set. Normal RBBI iterators don't use it, although the code
* for updating it is live. Dictionary Based break iterators (a subclass
* of us) access this field directly.
* @internal
*/
private int fDictionaryCharCount;
/*
* ICU debug argument name for RBBI
*/
private static final String RBBI_DEBUG_ARG = "rbbi";
/**
* Debugging flag. Trace operation of state machine when true.
*/
private static final boolean TRACE = ICUDebug.enabled(RBBI_DEBUG_ARG)
&& ICUDebug.value(RBBI_DEBUG_ARG).indexOf("trace") >= 0;
/**
* What kind of break iterator this is. Set to KIND_LINE by default,
* since this produces sensible output.
*/
private int fBreakType = KIND_LINE;
/**
* The "default" break engine - just skips over ranges of dictionary words,
* producing no breaks. Should only be used if characters need to be handled
* by a dictionary but we have no dictionary implementation for them.
*/
private final UnhandledBreakEngine fUnhandledBreakEngine = new UnhandledBreakEngine();
/**
* when a range of characters is divided up using the dictionary, the break
* positions that are discovered are stored here, preventing us from having
* to use either the dictionary or the state table again until the iterator
* leaves this range of text
*/
private int[] fCachedBreakPositions;
/**
* if fCachedBreakPositions is not null, this indicates which item in the
* cache the current iteration position refers to
*/
private int fPositionInCache;
private final ConcurrentHashMap fBreakEngines =
new ConcurrentHashMap();
/**
* Dumps caches and performs other actions associated with a complete change
* in text or iteration position.
*/
private void reset() {
fCachedBreakPositions = null;
// fNumCachedBreakPositions = 0;
fDictionaryCharCount = 0;
fPositionInCache = 0;
}
/**
* Dump the contents of the state table and character classes for this break iterator.
* For debugging only.
* @internal
* @deprecated This API is ICU internal only.
*/
@Deprecated
public void dump(java.io.PrintStream out) {
if (out == null) {
out = System.out;
}
this.fRData.dump(out);
}
/**
* Compile a set of source break rules into the binary state tables used
* by the break iterator engine. Creating a break iterator from precompiled
* rules is much faster than creating one from source rules.
*
* Binary break rules are not guaranteed to be compatible between different
* versions of ICU.
*
*
* @param rules The source form of the break rules
* @param ruleBinary An output stream to receive the compiled rules.
* @throws IOException If there is an error writing the output.
* @see #getInstanceFromCompiledRules(InputStream)
* @stable ICU 4.8
*/
public static void compileRules(String rules, OutputStream ruleBinary) throws IOException {
RBBIRuleBuilder.compileRules(rules, ruleBinary);
}
//=======================================================================
// BreakIterator overrides
//=======================================================================
/**
* Sets the current iteration position to the beginning of the text.
* (i.e., the CharacterIterator's starting offset).
* @return The offset of the beginning of the text.
* @stable ICU 2.0
*/
@Override
public int first() {
fCachedBreakPositions = null;
fDictionaryCharCount = 0;
fPositionInCache = 0;
fLastRuleStatusIndex = 0;
fLastStatusIndexValid = true;
if (fText == null) {
return BreakIterator.DONE;
}
fText.first();
return fText.getIndex();
}
/**
* Sets the current iteration position to the end of the text.
* (i.e., the CharacterIterator's ending offset).
* @return The text's past-the-end offset.
* @stable ICU 2.0
*/
@Override
public int last() {
fCachedBreakPositions = null;
fDictionaryCharCount = 0;
fPositionInCache = 0;
if (fText == null) {
fLastRuleStatusIndex = 0;
fLastStatusIndexValid = true;
return BreakIterator.DONE;
}
// t.last() returns the offset of the last character,
// rather than the past-the-end offset
// so a loop like for(p=it.last(); p!=DONE; p=it.previous()) ...
// will work correctly.
fLastStatusIndexValid = false;
int pos = fText.getEndIndex();
fText.setIndex(pos);
return pos;
}
/**
* Advances the iterator either forward or backward the specified number of steps.
* Negative values move backward, and positive values move forward. This is
* equivalent to repeatedly calling next() or previous().
* @param n The number of steps to move. The sign indicates the direction
* (negative is backwards, and positive is forwards).
* @return The character offset of the boundary position n boundaries away from
* the current one.
* @stable ICU 2.0
*/
@Override
public int next(int n) {
int result = current();
while (n > 0) {
result = next();
--n;
}
while (n < 0) {
result = previous();
++n;
}
return result;
}
/**
* Advances the iterator to the next boundary position.
* @return The position of the first boundary after this one.
* @stable ICU 2.0
*/
@Override
public int next() {
// if we have cached break positions and we're still in the range
// covered by them, just move one step forward in the cache
if (fCachedBreakPositions != null) {
if (fPositionInCache < fCachedBreakPositions.length - 1) {
++fPositionInCache;
int pos = fCachedBreakPositions[fPositionInCache];
fText.setIndex(pos);
return pos;
}
else {
reset();
}
}
int startPos = current();
fDictionaryCharCount = 0;
int result = handleNext(fRData.fFTable);
if (fDictionaryCharCount > 0) {
result = checkDictionary(startPos, result, false);
}
return result;
}
/**
* checkDictionary This function handles all processing of characters in
* the "dictionary" set. It will determine the appropriate
* course of action, and possibly set up a cache in the
* process.
*/
private int checkDictionary(int startPos, int endPos, boolean reverse) {
// Reset the old break cache first.
reset();
// note: code segment below assumes that dictionary chars are in the
// startPos-endPos range
// value returned should be next character in sequence
if ((endPos - startPos) <= 1) {
return (reverse ? startPos : endPos);
}
// Starting from the starting point, scan towards the proposed result,
// looking for the first dictionary character (which may be the one
// we're on, if we're starting in the middle of a range).
fText.setIndex(reverse ? endPos : startPos);
if (reverse) {
CharacterIteration.previous32(fText);
}
int rangeStart = startPos;
int rangeEnd = endPos;
int category;
int current;
DictionaryBreakEngine.DequeI breaks = new DictionaryBreakEngine.DequeI();
int foundBreakCount = 0;
int c = CharacterIteration.current32(fText);
category = (short)fRData.fTrie.getCodePointValue(c);
// Is the character we're starting on a dictionary character? If so, we
// need to back up to include the entire run; otherwise the results of
// the break algorithm will differ depending on where we start. Since
// the result is cached and there is typically a non-dictionary break
// within a small number of words, there should be little performance impact.
if ((category & 0x4000) != 0) {
if (reverse) {
do {
CharacterIteration.next32(fText);
c = CharacterIteration.current32(fText);
category = (short)fRData.fTrie.getCodePointValue(c);
} while (c != CharacterIteration.DONE32 && ((category & 0x4000)) != 0);
// Back up to the last dictionary character
rangeEnd = fText.getIndex();
if (c == CharacterIteration.DONE32) {
// c = fText->last32();
// TODO: why was this if needed?
c = CharacterIteration.previous32(fText);
}
else {
c = CharacterIteration.previous32(fText);
}
}
else {
do {
c = CharacterIteration.previous32(fText);
category = (short)fRData.fTrie.getCodePointValue(c);
}
while (c != CharacterIteration.DONE32 && ((category & 0x4000) != 0));
// Back up to the last dictionary character
if (c == CharacterIteration.DONE32) {
// c = fText->first32();
c = CharacterIteration.current32(fText);
}
else {
CharacterIteration.next32(fText);
c = CharacterIteration.current32(fText);
}
rangeStart = fText.getIndex();
}
category = (short)fRData.fTrie.getCodePointValue(c);
}
// Loop through the text, looking for ranges of dictionary characters.
// For each span, find the appropriate break engine, and ask it to find
// any breaks within the span.
// Note: we always do this in the forward direction, so that the break
// cache is built in the right order.
if (reverse) {
fText.setIndex(rangeStart);
c = CharacterIteration.current32(fText);
category = (short)fRData.fTrie.getCodePointValue(c);
}
LanguageBreakEngine lbe = null;
while(true) {
while((current = fText.getIndex()) < rangeEnd && (category & 0x4000) == 0) {
CharacterIteration.next32(fText);
c = CharacterIteration.current32(fText);
category = (short)fRData.fTrie.getCodePointValue(c);
}
if (current >= rangeEnd) {
break;
}
// We now have a dictionary character. Get the appropriate language object
// to deal with it.
lbe = getLanguageBreakEngine(c);
// Ask the language object if there are any breaks. It will leave the text
// pointer on the other side of its range, ready to search for the next one.
if (lbe != null) {
int startingIdx = fText.getIndex();
foundBreakCount += lbe.findBreaks(fText, rangeStart, rangeEnd, false, fBreakType, breaks);
assert fText.getIndex() > startingIdx;
}
// Reload the loop variables for the next go-round
c = CharacterIteration.current32(fText);
category = (short)fRData.fTrie.getCodePointValue(c);
}
// If we found breaks, build a new break cache. The first and last entries must
// be the original starting and ending position.
if (foundBreakCount > 0) {
if (foundBreakCount != breaks.size()) {
System.out.println("oops, foundBreakCount != breaks.size(). LBE = " + lbe.getClass());
}
assert foundBreakCount == breaks.size();
if (startPos < breaks.peekLast()) {
breaks.offer(startPos);
}
if (endPos > breaks.peek()) {
breaks.push(endPos);
}
// TODO: get rid of this array, use results from the deque directly
fCachedBreakPositions = new int[breaks.size()];
int i = 0;
while (breaks.size() > 0) {
fCachedBreakPositions[i++] = breaks.pollLast();
}
// If there are breaks, then by definition, we are replacing the original
// proposed break by one of the breaks we found. Use following() and
// preceding() to do the work. They should never recurse in this case.
if (reverse) {
return preceding(endPos);
}
else {
return following(startPos);
}
}
// If we get here, there were no language-based breaks. Set the text pointer
// to the original proposed break.
fText.setIndex(reverse ? startPos : endPos);
return (reverse ? startPos : endPos);
}
/**
* Moves the iterator backwards, to the last boundary preceding this one.
* @return The position of the last boundary position preceding this one.
* @stable ICU 2.0
*/
@Override
public int previous() {
int result;
int startPos;
CharacterIterator text = getText();
fLastStatusIndexValid = false;
// if we have cached break positions and we're still in the range
// covered by them, just move one step backward in the cache
if (fCachedBreakPositions != null) {
if (fPositionInCache > 0) {
--fPositionInCache;
// If we're at the beginning of the cache, need to reevaluate the
// rule status
if (fPositionInCache <= 0) {
fLastStatusIndexValid = false;
}
int pos = fCachedBreakPositions[fPositionInCache];
text.setIndex(pos);
return pos;
} else {
reset();
}
}
// if we're already sitting at the beginning of the text, return DONE
startPos = current();
if (fText == null || startPos == fText.getBeginIndex()) {
fLastRuleStatusIndex = 0;
fLastStatusIndexValid = true;
return BreakIterator.DONE;
}
// Rules with an exact reverse table are handled here.
if (fRData.fSRTable != null || fRData.fSFTable != null) {
result = handlePrevious(fRData.fRTable);
if (fDictionaryCharCount > 0) {
result = checkDictionary(result, startPos, true);
}
return result;
}
// old rule syntax
// set things up. handlePrevious() will back us up to some valid
// break position before the current position (we back our internal
// iterator up one step to prevent handlePrevious() from returning
// the current position), but not necessarily the last one before
// where we started
int start = current();
previous32(fText);
int lastResult = handlePrevious(fRData.fRTable);
if (lastResult == BreakIterator.DONE) {
lastResult = fText.getBeginIndex();
fText.setIndex(lastResult);
}
result = lastResult;
int lastTag = 0;
boolean breakTagValid = false;
// iterate forward from the known break position until we pass our
// starting point. The last break position before the starting
// point is our return value
for (;;) {
result = next();
if (result == BreakIterator.DONE || result >= start) {
break;
}
lastResult = result;
lastTag = fLastRuleStatusIndex;
breakTagValid = true;
}
// fLastBreakTag wants to have the value for section of text preceding
// the result position that we are to return (in lastResult.) If
// the backwards rules overshot and the above loop had to do two or more
// handleNext()s to move up to the desired return position, we will have a valid
// tag value. But, if handlePrevious() took us to exactly the correct result position,
// we wont have a tag value for that position, which is only set by handleNext().
// Set the current iteration position to be the last break position
// before where we started, and then return that value.
fText.setIndex(lastResult);
fLastRuleStatusIndex = lastTag; // for use by getRuleStatus()
fLastStatusIndexValid = breakTagValid;
return lastResult;
}
/**
* Sets the iterator to refer to the first boundary position following
* the specified position.
* @param offset The position from which to begin searching for a break position.
* @return The position of the first break after the current position.
* @stable ICU 2.0
*/
@Override
public int following(int offset) {
CharacterIterator text = getText();
// if we have no cached break positions, or if "offset" is outside the
// range covered by the cache, then dump the cache and call our
// inherited following() method. This will call other methods in this
// class that may refresh the cache.
if (fCachedBreakPositions == null || offset < fCachedBreakPositions[0] ||
offset >= fCachedBreakPositions[fCachedBreakPositions.length - 1]) {
fCachedBreakPositions = null;
return rulesFollowing(offset);
}
// on the other hand, if "offset" is within the range covered by the
// cache, then just search the cache for the first break position
// after "offset"
else {
fPositionInCache = 0;
while (fPositionInCache < fCachedBreakPositions.length
&& offset >= fCachedBreakPositions[fPositionInCache])
++fPositionInCache;
text.setIndex(fCachedBreakPositions[fPositionInCache]);
return text.getIndex();
}
}
private int rulesFollowing(int offset) {
// if the offset passed in is already past the end of the text,
// just return DONE; if it's before the beginning, return the
// text's starting offset
fLastRuleStatusIndex = 0;
fLastStatusIndexValid = true;
if (fText == null || offset >= fText.getEndIndex()) {
last();
return next();
}
else if (offset < fText.getBeginIndex()) {
return first();
}
// otherwise, set our internal iteration position (temporarily)
// to the position passed in. If this is the _beginning_ position,
// then we can just use next() to get our return value
int result = 0;
if (fRData.fSRTable != null) {
// Safe Point Reverse rules exist.
// This allows us to use the optimum algorithm.
fText.setIndex(offset);
// move forward one codepoint to prepare for moving back to a
// safe point.
// this handles offset being between a supplementary character
next32(fText);
// handlePrevious will move most of the time to < 1 boundary away
handlePrevious(fRData.fSRTable);
result = next();
while (result <= offset) {
result = next();
}
return result;
}
if (fRData.fSFTable != null) {
// No Safe point reverse table, but there is a safe pt forward table.
//
fText.setIndex(offset);
previous32(fText);
// handle next will give result >= offset
handleNext(fRData.fSFTable);
// previous will give result 0 or 1 boundary away from offset,
// most of the time
// we have to
int oldresult = previous();
while (oldresult > offset) {
result = previous();
if (result <= offset) {
return oldresult;
}
oldresult = result;
}
result = next();
if (result <= offset) {
return next();
}
return result;
}
// otherwise, we have to sync up first. Use handlePrevious() to back
// us up to a known break position before the specified position (if
// we can determine that the specified position is a break position,
// we don't back up at all). This may or may not be the last break
// position at or before our starting position. Advance forward
// from here until we've passed the starting position. The position
// we stop on will be the first break position after the specified one.
// old rule syntax
fText.setIndex(offset);
if (offset == fText.getBeginIndex()) {
return next();
}
result = previous();
while (result != BreakIterator.DONE && result <= offset) {
result = next();
}
return result;
}
/**
* Sets the iterator to refer to the last boundary position before the
* specified position.
* @param offset The position to begin searching for a break from.
* @return The position of the last boundary before the starting position.
* @stable ICU 2.0
*/
@Override
public int preceding(int offset) {
CharacterIterator text = getText();
// if we have no cached break positions, or "offset" is outside the
// range covered by the cache, we can just call the inherited routine
// (which will eventually call other routines in this class that may
// refresh the cache)
if (fCachedBreakPositions == null || offset <= fCachedBreakPositions[0] ||
offset > fCachedBreakPositions[fCachedBreakPositions.length - 1]) {
fCachedBreakPositions = null;
return rulesPreceding(offset);
}
// on the other hand, if "offset" is within the range covered by the cache,
// then all we have to do is search the cache for the last break position
// before "offset"
else {
fPositionInCache = 0;
while (fPositionInCache < fCachedBreakPositions.length
&& offset > fCachedBreakPositions[fPositionInCache])
++fPositionInCache;
--fPositionInCache;
text.setIndex(fCachedBreakPositions[fPositionInCache]);
return text.getIndex();
}
}
private int rulesPreceding(int offset) {
// if the offset passed in is already past the end of the text,
// just return DONE; if it's before the beginning, return the
// text's starting offset
if (fText == null || offset > fText.getEndIndex()) {
// return BreakIterator::DONE;
return last();
}
else if (offset < fText.getBeginIndex()) {
return first();
}
// if we start by updating the current iteration position to the
// position specified by the caller, we can just use previous()
// to carry out this operation
int result;
if (fRData.fSFTable != null) {
/// todo synwee
// new rule syntax
fText.setIndex(offset);
// move backwards one codepoint to prepare for moving forwards to a
// safe point.
// this handles offset being between a supplementary character
previous32(fText);
handleNext(fRData.fSFTable);
result = previous();
while (result >= offset) {
result = previous();
}
return result;
}
if (fRData.fSRTable != null) {
// backup plan if forward safe table is not available
fText.setIndex(offset);
next32(fText);
// handle previous will give result <= offset
handlePrevious(fRData.fSRTable);
// next will give result 0 or 1 boundary away from offset,
// most of the time
// we have to
int oldresult = next();
while (oldresult < offset) {
result = next();
if (result >= offset) {
return oldresult;
}
oldresult = result;
}
result = previous();
if (result >= offset) {
return previous();
}
return result;
}
// old rule syntax
fText.setIndex(offset);
return previous();
}
/**
* Throw IllegalArgumentException unless begin <= offset < end.
* @stable ICU 2.0
*/
protected static final void checkOffset(int offset, CharacterIterator text) {
if (offset < text.getBeginIndex() || offset > text.getEndIndex()) {
throw new IllegalArgumentException("offset out of bounds");
}
}
/**
* Returns true if the specified position is a boundary position. As a side
* effect, leaves the iterator pointing to the first boundary position at
* or after "offset".
* @param offset the offset to check.
* @return True if "offset" is a boundary position.
* @stable ICU 2.0
*/
@Override
public boolean isBoundary(int offset) {
checkOffset(offset, fText);
// the beginning index of the iterator is always a boundary position by definition
if (offset == fText.getBeginIndex()) {
first(); // For side effects on current position, tag values.
return true;
}
if (offset == fText.getEndIndex()) {
last(); // For side effects on current position, tag values.
return true;
}
// otherwise, we can use following() on the position before the specified
// one and return true if the position we get back is the one the user
// specified
// return following(offset - 1) == offset;
// TODO: check whether it is safe to revert to the simpler offset-1 code
// The safe rules may take care of unpaired surrogates ok.
fText.setIndex(offset);
previous32(fText);
int pos = fText.getIndex();
boolean result = following(pos) == offset;
return result;
}
/**
* Returns the current iteration position.
* @return The current iteration position.
* @stable ICU 2.0
*/
@Override
public int current() {
return (fText != null) ? fText.getIndex() : BreakIterator.DONE;
}
private void makeRuleStatusValid() {
if (fLastStatusIndexValid == false) {
// No cached status is available.
int curr = current();
if (curr == BreakIterator.DONE || curr == fText.getBeginIndex()) {
// At start of text, or there is no text. Status is always zero.
fLastRuleStatusIndex = 0;
fLastStatusIndexValid = true;
} else {
// Not at start of text. Find status the tedious way.
int pa = fText.getIndex();
first();
int pb = current();
while (fText.getIndex() < pa) {
pb = next();
}
Assert.assrt(pa == pb);
}
Assert.assrt(fLastStatusIndexValid == true);
Assert.assrt(fLastRuleStatusIndex >= 0 && fLastRuleStatusIndex < fRData.fStatusTable.length);
}
}
/**
* Return the status tag from the break rule that determined the most recently
* returned break position. The values appear in the rule source
* within brackets, {123}, for example. For rules that do not specify a
* status, a default value of 0 is returned. If more than one rule applies,
* the numerically largest of the possible status values is returned.
*
* Of the standard types of ICU break iterators, only the word break
* iterator provides status values. The values are defined in
* class RuleBasedBreakIterator, and allow distinguishing between words
* that contain alphabetic letters, "words" that appear to be numbers,
* punctuation and spaces, words containing ideographic characters, and
* more. Call getRuleStatus
after obtaining a boundary
* position from next()
, previous()
, or
* any other break iterator functions that returns a boundary position.
*
* @return the status from the break rule that determined the most recently
* returned break position.
*
* @draft ICU 3.0 (retain)
* @provisional This is a draft API and might change in a future release of ICU.
*/
@Override
public int getRuleStatus() {
makeRuleStatusValid();
// Status records have this form:
// Count N <-- fLastRuleStatusIndex points here.
// Status val 0
// Status val 1
// ...
// Status val N-1 <-- the value we need to return
// The status values are sorted in ascending order.
// This function returns the last (largest) of the array of status values.
int idx = fLastRuleStatusIndex + fRData.fStatusTable[fLastRuleStatusIndex];
int tagVal = fRData.fStatusTable[idx];
return tagVal;
}
/**
* Get the status (tag) values from the break rule(s) that determined the most
* recently returned break position. The values appear in the rule source
* within brackets, {123}, for example. The default status value for rules
* that do not explicitly provide one is zero.
*
* The status values used by the standard ICU break rules are defined
* as public constants in class RuleBasedBreakIterator.
*
* If the size of the output array is insufficient to hold the data,
* the output will be truncated to the available length. No exception
* will be thrown.
*
* @param fillInArray an array to be filled in with the status values.
* @return The number of rule status values from rules that determined
* the most recent boundary returned by the break iterator.
* In the event that the array is too small, the return value
* is the total number of status values that were available,
* not the reduced number that were actually returned.
* @draft ICU 3.0 (retain)
* @provisional This is a draft API and might change in a future release of ICU.
*/
@Override
public int getRuleStatusVec(int[] fillInArray) {
makeRuleStatusValid();
int numStatusVals = fRData.fStatusTable[fLastRuleStatusIndex];
if (fillInArray != null) {
int numToCopy = Math.min(numStatusVals, fillInArray.length);
for (int i=0; i engine.
script = UScript.HAN;
}
LanguageBreakEngine eng = fBreakEngines.get(script);
/*
if (eng != null && !eng.handles(c, fBreakType)) {
fUnhandledBreakEngine.handleChar(c, getBreakType());
eng = fUnhandledBreakEngine;
} else */ {
try {
switch (script) {
case UScript.THAI:
eng = new ThaiBreakEngine();
break;
case UScript.LAO:
eng = new LaoBreakEngine();
break;
case UScript.MYANMAR:
eng = new BurmeseBreakEngine();
break;
case UScript.KHMER:
eng = new KhmerBreakEngine();
break;
case UScript.HAN:
if (getBreakType() == KIND_WORD) {
eng = new CjkBreakEngine(false);
}
else {
fUnhandledBreakEngine.handleChar(c, getBreakType());
eng = fUnhandledBreakEngine;
}
break;
case UScript.HANGUL:
if (getBreakType() == KIND_WORD) {
eng = new CjkBreakEngine(true);
} else {
fUnhandledBreakEngine.handleChar(c, getBreakType());
eng = fUnhandledBreakEngine;
}
break;
default:
fUnhandledBreakEngine.handleChar(c, getBreakType());
eng = fUnhandledBreakEngine;
break;
}
} catch (IOException e) {
eng = null;
}
}
if (eng != null && eng != fUnhandledBreakEngine) {
LanguageBreakEngine existingEngine = fBreakEngines.putIfAbsent(script, eng);
if (existingEngine != null) {
// There was a race & another thread was first to register an engine for this script.
// Use theirs and discard the one we just created.
eng = existingEngine;
}
// assert eng.handles(c, fBreakType);
}
return eng;
}
private static final int kMaxLookaheads = 8;
private static class LookAheadResults {
int fUsedSlotLimit;
int[] fPositions;
int[] fKeys;
LookAheadResults() {
fUsedSlotLimit= 0;
fPositions = new int[kMaxLookaheads];
fKeys = new int[kMaxLookaheads];
}
int getPosition(int key) {
for (int i=0; i= kMaxLookaheads) {
assert(false);
i = kMaxLookaheads - 1;
}
fKeys[i] = key;
fPositions[i] = position;
assert(fUsedSlotLimit == i);
fUsedSlotLimit = i + 1;
}
void reset() {
fUsedSlotLimit = 0;
}
};
private LookAheadResults fLookAheadMatches = new LookAheadResults();
/**
* The State Machine Engine for moving forward is here.
* This function is the heart of the RBBI run time engine.
*
* @param stateTable
* @return the new iterator position
*
* A note on supplementary characters and the position of underlying
* Java CharacterIterator: Normally, a character iterator is positioned at
* the char most recently returned by next(). Within this function, when
* a supplementary char is being processed, the char iterator is left
* sitting on the trail surrogate, in the middle of the code point.
* This is different from everywhere else, where an iterator always
* points at the lead surrogate of a supplementary.
*/
private int handleNext(short stateTable[]) {
if (TRACE) {
System.out.println("Handle Next pos char state category");
}
// No matter what, handleNext alway correctly sets the break tag value.
fLastStatusIndexValid = true;
fLastRuleStatusIndex = 0;
// caches for quicker access
CharacterIterator text = fText;
CharTrie trie = fRData.fTrie;
// Set up the starting char
int c = text.current();
if (c >= UTF16.LEAD_SURROGATE_MIN_VALUE) {
c = nextTrail32(text, c);
if (c == DONE32) {
return BreakIterator.DONE;
}
}
int initialPosition = text.getIndex();
int result = initialPosition;
// Set the initial state for the state machine
int state = START_STATE;
int row = fRData.getRowIndex(state);
short category = 3;
int flagsState = fRData.getStateTableFlags(stateTable);
int mode = RBBI_RUN;
if ((flagsState & RBBIDataWrapper.RBBI_BOF_REQUIRED) != 0) {
category = 2;
mode = RBBI_START;
if (TRACE) {
System.out.print(" " + RBBIDataWrapper.intToString(text.getIndex(), 5));
System.out.print(RBBIDataWrapper.intToHexString(c, 10));
System.out.println(RBBIDataWrapper.intToString(state,7) + RBBIDataWrapper.intToString(category,6));
}
}
fLookAheadMatches.reset();
// loop until we reach the end of the text or transition to state 0
while (state != STOP_STATE) {
if (c == DONE32) {
// Reached end of input string.
if (mode == RBBI_END) {
// We have already run the loop one last time with the
// character set to the pseudo {eof} value. Now it is time
// to unconditionally bail out.
break;
}
// Run the loop one last time with the fake end-of-input character category
mode = RBBI_END;
category = 1;
}
else if (mode == RBBI_RUN) {
// Get the char category. An incoming category of 1 or 2 mens that
// we are preset for doing the beginning or end of input, and
// that we shouldn't get a category from an actual text input character.
//
// look up the current character's character category, which tells us
// which column in the state table to look at.
//
category = (short) trie.getCodePointValue(c);
// Check the dictionary bit in the character's category.
// Counter is only used by dictionary based iterators (subclasses).
// Chars that need to be handled by a dictionary have a flag bit set
// in their category values.
//
if ((category & 0x4000) != 0) {
fDictionaryCharCount++;
// And off the dictionary flag bit.
category &= ~0x4000;
}
if (TRACE) {
System.out.print(" " + RBBIDataWrapper.intToString(text.getIndex(), 5));
System.out.print(RBBIDataWrapper.intToHexString(c, 10));
System.out.println(RBBIDataWrapper.intToString(state,7) + RBBIDataWrapper.intToString(category,6));
}
// Advance to the next character.
// If this is a beginning-of-input loop iteration, don't advance.
// The next iteration will be processing the first real input character.
c = text.next();
if (c >= UTF16.LEAD_SURROGATE_MIN_VALUE) {
c = nextTrail32(text, c);
}
}
else {
mode = RBBI_RUN;
}
// look up a state transition in the state table
state = stateTable[row + RBBIDataWrapper.NEXTSTATES + category];
row = fRData.getRowIndex(state);
if (stateTable[row + RBBIDataWrapper.ACCEPTING] == -1) {
// Match found, common case
result = text.getIndex();
if (c >= UTF16.SUPPLEMENTARY_MIN_VALUE && c <= UTF16.CODEPOINT_MAX_VALUE) {
// The iterator has been left in the middle of a surrogate pair.
// We want the start of it.
result--;
}
// Remember the break status (tag) values.
fLastRuleStatusIndex = stateTable[row + RBBIDataWrapper.TAGIDX];
}
int completedRule = stateTable[row + RBBIDataWrapper.ACCEPTING];
if (completedRule > 0) {
// Lookahead match is completed
int lookaheadResult = fLookAheadMatches.getPosition(completedRule);
if (lookaheadResult >= 0) {
fLastRuleStatusIndex = stateTable[row + RBBIDataWrapper.TAGIDX];
text.setIndex(lookaheadResult);
return lookaheadResult;
}
}
int rule = stateTable[row + RBBIDataWrapper.LOOKAHEAD];
if (rule != 0) {
// At the position of a '/' in a look-ahead match. Record it.
int pos = text.getIndex();
if (c >= UTF16.SUPPLEMENTARY_MIN_VALUE && c <= UTF16.CODEPOINT_MAX_VALUE) {
// The iterator has been left in the middle of a surrogate pair.
// We want the beginning of it.
pos--;
}
fLookAheadMatches.setPosition(rule, pos);
}
} // End of state machine main loop
// The state machine is done. Check whether it found a match...
// If the iterator failed to advance in the match engine force it ahead by one.
// This indicates a defect in the break rules, which should always match
// at least one character.
if (result == initialPosition) {
if (TRACE) {
System.out.println("Iterator did not move. Advancing by 1.");
}
text.setIndex(initialPosition);
next32(text);
result = text.getIndex();
}
else {
// Leave the iterator at our result position.
// (we may have advanced beyond the last accepting position chasing after
// longer matches that never completed.)
text.setIndex(result);
}
if (TRACE) {
System.out.println("result = " + result);
}
return result;
}
private int handlePrevious(short stateTable[]) {
if (fText == null || stateTable == null) {
return 0;
}
int state;
int category = 0;
int mode;
int row;
int c;
int result = 0;
int initialPosition = 0;
fLookAheadMatches.reset();
// handlePrevious() never gets the rule status.
// Flag the status as invalid; if the user ever asks for status, we will need
// to back up, then re-find the break position using handleNext(), which does
// get the status value.
fLastStatusIndexValid = false;
fLastRuleStatusIndex = 0;
// set up the starting char
initialPosition = fText.getIndex();
result = initialPosition;
c = previous32(fText);
// Set up the initial state for the state machine
state = START_STATE;
row = fRData.getRowIndex(state);
category = 3; // TODO: obsolete? from the old start/run mode scheme?
mode = RBBI_RUN;
if ((fRData.getStateTableFlags(stateTable) & RBBIDataWrapper.RBBI_BOF_REQUIRED) != 0) {
category = 2;
mode = RBBI_START;
}
if (TRACE) {
System.out.println("Handle Prev pos char state category ");
}
// loop until we reach the beginning of the text or transition to state 0
//
mainLoop: for (;;) {
if (c == DONE32) {
// Reached end of input string.
if (mode == RBBI_END || fRData.fHeader.fVersion == 1) {
// Either this is the old (ICU 3.2 and earlier) format data which
// does not support explicit support for matching {eof}, or
// we have already done the {eof} iteration. Now is the time
// to unconditionally bail out.
if (result == initialPosition) {
// Ran off start, no match found.
// Move one position (towards the start, since we are doing previous.)
fText.setIndex(initialPosition);
previous32(fText);
}
break mainLoop;
}
mode = RBBI_END;
category = 1;
}
if (mode == RBBI_RUN) {
// look up the current character's category, which tells us
// which column in the state table to look at.
//
category = (short) fRData.fTrie.getCodePointValue(c);
// Check the dictionary bit in the character's category.
// Counter is only used by dictionary based iterators (subclasses).
// Chars that need to be handled by a dictionary have a flag bit set
// in their category values.
//
if ((category & 0x4000) != 0) {
fDictionaryCharCount++;
// And off the dictionary flag bit.
category &= ~0x4000;
}
}
if (TRACE) {
System.out.print(" " + fText.getIndex() + " ");
if (0x20 <= c && c < 0x7f) {
System.out.print(" " + c + " ");
} else {
System.out.print(" " + Integer.toHexString(c) + " ");
}
System.out.println(" " + state + " " + category + " ");
}
// State Transition - move machine to its next state
//
state = stateTable[row + RBBIDataWrapper.NEXTSTATES + category];
row = fRData.getRowIndex(state);
if (stateTable[row + RBBIDataWrapper.ACCEPTING] == -1) {
// Match found, common case, could have lookahead so we move
// on to check it
result = fText.getIndex();
}
int completedRule = stateTable[row + RBBIDataWrapper.ACCEPTING];
if (completedRule > 0) {
// Lookahead match is completed.
int lookaheadResult = fLookAheadMatches.getPosition(completedRule);
if (lookaheadResult >= 0) {
result = lookaheadResult;
break mainLoop;
}
}
int rule = stateTable[row + RBBIDataWrapper.LOOKAHEAD];
if (rule != 0) {
// At the position of a '/' in a look-ahead match. Record it.
int pos = fText.getIndex();
fLookAheadMatches.setPosition(rule, pos);
}
if (state == STOP_STATE) {
// Normal loop exit is here
break mainLoop;
}
// then move iterator position backwards one character
//
if (mode == RBBI_RUN) {
c = previous32(fText);
} else {
if (mode == RBBI_START) {
mode = RBBI_RUN;
}
}
} // End of the main loop.
// The state machine is done. Check whether it found a match...
//
// If the iterator failed to advance in the match engine, force it ahead by one.
// (This really indicates a defect in the break rules. They should always match
// at least one character.)
if (result == initialPosition) {
result = fText.setIndex(initialPosition);
previous32(fText);
result = fText.getIndex();
}
fText.setIndex(result);
if (TRACE) {
System.out.println("Result = " + result);
}
return result;
}
}