com.ibm.icu.text.RuleBasedBreakIterator Maven / Gradle / Ivy

Go to download
Show more of this group Show more artifacts with this name
Show all versions of com.liferay.commerce.currency.service
Liferay Commerce Currency Service
There is a newer version: 4.0.52
/*
 *******************************************************************************
 * Copyright (C) 2005-2016 International Business Machines Corporation and
 * others. All Rights Reserved.
 *******************************************************************************
 */

package com.ibm.icu.text;

import static com.ibm.icu.impl.CharacterIteration.DONE32;
import static com.ibm.icu.impl.CharacterIteration.next32;
import static com.ibm.icu.impl.CharacterIteration.nextTrail32;
import static com.ibm.icu.impl.CharacterIteration.previous32;

import java.io.ByteArrayOutputStream;
import java.io.IOException;
import java.io.InputStream;
import java.io.OutputStream;
import java.nio.ByteBuffer;
import java.text.CharacterIterator;
import java.util.concurrent.ConcurrentHashMap;

import com.ibm.icu.impl.Assert;
import com.ibm.icu.impl.CharTrie;
import com.ibm.icu.impl.CharacterIteration;
import com.ibm.icu.impl.ICUBinary;
import com.ibm.icu.impl.ICUDebug;
import com.ibm.icu.lang.UCharacter;
import com.ibm.icu.lang.UProperty;
import com.ibm.icu.lang.UScript;

/**
 * Rule Based Break Iterator
 * This is a port of the C++ class RuleBasedBreakIterator from ICU4C.
 *
 * @stable ICU 2.0
 */
public class RuleBasedBreakIterator extends BreakIterator {
    //=======================================================================
    // Constructors & Factories
    //=======================================================================

    /**
     * private constructor
     */
    private RuleBasedBreakIterator() {
        fLastStatusIndexValid = true;
        fDictionaryCharCount  = 0;
        fBreakEngines.put(-1, fUnhandledBreakEngine);
    }

    /**
     * Create a break iterator from a precompiled set of break rules.
     *
     * Creating a break iterator from the binary rules is much faster than
     * creating one from source rules.
     *
     * The binary rules are generated by the RuleBasedBreakIterator.compileRules() function.
     * Binary break iterator rules are not guaranteed to be compatible between
     * different versions of ICU.
     *
     * @param is an input stream supplying the compiled binary rules.
     * @throws IOException if there is an error while reading the rules from the InputStream.
     * @see    #compileRules(String, OutputStream)
     * @stable ICU 4.8
     */
    public static RuleBasedBreakIterator getInstanceFromCompiledRules(InputStream is) throws IOException {
        RuleBasedBreakIterator  This = new RuleBasedBreakIterator();
        This.fRData = RBBIDataWrapper.get(ICUBinary.getByteBufferFromInputStreamAndCloseStream(is));
        return This;
    }

    /**
     * Create a break iterator from a precompiled set of break rules.
     *
     * Creating a break iterator from the binary rules is much faster than
     * creating one from source rules.
     *
     * The binary rules are generated by the RuleBasedBreakIterator.compileRules() function.
     * Binary break iterator rules are not guaranteed to be compatible between
     * different versions of ICU.
     *
     * @param bytes a buffer supplying the compiled binary rules.
     * @throws IOException if there is an error while reading the rules from the buffer.
     * @see    #compileRules(String, OutputStream)
     * @internal
     * @deprecated This API is ICU internal only.
     */
    @Deprecated
    public static RuleBasedBreakIterator getInstanceFromCompiledRules(ByteBuffer bytes) throws IOException {
        RuleBasedBreakIterator  This = new RuleBasedBreakIterator();
        This.fRData = RBBIDataWrapper.get(bytes);
        return This;
    }

    /**
     * Construct a RuleBasedBreakIterator from a set of rules supplied as a string.
     * @param rules The break rules to be used.
     * @stable ICU 2.2
     */
    public RuleBasedBreakIterator(String rules)  {
        this();
        try {
            ByteArrayOutputStream ruleOS = new ByteArrayOutputStream();
            compileRules(rules, ruleOS);
            fRData = RBBIDataWrapper.get(ByteBuffer.wrap(ruleOS.toByteArray()));
        } catch (IOException e) {
            ///CLOVER:OFF
            // An IO exception can only arrive here if there is a bug in the RBBI Rule compiler,
            //  causing bogus compiled rules to be produced, but with no compile error raised.
            RuntimeException rte = new RuntimeException("RuleBasedBreakIterator rule compilation internal error: "
                    + e.getMessage());
            throw rte;
            ///CLOVER:ON
        }
    }

    //=======================================================================
    // Boilerplate
    //=======================================================================

    /**
     * Clones this iterator.
     * @return A newly-constructed RuleBasedBreakIterator with the same
     * behavior as this one.
     * @stable ICU 2.0
     */
    public Object clone()
    {
        RuleBasedBreakIterator result = (RuleBasedBreakIterator)super.clone();
        if (fText != null) {
            result.fText = (CharacterIterator)(fText.clone());
        }
        return result;
    }

    /**
     * Returns true if both BreakIterators are of the same class, have the same
     * rules, and iterate over the same text.
     * @stable ICU 2.0
     */
    public boolean equals(Object that) {
        if (that == null) {
            return false;
        }
        if (this == that) {
            return true;
        }
        try {
            RuleBasedBreakIterator other = (RuleBasedBreakIterator) that;
            if (fRData != other.fRData && (fRData == null || other.fRData == null)) {
                return false;
            }
            if (fRData != null && other.fRData != null &&
                    (!fRData.fRuleSource.equals(other.fRData.fRuleSource))) {
                return false;
            }
            if (fText == null && other.fText == null) {
                return true;
            }
            if (fText == null || other.fText == null) {
                return false;
            }
            return fText.equals(other.fText);
        }
        catch(ClassCastException e) {
            return false;
        }
     }

    /**
     * Returns the description (rules) used to create this iterator.
     * (In ICU4C, the same function is RuleBasedBreakIterator::getRules())
     * @stable ICU 2.0
     */
    public String toString() {
        String retStr = "";
        if (fRData != null) {
            retStr =  fRData.fRuleSource;
        }
        return retStr;
    }

    /**
     * Compute a hashcode for this BreakIterator
     * @return A hash code
     * @stable ICU 2.0
     */
    public int hashCode()
    {
        return fRData.fRuleSource.hashCode();
    }


    private static final int  START_STATE = 1;     // The state number of the starting state
    private static final int  STOP_STATE  = 0;     // The state-transition value indicating "stop"

    // RBBIRunMode - the state machine runs an extra iteration at the beginning and end
    //               of user text.  A variable with this enum type keeps track of where we
    //               are.  The state machine only fetches user text input while in RUN mode.
    private static final int  RBBI_START  = 0;
    private static final int  RBBI_RUN    = 1;
    private static final int  RBBI_END    = 2;

    /*
     * The character iterator through which this BreakIterator accesses the text.
     */
    private CharacterIterator   fText = new java.text.StringCharacterIterator("");

    /**
     * The rule data for this BreakIterator instance. Package private.
     */
    RBBIDataWrapper             fRData;

    /*
     * Index of the Rule {tag} values for the most recent match.
     */
    private int                 fLastRuleStatusIndex;

    /*
     * Rule tag value valid flag.
     * Some iterator operations don't intrinsically set the correct tag value.
     * This flag lets us lazily compute the value if we are ever asked for it.
     */
    private boolean             fLastStatusIndexValid;

    /**
     * Counter for the number of characters encountered with the "dictionary"
     *   flag set.  Normal RBBI iterators don't use it, although the code
     *   for updating it is live.  Dictionary Based break iterators (a subclass
     *   of us) access this field directly.
     * @internal
     */
    private int fDictionaryCharCount;

    /*
     * ICU debug argument name for RBBI
     */
    private static final String RBBI_DEBUG_ARG = "rbbi";

    /**
     * Debugging flag.  Trace operation of state machine when true.
     */
    private static final boolean TRACE = ICUDebug.enabled(RBBI_DEBUG_ARG)
            && ICUDebug.value(RBBI_DEBUG_ARG).indexOf("trace") >= 0;

    /**
     * What kind of break iterator this is. Set to KIND_LINE by default,
     * since this produces sensible output.
     */
    private int fBreakType = KIND_LINE;

    /**
     * The "default" break engine - just skips over ranges of dictionary words,
     * producing no breaks. Should only be used if characters need to be handled
     * by a dictionary but we have no dictionary implementation for them.
     */
    private final UnhandledBreakEngine fUnhandledBreakEngine = new UnhandledBreakEngine();

    /**
     * when a range of characters is divided up using the dictionary, the break
     * positions that are discovered are stored here, preventing us from having
     * to use either the dictionary or the state table again until the iterator
     * leaves this range of text
     */
    private int[] fCachedBreakPositions;

    /**
     * if fCachedBreakPositions is not null, this indicates which item in the
     * cache the current iteration position refers to
     */
    private int fPositionInCache;


    private final ConcurrentHashMap fBreakEngines =
            new ConcurrentHashMap();
    /**
     * Dumps caches and performs other actions associated with a complete change
     * in text or iteration position.
     */
    private void reset() {
        fCachedBreakPositions = null;
        // fNumCachedBreakPositions = 0;
        fDictionaryCharCount = 0;
        fPositionInCache = 0;

    }
    /**
     * Dump the contents of the state table and character classes for this break iterator.
     * For debugging only.
     * @internal
     * @deprecated This API is ICU internal only.
     */
    @Deprecated
    public void dump() {
        this.fRData.dump();
    }

    /**
     * Compile a set of source break rules into the binary state tables used
     * by the break iterator engine.  Creating a break iterator from precompiled
     * rules is much faster than creating one from source rules.
     *
     * Binary break rules are not guaranteed to be compatible between different
     * versions of ICU.
     *
     *
     * @param rules  The source form of the break rules
     * @param ruleBinary  An output stream to receive the compiled rules.
     * @throws IOException If there is an error writing the output.
     * @see #getInstanceFromCompiledRules(InputStream)
     * @stable ICU 4.8
     */
    public static void compileRules(String rules, OutputStream ruleBinary) throws IOException {
        RBBIRuleBuilder.compileRules(rules, ruleBinary);
    }

    //=======================================================================
    // BreakIterator overrides
    //=======================================================================

    /**
     * Sets the current iteration position to the beginning of the text.
     * (i.e., the CharacterIterator's starting offset).
     * @return The offset of the beginning of the text.
     * @stable ICU 2.0
     */
    public int first() {
        fCachedBreakPositions = null;
        fDictionaryCharCount = 0;
        fPositionInCache = 0;
        fLastRuleStatusIndex  = 0;
        fLastStatusIndexValid = true;
        if (fText == null) {
            return BreakIterator.DONE;
        }
        fText.first();
        return fText.getIndex();
    }

    /**
     * Sets the current iteration position to the end of the text.
     * (i.e., the CharacterIterator's ending offset).
     * @return The text's past-the-end offset.
     * @stable ICU 2.0
     */
    public int last() {
        fCachedBreakPositions = null;
        fDictionaryCharCount = 0;
        fPositionInCache = 0;

        if (fText == null) {
            fLastRuleStatusIndex  = 0;
            fLastStatusIndexValid = true;
            return BreakIterator.DONE;
        }

        // t.last() returns the offset of the last character,
        // rather than the past-the-end offset
        // so a loop like for(p=it.last(); p!=DONE; p=it.previous()) ...
        // will work correctly.
        fLastStatusIndexValid = false;
        int pos = fText.getEndIndex();
        fText.setIndex(pos);
        return pos;
    }

    /**
     * Advances the iterator either forward or backward the specified number of steps.
     * Negative values move backward, and positive values move forward.  This is
     * equivalent to repeatedly calling next() or previous().
     * @param n The number of steps to move.  The sign indicates the direction
     * (negative is backwards, and positive is forwards).
     * @return The character offset of the boundary position n boundaries away from
     * the current one.
     * @stable ICU 2.0
     */
    public int next(int n) {
        int result = current();
        while (n > 0) {
            result = next();
            --n;
        }
        while (n < 0) {
            result = previous();
            ++n;
        }
        return result;
    }

    /**
     * Advances the iterator to the next boundary position.
     * @return The position of the first boundary after this one.
     * @stable ICU 2.0
     */
    public int next() {
        // if we have cached break positions and we're still in the range
        // covered by them, just move one step forward in the cache
        if (fCachedBreakPositions != null) {
            if (fPositionInCache < fCachedBreakPositions.length - 1) {
                ++fPositionInCache;
                int pos = fCachedBreakPositions[fPositionInCache];
                fText.setIndex(pos);
                return pos;
            }
            else {
                reset();
            }
        }

        int startPos = current();
        fDictionaryCharCount = 0;
        int result = handleNext(fRData.fFTable);
        if (fDictionaryCharCount > 0) {
            result = checkDictionary(startPos, result, false);
        }
        return result;
    }

    /**
      *  checkDictionary      This function handles all processing of characters in
      *                       the "dictionary" set. It will determine the appropriate
      *                       course of action, and possibly set up a cache in the
      *                       process.
      */
    private int checkDictionary(int startPos, int endPos, boolean reverse) {

        // Reset the old break cache first.
        reset();

        // note: code segment below assumes that dictionary chars are in the
        // startPos-endPos range
        // value returned should be next character in sequence
        if ((endPos - startPos) <= 1) {
            return (reverse ? startPos : endPos);
        }

        // Starting from the starting point, scan towards the proposed result,
        // looking for the first dictionary character (which may be the one
        // we're on, if we're starting in the middle of a range).
        fText.setIndex(reverse ? endPos : startPos);
        if (reverse) {
            CharacterIteration.previous32(fText);
        }

        int  rangeStart = startPos;
        int  rangeEnd = endPos;

        int    category;
        int    current;
        DictionaryBreakEngine.DequeI breaks = new DictionaryBreakEngine.DequeI();
        int     foundBreakCount = 0;
        int     c = CharacterIteration.current32(fText);
        category = (short)fRData.fTrie.getCodePointValue(c);

        // Is the character we're starting on a dictionary character? If so, we
        // need to back up to include the entire run; otherwise the results of
        // the break algorithm will differ depending on where we start. Since
        // the result is cached and there is typically a non-dictionary break
        // within a small number of words, there should be little performance impact.
        if ((category & 0x4000) != 0) {
            if (reverse) {
                do {
                    CharacterIteration.next32(fText);
                    c = CharacterIteration.current32(fText);
                    category = (short)fRData.fTrie.getCodePointValue(c);
                } while (c != CharacterIteration.DONE32 && ((category & 0x4000)) != 0);

                // Back up to the last dictionary character
                rangeEnd = fText.getIndex();
                if (c == CharacterIteration.DONE32) {
                    // c = fText->last32();
                    //   TODO:  why was this if needed?
                    c = CharacterIteration.previous32(fText);
                }
                else {
                    c = CharacterIteration.previous32(fText);
                }
            }
            else {
                do {
                    c = CharacterIteration.previous32(fText);
                    category = (short)fRData.fTrie.getCodePointValue(c);
                }
                while (c != CharacterIteration.DONE32 && ((category & 0x4000) != 0));
                // Back up to the last dictionary character
                if (c == CharacterIteration.DONE32) {
                    // c = fText->first32();
                    c = CharacterIteration.current32(fText);
                }
                else {
                    CharacterIteration.next32(fText);
                    c = CharacterIteration.current32(fText);
                }
                rangeStart = fText.getIndex();
            }
            category = (short)fRData.fTrie.getCodePointValue(c);
        }


        // Loop through the text, looking for ranges of dictionary characters.
        // For each span, find the appropriate break engine, and ask it to find
        // any breaks within the span.
        // Note: we always do this in the forward direction, so that the break
        // cache is built in the right order.
        if (reverse) {
            fText.setIndex(rangeStart);
            c = CharacterIteration.current32(fText);
            category = (short)fRData.fTrie.getCodePointValue(c);
        }
        LanguageBreakEngine lbe = null;
        while(true) {
            while((current = fText.getIndex()) < rangeEnd && (category & 0x4000) == 0) {
                CharacterIteration.next32(fText);
                c = CharacterIteration.current32(fText);
                category = (short)fRData.fTrie.getCodePointValue(c);
            }
            if (current >= rangeEnd) {
                break;
            }

            // We now have a dictionary character. Get the appropriate language object
            // to deal with it.
            lbe = getLanguageBreakEngine(c);

            // Ask the language object if there are any breaks. It will leave the text
            // pointer on the other side of its range, ready to search for the next one.
            if (lbe != null) {
                int startingIdx = fText.getIndex();
                foundBreakCount += lbe.findBreaks(fText, rangeStart, rangeEnd, false, fBreakType, breaks);
                assert fText.getIndex() > startingIdx;
            }

            // Reload the loop variables for the next go-round
            c = CharacterIteration.current32(fText);
            category = (short)fRData.fTrie.getCodePointValue(c);
        }

        // If we found breaks, build a new break cache. The first and last entries must
        // be the original starting and ending position.
        if (foundBreakCount > 0) {
            if (foundBreakCount != breaks.size()) {
                System.out.println("oops, foundBreakCount != breaks.size().  LBE = " + lbe.getClass());
            }
            assert foundBreakCount == breaks.size();
            if (startPos < breaks.peekLast()) {
                breaks.offer(startPos);
            }
            if (endPos > breaks.peek()) {
                breaks.push(endPos);
            }

            // TODO: get rid of this array, use results from the deque directly
            fCachedBreakPositions = new int[breaks.size()];

            int i = 0;
            while (breaks.size() > 0) {
                fCachedBreakPositions[i++] = breaks.pollLast();
            }

            // If there are breaks, then by definition, we are replacing the original
            // proposed break by one of the breaks we found. Use following() and
            // preceding() to do the work. They should never recurse in this case.
            if (reverse) {
                return preceding(endPos);
            }
            else {
                return following(startPos);
            }
        }

        // If we get here, there were no language-based breaks. Set the text pointer
        // to the original proposed break.
        fText.setIndex(reverse ? startPos : endPos);
        return (reverse ? startPos : endPos);

        }


    /**
     * Moves the iterator backwards, to the last boundary preceding this one.
     * @return The position of the last boundary position preceding this one.
     * @stable ICU 2.0
     */
    public int previous() {
        int result;
        int startPos;

        CharacterIterator text = getText();

        fLastStatusIndexValid = false;

        // if we have cached break positions and we're still in the range
        // covered by them, just move one step backward in the cache
        if (fCachedBreakPositions != null) {
            if (fPositionInCache > 0) {
                --fPositionInCache;
                // If we're at the beginning of the cache, need to reevaluate the
                // rule status
                if (fPositionInCache <= 0) {
                    fLastStatusIndexValid = false;
                }
                int pos = fCachedBreakPositions[fPositionInCache];
                text.setIndex(pos);
                return pos;
            } else {
                reset();
            }
        }

        // if we're already sitting at the beginning of the text, return DONE
        startPos = current();
        if (fText == null || startPos == fText.getBeginIndex()) {
            fLastRuleStatusIndex  = 0;
            fLastStatusIndexValid = true;
            return BreakIterator.DONE;
        }

        // Rules with an exact reverse table are handled here.
        if (fRData.fSRTable != null || fRData.fSFTable != null) {
            result =  handlePrevious(fRData.fRTable);
            if (fDictionaryCharCount > 0) {
                result = checkDictionary(result, startPos, true);
            }
            return result;
        }

        // old rule syntax
        // set things up.  handlePrevious() will back us up to some valid
        // break position before the current position (we back our internal
        // iterator up one step to prevent handlePrevious() from returning
        // the current position), but not necessarily the last one before
        // where we started

        int       start = current();

        previous32(fText);
        int       lastResult    = handlePrevious(fRData.fRTable);
        if (lastResult == BreakIterator.DONE) {
            lastResult = fText.getBeginIndex();
            fText.setIndex(lastResult);
        }
        result = lastResult;
        int      lastTag       = 0;
        boolean  breakTagValid = false;

        // iterate forward from the known break position until we pass our
        // starting point.  The last break position before the starting
        // point is our return value

        for (;;) {
            result         = next();
            if (result == BreakIterator.DONE || result >= start) {
                break;
            }
            lastResult     = result;
            lastTag        = fLastRuleStatusIndex;
            breakTagValid  = true;
        }

        // fLastBreakTag wants to have the value for section of text preceding
        // the result position that we are to return (in lastResult.)  If
        // the backwards rules overshot and the above loop had to do two or more
        // handleNext()s to move up to the desired return position, we will have a valid
        // tag value. But, if handlePrevious() took us to exactly the correct result position,
        // we wont have a tag value for that position, which is only set by handleNext().

        // Set the current iteration position to be the last break position
        // before where we started, and then return that value.
        fText.setIndex(lastResult);
        fLastRuleStatusIndex  = lastTag;       // for use by getRuleStatus()
        fLastStatusIndexValid = breakTagValid;
        return lastResult;
    }

    /**
     * Sets the iterator to refer to the first boundary position following
     * the specified position.
     * @param offset The position from which to begin searching for a break position.
     * @return The position of the first break after the current position.
     * @stable ICU 2.0
     */
    public int following(int offset) {
        CharacterIterator text = getText();

        // if we have no cached break positions, or if "offset" is outside the
        // range covered by the cache, then dump the cache and call our
        // inherited following() method.  This will call other methods in this
        // class that may refresh the cache.
        if (fCachedBreakPositions == null || offset < fCachedBreakPositions[0] ||
                offset >= fCachedBreakPositions[fCachedBreakPositions.length - 1]) {
            fCachedBreakPositions = null;
            return rulesFollowing(offset);
        }

        // on the other hand, if "offset" is within the range covered by the
        // cache, then just search the cache for the first break position
        // after "offset"
        else {
            fPositionInCache = 0;
            while (fPositionInCache < fCachedBreakPositions.length
                   && offset >= fCachedBreakPositions[fPositionInCache])
                ++fPositionInCache;
            text.setIndex(fCachedBreakPositions[fPositionInCache]);
            return text.getIndex();
        }
    }

    private int rulesFollowing(int offset) {
        // if the offset passed in is already past the end of the text,
        // just return DONE; if it's before the beginning, return the
        // text's starting offset
        fLastRuleStatusIndex  = 0;
        fLastStatusIndexValid = true;
        if (fText == null || offset >= fText.getEndIndex()) {
            last();
            return next();
        }
        else if (offset < fText.getBeginIndex()) {
            return first();
        }

        // otherwise, set our internal iteration position (temporarily)
        // to the position passed in.  If this is the _beginning_ position,
        // then we can just use next() to get our return value

        int result = 0;

        if (fRData.fSRTable != null) {
            // Safe Point Reverse rules exist.
            //   This allows us to use the optimum algorithm.
            fText.setIndex(offset);
            // move forward one codepoint to prepare for moving back to a
            // safe point.
            // this handles offset being between a supplementary character
            next32(fText);
            // handlePrevious will move most of the time to < 1 boundary away
            handlePrevious(fRData.fSRTable);
            result = next();
            while (result <= offset) {
                result = next();
            }
            return result;
        }
        if (fRData.fSFTable != null) {
            // No Safe point reverse table, but there is a safe pt forward table.
            //
            fText.setIndex(offset);
            previous32(fText);
            // handle next will give result >= offset
            handleNext(fRData.fSFTable);
            // previous will give result 0 or 1 boundary away from offset,
            // most of the time
            // we have to
            int oldresult = previous();
            while (oldresult > offset) {
                result = previous();
                if (result <= offset) {
                    return oldresult;
                }
                oldresult = result;
            }
            result = next();
            if (result <= offset) {
                return next();
            }
            return result;
        }
        // otherwise, we have to sync up first.  Use handlePrevious() to back
        // us up to a known break position before the specified position (if
        // we can determine that the specified position is a break position,
        // we don't back up at all).  This may or may not be the last break
        // position at or before our starting position.  Advance forward
        // from here until we've passed the starting position.  The position
        // we stop on will be the first break position after the specified one.
        // old rule syntax

        fText.setIndex(offset);
        if (offset == fText.getBeginIndex()) {
            return next();
        }
        result = previous();

        while (result != BreakIterator.DONE && result <= offset) {
            result = next();
        }

        return result;
    }
    /**
     * Sets the iterator to refer to the last boundary position before the
     * specified position.
     * @param offset The position to begin searching for a break from.
     * @return The position of the last boundary before the starting position.
     * @stable ICU 2.0
     */
    public int preceding(int offset) {
        CharacterIterator text = getText();

        // if we have no cached break positions, or "offset" is outside the
        // range covered by the cache, we can just call the inherited routine
        // (which will eventually call other routines in this class that may
        // refresh the cache)
        if (fCachedBreakPositions == null || offset <= fCachedBreakPositions[0] ||
                offset > fCachedBreakPositions[fCachedBreakPositions.length - 1]) {
            fCachedBreakPositions = null;
            return rulesPreceding(offset);
        }

        // on the other hand, if "offset" is within the range covered by the cache,
        // then all we have to do is search the cache for the last break position
        // before "offset"
        else {
            fPositionInCache = 0;
            while (fPositionInCache < fCachedBreakPositions.length
                   && offset > fCachedBreakPositions[fPositionInCache])
                ++fPositionInCache;
            --fPositionInCache;
            text.setIndex(fCachedBreakPositions[fPositionInCache]);
            return text.getIndex();
        }
    }

    private int rulesPreceding(int offset) {
        // if the offset passed in is already past the end of the text,
        // just return DONE; if it's before the beginning, return the

        // text's starting offset
        if (fText == null || offset > fText.getEndIndex()) {
            // return BreakIterator::DONE;
            return last();
        }
        else if (offset < fText.getBeginIndex()) {
            return first();
        }

        // if we start by updating the current iteration position to the
        // position specified by the caller, we can just use previous()
        // to carry out this operation

        int  result;
        if (fRData.fSFTable != null) {
            /// todo synwee
            // new rule syntax
            fText.setIndex(offset);
            // move backwards one codepoint to prepare for moving forwards to a
            // safe point.
            // this handles offset being between a supplementary character
            previous32(fText);
            handleNext(fRData.fSFTable);
            result = previous();
            while (result >= offset) {
                result = previous();
            }
            return result;
        }
        if (fRData.fSRTable != null) {
            // backup plan if forward safe table is not available
            fText.setIndex(offset);
            next32(fText);
            // handle previous will give result <= offset
            handlePrevious(fRData.fSRTable);

            // next will give result 0 or 1 boundary away from offset,
            // most of the time
            // we have to
            int oldresult = next();
            while (oldresult < offset) {
                result = next();
                if (result >= offset) {
                    return oldresult;
                }
                oldresult = result;
            }
            result = previous();
            if (result >= offset) {
                return previous();
            }
            return result;
        }

        // old rule syntax
        fText.setIndex(offset);
        return previous();
    }

    /**
     * Throw IllegalArgumentException unless begin <= offset < end.
     * @stable ICU 2.0
     */
    protected static final void checkOffset(int offset, CharacterIterator text) {
        if (offset < text.getBeginIndex() || offset > text.getEndIndex()) {
            throw new IllegalArgumentException("offset out of bounds");
        }
    }


    /**
     * Returns true if the specified position is a boundary position.  As a side
     * effect, leaves the iterator pointing to the first boundary position at
     * or after "offset".
     * @param offset the offset to check.
     * @return True if "offset" is a boundary position.
     * @stable ICU 2.0
     */
    public boolean isBoundary(int offset) {
        checkOffset(offset, fText);

        // the beginning index of the iterator is always a boundary position by definition
        if (offset == fText.getBeginIndex()) {
            first();       // For side effects on current position, tag values.
            return true;
        }

        if (offset == fText.getEndIndex()) {
            last();       // For side effects on current position, tag values.
            return true;
        }

        // otherwise, we can use following() on the position before the specified
        // one and return true if the position we get back is the one the user
        // specified

        // return following(offset - 1) == offset;
        // TODO:  check whether it is safe to revert to the simpler offset-1 code
        //         The safe rules may take care of unpaired surrogates ok.
        fText.setIndex(offset);
        previous32(fText);
        int  pos = fText.getIndex();
        boolean result = following(pos) == offset;
        return result;
    }

    /**
     * Returns the current iteration position.
     * @return The current iteration position.
     * @stable ICU 2.0
     */
    public int current() {
        return (fText != null) ? fText.getIndex() : BreakIterator.DONE;
    }

    private void makeRuleStatusValid() {
        if (fLastStatusIndexValid == false) {
            //  No cached status is available.
            int curr = current();
            if (curr == BreakIterator.DONE || curr == fText.getBeginIndex()) {
                //  At start of text, or there is no text.  Status is always zero.
                fLastRuleStatusIndex = 0;
                fLastStatusIndexValid = true;
            } else {
                //  Not at start of text.  Find status the tedious way.
                int pa = fText.getIndex();
                first();
                int pb = current();
                while (fText.getIndex() < pa) {
                    pb = next();
                }
                Assert.assrt(pa == pb);
            }
            Assert.assrt(fLastStatusIndexValid == true);
            Assert.assrt(fLastRuleStatusIndex >= 0  &&  fLastRuleStatusIndex < fRData.fStatusTable.length);
        }
    }

    /**
     * Return the status tag from the break rule that determined the most recently
     * returned break position.  The values appear in the rule source
     * within brackets, {123}, for example.  For rules that do not specify a
     * status, a default value of 0 is returned.  If more than one rule applies,
     * the numerically largest of the possible status values is returned.
     * 
     * Of the standard types of ICU break iterators, only the word break
     * iterator provides status values.  The values are defined in
     * class RuleBasedBreakIterator, and allow distinguishing between words
     * that contain alphabetic letters, "words" that appear to be numbers,
     * punctuation and spaces, words containing ideographic characters, and
     * more.  Call getRuleStatus after obtaining a boundary
     * position from next(), previous(), or
     * any other break iterator functions that returns a boundary position.
     * 

     * @return the status from the break rule that determined the most recently
     * returned break position.
     *
     * @draft ICU 3.0 (retain)
     * @provisional This is a draft API and might change in a future release of ICU.
     */

    public int  getRuleStatus() {
        makeRuleStatusValid();
        //   Status records have this form:
        //           Count N         <--  fLastRuleStatusIndex points here.
        //           Status val 0
        //           Status val 1
        //              ...
        //           Status val N-1  <--  the value we need to return
        //   The status values are sorted in ascending order.
        //   This function returns the last (largest) of the array of status values.
        int  idx = fLastRuleStatusIndex + fRData.fStatusTable[fLastRuleStatusIndex];
        int  tagVal = fRData.fStatusTable[idx];
        return tagVal;
    }

    /**
     * Get the status (tag) values from the break rule(s) that determined the most
     * recently returned break position.  The values appear in the rule source
     * within brackets, {123}, for example.  The default status value for rules
     * that do not explicitly provide one is zero.
     * 

     * The status values used by the standard ICU break rules are defined
     * as public constants in class RuleBasedBreakIterator.
     * 
     * If the size  of the output array is insufficient to hold the data,
     *  the output will be truncated to the available length.  No exception
     *  will be thrown.
     *
     * @param fillInArray an array to be filled in with the status values.
     * @return          The number of rule status values from rules that determined
     *                  the most recent boundary returned by the break iterator.
     *                  In the event that the array is too small, the return value
     *                  is the total number of status values that were available,
     *                  not the reduced number that were actually returned.
     * @draft ICU 3.0 (retain)
     * @provisional This is a draft API and might change in a future release of ICU.
     */
    public int getRuleStatusVec(int[] fillInArray) {
        makeRuleStatusValid();
        int numStatusVals = fRData.fStatusTable[fLastRuleStatusIndex];
        if (fillInArray != null) {
            int numToCopy = Math.min(numStatusVals, fillInArray.length);
            for (int i=0; i engine.
            script = UScript.HAN;
        }

        LanguageBreakEngine eng = fBreakEngines.get(script);
        /*
        if (eng != null && !eng.handles(c, fBreakType)) {
            fUnhandledBreakEngine.handleChar(c, getBreakType());
            eng = fUnhandledBreakEngine;
        } else  */  {
            try {
                switch (script) {
                case UScript.THAI:
                    eng = new ThaiBreakEngine();
                    break;
                case UScript.LAO:
                    eng = new LaoBreakEngine();
                    break;
                case UScript.MYANMAR:
                    eng = new BurmeseBreakEngine();
                    break;
                case UScript.KHMER:
                    eng = new KhmerBreakEngine();
                    break;
                case UScript.HAN:
                    if (getBreakType() == KIND_WORD) {
                        eng = new CjkBreakEngine(false);
                    }
                    else {
                        fUnhandledBreakEngine.handleChar(c, getBreakType());
                        eng = fUnhandledBreakEngine;
                    }
                    break;
                case UScript.HANGUL:
                    if (getBreakType() == KIND_WORD) {
                        eng = new CjkBreakEngine(true);
                    } else {
                        fUnhandledBreakEngine.handleChar(c, getBreakType());
                        eng = fUnhandledBreakEngine;
                    }
                    break;
                default:
                    fUnhandledBreakEngine.handleChar(c, getBreakType());
                    eng = fUnhandledBreakEngine;
                    break;
                }
            } catch (IOException e) {
                eng = null;
            }
        }

        if (eng != null && eng != fUnhandledBreakEngine) {
            LanguageBreakEngine existingEngine = fBreakEngines.putIfAbsent(script, eng);
            if (existingEngine != null) {
                // There was a race & another thread was first to register an engine for this script.
                // Use theirs and discard the one we just created.
                eng = existingEngine;
            }
            // assert eng.handles(c, fBreakType);
        }
        return eng;
    }

    private static final int kMaxLookaheads = 8;
    private static class LookAheadResults {
        int      fUsedSlotLimit;
        int[]    fPositions;
        int[]    fKeys;

        LookAheadResults() {
            fUsedSlotLimit= 0;
            fPositions = new int[kMaxLookaheads];
            fKeys = new int[kMaxLookaheads];
        }

        int getPosition(int key) {
            for (int i=0; i= kMaxLookaheads) {
                assert(false);
                i = kMaxLookaheads - 1;
            }
            fKeys[i] = key;
            fPositions[i] = position;
            assert(fUsedSlotLimit == i);
            fUsedSlotLimit = i + 1;
        }

        void reset() {
            fUsedSlotLimit = 0;
        }
    };
    private LookAheadResults fLookAheadMatches = new LookAheadResults();


    /**
     * The State Machine Engine for moving forward is here.
     * This function is the heart of the RBBI run time engine.
     *
     * @param stateTable
     * @return the new iterator position
     *
     * A note on supplementary characters and the position of underlying
     * Java CharacterIterator:   Normally, a character iterator is positioned at
     * the char most recently returned by next().  Within this function, when
     * a supplementary char is being processed, the char iterator is left
     * sitting on the trail surrogate, in the middle of the code point.
     * This is different from everywhere else, where an iterator always
     * points at the lead surrogate of a supplementary.
     */
    private int handleNext(short stateTable[]) {
        if (TRACE) {
            System.out.println("Handle Next   pos      char  state category");
        }

        // No matter what, handleNext alway correctly sets the break tag value.
        fLastStatusIndexValid = true;
        fLastRuleStatusIndex  = 0;

        // caches for quicker access
        CharacterIterator text = fText;
        CharTrie trie = fRData.fTrie;

        // Set up the starting char
        int c               = text.current();
        if (c >= UTF16.LEAD_SURROGATE_MIN_VALUE) {
            c = nextTrail32(text, c);
            if (c == DONE32) {
                return BreakIterator.DONE;
            }
        }
        int initialPosition = text.getIndex();
        int result          = initialPosition;

        // Set the initial state for the state machine
        int state           = START_STATE;
        int row             = fRData.getRowIndex(state);
        short category      = 3;
        int flagsState      = fRData.getStateTableFlags(stateTable);
        int mode            = RBBI_RUN;
        if ((flagsState & RBBIDataWrapper.RBBI_BOF_REQUIRED) != 0) {
            category = 2;
            mode     = RBBI_START;
            if (TRACE) {
                System.out.print("            " +  RBBIDataWrapper.intToString(text.getIndex(), 5));
                System.out.print(RBBIDataWrapper.intToHexString(c, 10));
                System.out.println(RBBIDataWrapper.intToString(state,7) + RBBIDataWrapper.intToString(category,6));
            }
        }
        fLookAheadMatches.reset();

        // loop until we reach the end of the text or transition to state 0
        while (state != STOP_STATE) {
            if (c == DONE32) {
                // Reached end of input string.
                if (mode == RBBI_END) {
                    // We have already run the loop one last time with the
                    // character set to the pseudo {eof} value. Now it is time
                    // to unconditionally bail out.
                    break;
                }
                // Run the loop one last time with the fake end-of-input character category
                mode = RBBI_END;
                category = 1;
            }
            else if (mode == RBBI_RUN) {
                // Get the char category.  An incoming category of 1 or 2 mens that
                //      we are preset for doing the beginning or end of input, and
                //      that we shouldn't get a category from an actual text input character.
                //

                // look up the current character's character category, which tells us
                // which column in the state table to look at.
                //
                category = (short) trie.getCodePointValue(c);

                // Check the dictionary bit in the character's category.
                //    Counter is only used by dictionary based iterators (subclasses).
                //    Chars that need to be handled by a dictionary have a flag bit set
                //    in their category values.
                //
                if ((category & 0x4000) != 0)  {
                    fDictionaryCharCount++;
                    //  And off the dictionary flag bit.
                    category &= ~0x4000;
                }

                if (TRACE) {
                    System.out.print("            " +  RBBIDataWrapper.intToString(text.getIndex(), 5));
                    System.out.print(RBBIDataWrapper.intToHexString(c, 10));
                    System.out.println(RBBIDataWrapper.intToString(state,7) + RBBIDataWrapper.intToString(category,6));
                }

                // Advance to the next character.
                // If this is a beginning-of-input loop iteration, don't advance.
                //    The next iteration will be processing the first real input character.
                c = (int)text.next();
                if (c >= UTF16.LEAD_SURROGATE_MIN_VALUE) {
                    c = nextTrail32(text, c);
                }
            }
            else {
                mode = RBBI_RUN;
            }

            // look up a state transition in the state table
            state = stateTable[row + RBBIDataWrapper.NEXTSTATES + category];
            row   = fRData.getRowIndex(state);

            if (stateTable[row + RBBIDataWrapper.ACCEPTING] == -1) {
                // Match found, common case
                result = text.getIndex();
                if (c >= UTF16.SUPPLEMENTARY_MIN_VALUE && c <= UTF16.CODEPOINT_MAX_VALUE) {
                    // The iterator has been left in the middle of a surrogate pair.
                    // We want the start of it.
                    result--;
                }

                //  Remember the break status (tag) values.
                fLastRuleStatusIndex = stateTable[row + RBBIDataWrapper.TAGIDX];
            }

            int completedRule = stateTable[row + RBBIDataWrapper.ACCEPTING];
            if (completedRule > 0) {
                // Lookahead match is completed
                int lookaheadResult = fLookAheadMatches.getPosition(completedRule);
                if (lookaheadResult >= 0) {
                    fLastRuleStatusIndex = stateTable[row + RBBIDataWrapper.TAGIDX];
                    text.setIndex(lookaheadResult);
                    return lookaheadResult;
                }
            }

            int rule =  stateTable[row + RBBIDataWrapper.LOOKAHEAD];
            if (rule != 0) {
                // At the position of a '/' in a look-ahead match. Record it.
                int  pos = text.getIndex();
                if (c >= UTF16.SUPPLEMENTARY_MIN_VALUE && c <= UTF16.CODEPOINT_MAX_VALUE) {
                    // The iterator has been left in the middle of a surrogate pair.
                    // We want the beginning  of it.
                    pos--;
                }
                fLookAheadMatches.setPosition(rule, pos);
            }


        }        // End of state machine main loop

        // The state machine is done.  Check whether it found a match...

        // If the iterator failed to advance in the match engine force it ahead by one.
        // This indicates a defect in the break rules, which should always match
        // at least one character.

        if (result == initialPosition) {
            if (TRACE) {
                System.out.println("Iterator did not move. Advancing by 1.");
            }
            text.setIndex(initialPosition);
            next32(text);
            result = text.getIndex();
        }
        else {
            // Leave the iterator at our result position.
            //   (we may have advanced beyond the last accepting position chasing after
            //    longer matches that never completed.)
            text.setIndex(result);
        }
        if (TRACE) {
            System.out.println("result = " + result);
        }
        return result;
    }

    private int handlePrevious(short stateTable[]) {
        if (fText == null || stateTable == null) {
            return 0;
        }

        int            state;
        int            category           = 0;
        int            mode;
        int            row;
        int            c;
        int            result             = 0;
        int            initialPosition    = 0;
        fLookAheadMatches.reset();

        // handlePrevious() never gets the rule status.
        // Flag the status as invalid; if the user ever asks for status, we will need
        // to back up, then re-find the break position using handleNext(), which does
        // get the status value.
        fLastStatusIndexValid = false;
        fLastRuleStatusIndex  = 0;

        // set up the starting char
        initialPosition = fText.getIndex();
        result          = initialPosition;
        c               = previous32(fText);

        // Set up the initial state for the state machine
        state = START_STATE;
        row = fRData.getRowIndex(state);
        category = 3;   // TODO:  obsolete?  from the old start/run mode scheme?
        mode     = RBBI_RUN;
        if ((fRData.getStateTableFlags(stateTable) & RBBIDataWrapper.RBBI_BOF_REQUIRED) != 0) {
            category = 2;
            mode     = RBBI_START;
        }

        if (TRACE) {
            System.out.println("Handle Prev   pos   char  state category ");
        }

        // loop until we reach the beginning of the text or transition to state 0
        //
        mainLoop: for (;;) {
            if (c == DONE32) {
                // Reached end of input string.
                if (mode == RBBI_END || fRData.fHeader.fVersion == 1) {
                    // Either this is the old (ICU 3.2 and earlier) format data which
                    // does not support explicit support for matching {eof}, or
                    // we have already done the {eof} iteration.  Now is the time
                    // to unconditionally bail out.
                    if (result == initialPosition) {
                        // Ran off start, no match found.
                        // Move one position (towards the start, since we are doing previous.)
                        fText.setIndex(initialPosition);
                        previous32(fText);
                    }
                    break mainLoop;
                }
                mode = RBBI_END;
                category = 1;
            }

            if (mode == RBBI_RUN) {
                // look up the current character's category, which tells us
                // which column in the state table to look at.
                //
                category = (short) fRData.fTrie.getCodePointValue(c);

                // Check the dictionary bit in the character's category.
                //    Counter is only used by dictionary based iterators (subclasses).
                //    Chars that need to be handled by a dictionary have a flag bit set
                //    in their category values.
                //
                if ((category & 0x4000) != 0)  {
                    fDictionaryCharCount++;
                    //  And off the dictionary flag bit.
                    category &= ~0x4000;
                }
            }


            if (TRACE) {
                System.out.print("             " + fText.getIndex() + "   ");
                if (0x20 <= c && c < 0x7f) {
                    System.out.print("  " + c + "  ");
                } else {
                    System.out.print(" " + Integer.toHexString(c) + " ");
                }
                System.out.println(" " + state + "  " + category + " ");
            }

            // State Transition - move machine to its next state
            //
            state = stateTable[row + RBBIDataWrapper.NEXTSTATES + category];
            row = fRData.getRowIndex(state);

            if (stateTable[row + RBBIDataWrapper.ACCEPTING] == -1) {
                // Match found, common case, could have lookahead so we move
                // on to check it
                result = fText.getIndex();
            }


            int completedRule = stateTable[row + RBBIDataWrapper.ACCEPTING];
            if (completedRule > 0) {
                // Lookahead match is completed.
                int lookaheadResult = fLookAheadMatches.getPosition(completedRule);
                if (lookaheadResult >= 0) {
                    result = lookaheadResult;
                    break mainLoop;
                }
            }
            int rule = stateTable[row + RBBIDataWrapper.LOOKAHEAD];
            if (rule != 0) {
                // At the position of a '/' in a look-ahead match. Record it.
                int pos = fText.getIndex();
                fLookAheadMatches.setPosition(rule, pos);
            }

            if (state == STOP_STATE) {
                // Normal loop exit is here
                break mainLoop;
            }

            // then move iterator position backwards one character
            //
            if (mode == RBBI_RUN) {
                c = previous32(fText);
            } else {
                if (mode == RBBI_START) {
                    mode = RBBI_RUN;
                }
            }


        }   // End of the main loop.

        // The state machine is done.  Check whether it found a match...
        //
        // If the iterator failed to advance in the match engine, force it ahead by one.
        //   (This really indicates a defect in the break rules.  They should always match
        //    at least one character.)
        if (result == initialPosition) {
            result = fText.setIndex(initialPosition);
            previous32(fText);
            result = fText.getIndex();
        }

        fText.setIndex(result);
        if (TRACE) {
            System.out.println("Result = " + result);
        }

        return result;
    }
}