com.ibm.icu.text.RuleBasedBreakIterator Maven / Gradle / Ivy

Go to download
Show more of this group Show more artifacts with this name
Show all versions of icu4j Show documentation
International Component for Unicode for Java (ICU4J) is a mature, widely used Java library providing Unicode and Globalization support
There is a newer version: 76.1
Show newest version
// © 2016 and later: Unicode, Inc. and others.
// License & terms of use: http://www.unicode.org/copyright.html
/*
 *******************************************************************************
 * Copyright (C) 2005-2016 International Business Machines Corporation and
 * others. All Rights Reserved.
 *******************************************************************************
 */

package com.ibm.icu.text;

import static com.ibm.icu.impl.CharacterIteration.DONE32;
import static com.ibm.icu.impl.CharacterIteration.next32;
import static com.ibm.icu.impl.CharacterIteration.nextTrail32;

import java.io.ByteArrayOutputStream;
import java.io.IOException;
import java.io.InputStream;
import java.io.OutputStream;
import java.nio.ByteBuffer;
import java.text.CharacterIterator;
import java.util.ArrayList;
import java.util.List;
import java.util.MissingResourceException;

import com.ibm.icu.impl.CharacterIteration;
import com.ibm.icu.impl.ICUBinary;
import com.ibm.icu.impl.ICUDebug;
import com.ibm.icu.impl.RBBIDataWrapper;
import com.ibm.icu.impl.breakiter.BurmeseBreakEngine;
import com.ibm.icu.impl.breakiter.CjkBreakEngine;
import com.ibm.icu.impl.breakiter.DictionaryBreakEngine;
import com.ibm.icu.impl.breakiter.KhmerBreakEngine;
import com.ibm.icu.impl.breakiter.LSTMBreakEngine;
import com.ibm.icu.impl.breakiter.LanguageBreakEngine;
import com.ibm.icu.impl.breakiter.LaoBreakEngine;
import com.ibm.icu.impl.breakiter.ThaiBreakEngine;
import com.ibm.icu.impl.breakiter.UnhandledBreakEngine;
import com.ibm.icu.lang.UCharacter;
import com.ibm.icu.lang.UProperty;
import com.ibm.icu.lang.UScript;
import com.ibm.icu.util.CodePointTrie;

/**
 * Rule Based Break Iterator
 * This is a port of the C++ class RuleBasedBreakIterator from ICU4C.
 *
 * @stable ICU 2.0
 */
public class RuleBasedBreakIterator extends BreakIterator {
    //=======================================================================
    // Constructors & Factories
    //=======================================================================

    /**
     * private constructor
     */
    private RuleBasedBreakIterator() {
        fDictionaryCharCount  = 0;
        synchronized(gAllBreakEngines) {
            fBreakEngines = new ArrayList<>(gAllBreakEngines);
        }
    }

    /**
     * Create a break iterator from a precompiled set of break rules.
     *
     * Creating a break iterator from the binary rules is much faster than
     * creating one from source rules.
     *
     * The binary rules are generated by the RuleBasedBreakIterator.compileRules() function.
     * Binary break iterator rules are not guaranteed to be compatible between
     * different versions of ICU.
     *
     * @param is an input stream supplying the compiled binary rules.
     * @throws IOException if there is an error while reading the rules from the InputStream.
     * @see    #compileRules(String, OutputStream)
     * @stable ICU 4.8
     */
    public static RuleBasedBreakIterator getInstanceFromCompiledRules(InputStream is) throws IOException {
        RuleBasedBreakIterator  This = new RuleBasedBreakIterator();
        This.fRData = RBBIDataWrapper.get(ICUBinary.getByteBufferFromInputStreamAndCloseStream(is));
        This.fLookAheadMatches = new int[This.fRData.fFTable.fLookAheadResultsSize];
        return This;
    }

    /**
     * Create a break iterator from a precompiled set of break rules.
     *
     * Creating a break iterator from the binary rules is much faster than
     * creating one from source rules.
     *
     * The binary rules are generated by the RuleBasedBreakIterator.compileRules() function.
     * Binary break iterator rules are not guaranteed to be compatible between
     * different versions of ICU.
     *
     * @param bytes a buffer supplying the compiled binary rules.
     * @throws IOException if there is an error while reading the rules from the buffer.
     * @see    #compileRules(String, OutputStream)
     * @internal
     * @deprecated This API is ICU internal only.
     */
    @Deprecated
    public static RuleBasedBreakIterator getInstanceFromCompiledRules(ByteBuffer bytes) throws IOException {
        RuleBasedBreakIterator  This = new RuleBasedBreakIterator();
        This.fRData = RBBIDataWrapper.get(bytes);
        This.fLookAheadMatches = new int[This.fRData.fFTable.fLookAheadResultsSize];
        return This;
    }

    /**
     * Construct a RuleBasedBreakIterator from a set of rules supplied as a string.
     * @param rules The break rules to be used.
     * @stable ICU 2.2
     */
    public RuleBasedBreakIterator(String rules)  {
        this();
        try {
            ByteArrayOutputStream ruleOS = new ByteArrayOutputStream();
            compileRules(rules, ruleOS);
            fRData = RBBIDataWrapper.get(ByteBuffer.wrap(ruleOS.toByteArray()));
            fLookAheadMatches = new int[fRData.fFTable.fLookAheadResultsSize];
        } catch (IOException e) {
            ///CLOVER:OFF
            // An IO exception can only arrive here if there is a bug in the RBBI Rule compiler,
            //  causing bogus compiled rules to be produced, but with no compile error raised.
            RuntimeException rte = new RuntimeException("RuleBasedBreakIterator rule compilation internal error: "
                    + e.getMessage());
            throw rte;
            ///CLOVER:ON
        }
    }

    //=======================================================================
    // Boilerplate
    //=======================================================================

    /**
     * Clones this iterator.
     * @return A newly-constructed RuleBasedBreakIterator with the same
     * behavior as this one.
     * @stable ICU 2.0
     */
    @Override
    public Object clone()  {
        RuleBasedBreakIterator result;
        result = (RuleBasedBreakIterator)super.clone();
        if (fText != null) {
            result.fText = (CharacterIterator)(fText.clone());
        }
        synchronized (gAllBreakEngines)  {
            result.fBreakEngines = new ArrayList<>(gAllBreakEngines);
        }
        result.fLookAheadMatches = new int[fRData.fFTable.fLookAheadResultsSize];
        result.fBreakCache = result.new BreakCache(fBreakCache);
        result.fDictionaryCache = result.new DictionaryCache(fDictionaryCache);
        return result;
    }


    /**
     * Returns true if both BreakIterators are of the same class, have the same
     * rules, and iterate over the same text.
     * @stable ICU 2.0
     */
    @Override
    public boolean equals(Object that) {
        if (that == null) {
            return false;
        }
        if (this == that) {
            return true;
        }
        try {
            RuleBasedBreakIterator other = (RuleBasedBreakIterator) that;
            if (fRData != other.fRData && (fRData == null || other.fRData == null)) {
                return false;
            }
            if (fRData != null && other.fRData != null &&
                    (!fRData.fRuleSource.equals(other.fRData.fRuleSource))) {
                return false;
            }
            if (fText == null && other.fText == null) {
                return true;
            }
            if (fText == null || other.fText == null || !fText.equals(other.fText)) {
                return false;
            }
            return fPosition == other.fPosition;
        }
        catch(ClassCastException e) {
            return false;
        }
     }

    /**
     * Returns the description (rules) used to create this iterator.
     * (In ICU4C, the same function is RuleBasedBreakIterator::getRules())
     * @stable ICU 2.0
     */
    @Override
    public String toString() {
        String retStr = "";
        if (fRData != null) {
            retStr =  fRData.fRuleSource;
        }
        return retStr;
    }

    /**
     * Compute a hashcode for this BreakIterator
     * @return A hash code
     * @stable ICU 2.0
     */
    @Override
    public int hashCode()
    {
        return fRData.fRuleSource.hashCode();
    }


    private static final int  START_STATE = 1;     // The state number of the starting state
    private static final int  STOP_STATE  = 0;     // The state-transition value indicating "stop"

    // RBBIRunMode - the state machine runs an extra iteration at the beginning and end
    //               of user text.  A variable with this enum type keeps track of where we
    //               are.  The state machine only fetches user text input while in RUN mode.
    private static final int  RBBI_START  = 0;
    private static final int  RBBI_RUN    = 1;
    private static final int  RBBI_END    = 2;

    /**
     * The character iterator through which this BreakIterator accesses the text.
     */
    private CharacterIterator   fText = new java.text.StringCharacterIterator("");

    /**
     * The rule data for this BreakIterator instance.
     * Not intended for public use. Declared public for testing purposes only.
     * @internal
     * @deprecated This API is ICU internal only.
     */
    @Deprecated
    public RBBIDataWrapper    fRData;

    /**
     *  The iteration state - current position, rule status for the current position,
     *                        and whether the iterator ran off the end, yielding UBRK_DONE.
     *                        Current position is pinned to be 0 < position <= text.length.
     *                        Current position is always set to a boundary.
     *
     *  The current  position of the iterator. Pinned, 0 < fPosition <= text.length.
     *  Never has the value UBRK_DONE (-1).
     */
    private int                fPosition;

    /**
     * Index of the Rule {tag} values for the most recent match.
     */
    private int                fRuleStatusIndex;

    /**
     * True when iteration has run off the end, and iterator functions should return UBRK_DONE.
     */
    private boolean            fDone;

    /**
     *  Array of look-ahead tentative results.
     */
    private int[]              fLookAheadMatches;

    /**
     *   Cache of previously determined boundary positions.
     */
    private BreakCache         fBreakCache = new BreakCache();


    /**
     * Counter for the number of characters encountered with the "dictionary"
     *   flag set.  Normal RBBI iterators don't use it, although the code
     *   for updating it is live.  Dictionary Based break iterators (a subclass
     *   of us) access this field directly.
     * @internal
     */
    private int fDictionaryCharCount;

    private DictionaryCache     fDictionaryCache = new DictionaryCache();

    /**
     * ICU debug argument name for RBBI
     */
    private static final String RBBI_DEBUG_ARG = "rbbi";

    /**
     * Debugging flag.  Trace operation of state machine when true.
     */
    private static final boolean TRACE = ICUDebug.enabled(RBBI_DEBUG_ARG)
            && ICUDebug.value(RBBI_DEBUG_ARG).indexOf("trace") >= 0;

    /**
     * The "default" break engine - just skips over ranges of dictionary words,
     * producing no breaks. Should only be used if characters need to be handled
     * by a dictionary but we have no dictionary implementation for them.
     *
     * Only one instance; shared by all break iterators.
     */
    private static final UnhandledBreakEngine gUnhandledBreakEngine;

    /**
     * List of all known break engines, common for all break iterators.
     * Lazily updated as break engines are needed, because instantiation of
     * break engines is expensive.
     *
     * Because gAllBreakEngines can be referenced concurrently from different
     * BreakIterator instances, all access is synchronized.
     */
    private static final List gAllBreakEngines;

    static {
        gUnhandledBreakEngine = new UnhandledBreakEngine();
        gAllBreakEngines = new ArrayList<>();
        gAllBreakEngines.add(gUnhandledBreakEngine);
    }

    /**
     * List of all known break engines. Similar to gAllBreakEngines, but local to a
     * break iterator, allowing it to be used without synchronization.
     */
    private List fBreakEngines;

    /**
     * Dump the contents of the state table and character classes for this break iterator.
     * For debugging only.
     * @internal
     * @deprecated This API is ICU internal only.
     */
    @Deprecated
    public void dump(java.io.PrintStream out) {
        if (out == null) {
            out = System.out;
        }
        this.fRData.dump(out);
    }

    /**
     * Compile a set of source break rules into the binary state tables used
     * by the break iterator engine.  Creating a break iterator from precompiled
     * rules is much faster than creating one from source rules.
     *
     * Binary break rules are not guaranteed to be compatible between different
     * versions of ICU.
     *
     *
     * @param rules  The source form of the break rules
     * @param ruleBinary  An output stream to receive the compiled rules.
     * @throws IOException If there is an error writing the output.
     * @see #getInstanceFromCompiledRules(InputStream)
     * @stable ICU 4.8
     */
    public static void compileRules(String rules, OutputStream ruleBinary) throws IOException {
        RBBIRuleBuilder.compileRules(rules, ruleBinary);
    }

    //=======================================================================
    // BreakIterator overrides
    //=======================================================================

    /**
     * Sets the current iteration position to the beginning of the text.
     * (i.e., the CharacterIterator's starting offset).
     * @return The offset of the beginning of the text.
     * @stable ICU 2.0
     */
    @Override
    public int first() {
        if (fText == null) {
            return BreakIterator.DONE;
        }
        fText.first();
        int start =  fText.getIndex();
        if (!fBreakCache.seek(start)) {
            fBreakCache.populateNear(start);
        }
        fBreakCache.current();
        assert(fPosition == start);
        return fPosition;
    }

    /**
     * Sets the current iteration position to the end of the text.
     * (i.e., the CharacterIterator's ending offset).
     * @return The text's past-the-end offset.
     * @stable ICU 2.0
     */
    @Override
    public int last() {
        if (fText == null) {
            return BreakIterator.DONE;
        }
        int endPos = fText.getEndIndex();
        boolean endShouldBeBoundary = isBoundary(endPos);      // Has side effect of setting iterator position.
        assert(endShouldBeBoundary);
        if (fPosition != endPos) {
            assert(fPosition == endPos);
        }
        return endPos;
    }

    /**
     * Advances the iterator either forward or backward the specified number of steps.
     * Negative values move backward, and positive values move forward.  This is
     * equivalent to repeatedly calling next() or previous().
     * @param n The number of steps to move.  The sign indicates the direction
     * (negative is backwards, and positive is forwards).
     * @return The character offset of the boundary position n boundaries away from
     * the current one.
     * @stable ICU 2.0
     */
    @Override
    public int next(int n) {
        int result = 0;
        if (n > 0) {
            for (; n > 0 && result != DONE; --n) {
                result = next();
            }
        } else if (n < 0) {
            for (; n < 0 && result != DONE; ++n) {
                result = previous();
            }
        } else {
            result = current();
        }
        return result;
    }

    /**
     * Advances the iterator to the next boundary position.
     * @return The position of the first boundary after this one.
     * @stable ICU 2.0
     */
    @Override
    public int next() {
        fBreakCache.next();
        return fDone ? DONE : fPosition;
    }

    /**
     * Moves the iterator backwards, to the boundary preceding the current one.
     * @return The position of the boundary position immediately preceding the starting position.
     * @stable ICU 2.0
     */
    @Override
    public int previous() {
        fBreakCache.previous();
        return fDone ? DONE : fPosition;
    }

    /**
     * Sets the iterator to refer to the first boundary position following
     * the specified position.
     * @param startPos The position from which to begin searching for a break position.
     * @return The position of the first break after the current position.
     * @stable ICU 2.0
     */
    @Override
    public int following(int startPos) {
        // if the supplied position is before the beginning, return the
        // text's starting offset
        if (startPos < fText.getBeginIndex()) {
            return first();
        }

        // Move requested offset to a code point start. It might be between a lead and trail surrogate.
        // Or it may be beyond the end of the text.
        startPos = CISetIndex32(fText, startPos);
        fBreakCache.following(startPos);
        return fDone ? DONE : fPosition;
    }


    /**
     * Sets the iterator to refer to the last boundary position before the
     * specified position.
     * @param offset The position to begin searching for a break from.
     * @return The position of the last boundary before the starting position.
     * @stable ICU 2.0
     */
    @Override
    public int preceding(int offset) {
        if (fText == null || offset > fText.getEndIndex()) {
            return last();
        } else if (offset < fText.getBeginIndex()) {
            return first();
        }

        // Move requested offset to a code point start. It might be between a lead and trail surrogate.
        // int adjustedOffset = CISetIndex32(fText, offset);    // TODO: restore to match ICU4C behavior.
        int adjustedOffset = offset;
        fBreakCache.preceding(adjustedOffset);
        return fDone ? DONE : fPosition;

    }


    /**
     * Throw IllegalArgumentException unless begin <= offset < end.
     * @stable ICU 2.0
     */
    protected static final void checkOffset(int offset, CharacterIterator text) {
        if (offset < text.getBeginIndex() || offset > text.getEndIndex()) {
            throw new IllegalArgumentException("offset out of bounds");
        }
    }


    /**
     * Returns true if the specified position is a boundary position.  As a side
     * effect, leaves the iterator pointing to the first boundary position at
     * or after "offset".
     * @param offset the offset to check.
     * @return True if "offset" is a boundary position.
     * @stable ICU 2.0
     */
    @Override
    public boolean isBoundary(int offset) {
        // TODO: behavior difference with ICU4C, which considers out-of-range offsets
        //       to not be boundaries, and to not be errors.
        checkOffset(offset, fText);

        // Adjust offset to be on a code point boundary and not beyond the end of the text.
        // Note that isBoundary() is always false for offsets that are not on code point boundaries.
        // But we still need the side effect of leaving iteration at the following boundary.
        int adjustedOffset = CISetIndex32(fText, offset);

        boolean result = false;
        if (fBreakCache.seek(adjustedOffset) || fBreakCache.populateNear(adjustedOffset)) {
            result = (fBreakCache.current() == offset);
        }

        if (!result) {
            // Not on a boundary. isBoundary() must leave iterator on the following boundary.
            // fBreakCache.seek(), above, left us on the preceding boundary, so advance one.
            next();
        }
        return result;

    }

    /**
     * Returns the current iteration position.  Note that DONE is never
     * returned from this function; if iteration has run to the end of a
     * string, current() will return the length of the string while
     * next() will return BreakIterator.DONE).
     * @return The current iteration position.
     * @stable ICU 2.0
     */
    @Override
    public int current() {
        return (fText != null) ? fPosition : BreakIterator.DONE;
    }


    /**
     * Return the status tag from the break rule that determined the boundary at
     * the current iteration position.  The values appear in the rule source
     * within brackets, {123}, for example.  For rules that do not specify a
     * status, a default value of 0 is returned.  If more than one rule applies,
     * the numerically largest of the possible status values is returned.
     * 
     * Of the standard types of ICU break iterators, only the word and line break
     * iterator provides status values.  The values are defined in
     * class RuleBasedBreakIterator, and allow distinguishing between words
     * that contain alphabetic letters, "words" that appear to be numbers,
     * punctuation and spaces, words containing ideographic characters, and
     * more.  Call getRuleStatus after obtaining a boundary
     * position from next(), previous(), or
     * any other break iterator functions that returns a boundary position.
     * 

     * Note that getRuleStatus() returns the value corresponding to
     * current() index even after next() has returned DONE.
     * 


     * @return the status from the break rule that determined the boundary
     * at the current iteration position.
     *
     * @stable ICU 60
     */

    @Override
    public int  getRuleStatus() {
        //   Status records have this form:
        //           Count N         <--  fLastRuleStatusIndex points here.
        //           Status val 0
        //           Status val 1
        //              ...
        //           Status val N-1  <--  the value we need to return
        //   The status values are sorted in ascending order.
        //   This function returns the last (largest) of the array of status values.
        int  idx = fRuleStatusIndex + fRData.fStatusTable[fRuleStatusIndex];
        int  tagVal = fRData.fStatusTable[idx];
        return tagVal;
    }

    /**
     * Get the status (tag) values from the break rule(s) that determined the boundary
     * at the current iteration position.  The values appear in the rule source
     * within brackets, {123}, for example.  The default status value for rules
     * that do not explicitly provide one is zero.
     * 

     * The status values used by the standard ICU break rules are defined
     * as public constants in class RuleBasedBreakIterator.
     * 

     * If the size  of the output array is insufficient to hold the data,
     *  the output will be truncated to the available length.  No exception
     *  will be thrown.
     *
     * @param fillInArray an array to be filled in with the status values.
     * @return          The number of rule status values from the rules that determined
     *                  the boundary at the current iteration position.
     *                  In the event that the array is too small, the return value
     *                  is the total number of status values that were available,
     *                  not the reduced number that were actually returned.
     * @stable ICU 60
     */
    @Override
    public int getRuleStatusVec(int[] fillInArray) {
        int numStatusVals = fRData.fStatusTable[fRuleStatusIndex];
        if (fillInArray != null) {
            int numToCopy = Math.min(numStatusVals, fillInArray.length);
            for (int i=0; i
     * Caution:The state of the returned CharacterIterator
     * must not be modified in any way while the BreakIterator is still in use.
     * Doing so will lead to undefined behavior of the BreakIterator.
     * Clone the returned CharacterIterator first and work with that.
     * 

     * The returned CharacterIterator is a reference
     * to the actual iterator being used by the BreakIterator.
     * No guarantees are made about the current position
     * of this iterator when it is returned; it may differ from the
     * BreakIterators current position.  If you need to move that
     * position to examine the text, clone this function's return value first.
     * @return An iterator over the text being analyzed.
     * @stable ICU 2.0
     */
    @Override
    public CharacterIterator getText() {
        return fText;
    }

    /**
     * Set the iterator to analyze a new piece of text.  This function resets
     * the current iteration position to the beginning of the text.
     * (The old iterator is dropped.)
     * 
     * Caution: The supplied CharacterIterator is used
     * directly by the BreakIterator, and must not be altered in any
     * way by code outside of the BreakIterator.
     * Doing so will lead to undefined behavior of the BreakIterator.
     *
     * @param newText An iterator over the text to analyze.
     * @stable ICU 2.0
     */
    @Override
    public void setText(CharacterIterator newText) {
        if (newText != null) {
            fBreakCache.reset(newText.getBeginIndex(), 0);
        } else {
            fBreakCache.reset();
        }
        fDictionaryCache.reset();
        fText = newText;
        this.first();
    }

     /**
     * Control debug, trace and dump options.
     * @internal
     * @deprecated This API is ICU internal only.
     */
    @Deprecated
    public static final String fDebugEnv = ICUDebug.enabled(RBBI_DEBUG_ARG) ?
                                           ICUDebug.value(RBBI_DEBUG_ARG) : null;


    private LanguageBreakEngine getLanguageBreakEngine(int c) {

        // We have a dictionary character.
        // Does an already instantiated break engine handle it?
        for (LanguageBreakEngine candidate : fBreakEngines) {
            if (candidate.handles(c)) {
                return candidate;
            }
        }

        synchronized (gAllBreakEngines) {
            // This break iterator's list of break engines didn't handle the character.
            // Check the global list, another break iterator may have instantiated the
            // desired engine.
            for (LanguageBreakEngine candidate : gAllBreakEngines) {
                if (candidate.handles(c)) {
                    fBreakEngines.add(candidate);
                    return candidate;
                }
            }

            // The global list doesn't have an existing engine, build one.
            int script = UCharacter.getIntPropertyValue(c, UProperty.SCRIPT);
            if (script == UScript.KATAKANA || script == UScript.HIRAGANA) {
                // Katakana, Hiragana and Han are handled by the same dictionary engine.
                // Fold them together for mapping from script -> engine.
                script = UScript.HAN;
            }

            LanguageBreakEngine eng;
            try {
                switch (script) {
                case UScript.THAI:
                    try {
                        eng = LSTMBreakEngine.create(script, LSTMBreakEngine.createData(script));
                    } catch (MissingResourceException e) {
                        eng = new ThaiBreakEngine();
                    }
                    break;
                case UScript.LAO:
                    eng = new LaoBreakEngine();
                    break;
                case UScript.MYANMAR:
                    try {
                        eng = LSTMBreakEngine.create(script, LSTMBreakEngine.createData(script));
                    } catch (MissingResourceException e) {
                        eng = new BurmeseBreakEngine();
                    }
                    break;
                case UScript.KHMER:
                    eng = new KhmerBreakEngine();
                    break;
                case UScript.HAN:
                    eng = new CjkBreakEngine(false);
                     break;
                case UScript.HANGUL:
                    eng = new CjkBreakEngine(true);
                    break;
                default:
                    gUnhandledBreakEngine.handleChar(c);
                    eng = gUnhandledBreakEngine;
                    break;
                }
            } catch (IOException e) {
                eng = null;
            }

            if (eng != null && eng != gUnhandledBreakEngine) {
                gAllBreakEngines.add(eng);
                fBreakEngines.add(eng);
            }
            return eng;
        }   // end synchronized(gAllBreakEngines)
    }

    /**
     * The State Machine Engine for moving forward is here.
     * This function is the heart of the RBBI run time engine.
     *
     * Input
     *    fPosition, the position in the text to begin from.
     * Output
     *    fPosition:           the boundary following the starting position.
     *    fDictionaryCharCount the number of dictionary characters encountered.
     *                         If > 0, the segment will be further subdivided
     *    fRuleStatusIndex     Info from the state table indicating which rules caused the boundary.
     *
     * @return the new iterator position
     *
     * A note on supplementary characters and the position of underlying
     * Java CharacterIterator:   Normally, a character iterator is positioned at
     * the char most recently returned by next().  Within this function, when
     * a supplementary char is being processed, the char iterator is left
     * sitting on the trail surrogate, in the middle of the code point.
     * This is different from everywhere else, where an iterator always
     * points at the lead surrogate of a supplementary.
     */
    private int handleNext() {
        if (TRACE) {
            System.out.println("Handle Next   pos      char  state category");
        }

        // handleNext always sets the break tag value.
        // Set the default for it.
        fRuleStatusIndex  = 0;
        fDictionaryCharCount = 0;

        // caches for quicker access
        CharacterIterator text = fText;
        CodePointTrie trie = fRData.fTrie;

        char[] stateTable  = fRData.fFTable.fTable;
        int initialPosition = fPosition;
        text.setIndex(initialPosition);
        int result          = initialPosition;

        // Set up the starting char
        int c = text.current();
        if (c >= UTF16.LEAD_SURROGATE_MIN_VALUE) {
            c = nextTrail32(text, c);
            if (c == DONE32) {
                fDone = true;
                return BreakIterator.DONE;
            }
        }

        // Set the initial state for the state machine
        int state           = START_STATE;
        int row             = fRData.getRowIndex(state);
        short category      = 3;
        int flagsState      = fRData.fFTable.fFlags;
        int dictStart       = fRData.fFTable.fDictCategoriesStart;
        int mode            = RBBI_RUN;
        if ((flagsState & RBBIDataWrapper.RBBI_BOF_REQUIRED) != 0) {
            category = 2;
            mode     = RBBI_START;
            if (TRACE) {
                System.out.print("            " +  RBBIDataWrapper.intToString(text.getIndex(), 5));
                System.out.print(RBBIDataWrapper.intToHexString(c, 10));
                System.out.println(RBBIDataWrapper.intToString(state,7) + RBBIDataWrapper.intToString(category,6));
            }
        }

        // loop until we reach the end of the text or transition to state 0
        while (state != STOP_STATE) {
            if (c == DONE32) {
                // Reached end of input string.
                if (mode == RBBI_END) {
                    // We have already run the loop one last time with the
                    // character set to the pseudo {eof} value. Now it is time
                    // to unconditionally bail out.
                    break;
                }
                // Run the loop one last time with the fake end-of-input character category
                mode = RBBI_END;
                category = 1;
            }
            else if (mode == RBBI_RUN) {
                // Get the char category.  An incoming category of 1 or 2 mens that
                //      we are preset for doing the beginning or end of input, and
                //      that we shouldn't get a category from an actual text input character.
                //

                // look up the current character's character category, which tells us
                // which column in the state table to look at.
                //
                category = (short) trie.get(c);

                // Check for categories that require word dictionary handling.
                if (category >= dictStart) {
                    fDictionaryCharCount++;
                }

                if (TRACE) {
                    System.out.print("            " +  RBBIDataWrapper.intToString(text.getIndex(), 5));
                    System.out.print(RBBIDataWrapper.intToHexString(c, 10));
                    System.out.println(RBBIDataWrapper.intToString(state,7) + RBBIDataWrapper.intToString(category,6));
                }

                // Advance to the next character.
                // If this is a beginning-of-input loop iteration, don't advance.
                //    The next iteration will be processing the first real input character.
                c = text.next();
                if (c >= UTF16.LEAD_SURROGATE_MIN_VALUE) {
                    c = nextTrail32(text, c);
                }
            }
            else {
                mode = RBBI_RUN;
            }

            // look up a state transition in the state table
            state = stateTable[row + RBBIDataWrapper.NEXTSTATES + category];
            row   = fRData.getRowIndex(state);
            int accepting = stateTable[row + RBBIDataWrapper.ACCEPTING];
            if (accepting == RBBIDataWrapper.ACCEPTING_UNCONDITIONAL) {
                // Match found, common case
                result = text.getIndex();
                if (c >= UTF16.SUPPLEMENTARY_MIN_VALUE && c <= UTF16.CODEPOINT_MAX_VALUE) {
                    // The iterator has been left in the middle of a surrogate pair.
                    // We want the start of it.
                    result--;
                }

                //  Remember the break status (tag) values.
                fRuleStatusIndex = stateTable[row + RBBIDataWrapper.TAGSIDX];
            } else if (accepting > RBBIDataWrapper.ACCEPTING_UNCONDITIONAL) {
                // Lookahead match is completed
                int lookaheadResult = fLookAheadMatches[accepting];
                if (lookaheadResult >= 0) {
                    fRuleStatusIndex = stateTable[row + RBBIDataWrapper.TAGSIDX];
                    fPosition = lookaheadResult;
                    return lookaheadResult;
                }
            }


            // If we are at the position of the '/' in a look-ahead (hard break) rule;
            // record the current position, to be returned later, if the full rule matches.
            // TODO: Move this check before the previous check of fAccepting.
            //       This would enable hard-break rules with no following context.
            //       But there are line break test failures when trying this. Investigate.
            //       Issue ICU-20837
            int rule = stateTable[row + RBBIDataWrapper.LOOKAHEAD];
            if (rule != 0) {
                int  pos = text.getIndex();
                if (c >= UTF16.SUPPLEMENTARY_MIN_VALUE && c <= UTF16.CODEPOINT_MAX_VALUE) {
                    // The iterator has been left in the middle of a surrogate pair.
                    // We want the beginning  of it.
                    pos--;
                }
                fLookAheadMatches[rule] = pos;
            }


        }        // End of state machine main loop

        // The state machine is done.  Check whether it found a match...

        // If the iterator failed to advance in the match engine force it ahead by one.
        // This indicates a defect in the break rules, which should always match
        // at least one character.

        if (result == initialPosition) {
            if (TRACE) {
                System.out.println("Iterator did not move. Advancing by 1.");
            }
            text.setIndex(initialPosition);
            next32(text);
            result = text.getIndex();
            fRuleStatusIndex = 0;
        }

        // Leave the iterator at our result position.
        //   (we may have advanced beyond the last accepting position chasing after
        //    longer matches that never completed.)
        fPosition = result;

        if (TRACE) {
            System.out.println("result = " + result);
        }
        return result;
    }

    /**
     * Iterate backwards from an arbitrary position in the input text using the Safe Reverse rules.
     * This locates a "Safe Position" from which the forward break rules
     * will operate correctly. A Safe Position is not necessarily a boundary itself.
     *
     * The logic of this function is very similar to handleNext(), above, but simpler
     * because the safe table does not require as many options.
     *
     * @param fromPosition the position in the input text to begin the iteration.
     * @internal
     */
    private int handleSafePrevious(int fromPosition) {
        char            state;
        short           category = 0;
        int             result = 0;

        // caches for quicker access
        CharacterIterator text = fText;
        CodePointTrie trie = fRData.fTrie;
        char[] stateTable  = fRData.fRTable.fTable;

        CISetIndex32(text, fromPosition);
        if (TRACE) {
            System.out.print("Handle Previous   pos   char  state category");
        }

        // if we're already at the start of the text, return DONE.
        if (text.getIndex() == text.getBeginIndex()) {
            return BreakIterator.DONE;
        }

        //  Set the initial state for the state machine
        int c = CharacterIteration.previous32(text);
        state = START_STATE;
        int row = fRData.getRowIndex(state);

        // loop until we reach the start of the text or transition to state 0
        //
        for (; c != DONE32; c = CharacterIteration.previous32(text)) {

            // look up the current character's character category, which tells us
            // which column in the state table to look at.
            //
            //  And off the dictionary flag bit. For reverse iteration it is not used.
            category = (short) trie.get(c);
            if (TRACE) {
                System.out.print("            " +  RBBIDataWrapper.intToString(text.getIndex(), 5));
                System.out.print(RBBIDataWrapper.intToHexString(c, 10));
                System.out.println(RBBIDataWrapper.intToString(state,7) + RBBIDataWrapper.intToString(category,6));
            }

            // State Transition - move machine to its next state
            //
            assert(category < fRData.fHeader.fCatCount);
            state = stateTable[row + RBBIDataWrapper.NEXTSTATES + category];
            row   = fRData.getRowIndex(state);

            if (state == STOP_STATE) {
                // This is the normal exit from the lookup state machine.
                // Transition to state zero means we have found a safe point.
                break;
            }
        }

        // The state machine is done.
        result = text.getIndex();
        if (TRACE) {
            System.out.println("result = " + result);
        }
        return result;
    }

    /**
     * Set the index of a CharacterIterator.
     * Pin the index to the valid range range of BeginIndex <= index <= EndIndex.
     * If the index points to a trail surrogate of a supplementary character, adjust it
     * to the start (lead surrogate) index.
     *
     * @param ci A CharacterIterator to set
     * @param index the index to set
     * @return the resulting index, possibly pinned or adjusted.
     */
    private static int CISetIndex32(CharacterIterator ci, int index) {
        if (index <= ci.getBeginIndex()) {
            ci.first();
        } else if (index >= ci.getEndIndex()) {
            ci.setIndex(ci.getEndIndex());
        } else if (Character.isLowSurrogate(ci.setIndex(index))) {
            if (!Character.isHighSurrogate(ci.previous())) {
                ci.next();
            }
        }
        return ci.getIndex();
    }

    /** DictionaryCache  stores the boundaries obtained from a run of dictionary characters.
     *                 Dictionary boundaries are moved first to this cache, then from here
     *                 to the main BreakCache, where they may inter-leave with non-dictionary
     *                 boundaries. The public BreakIterator API always fetches directly
     *                 from the main BreakCache, not from here.
     *
     *                 In common situations, the number of boundaries in a single dictionary run
     *                 should be quite small, it will be terminated by punctuation, spaces,
     *                 or any other non-dictionary characters. The main BreakCache may end
     *                 up with boundaries from multiple dictionary based runs.
     *
     *                 The boundaries are stored in a simple ArrayList (vector), with the
     *                 assumption that they will be accessed sequentially.
     */
    class DictionaryCache  {

         void reset() {
             fPositionInCache = -1;
             fStart = 0;
             fLimit = 0;
             fFirstRuleStatusIndex = 0;
             fOtherRuleStatusIndex = 0;
             fBreaks.removeAllElements();
         };

         boolean following(int fromPos) {
             if (fromPos >= fLimit || fromPos < fStart) {
                 fPositionInCache = -1;
                 return false;
             }

             // Sequential iteration, move from previous boundary to the following

             int r = 0;
             if (fPositionInCache >= 0 && fPositionInCache < fBreaks.size() && fBreaks.elementAt(fPositionInCache) == fromPos) {
                 ++fPositionInCache;
                 if (fPositionInCache >= fBreaks.size()) {
                     fPositionInCache = -1;
                     return false;
                 }
                 r = fBreaks.elementAt(fPositionInCache);
                 assert(r > fromPos);
                 fBoundary = r;
                 fStatusIndex = fOtherRuleStatusIndex;
                 return true;
             }

             // Random indexing. Linear search for the boundary following the given position.

             for (fPositionInCache = 0; fPositionInCache < fBreaks.size(); ++fPositionInCache) {
                 r= fBreaks.elementAt(fPositionInCache);
                 if (r > fromPos) {
                     fBoundary = r;
                     fStatusIndex = fOtherRuleStatusIndex;
                     return true;
                 }
             }

             // Internal error. fStart <= fromPos < fLimit, but no cached boundary.
             assert(false);
             fPositionInCache = -1;
             return false;
         };

         boolean preceding(int fromPos) {
             if (fromPos <= fStart || fromPos > fLimit) {
                 fPositionInCache = -1;
                 return false;
             }

             if (fromPos == fLimit) {
                 fPositionInCache = fBreaks.size() - 1;
                 if (fPositionInCache >= 0) {
                     assert(fBreaks.elementAt(fPositionInCache) == fromPos);
                 }
             }

             int r;
             if (fPositionInCache > 0 && fPositionInCache < fBreaks.size() && fBreaks.elementAt(fPositionInCache) == fromPos) {
                 --fPositionInCache;
                 r = fBreaks.elementAt(fPositionInCache);
                 assert(r < fromPos);
                 fBoundary = r;
                 fStatusIndex = ( r== fStart) ? fFirstRuleStatusIndex : fOtherRuleStatusIndex;
                 return true;
             }

             if (fPositionInCache == 0) {
                 fPositionInCache = -1;
                 return false;
             }

             for (fPositionInCache = fBreaks.size()-1; fPositionInCache >= 0; --fPositionInCache) {
                 r = fBreaks.elementAt(fPositionInCache);
                 if (r < fromPos) {
                     fBoundary = r;
                     fStatusIndex = ( r == fStart) ? fFirstRuleStatusIndex : fOtherRuleStatusIndex;
                     return true;
                 }
             }
             assert(false);
             fPositionInCache = -1;
             return false;
         };

        /**
         * Populate the cache with the dictionary based boundaries within a region of text.
         * @param startPos  The start position of a range of text
         * @param endPos    The end position of a range of text
         * @param firstRuleStatus The rule status index that applies to the break at startPos
         * @param otherRuleStatus The rule status index that applies to boundaries other than startPos
         * @internal
         */
        void populateDictionary(int startPos, int endPos,
                                int firstRuleStatus, int otherRuleStatus) {
            if ((endPos - startPos) <= 1) {
                return;
            }

            reset();
            fFirstRuleStatusIndex = firstRuleStatus;
            fOtherRuleStatusIndex = otherRuleStatus;

            int rangeStart = startPos;
            int rangeEnd = endPos;

            int         category;
            int         current;
            int         foundBreakCount = 0;

            // Loop through the text, looking for ranges of dictionary characters.
            // For each span, find the appropriate break engine, and ask it to find
            // any breaks within the span.

            fText.setIndex(rangeStart);
            int     c = CharacterIteration.current32(fText);
            category = (short)fRData.fTrie.get(c);
            int dictStart = fRData.fFTable.fDictCategoriesStart;

            while(true) {
                while((current = fText.getIndex()) < rangeEnd && (category < dictStart)) {
                    c = CharacterIteration.next32(fText);    // pre-increment
                    category = (short)fRData.fTrie.get(c);
                }
                if (current >= rangeEnd) {
                    break;
                }

                // We now have a dictionary character. Get the appropriate language object
                // to deal with it.
                LanguageBreakEngine lbe = getLanguageBreakEngine(c);

                // Ask the language object if there are any breaks. It will add them to the cache and
                // leave the text pointer on the other side of its range, ready to search for the next one.
                if (lbe != null) {
                    foundBreakCount += lbe.findBreaks(fText, rangeStart, rangeEnd, fBreaks);
                }

                // Reload the loop variables for the next go-round
                c = CharacterIteration.current32(fText);
                category = (short)fRData.fTrie.get(c);
            }

            // If we found breaks, ensure that the first and last entries are
            // the original starting and ending position. And initialize the
            // cache iteration position to the first entry.

            // System.out.printf("foundBreakCount = %d%n", foundBreakCount);
            if (foundBreakCount > 0) {
                assert(foundBreakCount == fBreaks.size());
                if (startPos < fBreaks.elementAt(0)) {
                    // The dictionary did not place a boundary at the start of the segment of text.
                    // Add one now. This should not commonly happen, but it would be easy for interactions
                    // of the rules for dictionary segments and the break engine implementations to
                    // inadvertently cause it. Cover it here, just in case.
                    fBreaks.offer(startPos);
                }
                if (endPos > fBreaks.peek()) {
                    fBreaks.push(endPos);
                }
                fPositionInCache = 0;
                // Note: Dictionary matching may extend beyond the original limit.
                fStart = fBreaks.elementAt(0);
                fLimit = fBreaks.peek();
            } else {
                // there were no language-based breaks, even though the segment contained
                // dictionary characters. Subsequent attempts to fetch boundaries from the dictionary cache
                // for this range will fail, and the calling code will fall back to the rule based boundaries.
            }

        };


        DictionaryCache() {
            fPositionInCache = -1;
            fBreaks = new DictionaryBreakEngine.DequeI();
        }

        /**
         * copy constructor. Used by RuleBasedBreakIterator.clone().
         *
         * @param src the source object to be copied.
         */
        DictionaryCache(DictionaryCache src)  {
            try {
                fBreaks = (DictionaryBreakEngine.DequeI)src.fBreaks.clone();
            }
            catch (CloneNotSupportedException e) {
                throw new RuntimeException(e);
            }
            fPositionInCache      = src.fPositionInCache;
            fStart                = src.fStart;
            fLimit                = src.fLimit;
            fFirstRuleStatusIndex = src.fFirstRuleStatusIndex;
            fOtherRuleStatusIndex = src.fOtherRuleStatusIndex;
            fBoundary             = src.fBoundary;
            fStatusIndex          = src.fStatusIndex;
        }

        // A data structure containing the boundaries themselves. Essentially a vector of raw ints.
        DictionaryBreakEngine.DequeI fBreaks;
        int             fPositionInCache;       // Index in fBreaks of last boundary returned by following()
        //                                      //    or preceding(). Optimizes sequential access.
        int             fStart;                 // Text position of first boundary in cache.
        int             fLimit;                 // Last boundary in cache. Which is the limit of the
        //                                      //    text segment being handled by the dictionary.
        int             fFirstRuleStatusIndex;  // Rule status info for first boundary.
        int             fOtherRuleStatusIndex;  // Rule status info for 2nd through last boundaries.
        int             fBoundary;              // Current boundary. Set by preceding(), following().
        int             fStatusIndex;           // Current rule status index. Set by preceding, following().
    };




/*
 * class BreakCache
 *
 * Cache of break boundary positions and rule status values.
 * Break iterator API functions, next(), previous(), etc., will use cached results
 * when possible, and otherwise cache new results as they are obtained.
 *
 * Uniformly caches both dictionary and rule based (non-dictionary) boundaries.
 *
 * The cache is implemented as a single circular buffer.
 */

/*
 * size of the circular cache buffer.
 */

class BreakCache {

    BreakCache() {
        reset();
    };

    void  reset(int pos, int ruleStatus) {
        fStartBufIdx = 0;
        fEndBufIdx = 0;
        fTextIdx = pos;
        fBufIdx = 0;
        fBoundaries[0] = pos;
        fStatuses[0] = (short)ruleStatus;
    }

    void  reset() {reset(0, 0); };

    void  next() {
        if (fBufIdx == fEndBufIdx) {
            fDone = !populateFollowing();
            fPosition = fTextIdx;
            fRuleStatusIndex = fStatuses[fBufIdx];
        } else {
            fBufIdx = modChunkSize(fBufIdx + 1);
            fTextIdx = fPosition = fBoundaries[fBufIdx];
            fRuleStatusIndex = fStatuses[fBufIdx];
        }
    };

    void  previous() {
        int initialBufIdx = fBufIdx;
        if (fBufIdx == fStartBufIdx) {
            // At start of cache. Prepend to it.
            populatePreceding();
        } else {
            // Cache already holds the next boundary
            fBufIdx = modChunkSize(fBufIdx - 1);
            fTextIdx = fBoundaries[fBufIdx];
        }
        fDone = (fBufIdx == initialBufIdx);
        fPosition = fTextIdx;
        fRuleStatusIndex = fStatuses[fBufIdx];
        return;
    };

    // Move the iteration state to the position following the startPosition.
    // Input position must be pinned to the input length.
    void following(int startPos) {
        if (startPos == fTextIdx || seek(startPos) || populateNear(startPos)) {
            // startPos is in the cache. Do a next() from that position.
            // TODO: an awkward set of interactions with bi->fDone
            //       seek() does not clear it; it can't because of interactions with populateNear().
            //       next() does not clear it in the fast-path case, where everything matters. Maybe it should.
            //       So clear it here, for the case where seek() succeeded on an iterator that had previously run off the end.
            fDone = false;
            next();
        }

    };

    void  preceding(int startPos) {
        if (startPos == fTextIdx || seek(startPos) || populateNear(startPos)) {
            if (startPos == fTextIdx) {
                previous();
            } else {
                // seek() leaves the BreakCache positioned at the preceding boundary
                //        if the requested position is between two boundaries.
                // current() pushes the BreakCache position out to the BreakIterator itself.
                assert(startPos > fTextIdx);
                current();
            }
        }
        return;
    };

    /**
     * Update the state of the public BreakIterator (fBI) to reflect the
     * current state of the break iterator cache (this).
     */
    int current() {
        fPosition = fTextIdx;
        fRuleStatusIndex = fStatuses[fBufIdx];
        fDone = false;
        return fTextIdx;
    };

    /**
     * Add boundaries to the cache near the specified position.
     * The given position need not be a boundary itself.
     * The input position must be within the range of the text, and
     * on a code point boundary.
     * If the requested position is a break boundary, leave the iteration
     * position on it.
     * If the requested position is not a boundary, leave the iteration
     * position on the preceding boundary and include both the the
     * preceding and following boundaries in the cache.
     * Additional boundaries, either preceding or following, may be added
     * to the cache as a side effect.
     *
     * Return false if the operation failed.
     */
    boolean populateNear(int position) {
        assert(position < fBoundaries[fStartBufIdx] || position > fBoundaries[fEndBufIdx]);

        // Find a boundary somewhere in the vicinity of the requested position.
        // Depending on the safe rules and the text data, it could be either before, at, or after
        // the requested position.


        // If the requested position is not near already cached positions, clear the existing cache,
        // find a near-by boundary and begin new cache contents there.

        if ((position < fBoundaries[fStartBufIdx] - 15) || position > (fBoundaries[fEndBufIdx] + 15)) {
            int aBoundary = fText.getBeginIndex();
            int ruleStatusIndex = 0;
            if (position > aBoundary + 20) {
                int backupPos = handleSafePrevious(position);
                if (backupPos > aBoundary) {
                    // Advance to the boundary following the backup position.
                    // There is a complication: the safe reverse rules identify pairs of code points
                    // that are safe. If advancing from the safe point moves forwards by less than
                    // two code points, we need to advance one more time to ensure that the boundary
                    // is good, including a correct rules status value.
                    //
                    fPosition = backupPos;
                    aBoundary = handleNext();
                    if (aBoundary == backupPos + 1 ||
                            (aBoundary == backupPos + 2 &&
                            Character.isHighSurrogate(fText.setIndex(backupPos)) &&
                            Character.isLowSurrogate(fText.next()))) {
                        // The initial handleNext() only advanced by a single code point. Go again.
                        // Safe rules identify safe pairs.
                        aBoundary = handleNext();
                    }
                }
                ruleStatusIndex = fRuleStatusIndex;
            }
            reset(aBoundary, ruleStatusIndex);               // Reset cache to hold aBoundary as a single starting point.
        }

        // Fill in boundaries between existing cache content and the new requested position.

        if (fBoundaries[fEndBufIdx] < position) {
            // The last position in the cache precedes the requested position.
            // Add following position(s) to the cache.
            while (fBoundaries[fEndBufIdx] < position) {
                if (!populateFollowing()) {
                    assert false;
                    return false;
                }
            }
            fBufIdx = fEndBufIdx;                      // Set iterator position to the end of the buffer.
            fTextIdx = fBoundaries[fBufIdx];           // Required because populateFollowing may add extra boundaries.
            while (fTextIdx > position) {              // Move backwards to a position at or preceding the requested pos.
                previous();
            }
            return true;
        }

        if (fBoundaries[fStartBufIdx] > position) {
            // The first position in the cache is beyond the requested position.
            // back up more until we get a boundary <= the requested position.
            while (fBoundaries[fStartBufIdx] > position) {
                populatePreceding();
            }
            fBufIdx = fStartBufIdx;                    // Set iterator position to the start of the buffer.
            fTextIdx = fBoundaries[fBufIdx];           // Required because populatePreceding may add extra boundaries.
            while (fTextIdx < position) {              // Move forwards to a position at or following the requested pos.
                next();
            }
            if (fTextIdx > position) {
                // If position is not itself a boundary, the next() loop above will overshoot.
                // Back up one, leaving cache position at the boundary preceding the requested position.
                previous();
            }
            return true;
        }

        assert fTextIdx == position;
        return true;

    };

    /**
     *  Add boundary(s) to the cache following the current last boundary.
     *  Return false if at the end of the text, and no more boundaries can be added.
     *  Leave iteration position at the first newly added boundary, or unchanged if no boundary was added.
     */
    boolean populateFollowing() {
        int fromPosition = fBoundaries[fEndBufIdx];
        int fromRuleStatusIdx = fStatuses[fEndBufIdx];
        int pos = 0;
        int ruleStatusIdx = 0;

        if (fDictionaryCache.following(fromPosition)) {
            addFollowing(fDictionaryCache.fBoundary, fDictionaryCache.fStatusIndex, UpdateCachePosition);
            return true;
        }

        fPosition = fromPosition;
        pos = handleNext();
        if (pos == BreakIterator.DONE) {
            return false;
        }

        ruleStatusIdx = fRuleStatusIndex;
        if (fDictionaryCharCount > 0) {
            // The text segment obtained from the rules includes dictionary characters.
            // Subdivide it, with subdivided results going into the dictionary cache.
            fDictionaryCache.populateDictionary(fromPosition, pos, fromRuleStatusIdx, ruleStatusIdx);
            if (fDictionaryCache.following(fromPosition)) {
                addFollowing(fDictionaryCache.fBoundary, fDictionaryCache.fStatusIndex, UpdateCachePosition);
                return true;
                // TODO: may want to move a sizable chunk of the dictionary cache to the break cache at this point.
                //       But be careful with interactions with populateNear().
            }
        }

        // Rule based segment did not include dictionary characters.
        // Or, it did contain dictionary chars, but the dictionary segmenter didn't handle them,
        //    meaning that we didn't take the return, above.
        // Add its end point to the cache.
        addFollowing(pos, ruleStatusIdx, UpdateCachePosition);

        // Add several non-dictionary boundaries at this point, to optimize straight forward iteration.
        //    (subsequent calls to BreakIterator::next() will take the fast path, getting cached results.
        //
        for (int count=0; count<6; ++count) {
            pos = handleNext();
            if (pos == BreakIterator.DONE || fDictionaryCharCount > 0) {
                break;
            }
            addFollowing(pos, fRuleStatusIndex, RetainCachePosition);
        }
        return true;
    };

    /**
     *  Add one or more boundaries to the cache preceding the first currently cached boundary.
     *  Leave the iteration position on the first added boundary.
     *  Return false if no boundaries could be added (if at the start of the text.)
     */
    boolean populatePreceding() {
        int textBegin = fText.getBeginIndex();
        int fromPosition = fBoundaries[fStartBufIdx];
        if (fromPosition == textBegin) {
            return false;
        }

        int position = textBegin;
        int positionStatusIdx = 0;

        if (fDictionaryCache.preceding(fromPosition)) {
            addPreceding(fDictionaryCache.fBoundary, fDictionaryCache.fStatusIndex, UpdateCachePosition);
            return true;
        }

        int backupPosition = fromPosition;

        // Find a boundary somewhere preceding the first already-cached boundary
        do {
            backupPosition = backupPosition - 30;
            if (backupPosition <= textBegin) {
                backupPosition = textBegin;
            } else {
                backupPosition = handleSafePrevious(backupPosition);
            }
            if (backupPosition == BreakIterator.DONE || backupPosition == textBegin) {
                position = textBegin;
                positionStatusIdx = 0;
            } else {
                // Advance to the boundary following the backup position.
                // There is a complication: the safe reverse rules identify pairs of code points
                // that are safe. If advancing from the safe point moves forwards by less than
                // two code points, we need to advance one more time to ensure that the boundary
                // is good, including a correct rules status value.
                //
                fPosition = backupPosition;  // TODO: pass starting position in a clearer way.
                position = handleNext();
                if (position == backupPosition + 1 ||
                        (position == backupPosition + 2 &&
                        Character.isHighSurrogate(fText.setIndex(backupPosition)) &&
                        Character.isLowSurrogate(fText.next()))) {
                    // The initial handleNext() only advanced by a single code point. Go again.
                    // Safe rules identify safe pairs.
                    position = handleNext();
                }
                positionStatusIdx = fRuleStatusIndex;
            }
        } while (position >= fromPosition);

        // Find boundaries between the one we just located and the first already-cached boundary
        // Put them in a side buffer, because we don't yet know where they will fall in the circular cache buffer.

        fSideBuffer.removeAllElements();
        fSideBuffer.push(position);
        fSideBuffer.push(positionStatusIdx);

        do {
            int prevPosition = fPosition = position;
            int prevStatusIdx = positionStatusIdx;
            position = handleNext();
            positionStatusIdx = fRuleStatusIndex;
            if (position == BreakIterator.DONE) {
                break;
            }

            boolean segmentHandledByDictionary = false;
            if (fDictionaryCharCount != 0) {
                // Segment from the rules includes dictionary characters.
                // Subdivide it, with subdivided results going into the dictionary cache.
                int dictSegEndPosition = position;
                fDictionaryCache.populateDictionary(prevPosition, dictSegEndPosition, prevStatusIdx, positionStatusIdx);
                while (fDictionaryCache.following(prevPosition)) {
                    position = fDictionaryCache.fBoundary;
                    positionStatusIdx = fDictionaryCache.fStatusIndex;
                    segmentHandledByDictionary = true;
                    assert(position > prevPosition);
                    if (position >= fromPosition) {
                        break;
                    }
                    assert(position <= dictSegEndPosition);
                    fSideBuffer.push(position);
                    fSideBuffer.push(positionStatusIdx);
                    prevPosition = position;
                }
                assert(position==dictSegEndPosition || position>=fromPosition);
            }

            if (!segmentHandledByDictionary && position < fromPosition) {
                fSideBuffer.push(position);
                fSideBuffer.push(positionStatusIdx);
            }
        } while (position < fromPosition);

        // Move boundaries from the side buffer to the main circular buffer.
        boolean success = false;
        if (!fSideBuffer.isEmpty()) {
            positionStatusIdx = fSideBuffer.pop();
            position = fSideBuffer.pop();
            addPreceding(position, positionStatusIdx, UpdateCachePosition);
            success = true;
        }

        while (!fSideBuffer.isEmpty()) {
            positionStatusIdx = fSideBuffer.pop();
            position = fSideBuffer.pop();
            if (!addPreceding(position, positionStatusIdx, RetainCachePosition)) {
                // No space in circular buffer to hold a new preceding result while
                // also retaining the current cache (iteration) position.
                // Bailing out is safe; the cache will refill again if needed.
                break;
            }
        }
        return success;
    };


    static final boolean RetainCachePosition = false;
    static final boolean UpdateCachePosition = true;

    /**
     * Add the boundary following the current position.
     * The current position can be left as it was, or changed to the newly added boundary,
     * as specified by the update parameter.
     */
    void addFollowing(int position, int ruleStatusIdx, boolean update) {
        assert(position > fBoundaries[fEndBufIdx]);
        assert(ruleStatusIdx <= Short.MAX_VALUE);
        int nextIdx = modChunkSize(fEndBufIdx + 1);
        if (nextIdx == fStartBufIdx) {
            fStartBufIdx = modChunkSize(fStartBufIdx + 6);    // TODO: experiment. Probably revert to 1.
        }
        fBoundaries[nextIdx] = position;
        fStatuses[nextIdx] = (short)ruleStatusIdx;
        fEndBufIdx = nextIdx;
        if (update == UpdateCachePosition) {
            // Set current position to the newly added boundary.
            fBufIdx = nextIdx;
            fTextIdx = position;
        } else {
            // Retaining the original cache position.
            // Check if the added boundary wraps around the buffer, and would over-write the original position.
            // It's the responsibility of callers of this function to not add too many.
            assert(nextIdx != fBufIdx);
        }

    };


    /**
     * Add the boundary preceding the current position.
     * The current position can be left as it was, or changed to the newly added boundary,
     * as specified by the update parameter.
     */
    boolean addPreceding(int position, int ruleStatusIdx, boolean update) {
        assert(position < fBoundaries[fStartBufIdx]);
        assert(ruleStatusIdx <= Short.MAX_VALUE);
        int nextIdx = modChunkSize(fStartBufIdx - 1);
        if (nextIdx == fEndBufIdx) {
            if (fBufIdx == fEndBufIdx && update == RetainCachePosition) {
                // Failure. The insertion of the new boundary would claim the buffer position that is the
                // current iteration position. And we also want to retain the current iteration position.
                // (The buffer is already completely full of entries that precede the iteration position.)
                return false;
            }
            fEndBufIdx = modChunkSize(fEndBufIdx - 1);
        }
        fBoundaries[nextIdx] = position;
        fStatuses[nextIdx] = (short)ruleStatusIdx;
        fStartBufIdx = nextIdx;
        if (update == UpdateCachePosition) {
            fBufIdx = nextIdx;
            fTextIdx = position;
        }
        return true;
    };

    /**
     *  Set the cache position to the specified position, or, if the position
     *  falls between to cached boundaries, to the preceding boundary.
     *  Fails if the requested position is outside of the range of boundaries currently held by the cache.
     *  The startPosition must be on a code point boundary.
     *
     *  Return true if successful, false if the specified position is after
     *  the last cached boundary or before the first.
     */
    boolean seek(int pos) {
        if (pos < fBoundaries[fStartBufIdx] || pos > fBoundaries[fEndBufIdx]) {
            return false;
        }
        if (pos == fBoundaries[fStartBufIdx]) {
            // Common case: seek(0), from BreakIterator::first()
            fBufIdx = fStartBufIdx;
            fTextIdx = fBoundaries[fBufIdx];
            return true;
        }
        if (pos == fBoundaries[fEndBufIdx]) {
            fBufIdx = fEndBufIdx;
            fTextIdx = fBoundaries[fBufIdx];
            return true;
        }

        int min = fStartBufIdx;
        int max = fEndBufIdx;
        while (min != max) {
            int probe = (min + max + (min>max ? CACHE_SIZE : 0)) / 2;
            probe = modChunkSize(probe);
            if (fBoundaries[probe] > pos) {
                max = probe;
            } else {
                min = modChunkSize(probe + 1);
            }
        }
        assert(fBoundaries[max] > pos);
        fBufIdx = modChunkSize(max - 1);
        fTextIdx = fBoundaries[fBufIdx];
        assert(fTextIdx <= pos);
        return true;

    };


    /**
     * copy constructor, used from RuleBasedBreakIterator.clone().
     *
     * @param src
     */
    BreakCache(BreakCache src)  {
        fStartBufIdx = src.fStartBufIdx;
        fEndBufIdx = src.fEndBufIdx;
        fTextIdx = src.fTextIdx;
        fBufIdx = src.fBufIdx;
        fBoundaries = src.fBoundaries.clone();
        fStatuses = src.fStatuses.clone();
        fSideBuffer = new DictionaryBreakEngine.DequeI();  // Transient, no need to clone contents.
    }

    void dumpCache() {
        System.out.printf("fTextIdx:%d   fBufIdx:%d%n", fTextIdx, fBufIdx);
        for (int i=fStartBufIdx; ; i=modChunkSize(i+1)) {
            System.out.printf("%d  %d%n", i, fBoundaries[i]);
            if (i == fEndBufIdx) {
                break;
            }
        }
    };

    private final int   modChunkSize(int index) { return index & (CACHE_SIZE - 1); };

    static final int CACHE_SIZE = 128;
    // static_assert((CACHE_SIZE & (CACHE_SIZE-1)) == 0, "CACHE_SIZE must be power of two.");

    int                 fStartBufIdx;
    int                 fEndBufIdx;    // inclusive

    int                 fTextIdx;
    int                 fBufIdx;

    int[]               fBoundaries = new int[CACHE_SIZE];
    short[]             fStatuses = new short[CACHE_SIZE];

    DictionaryBreakEngine.DequeI   fSideBuffer = new DictionaryBreakEngine.DequeI();
};




}