All Downloads are FREE. Search and download functionalities are using the official Maven repository.

com.ibm.icu.text.RuleBasedBreakIterator Maven / Gradle / Ivy

Go to download

International Component for Unicode for Java (ICU4J) is a mature, widely used Java library providing Unicode and Globalization support

There is a newer version: 76.1
Show newest version
// © 2016 and later: Unicode, Inc. and others.
// License & terms of use: http://www.unicode.org/copyright.html
/*
 *******************************************************************************
 * Copyright (C) 2005-2016 International Business Machines Corporation and
 * others. All Rights Reserved.
 *******************************************************************************
 */

package com.ibm.icu.text;

import static com.ibm.icu.impl.CharacterIteration.DONE32;
import static com.ibm.icu.impl.CharacterIteration.next32;
import static com.ibm.icu.impl.CharacterIteration.nextTrail32;

import java.io.ByteArrayOutputStream;
import java.io.IOException;
import java.io.InputStream;
import java.io.OutputStream;
import java.nio.ByteBuffer;
import java.text.CharacterIterator;
import java.util.ArrayList;
import java.util.List;
import java.util.MissingResourceException;

import com.ibm.icu.impl.CharacterIteration;
import com.ibm.icu.impl.ICUBinary;
import com.ibm.icu.impl.ICUDebug;
import com.ibm.icu.impl.RBBIDataWrapper;
import com.ibm.icu.impl.breakiter.BurmeseBreakEngine;
import com.ibm.icu.impl.breakiter.CjkBreakEngine;
import com.ibm.icu.impl.breakiter.DictionaryBreakEngine;
import com.ibm.icu.impl.breakiter.KhmerBreakEngine;
import com.ibm.icu.impl.breakiter.LSTMBreakEngine;
import com.ibm.icu.impl.breakiter.LanguageBreakEngine;
import com.ibm.icu.impl.breakiter.LaoBreakEngine;
import com.ibm.icu.impl.breakiter.ThaiBreakEngine;
import com.ibm.icu.impl.breakiter.UnhandledBreakEngine;
import com.ibm.icu.lang.UCharacter;
import com.ibm.icu.lang.UProperty;
import com.ibm.icu.lang.UScript;
import com.ibm.icu.util.CodePointTrie;

/**
 * Rule Based Break Iterator
 * This is a port of the C++ class RuleBasedBreakIterator from ICU4C.
 *
 * @stable ICU 2.0
 */
public class RuleBasedBreakIterator extends BreakIterator {
    //=======================================================================
    // Constructors & Factories
    //=======================================================================

    /**
     * private constructor
     */
    private RuleBasedBreakIterator() {
        fDictionaryCharCount  = 0;
        synchronized(gAllBreakEngines) {
            fBreakEngines = new ArrayList<>(gAllBreakEngines);
        }
    }

    /**
     * Create a break iterator from a precompiled set of break rules.
     *
     * Creating a break iterator from the binary rules is much faster than
     * creating one from source rules.
     *
     * The binary rules are generated by the RuleBasedBreakIterator.compileRules() function.
     * Binary break iterator rules are not guaranteed to be compatible between
     * different versions of ICU.
     *
     * @param is an input stream supplying the compiled binary rules.
     * @throws IOException if there is an error while reading the rules from the InputStream.
     * @see    #compileRules(String, OutputStream)
     * @stable ICU 4.8
     */
    public static RuleBasedBreakIterator getInstanceFromCompiledRules(InputStream is) throws IOException {
        RuleBasedBreakIterator  This = new RuleBasedBreakIterator();
        This.fRData = RBBIDataWrapper.get(ICUBinary.getByteBufferFromInputStreamAndCloseStream(is));
        This.fLookAheadMatches = new int[This.fRData.fFTable.fLookAheadResultsSize];
        return This;
    }

    /**
     * This factory method doesn't have an access modifier; it is only accessible in the same
     * package.
     *
     * Create a break iterator from a precompiled set of break rules.
     *
     * Creating a break iterator from the binary rules is much faster than
     * creating one from source rules.
     *
     * The binary rules are generated by the RuleBasedBreakIterator.compileRules() function.
     * Binary break iterator rules are not guaranteed to be compatible between
     * different versions of ICU.
     *
     * @param bytes a buffer supplying the compiled binary rules.
     * @param phraseBreaking a flag indicating if phrase breaking is required.
     * @throws IOException if there is an error while reading the rules from the buffer.
     * @see    #compileRules(String, OutputStream)
     * @internal
     */
    /* package-potected */ static RuleBasedBreakIterator getInstanceFromCompiledRules(
            ByteBuffer bytes, boolean phraseBreaking) throws IOException {
        RuleBasedBreakIterator instance = getInstanceFromCompiledRules(bytes);
        instance.fPhraseBreaking = phraseBreaking;
        return instance;
    }

    /**
     * Create a break iterator from a precompiled set of break rules.
     *
     * Creating a break iterator from the binary rules is much faster than
     * creating one from source rules.
     *
     * The binary rules are generated by the RuleBasedBreakIterator.compileRules() function.
     * Binary break iterator rules are not guaranteed to be compatible between
     * different versions of ICU.
     *
     * @param bytes a buffer supplying the compiled binary rules.
     * @throws IOException if there is an error while reading the rules from the buffer.
     * @see    #compileRules(String, OutputStream)
     * @internal
     * @deprecated This API is ICU internal only.
     */
    @Deprecated
    public static RuleBasedBreakIterator getInstanceFromCompiledRules(ByteBuffer bytes) throws IOException {
        RuleBasedBreakIterator  This = new RuleBasedBreakIterator();
        This.fRData = RBBIDataWrapper.get(bytes);
        This.fLookAheadMatches = new int[This.fRData.fFTable.fLookAheadResultsSize];
        return This;
    }

    /**
     * Construct a RuleBasedBreakIterator from a set of rules supplied as a string.
     * @param rules The break rules to be used.
     * @stable ICU 2.2
     */
    public RuleBasedBreakIterator(String rules)  {
        this();
        try {
            ByteArrayOutputStream ruleOS = new ByteArrayOutputStream();
            compileRules(rules, ruleOS);
            fRData = RBBIDataWrapper.get(ByteBuffer.wrap(ruleOS.toByteArray()));
            fLookAheadMatches = new int[fRData.fFTable.fLookAheadResultsSize];
        } catch (IOException e) {
            ///CLOVER:OFF
            // An IO exception can only arrive here if there is a bug in the RBBI Rule compiler,
            //  causing bogus compiled rules to be produced, but with no compile error raised.
            RuntimeException rte = new RuntimeException("RuleBasedBreakIterator rule compilation internal error: "
                    + e.getMessage());
            throw rte;
            ///CLOVER:ON
        }
    }

    //=======================================================================
    // Boilerplate
    //=======================================================================

    /**
     * Clones this iterator.
     * @return A newly-constructed RuleBasedBreakIterator with the same
     * behavior as this one.
     * @stable ICU 2.0
     */
    @Override
    public Object clone()  {
        RuleBasedBreakIterator result;
        result = (RuleBasedBreakIterator)super.clone();
        if (fText != null) {
            result.fText = (CharacterIterator)(fText.clone());
        }
        synchronized (gAllBreakEngines)  {
            result.fBreakEngines = new ArrayList<>(gAllBreakEngines);
        }
        result.fLookAheadMatches = new int[fRData.fFTable.fLookAheadResultsSize];
        result.fBreakCache = result.new BreakCache(fBreakCache);
        result.fDictionaryCache = result.new DictionaryCache(fDictionaryCache);
        return result;
    }


    /**
     * Returns true if both BreakIterators are of the same class, have the same
     * rules, and iterate over the same text.
     * @stable ICU 2.0
     */
    @Override
    public boolean equals(Object that) {
        if (that == null) {
            return false;
        }
        if (this == that) {
            return true;
        }
        try {
            RuleBasedBreakIterator other = (RuleBasedBreakIterator) that;
            if (fRData != other.fRData && (fRData == null || other.fRData == null)) {
                return false;
            }
            if (fRData != null && other.fRData != null &&
                    (!fRData.fRuleSource.equals(other.fRData.fRuleSource))) {
                return false;
            }
            if (fText == null && other.fText == null) {
                return true;
            }
            if (fText == null || other.fText == null || !fText.equals(other.fText)) {
                return false;
            }
            return fPosition == other.fPosition;
        }
        catch(ClassCastException e) {
            return false;
        }
     }

    /**
     * Returns the description (rules) used to create this iterator.
     * (In ICU4C, the same function is RuleBasedBreakIterator::getRules())
     * @stable ICU 2.0
     */
    @Override
    public String toString() {
        String retStr = "";
        if (fRData != null) {
            retStr =  fRData.fRuleSource;
        }
        return retStr;
    }

    /**
     * Compute a hashcode for this BreakIterator
     * @return A hash code
     * @stable ICU 2.0
     */
    @Override
    public int hashCode()
    {
        return fRData.fRuleSource.hashCode();
    }


    private static final int  START_STATE = 1;     // The state number of the starting state
    private static final int  STOP_STATE  = 0;     // The state-transition value indicating "stop"

    // RBBIRunMode - the state machine runs an extra iteration at the beginning and end
    //               of user text.  A variable with this enum type keeps track of where we
    //               are.  The state machine only fetches user text input while in RUN mode.
    private static final int  RBBI_START  = 0;
    private static final int  RBBI_RUN    = 1;
    private static final int  RBBI_END    = 2;

    /**
     * The character iterator through which this BreakIterator accesses the text.
     */
    private CharacterIterator   fText = new java.text.StringCharacterIterator("");

    /**
     * The rule data for this BreakIterator instance.
     * Not intended for public use. Declared public for testing purposes only.
     * @internal
     * @deprecated This API is ICU internal only.
     */
    @Deprecated
    public RBBIDataWrapper    fRData;

    /**
     *  The iteration state - current position, rule status for the current position,
     *                        and whether the iterator ran off the end, yielding UBRK_DONE.
     *                        Current position is pinned to be 0 < position <= text.length.
     *                        Current position is always set to a boundary.
     *
     *  The current  position of the iterator. Pinned, 0 < fPosition <= text.length.
     *  Never has the value UBRK_DONE (-1).
     */
    private int                fPosition;

    /**
     * Index of the Rule {tag} values for the most recent match.
     */
    private int                fRuleStatusIndex;

    /**
     * True when iteration has run off the end, and iterator functions should return UBRK_DONE.
     */
    private boolean            fDone;

    /**
     *  Array of look-ahead tentative results.
     */
    private int[]              fLookAheadMatches;

    /**
     *   Cache of previously determined boundary positions.
     */
    private BreakCache         fBreakCache = new BreakCache();

    /**
     * Flag used to indicate if phrase breaking is required.
     */
    private boolean            fPhraseBreaking = false;


    /**
     * Counter for the number of characters encountered with the "dictionary"
     *   flag set.  Normal RBBI iterators don't use it, although the code
     *   for updating it is live.  Dictionary Based break iterators (a subclass
     *   of us) access this field directly.
     * @internal
     */
    private int fDictionaryCharCount;

    private DictionaryCache     fDictionaryCache = new DictionaryCache();

    /**
     * ICU debug argument name for RBBI
     */
    private static final String RBBI_DEBUG_ARG = "rbbi";

    /**
     * Debugging flag.  Trace operation of state machine when true.
     */
    private static final boolean TRACE = ICUDebug.enabled(RBBI_DEBUG_ARG)
            && ICUDebug.value(RBBI_DEBUG_ARG).indexOf("trace") >= 0;

    /**
     * The "default" break engine - just skips over ranges of dictionary words,
     * producing no breaks. Should only be used if characters need to be handled
     * by a dictionary but we have no dictionary implementation for them.
     *
     * Only one instance; shared by all break iterators.
     */
    private static final UnhandledBreakEngine gUnhandledBreakEngine;

    /**
     * List of all known break engines, common for all break iterators.
     * Lazily updated as break engines are needed, because instantiation of
     * break engines is expensive.
     *
     * Because gAllBreakEngines can be referenced concurrently from different
     * BreakIterator instances, all access is synchronized.
     */
    private static final List gAllBreakEngines;

    static {
        gUnhandledBreakEngine = new UnhandledBreakEngine();
        gAllBreakEngines = new ArrayList<>();
        gAllBreakEngines.add(gUnhandledBreakEngine);
    }

    /**
     * List of all known break engines. Similar to gAllBreakEngines, but local to a
     * break iterator, allowing it to be used without synchronization.
     */
    private List fBreakEngines;

    /**
     * Dump the contents of the state table and character classes for this break iterator.
     * For debugging only.
     * @internal
     * @deprecated This API is ICU internal only.
     */
    @Deprecated
    public void dump(java.io.PrintStream out) {
        if (out == null) {
            out = System.out;
        }
        this.fRData.dump(out);
    }

    /**
     * Compile a set of source break rules into the binary state tables used
     * by the break iterator engine.  Creating a break iterator from precompiled
     * rules is much faster than creating one from source rules.
     *
     * Binary break rules are not guaranteed to be compatible between different
     * versions of ICU.
     *
     *
     * @param rules  The source form of the break rules
     * @param ruleBinary  An output stream to receive the compiled rules.
     * @throws IOException If there is an error writing the output.
     * @see #getInstanceFromCompiledRules(InputStream)
     * @stable ICU 4.8
     */
    public static void compileRules(String rules, OutputStream ruleBinary) throws IOException {
        RBBIRuleBuilder.compileRules(rules, ruleBinary);
    }

    //=======================================================================
    // BreakIterator overrides
    //=======================================================================

    /**
     * Sets the current iteration position to the beginning of the text.
     * (i.e., the CharacterIterator's starting offset).
     * @return The offset of the beginning of the text.
     * @stable ICU 2.0
     */
    @Override
    public int first() {
        if (fText == null) {
            return BreakIterator.DONE;
        }
        fText.first();
        int start =  fText.getIndex();
        if (!fBreakCache.seek(start)) {
            fBreakCache.populateNear(start);
        }
        fBreakCache.current();
        assert(fPosition == start);
        return fPosition;
    }

    /**
     * Sets the current iteration position to the end of the text.
     * (i.e., the CharacterIterator's ending offset).
     * @return The text's past-the-end offset.
     * @stable ICU 2.0
     */
    @Override
    public int last() {
        if (fText == null) {
            return BreakIterator.DONE;
        }
        int endPos = fText.getEndIndex();
        boolean endShouldBeBoundary = isBoundary(endPos);      // Has side effect of setting iterator position.
        assert(endShouldBeBoundary);
        if (fPosition != endPos) {
            assert(fPosition == endPos);
        }
        return endPos;
    }

    /**
     * Advances the iterator either forward or backward the specified number of steps.
     * Negative values move backward, and positive values move forward.  This is
     * equivalent to repeatedly calling next() or previous().
     * @param n The number of steps to move.  The sign indicates the direction
     * (negative is backwards, and positive is forwards).
     * @return The character offset of the boundary position n boundaries away from
     * the current one.
     * @stable ICU 2.0
     */
    @Override
    public int next(int n) {
        int result = 0;
        if (n > 0) {
            for (; n > 0 && result != DONE; --n) {
                result = next();
            }
        } else if (n < 0) {
            for (; n < 0 && result != DONE; ++n) {
                result = previous();
            }
        } else {
            result = current();
        }
        return result;
    }

    /**
     * Advances the iterator to the next boundary position.
     * @return The position of the first boundary after this one.
     * @stable ICU 2.0
     */
    @Override
    public int next() {
        fBreakCache.next();
        return fDone ? DONE : fPosition;
    }

    /**
     * Moves the iterator backwards, to the boundary preceding the current one.
     * @return The position of the boundary position immediately preceding the starting position.
     * @stable ICU 2.0
     */
    @Override
    public int previous() {
        fBreakCache.previous();
        return fDone ? DONE : fPosition;
    }

    /**
     * Sets the iterator to refer to the first boundary position following
     * the specified position.
     * @param startPos The position from which to begin searching for a break position.
     * @return The position of the first break after the current position.
     * @stable ICU 2.0
     */
    @Override
    public int following(int startPos) {
        // if the supplied position is before the beginning, return the
        // text's starting offset
        if (startPos < fText.getBeginIndex()) {
            return first();
        }

        // Move requested offset to a code point start. It might be between a lead and trail surrogate.
        // Or it may be beyond the end of the text.
        startPos = CISetIndex32(fText, startPos);
        fBreakCache.following(startPos);
        return fDone ? DONE : fPosition;
    }


    /**
     * Sets the iterator to refer to the last boundary position before the
     * specified position.
     * @param offset The position to begin searching for a break from.
     * @return The position of the last boundary before the starting position.
     * @stable ICU 2.0
     */
    @Override
    public int preceding(int offset) {
        if (fText == null || offset > fText.getEndIndex()) {
            return last();
        } else if (offset < fText.getBeginIndex()) {
            return first();
        }

        // Move requested offset to a code point start. It might be between a lead and trail surrogate.
        // int adjustedOffset = CISetIndex32(fText, offset);    // TODO: restore to match ICU4C behavior.
        int adjustedOffset = offset;
        fBreakCache.preceding(adjustedOffset);
        return fDone ? DONE : fPosition;

    }


    /**
     * Throw IllegalArgumentException unless begin <= offset < end.
     * @stable ICU 2.0
     */
    protected static final void checkOffset(int offset, CharacterIterator text) {
        if (offset < text.getBeginIndex() || offset > text.getEndIndex()) {
            throw new IllegalArgumentException("offset out of bounds");
        }
    }


    /**
     * Returns true if the specified position is a boundary position.  As a side
     * effect, leaves the iterator pointing to the first boundary position at
     * or after "offset".
     * @param offset the offset to check.
     * @return True if "offset" is a boundary position.
     * @stable ICU 2.0
     */
    @Override
    public boolean isBoundary(int offset) {
        // TODO: behavior difference with ICU4C, which considers out-of-range offsets
        //       to not be boundaries, and to not be errors.
        checkOffset(offset, fText);

        // Adjust offset to be on a code point boundary and not beyond the end of the text.
        // Note that isBoundary() is always false for offsets that are not on code point boundaries.
        // But we still need the side effect of leaving iteration at the following boundary.
        int adjustedOffset = CISetIndex32(fText, offset);

        boolean result = false;
        if (fBreakCache.seek(adjustedOffset) || fBreakCache.populateNear(adjustedOffset)) {
            result = (fBreakCache.current() == offset);
        }

        if (!result) {
            // Not on a boundary. isBoundary() must leave iterator on the following boundary.
            // fBreakCache.seek(), above, left us on the preceding boundary, so advance one.
            next();
        }
        return result;

    }

    /**
     * Returns the current iteration position.  Note that DONE is never
     * returned from this function; if iteration has run to the end of a
     * string, current() will return the length of the string while
     * next() will return BreakIterator.DONE).
     * @return The current iteration position.
     * @stable ICU 2.0
     */
    @Override
    public int current() {
        return (fText != null) ? fPosition : BreakIterator.DONE;
    }


    /**
     * Return the status tag from the break rule that determined the boundary at
     * the current iteration position.  The values appear in the rule source
     * within brackets, {123}, for example.  For rules that do not specify a
     * status, a default value of 0 is returned.  If more than one rule applies,
     * the numerically largest of the possible status values is returned.
     * 

* Of the standard types of ICU break iterators, only the word and line break * iterator provides status values. The values are defined in * class RuleBasedBreakIterator, and allow distinguishing between words * that contain alphabetic letters, "words" that appear to be numbers, * punctuation and spaces, words containing ideographic characters, and * more. Call getRuleStatus after obtaining a boundary * position from next(), previous(), or * any other break iterator functions that returns a boundary position. *

* Note that getRuleStatus() returns the value corresponding to * current() index even after next() has returned DONE. *

* @return the status from the break rule that determined the boundary * at the current iteration position. * * @stable ICU 60 */ @Override public int getRuleStatus() { // Status records have this form: // Count N <-- fLastRuleStatusIndex points here. // Status val 0 // Status val 1 // ... // Status val N-1 <-- the value we need to return // The status values are sorted in ascending order. // This function returns the last (largest) of the array of status values. int idx = fRuleStatusIndex + fRData.fStatusTable[fRuleStatusIndex]; int tagVal = fRData.fStatusTable[idx]; return tagVal; } /** * Get the status (tag) values from the break rule(s) that determined the boundary * at the current iteration position. The values appear in the rule source * within brackets, {123}, for example. The default status value for rules * that do not explicitly provide one is zero. *

* The status values used by the standard ICU break rules are defined * as public constants in class RuleBasedBreakIterator. *

* If the size of the output array is insufficient to hold the data, * the output will be truncated to the available length. No exception * will be thrown. * * @param fillInArray an array to be filled in with the status values. * @return The number of rule status values from the rules that determined * the boundary at the current iteration position. * In the event that the array is too small, the return value * is the total number of status values that were available, * not the reduced number that were actually returned. * @stable ICU 60 */ @Override public int getRuleStatusVec(int[] fillInArray) { int numStatusVals = fRData.fStatusTable[fRuleStatusIndex]; if (fillInArray != null) { int numToCopy = Math.min(numStatusVals, fillInArray.length); for (int i=0; i * Caution:The state of the returned CharacterIterator * must not be modified in any way while the BreakIterator is still in use. * Doing so will lead to undefined behavior of the BreakIterator. * Clone the returned CharacterIterator first and work with that. *

* The returned CharacterIterator is a reference * to the actual iterator being used by the BreakIterator. * No guarantees are made about the current position * of this iterator when it is returned; it may differ from the * BreakIterators current position. If you need to move that * position to examine the text, clone this function's return value first. * @return An iterator over the text being analyzed. * @stable ICU 2.0 */ @Override public CharacterIterator getText() { return fText; } /** * Set the iterator to analyze a new piece of text. This function resets * the current iteration position to the beginning of the text. * (The old iterator is dropped.) *

* Caution: The supplied CharacterIterator is used * directly by the BreakIterator, and must not be altered in any * way by code outside of the BreakIterator. * Doing so will lead to undefined behavior of the BreakIterator. * * @param newText An iterator over the text to analyze. * @stable ICU 2.0 */ @Override public void setText(CharacterIterator newText) { if (newText != null) { fBreakCache.reset(newText.getBeginIndex(), 0); } else { fBreakCache.reset(); } fDictionaryCache.reset(); fText = newText; this.first(); } /** * Control debug, trace and dump options. * @internal * @deprecated This API is ICU internal only. */ @Deprecated public static final String fDebugEnv = ICUDebug.enabled(RBBI_DEBUG_ARG) ? ICUDebug.value(RBBI_DEBUG_ARG) : null; private LanguageBreakEngine getLanguageBreakEngine(int c) { // We have a dictionary character. // Does an already instantiated break engine handle it? for (LanguageBreakEngine candidate : fBreakEngines) { if (candidate.handles(c)) { return candidate; } } synchronized (gAllBreakEngines) { // This break iterator's list of break engines didn't handle the character. // Check the global list, another break iterator may have instantiated the // desired engine. for (LanguageBreakEngine candidate : gAllBreakEngines) { if (candidate.handles(c)) { fBreakEngines.add(candidate); return candidate; } } // The global list doesn't have an existing engine, build one. int script = UCharacter.getIntPropertyValue(c, UProperty.SCRIPT); if (script == UScript.KATAKANA || script == UScript.HIRAGANA) { // Katakana, Hiragana and Han are handled by the same dictionary engine. // Fold them together for mapping from script -> engine. script = UScript.HAN; } LanguageBreakEngine eng; try { switch (script) { case UScript.THAI: try { eng = LSTMBreakEngine.create(script, LSTMBreakEngine.createData(script)); } catch (MissingResourceException e) { eng = new ThaiBreakEngine(); } break; case UScript.LAO: eng = new LaoBreakEngine(); break; case UScript.MYANMAR: try { eng = LSTMBreakEngine.create(script, LSTMBreakEngine.createData(script)); } catch (MissingResourceException e) { eng = new BurmeseBreakEngine(); } break; case UScript.KHMER: eng = new KhmerBreakEngine(); break; case UScript.HAN: eng = new CjkBreakEngine(false); break; case UScript.HANGUL: eng = new CjkBreakEngine(true); break; default: gUnhandledBreakEngine.handleChar(c); eng = gUnhandledBreakEngine; break; } } catch (IOException e) { eng = null; } if (eng != null && eng != gUnhandledBreakEngine) { gAllBreakEngines.add(eng); fBreakEngines.add(eng); } return eng; } // end synchronized(gAllBreakEngines) } /** * The State Machine Engine for moving forward is here. * This function is the heart of the RBBI run time engine. * * Input * fPosition, the position in the text to begin from. * Output * fPosition: the boundary following the starting position. * fDictionaryCharCount the number of dictionary characters encountered. * If > 0, the segment will be further subdivided * fRuleStatusIndex Info from the state table indicating which rules caused the boundary. * * @return the new iterator position * * A note on supplementary characters and the position of underlying * Java CharacterIterator: Normally, a character iterator is positioned at * the char most recently returned by next(). Within this function, when * a supplementary char is being processed, the char iterator is left * sitting on the trail surrogate, in the middle of the code point. * This is different from everywhere else, where an iterator always * points at the lead surrogate of a supplementary. */ private int handleNext() { if (TRACE) { System.out.println("Handle Next pos char state category"); } // handleNext always sets the break tag value. // Set the default for it. fRuleStatusIndex = 0; fDictionaryCharCount = 0; // caches for quicker access CharacterIterator text = fText; CodePointTrie trie = fRData.fTrie; char[] stateTable = fRData.fFTable.fTable; int initialPosition = fPosition; text.setIndex(initialPosition); int result = initialPosition; // Set up the starting char int c = text.current(); if (c >= UTF16.LEAD_SURROGATE_MIN_VALUE) { c = nextTrail32(text, c); if (c == DONE32) { fDone = true; return BreakIterator.DONE; } } // Set the initial state for the state machine int state = START_STATE; int row = fRData.getRowIndex(state); short category = 3; int flagsState = fRData.fFTable.fFlags; int dictStart = fRData.fFTable.fDictCategoriesStart; int mode = RBBI_RUN; if ((flagsState & RBBIDataWrapper.RBBI_BOF_REQUIRED) != 0) { category = 2; mode = RBBI_START; if (TRACE) { System.out.print(" " + RBBIDataWrapper.intToString(text.getIndex(), 5)); System.out.print(RBBIDataWrapper.intToHexString(c, 10)); System.out.println(RBBIDataWrapper.intToString(state,7) + RBBIDataWrapper.intToString(category,6)); } } // loop until we reach the end of the text or transition to state 0 while (state != STOP_STATE) { if (c == DONE32) { // Reached end of input string. if (mode == RBBI_END) { // We have already run the loop one last time with the // character set to the pseudo {eof} value. Now it is time // to unconditionally bail out. break; } // Run the loop one last time with the fake end-of-input character category mode = RBBI_END; category = 1; } else if (mode == RBBI_RUN) { // Get the char category. An incoming category of 1 or 2 mens that // we are preset for doing the beginning or end of input, and // that we shouldn't get a category from an actual text input character. // // look up the current character's character category, which tells us // which column in the state table to look at. // category = (short) trie.get(c); // Check for categories that require word dictionary handling. if (category >= dictStart) { fDictionaryCharCount++; } if (TRACE) { System.out.print(" " + RBBIDataWrapper.intToString(text.getIndex(), 5)); System.out.print(RBBIDataWrapper.intToHexString(c, 10)); System.out.println(RBBIDataWrapper.intToString(state,7) + RBBIDataWrapper.intToString(category,6)); } // Advance to the next character. // If this is a beginning-of-input loop iteration, don't advance. // The next iteration will be processing the first real input character. c = text.next(); if (c >= UTF16.LEAD_SURROGATE_MIN_VALUE) { c = nextTrail32(text, c); } } else { mode = RBBI_RUN; } // look up a state transition in the state table state = stateTable[row + RBBIDataWrapper.NEXTSTATES + category]; row = fRData.getRowIndex(state); int accepting = stateTable[row + RBBIDataWrapper.ACCEPTING]; if (accepting == RBBIDataWrapper.ACCEPTING_UNCONDITIONAL) { // Match found, common case result = text.getIndex(); if (c >= UTF16.SUPPLEMENTARY_MIN_VALUE && c <= UTF16.CODEPOINT_MAX_VALUE) { // The iterator has been left in the middle of a surrogate pair. // We want the start of it. result--; } // Remember the break status (tag) values. fRuleStatusIndex = stateTable[row + RBBIDataWrapper.TAGSIDX]; } else if (accepting > RBBIDataWrapper.ACCEPTING_UNCONDITIONAL) { // Lookahead match is completed int lookaheadResult = fLookAheadMatches[accepting]; if (lookaheadResult >= 0) { fRuleStatusIndex = stateTable[row + RBBIDataWrapper.TAGSIDX]; fPosition = lookaheadResult; return lookaheadResult; } } // If we are at the position of the '/' in a look-ahead (hard break) rule; // record the current position, to be returned later, if the full rule matches. // TODO: Move this check before the previous check of fAccepting. // This would enable hard-break rules with no following context. // But there are line break test failures when trying this. Investigate. // Issue ICU-20837 int rule = stateTable[row + RBBIDataWrapper.LOOKAHEAD]; if (rule != 0) { int pos = text.getIndex(); if (c >= UTF16.SUPPLEMENTARY_MIN_VALUE && c <= UTF16.CODEPOINT_MAX_VALUE) { // The iterator has been left in the middle of a surrogate pair. // We want the beginning of it. pos--; } fLookAheadMatches[rule] = pos; } } // End of state machine main loop // The state machine is done. Check whether it found a match... // If the iterator failed to advance in the match engine force it ahead by one. // This indicates a defect in the break rules, which should always match // at least one character. if (result == initialPosition) { if (TRACE) { System.out.println("Iterator did not move. Advancing by 1."); } text.setIndex(initialPosition); next32(text); result = text.getIndex(); fRuleStatusIndex = 0; } // Leave the iterator at our result position. // (we may have advanced beyond the last accepting position chasing after // longer matches that never completed.) fPosition = result; if (TRACE) { System.out.println("result = " + result); } return result; } /** * Iterate backwards from an arbitrary position in the input text using the Safe Reverse rules. * This locates a "Safe Position" from which the forward break rules * will operate correctly. A Safe Position is not necessarily a boundary itself. * * The logic of this function is very similar to handleNext(), above, but simpler * because the safe table does not require as many options. * * @param fromPosition the position in the input text to begin the iteration. * @internal */ private int handleSafePrevious(int fromPosition) { char state; short category = 0; int result = 0; // caches for quicker access CharacterIterator text = fText; CodePointTrie trie = fRData.fTrie; char[] stateTable = fRData.fRTable.fTable; CISetIndex32(text, fromPosition); if (TRACE) { System.out.print("Handle Previous pos char state category"); } // if we're already at the start of the text, return DONE. if (text.getIndex() == text.getBeginIndex()) { return BreakIterator.DONE; } // Set the initial state for the state machine int c = CharacterIteration.previous32(text); state = START_STATE; int row = fRData.getRowIndex(state); // loop until we reach the start of the text or transition to state 0 // for (; c != DONE32; c = CharacterIteration.previous32(text)) { // look up the current character's character category, which tells us // which column in the state table to look at. // // And off the dictionary flag bit. For reverse iteration it is not used. category = (short) trie.get(c); if (TRACE) { System.out.print(" " + RBBIDataWrapper.intToString(text.getIndex(), 5)); System.out.print(RBBIDataWrapper.intToHexString(c, 10)); System.out.println(RBBIDataWrapper.intToString(state,7) + RBBIDataWrapper.intToString(category,6)); } // State Transition - move machine to its next state // assert(category < fRData.fHeader.fCatCount); state = stateTable[row + RBBIDataWrapper.NEXTSTATES + category]; row = fRData.getRowIndex(state); if (state == STOP_STATE) { // This is the normal exit from the lookup state machine. // Transition to state zero means we have found a safe point. break; } } // The state machine is done. result = text.getIndex(); if (TRACE) { System.out.println("result = " + result); } return result; } /** * Set the index of a CharacterIterator. * Pin the index to the valid range range of BeginIndex <= index <= EndIndex. * If the index points to a trail surrogate of a supplementary character, adjust it * to the start (lead surrogate) index. * * @param ci A CharacterIterator to set * @param index the index to set * @return the resulting index, possibly pinned or adjusted. */ private static int CISetIndex32(CharacterIterator ci, int index) { if (index <= ci.getBeginIndex()) { ci.first(); } else if (index >= ci.getEndIndex()) { ci.setIndex(ci.getEndIndex()); } else if (Character.isLowSurrogate(ci.setIndex(index))) { if (!Character.isHighSurrogate(ci.previous())) { ci.next(); } } return ci.getIndex(); } /** DictionaryCache stores the boundaries obtained from a run of dictionary characters. * Dictionary boundaries are moved first to this cache, then from here * to the main BreakCache, where they may inter-leave with non-dictionary * boundaries. The public BreakIterator API always fetches directly * from the main BreakCache, not from here. * * In common situations, the number of boundaries in a single dictionary run * should be quite small, it will be terminated by punctuation, spaces, * or any other non-dictionary characters. The main BreakCache may end * up with boundaries from multiple dictionary based runs. * * The boundaries are stored in a simple ArrayList (vector), with the * assumption that they will be accessed sequentially. */ class DictionaryCache { void reset() { fPositionInCache = -1; fStart = 0; fLimit = 0; fFirstRuleStatusIndex = 0; fOtherRuleStatusIndex = 0; fBreaks.removeAllElements(); }; boolean following(int fromPos) { if (fromPos >= fLimit || fromPos < fStart) { fPositionInCache = -1; return false; } // Sequential iteration, move from previous boundary to the following int r = 0; if (fPositionInCache >= 0 && fPositionInCache < fBreaks.size() && fBreaks.elementAt(fPositionInCache) == fromPos) { ++fPositionInCache; if (fPositionInCache >= fBreaks.size()) { fPositionInCache = -1; return false; } r = fBreaks.elementAt(fPositionInCache); assert(r > fromPos); fBoundary = r; fStatusIndex = fOtherRuleStatusIndex; return true; } // Random indexing. Linear search for the boundary following the given position. for (fPositionInCache = 0; fPositionInCache < fBreaks.size(); ++fPositionInCache) { r= fBreaks.elementAt(fPositionInCache); if (r > fromPos) { fBoundary = r; fStatusIndex = fOtherRuleStatusIndex; return true; } } // Internal error. fStart <= fromPos < fLimit, but no cached boundary. assert(false); fPositionInCache = -1; return false; }; boolean preceding(int fromPos) { if (fromPos <= fStart || fromPos > fLimit) { fPositionInCache = -1; return false; } if (fromPos == fLimit) { fPositionInCache = fBreaks.size() - 1; if (fPositionInCache >= 0) { assert(fBreaks.elementAt(fPositionInCache) == fromPos); } } int r; if (fPositionInCache > 0 && fPositionInCache < fBreaks.size() && fBreaks.elementAt(fPositionInCache) == fromPos) { --fPositionInCache; r = fBreaks.elementAt(fPositionInCache); assert(r < fromPos); fBoundary = r; fStatusIndex = ( r== fStart) ? fFirstRuleStatusIndex : fOtherRuleStatusIndex; return true; } if (fPositionInCache == 0) { fPositionInCache = -1; return false; } for (fPositionInCache = fBreaks.size()-1; fPositionInCache >= 0; --fPositionInCache) { r = fBreaks.elementAt(fPositionInCache); if (r < fromPos) { fBoundary = r; fStatusIndex = ( r == fStart) ? fFirstRuleStatusIndex : fOtherRuleStatusIndex; return true; } } assert(false); fPositionInCache = -1; return false; }; /** * Populate the cache with the dictionary based boundaries within a region of text. * @param startPos The start position of a range of text * @param endPos The end position of a range of text * @param firstRuleStatus The rule status index that applies to the break at startPos * @param otherRuleStatus The rule status index that applies to boundaries other than startPos * @internal */ void populateDictionary(int startPos, int endPos, int firstRuleStatus, int otherRuleStatus) { if ((endPos - startPos) <= 1) { return; } reset(); fFirstRuleStatusIndex = firstRuleStatus; fOtherRuleStatusIndex = otherRuleStatus; int rangeStart = startPos; int rangeEnd = endPos; int category; int current; int foundBreakCount = 0; // Loop through the text, looking for ranges of dictionary characters. // For each span, find the appropriate break engine, and ask it to find // any breaks within the span. fText.setIndex(rangeStart); int c = CharacterIteration.current32(fText); category = (short)fRData.fTrie.get(c); int dictStart = fRData.fFTable.fDictCategoriesStart; while(true) { while((current = fText.getIndex()) < rangeEnd && (category < dictStart)) { c = CharacterIteration.next32(fText); // pre-increment category = (short)fRData.fTrie.get(c); } if (current >= rangeEnd) { break; } // We now have a dictionary character. Get the appropriate language object // to deal with it. LanguageBreakEngine lbe = getLanguageBreakEngine(c); // Ask the language object if there are any breaks. It will add them to the cache and // leave the text pointer on the other side of its range, ready to search for the next one. if (lbe != null) { foundBreakCount += lbe.findBreaks(fText, rangeStart, rangeEnd, fBreaks, fPhraseBreaking); } // Reload the loop variables for the next go-round c = CharacterIteration.current32(fText); category = (short)fRData.fTrie.get(c); } // If we found breaks, ensure that the first and last entries are // the original starting and ending position. And initialize the // cache iteration position to the first entry. // System.out.printf("foundBreakCount = %d%n", foundBreakCount); if (foundBreakCount > 0) { assert(foundBreakCount == fBreaks.size()); if (startPos < fBreaks.elementAt(0)) { // The dictionary did not place a boundary at the start of the segment of text. // Add one now. This should not commonly happen, but it would be easy for interactions // of the rules for dictionary segments and the break engine implementations to // inadvertently cause it. Cover it here, just in case. fBreaks.offer(startPos); } if (endPos > fBreaks.peek()) { fBreaks.push(endPos); } fPositionInCache = 0; // Note: Dictionary matching may extend beyond the original limit. fStart = fBreaks.elementAt(0); fLimit = fBreaks.peek(); } else { // there were no language-based breaks, even though the segment contained // dictionary characters. Subsequent attempts to fetch boundaries from the dictionary cache // for this range will fail, and the calling code will fall back to the rule based boundaries. } }; DictionaryCache() { fPositionInCache = -1; fBreaks = new DictionaryBreakEngine.DequeI(); } /** * copy constructor. Used by RuleBasedBreakIterator.clone(). * * @param src the source object to be copied. */ DictionaryCache(DictionaryCache src) { try { fBreaks = (DictionaryBreakEngine.DequeI)src.fBreaks.clone(); } catch (CloneNotSupportedException e) { throw new RuntimeException(e); } fPositionInCache = src.fPositionInCache; fStart = src.fStart; fLimit = src.fLimit; fFirstRuleStatusIndex = src.fFirstRuleStatusIndex; fOtherRuleStatusIndex = src.fOtherRuleStatusIndex; fBoundary = src.fBoundary; fStatusIndex = src.fStatusIndex; } // A data structure containing the boundaries themselves. Essentially a vector of raw ints. DictionaryBreakEngine.DequeI fBreaks; int fPositionInCache; // Index in fBreaks of last boundary returned by following() // // or preceding(). Optimizes sequential access. int fStart; // Text position of first boundary in cache. int fLimit; // Last boundary in cache. Which is the limit of the // // text segment being handled by the dictionary. int fFirstRuleStatusIndex; // Rule status info for first boundary. int fOtherRuleStatusIndex; // Rule status info for 2nd through last boundaries. int fBoundary; // Current boundary. Set by preceding(), following(). int fStatusIndex; // Current rule status index. Set by preceding, following(). }; /* * class BreakCache * * Cache of break boundary positions and rule status values. * Break iterator API functions, next(), previous(), etc., will use cached results * when possible, and otherwise cache new results as they are obtained. * * Uniformly caches both dictionary and rule based (non-dictionary) boundaries. * * The cache is implemented as a single circular buffer. */ /* * size of the circular cache buffer. */ class BreakCache { BreakCache() { reset(); }; void reset(int pos, int ruleStatus) { fStartBufIdx = 0; fEndBufIdx = 0; fTextIdx = pos; fBufIdx = 0; fBoundaries[0] = pos; fStatuses[0] = (short)ruleStatus; } void reset() {reset(0, 0); }; void next() { if (fBufIdx == fEndBufIdx) { fDone = !populateFollowing(); fPosition = fTextIdx; fRuleStatusIndex = fStatuses[fBufIdx]; } else { fBufIdx = modChunkSize(fBufIdx + 1); fTextIdx = fPosition = fBoundaries[fBufIdx]; fRuleStatusIndex = fStatuses[fBufIdx]; } }; void previous() { int initialBufIdx = fBufIdx; if (fBufIdx == fStartBufIdx) { // At start of cache. Prepend to it. populatePreceding(); } else { // Cache already holds the next boundary fBufIdx = modChunkSize(fBufIdx - 1); fTextIdx = fBoundaries[fBufIdx]; } fDone = (fBufIdx == initialBufIdx); fPosition = fTextIdx; fRuleStatusIndex = fStatuses[fBufIdx]; return; }; // Move the iteration state to the position following the startPosition. // Input position must be pinned to the input length. void following(int startPos) { if (startPos == fTextIdx || seek(startPos) || populateNear(startPos)) { // startPos is in the cache. Do a next() from that position. // TODO: an awkward set of interactions with bi->fDone // seek() does not clear it; it can't because of interactions with populateNear(). // next() does not clear it in the fast-path case, where everything matters. Maybe it should. // So clear it here, for the case where seek() succeeded on an iterator that had previously run off the end. fDone = false; next(); } }; void preceding(int startPos) { if (startPos == fTextIdx || seek(startPos) || populateNear(startPos)) { if (startPos == fTextIdx) { previous(); } else { // seek() leaves the BreakCache positioned at the preceding boundary // if the requested position is between two boundaries. // current() pushes the BreakCache position out to the BreakIterator itself. assert(startPos > fTextIdx); current(); } } return; }; /** * Update the state of the public BreakIterator (fBI) to reflect the * current state of the break iterator cache (this). */ int current() { fPosition = fTextIdx; fRuleStatusIndex = fStatuses[fBufIdx]; fDone = false; return fTextIdx; }; /** * Add boundaries to the cache near the specified position. * The given position need not be a boundary itself. * The input position must be within the range of the text, and * on a code point boundary. * If the requested position is a break boundary, leave the iteration * position on it. * If the requested position is not a boundary, leave the iteration * position on the preceding boundary and include both the the * preceding and following boundaries in the cache. * Additional boundaries, either preceding or following, may be added * to the cache as a side effect. * * Return false if the operation failed. */ boolean populateNear(int position) { assert(position < fBoundaries[fStartBufIdx] || position > fBoundaries[fEndBufIdx]); // Add boundaries to the cache near the specified position. // The given position need not be a boundary itself. // The input position must be within the range of the text, and // on a code point boundary. // If the requested position is a break boundary, leave the iteration // position on it. // If the requested position is not a boundary, leave the iteration // position on the preceding boundary and include both the // preceding and following boundaries in the cache. // Additional boundaries, either preceding or following, may be added // to the cache as a side effect. // If the requested position is not near already cached positions, clear the existing cache, // find a near-by boundary and begin new cache contents there. // Threshold for a text position to be considered near to existing cache contents. // TODO: See issue ICU-22024 "perf tuning of Cache needed." // This value is subject to change. See the ticket for more details. final int CACHE_NEAR = 15; int startOfText = fText.getBeginIndex(); int aBoundary = -1; int ruleStatusIndex = 0; boolean retainCache = false; if ((position > fBoundaries[fStartBufIdx] - CACHE_NEAR) && position < (fBoundaries[fEndBufIdx] + CACHE_NEAR)) { // Requested position is near the existing cache. Retain it. retainCache = true; } else if (position <= startOfText + CACHE_NEAR) { // Requested position is near the start of the text. Fill cache from start, skipping // the need to find a safe point. retainCache = false; aBoundary = startOfText; } else { // Requested position is not near the existing cache. // Find a safe point to refill the cache from. int backupPos = handleSafePrevious(position); if (fBoundaries[fEndBufIdx] < position && fBoundaries[fEndBufIdx] >= (backupPos - CACHE_NEAR)) { // The requested position is beyond the end of the existing cache, but the // reverse rules produced a position near or before the cached region. // Retain the existing cache, and fill from the end of it. retainCache = true; } else if (backupPos < startOfText + CACHE_NEAR) { // The safe reverse rules moved us to near the start of text. // Take that (index 0) as the backup boundary, avoiding the complication // (in the following block) of moving forward from the safe point to a known boundary. // // Retain the cache if it begins not too far from the requested position. aBoundary = startOfText; retainCache = (fBoundaries[fStartBufIdx] <= (position + CACHE_NEAR)); } else { // The safe reverse rules produced a position that is neither near the existing // cache, nor near the start of text. // Advance to the boundary following. // There is a complication: the safe reverse rules identify pairs of code points // that are safe. If advancing from the safe point moves forwards by less than // two code points, we need to advance one more time to ensure that the boundary // is good, including a correct rules status value. // retainCache = false; fPosition = backupPos; aBoundary = handleNext(); if (aBoundary == backupPos + 1 || (aBoundary == backupPos + 2 && Character.isHighSurrogate(fText.setIndex(backupPos)) && Character.isLowSurrogate(fText.next()))) { // The initial handleNext() only advanced by a single code point. Go again. // Safe rules identify safe pairs. aBoundary = handleNext(); } if (aBoundary == BreakIterator.DONE) { aBoundary = fText.getEndIndex(); } ruleStatusIndex = fRuleStatusIndex; } } if (!retainCache) { assert(aBoundary != -1); reset(aBoundary, ruleStatusIndex); // Reset cache to hold aBoundary as a single starting point. } // Fill in boundaries between existing cache content and the new requested position. if (fBoundaries[fEndBufIdx] < position) { // The last position in the cache precedes the requested position. // Add following position(s) to the cache. while (fBoundaries[fEndBufIdx] < position) { if (!populateFollowing()) { assert false; return false; } } fBufIdx = fEndBufIdx; // Set iterator position to the end of the buffer. fTextIdx = fBoundaries[fBufIdx]; // Required because populateFollowing may add extra boundaries. while (fTextIdx > position) { // Move backwards to a position at or preceding the requested pos. previous(); } return true; } if (fBoundaries[fStartBufIdx] > position) { // The first position in the cache is beyond the requested position. // back up more until we get a boundary <= the requested position. while (fBoundaries[fStartBufIdx] > position) { populatePreceding(); } fBufIdx = fStartBufIdx; // Set iterator position to the start of the buffer. fTextIdx = fBoundaries[fBufIdx]; // Required because populatePreceding may add extra boundaries. while (fTextIdx < position) { // Move forwards to a position at or following the requested pos. next(); } if (fTextIdx > position) { // If position is not itself a boundary, the next() loop above will overshoot. // Back up one, leaving cache position at the boundary preceding the requested position. previous(); } return true; } assert fTextIdx == position; return true; }; /** * Add boundary(s) to the cache following the current last boundary. * Return false if at the end of the text, and no more boundaries can be added. * Leave iteration position at the first newly added boundary, or unchanged if no boundary was added. */ boolean populateFollowing() { int fromPosition = fBoundaries[fEndBufIdx]; int fromRuleStatusIdx = fStatuses[fEndBufIdx]; int pos = 0; int ruleStatusIdx = 0; if (fDictionaryCache.following(fromPosition)) { addFollowing(fDictionaryCache.fBoundary, fDictionaryCache.fStatusIndex, UpdateCachePosition); return true; } fPosition = fromPosition; pos = handleNext(); if (pos == BreakIterator.DONE) { return false; } ruleStatusIdx = fRuleStatusIndex; if (fDictionaryCharCount > 0) { // The text segment obtained from the rules includes dictionary characters. // Subdivide it, with subdivided results going into the dictionary cache. fDictionaryCache.populateDictionary(fromPosition, pos, fromRuleStatusIdx, ruleStatusIdx); if (fDictionaryCache.following(fromPosition)) { addFollowing(fDictionaryCache.fBoundary, fDictionaryCache.fStatusIndex, UpdateCachePosition); return true; // TODO: may want to move a sizable chunk of the dictionary cache to the break cache at this point. // But be careful with interactions with populateNear(). } } // Rule based segment did not include dictionary characters. // Or, it did contain dictionary chars, but the dictionary segmenter didn't handle them, // meaning that we didn't take the return, above. // Add its end point to the cache. addFollowing(pos, ruleStatusIdx, UpdateCachePosition); // Add several non-dictionary boundaries at this point, to optimize straight forward iteration. // (subsequent calls to BreakIterator::next() will take the fast path, getting cached results. // for (int count=0; count<6; ++count) { pos = handleNext(); if (pos == BreakIterator.DONE || fDictionaryCharCount > 0) { break; } addFollowing(pos, fRuleStatusIndex, RetainCachePosition); } return true; }; /** * Add one or more boundaries to the cache preceding the first currently cached boundary. * Leave the iteration position on the first added boundary. * Return false if no boundaries could be added (if at the start of the text.) */ boolean populatePreceding() { int textBegin = fText.getBeginIndex(); int fromPosition = fBoundaries[fStartBufIdx]; if (fromPosition == textBegin) { return false; } int position = textBegin; int positionStatusIdx = 0; if (fDictionaryCache.preceding(fromPosition)) { addPreceding(fDictionaryCache.fBoundary, fDictionaryCache.fStatusIndex, UpdateCachePosition); return true; } int backupPosition = fromPosition; // Find a boundary somewhere preceding the first already-cached boundary do { backupPosition = backupPosition - 30; if (backupPosition <= textBegin) { backupPosition = textBegin; } else { backupPosition = handleSafePrevious(backupPosition); } if (backupPosition == BreakIterator.DONE || backupPosition == textBegin) { position = textBegin; positionStatusIdx = 0; } else { // Advance to the boundary following the backup position. // There is a complication: the safe reverse rules identify pairs of code points // that are safe. If advancing from the safe point moves forwards by less than // two code points, we need to advance one more time to ensure that the boundary // is good, including a correct rules status value. // fPosition = backupPosition; // TODO: pass starting position in a clearer way. position = handleNext(); if (position == backupPosition + 1 || (position == backupPosition + 2 && Character.isHighSurrogate(fText.setIndex(backupPosition)) && Character.isLowSurrogate(fText.next()))) { // The initial handleNext() only advanced by a single code point. Go again. // Safe rules identify safe pairs. position = handleNext(); } positionStatusIdx = fRuleStatusIndex; } } while (position >= fromPosition); // Find boundaries between the one we just located and the first already-cached boundary // Put them in a side buffer, because we don't yet know where they will fall in the circular cache buffer. fSideBuffer.removeAllElements(); fSideBuffer.push(position); fSideBuffer.push(positionStatusIdx); do { int prevPosition = fPosition = position; int prevStatusIdx = positionStatusIdx; position = handleNext(); positionStatusIdx = fRuleStatusIndex; if (position == BreakIterator.DONE) { break; } boolean segmentHandledByDictionary = false; if (fDictionaryCharCount != 0) { // Segment from the rules includes dictionary characters. // Subdivide it, with subdivided results going into the dictionary cache. int dictSegEndPosition = position; fDictionaryCache.populateDictionary(prevPosition, dictSegEndPosition, prevStatusIdx, positionStatusIdx); while (fDictionaryCache.following(prevPosition)) { position = fDictionaryCache.fBoundary; positionStatusIdx = fDictionaryCache.fStatusIndex; segmentHandledByDictionary = true; assert(position > prevPosition); if (position >= fromPosition) { break; } assert(position <= dictSegEndPosition); fSideBuffer.push(position); fSideBuffer.push(positionStatusIdx); prevPosition = position; } assert(position==dictSegEndPosition || position>=fromPosition); } if (!segmentHandledByDictionary && position < fromPosition) { fSideBuffer.push(position); fSideBuffer.push(positionStatusIdx); } } while (position < fromPosition); // Move boundaries from the side buffer to the main circular buffer. boolean success = false; if (!fSideBuffer.isEmpty()) { positionStatusIdx = fSideBuffer.pop(); position = fSideBuffer.pop(); addPreceding(position, positionStatusIdx, UpdateCachePosition); success = true; } while (!fSideBuffer.isEmpty()) { positionStatusIdx = fSideBuffer.pop(); position = fSideBuffer.pop(); if (!addPreceding(position, positionStatusIdx, RetainCachePosition)) { // No space in circular buffer to hold a new preceding result while // also retaining the current cache (iteration) position. // Bailing out is safe; the cache will refill again if needed. break; } } return success; }; static final boolean RetainCachePosition = false; static final boolean UpdateCachePosition = true; /** * Add the boundary following the current position. * The current position can be left as it was, or changed to the newly added boundary, * as specified by the update parameter. */ void addFollowing(int position, int ruleStatusIdx, boolean update) { assert(position > fBoundaries[fEndBufIdx]); assert(ruleStatusIdx <= Short.MAX_VALUE); int nextIdx = modChunkSize(fEndBufIdx + 1); if (nextIdx == fStartBufIdx) { fStartBufIdx = modChunkSize(fStartBufIdx + 6); // TODO: experiment. Probably revert to 1. } fBoundaries[nextIdx] = position; fStatuses[nextIdx] = (short)ruleStatusIdx; fEndBufIdx = nextIdx; if (update == UpdateCachePosition) { // Set current position to the newly added boundary. fBufIdx = nextIdx; fTextIdx = position; } else { // Retaining the original cache position. // Check if the added boundary wraps around the buffer, and would over-write the original position. // It's the responsibility of callers of this function to not add too many. assert(nextIdx != fBufIdx); } }; /** * Add the boundary preceding the current position. * The current position can be left as it was, or changed to the newly added boundary, * as specified by the update parameter. */ boolean addPreceding(int position, int ruleStatusIdx, boolean update) { assert(position < fBoundaries[fStartBufIdx]); assert(ruleStatusIdx <= Short.MAX_VALUE); int nextIdx = modChunkSize(fStartBufIdx - 1); if (nextIdx == fEndBufIdx) { if (fBufIdx == fEndBufIdx && update == RetainCachePosition) { // Failure. The insertion of the new boundary would claim the buffer position that is the // current iteration position. And we also want to retain the current iteration position. // (The buffer is already completely full of entries that precede the iteration position.) return false; } fEndBufIdx = modChunkSize(fEndBufIdx - 1); } fBoundaries[nextIdx] = position; fStatuses[nextIdx] = (short)ruleStatusIdx; fStartBufIdx = nextIdx; if (update == UpdateCachePosition) { fBufIdx = nextIdx; fTextIdx = position; } return true; }; /** * Set the cache position to the specified position, or, if the position * falls between to cached boundaries, to the preceding boundary. * Fails if the requested position is outside of the range of boundaries currently held by the cache. * The startPosition must be on a code point boundary. * * Return true if successful, false if the specified position is after * the last cached boundary or before the first. */ boolean seek(int pos) { if (pos < fBoundaries[fStartBufIdx] || pos > fBoundaries[fEndBufIdx]) { return false; } if (pos == fBoundaries[fStartBufIdx]) { // Common case: seek(0), from BreakIterator::first() fBufIdx = fStartBufIdx; fTextIdx = fBoundaries[fBufIdx]; return true; } if (pos == fBoundaries[fEndBufIdx]) { fBufIdx = fEndBufIdx; fTextIdx = fBoundaries[fBufIdx]; return true; } int min = fStartBufIdx; int max = fEndBufIdx; while (min != max) { int probe = (min + max + (min>max ? CACHE_SIZE : 0)) / 2; probe = modChunkSize(probe); if (fBoundaries[probe] > pos) { max = probe; } else { min = modChunkSize(probe + 1); } } assert(fBoundaries[max] > pos); fBufIdx = modChunkSize(max - 1); fTextIdx = fBoundaries[fBufIdx]; assert(fTextIdx <= pos); return true; }; /** * copy constructor, used from RuleBasedBreakIterator.clone(). * * @param src */ BreakCache(BreakCache src) { fStartBufIdx = src.fStartBufIdx; fEndBufIdx = src.fEndBufIdx; fTextIdx = src.fTextIdx; fBufIdx = src.fBufIdx; fBoundaries = src.fBoundaries.clone(); fStatuses = src.fStatuses.clone(); fSideBuffer = new DictionaryBreakEngine.DequeI(); // Transient, no need to clone contents. } void dumpCache() { System.out.printf("fTextIdx:%d fBufIdx:%d%n", fTextIdx, fBufIdx); for (int i=fStartBufIdx; ; i=modChunkSize(i+1)) { System.out.printf("%d %d%n", i, fBoundaries[i]); if (i == fEndBufIdx) { break; } } }; private final int modChunkSize(int index) { return index & (CACHE_SIZE - 1); }; static final int CACHE_SIZE = 128; // static_assert((CACHE_SIZE & (CACHE_SIZE-1)) == 0, "CACHE_SIZE must be power of two."); int fStartBufIdx; int fEndBufIdx; // inclusive int fTextIdx; int fBufIdx; int[] fBoundaries = new int[CACHE_SIZE]; short[] fStatuses = new short[CACHE_SIZE]; DictionaryBreakEngine.DequeI fSideBuffer = new DictionaryBreakEngine.DequeI(); }; }





© 2015 - 2024 Weber Informatics LLC | Privacy Policy