com.ibm.icu.text.CollationElementIterator Maven / Gradle / Ivy

Go to download
Show more of this group Show more artifacts with this name
Show all versions of icu4j Show documentation
International Component for Unicode for Java (ICU4J) is a mature, widely used Java library providing Unicode and Globalization support
There is a newer version: 76.1
Show newest version
// © 2016 and later: Unicode, Inc. and others.
// License & terms of use: http://www.unicode.org/copyright.html
/**
*******************************************************************************
* Copyright (C) 1996-2016, International Business Machines Corporation and
* others. All Rights Reserved.
*******************************************************************************
*/
package com.ibm.icu.text;

import java.text.CharacterIterator;
import java.util.HashMap;
import java.util.Map;

import com.ibm.icu.impl.CharacterIteratorWrapper;
import com.ibm.icu.impl.coll.Collation;
import com.ibm.icu.impl.coll.CollationData;
import com.ibm.icu.impl.coll.CollationIterator;
import com.ibm.icu.impl.coll.ContractionsAndExpansions;
import com.ibm.icu.impl.coll.FCDIterCollationIterator;
import com.ibm.icu.impl.coll.FCDUTF16CollationIterator;
import com.ibm.icu.impl.coll.IterCollationIterator;
import com.ibm.icu.impl.coll.UTF16CollationIterator;
import com.ibm.icu.impl.coll.UVector32;

/**
 * CollationElementIterator is an iterator created by
 * a RuleBasedCollator to walk through a string. The return result of
 * each iteration is a 32-bit collation element (CE) that defines the
 * ordering priority of the next character or sequence of characters
 * in the source string.
 *
 * For illustration, consider the following in Slovak and in traditional Spanish collation:
 * 

 *  * "ca" -> the first collation element is CE('c') and the second
 *         collation element is CE('a').
 * "cha" -> the first collation element is CE('ch') and the second
 *          collation element is CE('a').
 * 
 * 
 * And in German phonebook collation,
 * 
 *  * Since the character 'æ' is a composed character of 'a' and 'e', the
 * iterator returns two collation elements for the single character 'æ'
 *
 * "æb" -> the first collation element is collation_element('a'), the
 *              second collation element is collation_element('e'), and the
 *              third collation element is collation_element('b').
 * 
 * 
 *
 * For collation ordering comparison, the collation element results
 * can not be compared simply by using basic arithmetic operators,
 * e.g. <, == or >, further processing has to be done. Details
 * can be found in the ICU
 * 
 * User Guide. An example of using the CollationElementIterator
 * for collation ordering comparison is the class
 * {@link com.ibm.icu.text.StringSearch}.
 *
 * 
To construct a CollationElementIterator object, users
 * call the method getCollationElementIterator() on a
 * RuleBasedCollator that defines the desired sorting order.
 *
 * 
 Example:
 * 

 *  *  String testString = "This is a test";
 *  RuleBasedCollator rbc = new RuleBasedCollator("&a<b");
 *  CollationElementIterator iterator = rbc.getCollationElementIterator(testString);
 *  int primaryOrder = iterator.IGNORABLE;
 *  while (primaryOrder != iterator.NULLORDER) {
 *      int order = iterator.next();
 *      if (order != iterator.IGNORABLE &&
 *          order != iterator.NULLORDER) {
 *          // order is valid, not ignorable and we have not passed the end
 *          // of the iteration, we do something
 *          primaryOrder = CollationElementIterator.primaryOrder(order);
 *          System.out.println("Next primary order 0x" +
 *                             Integer.toHexString(primaryOrder));
 *      }
 *  }
 * 
 * 
 * 
 * The method next() returns the collation order of the next character based on
 * the comparison level of the collator. The method previous() returns the
 * collation order of the previous character based on the comparison level of
 * the collator. The Collation Element Iterator moves only in one direction
 * between calls to reset(), setOffset(), or setText(). That is, next() and
 * previous() can not be inter-used. Whenever previous() is to be called after
 * next() or vice versa, reset(), setOffset() or setText() has to be called first
 * to reset the status, shifting current position to either the end or the start of
 * the string (reset() or setText()), or the specified position (setOffset()).
 * Hence at the next call of next() or previous(), the first or last collation order,
 * or collation order at the specified position will be returned. If a change of
 * direction is done without one of these calls, the result is undefined.
 * 

 * This class is not subclassable.
 * @see Collator
 * @see RuleBasedCollator
 * @see StringSearch
 * @author Syn Wee Quek
 * @stable ICU 2.8
 */
public final class CollationElementIterator
{
    private CollationIterator iter_;  // owned
    private RuleBasedCollator rbc_;  // aliased
    private int otherHalf_;
    /**
     * <0: backwards; 0: just after reset() (previous() begins from end);
     * 1: just after setOffset(); >1: forward
     */
    private byte dir_;
    /**
     * Stores offsets from expansions and from unsafe-backwards iteration,
     * so that getOffset() returns intermediate offsets for the CEs
     * that are consistent with forward iteration.
     */
    private UVector32 offsets_;

    private String string_;  // TODO: needed in Java? if so, then add a UCharacterIterator field too?


    /**
     * This constant is returned by the iterator in the methods
     * next() and previous() when the end or the beginning of the
     * source string has been reached, and there are no more valid
     * collation elements to return.
     *
     * 
See class documentation for an example of use.
     * @stable ICU 2.8
     * @see #next
     * @see #previous */
    public final static int NULLORDER = 0xffffffff;

    /**
     * This constant is returned by the iterator in the methods
     * next() and previous() when a collation element result is to be
     * ignored.
     *
     * 
See class documentation for an example of use.
     * @stable ICU 2.8
     * @see #next
     * @see #previous */
    public static final int IGNORABLE = 0;

    /**
     * Return the primary order of the specified collation element,
     * i.e. the first 16 bits.  This value is unsigned.
     * @param ce the collation element
     * @return the element's 16 bits primary order.
     * @stable ICU 2.8
     */
    public final static int primaryOrder(int ce) {
        return (ce >>> 16) & 0xffff;
    }

    /**
     * Return the secondary order of the specified collation element,
     * i.e. the 16th to 23th bits, inclusive.  This value is unsigned.
     * @param ce the collation element
     * @return the element's 8 bits secondary order
     * @stable ICU 2.8
     */
    public final static int secondaryOrder(int ce) {
        return (ce >>> 8) & 0xff;
    }

    /**
     * Return the tertiary order of the specified collation element, i.e. the last
     * 8 bits.  This value is unsigned.
     * @param ce the collation element
     * @return the element's 8 bits tertiary order
     * @stable ICU 2.8
     */
    public final static int tertiaryOrder(int ce) {
        return ce & 0xff;
    }


    private static final int getFirstHalf(long p, int lower32) {
        return ((int)p & 0xffff0000) | ((lower32 >> 16) & 0xff00) | ((lower32 >> 8) & 0xff);
    }

    private static final int getSecondHalf(long p, int lower32) {
        return ((int)p << 16) | ((lower32 >> 8) & 0xff00) | (lower32 & 0x3f);
    }

    private static final boolean ceNeedsTwoParts(long ce) {
        return (ce & 0xffff00ff003fL) != 0;
    }

    private CollationElementIterator(RuleBasedCollator collator) {
        iter_ = null;
        rbc_ = collator;
        otherHalf_ = 0;
        dir_ = 0;
        offsets_ = null;
    }

    /**
     * CollationElementIterator constructor. This takes a source
     * string and a RuleBasedCollator. The iterator will walk through
     * the source string based on the rules defined by the
     * collator. If the source string is empty, NULLORDER will be
     * returned on the first call to next().
     *
     * @param source the source string.
     * @param collator the RuleBasedCollator
     * @stable ICU 2.8
     */
    CollationElementIterator(String source, RuleBasedCollator collator) {
        this(collator);
        setText(source);
    }
    // Note: The constructors should take settings & tailoring, not a collator,
    // to avoid circular dependencies.
    // However, for equals() we would need to be able to compare tailoring data for equality
    // without making CollationData or CollationTailoring depend on TailoredSet.
    // (See the implementation of RuleBasedCollator.equals().)
    // That might require creating an intermediate class that would be used
    // by both CollationElementIterator and RuleBasedCollator
    // but only contain the part of RBC.equals() related to data and rules.

    /**
     * CollationElementIterator constructor. This takes a source
     * character iterator and a RuleBasedCollator. The iterator will
     * walk through the source string based on the rules defined by
     * the collator. If the source string is empty, NULLORDER will be
     * returned on the first call to next().
     *
     * @param source the source string iterator.
     * @param collator the RuleBasedCollator
     * @stable ICU 2.8
     */
    CollationElementIterator(CharacterIterator source, RuleBasedCollator collator) {
        this(collator);
        setText(source);
    }

    /**
     * CollationElementIterator constructor. This takes a source
     * character iterator and a RuleBasedCollator. The iterator will
     * walk through the source string based on the rules defined by
     * the collator. If the source string is empty, NULLORDER will be
     * returned on the first call to next().
     *
     * @param source the source string iterator.
     * @param collator the RuleBasedCollator
     * @stable ICU 2.8
     */
    CollationElementIterator(UCharacterIterator source, RuleBasedCollator collator) {
        this(collator);
        setText(source);
    }

    /**
     * Returns the character offset in the source string
     * corresponding to the next collation element. I.e., getOffset()
     * returns the position in the source string corresponding to the
     * collation element that will be returned by the next call to
     * next() or previous(). This value could be any of:
     * 

     *  The index of the first character corresponding to
     * the next collation element. (This means that if
     * setOffset(offset) sets the index in the middle of
     * a contraction, getOffset() returns the index of
     * the first character in the contraction, which may not be equal
     * to the original offset that was set. Hence calling getOffset()
     * immediately after setOffset(offset) does not guarantee that the
     * original offset set will be returned.)
     * 
 If normalization is on, the index of the immediate
     * subsequent character, or composite character with the first
     * character, having a combining class of 0.
     * 
 The length of the source string, if iteration has reached
     * the end.
     *
     *
     * @return The character offset in the source string corresponding to the
     *         collation element that will be returned by the next call to
     *         next() or previous().
     * @stable ICU 2.8
     */
    public int getOffset() {
        if (dir_ < 0 && offsets_ != null && !offsets_.isEmpty()) {
            // CollationIterator.previousCE() decrements the CEs length
            // while it pops CEs from its internal buffer.
            int i = iter_.getCEsLength();
            if (otherHalf_ != 0) {
                // Return the trailing CE offset while we are in the middle of a 64-bit CE.
                ++i;
            }
            assert (i < offsets_.size());
            return offsets_.elementAti(i);
        }
        return iter_.getOffset();
    }

    /**
     * Get the next collation element in the source string.
     *
     * This iterator iterates over a sequence of collation elements
     * that were built from the string. Because there isn't
     * necessarily a one-to-one mapping from characters to collation
     * elements, this doesn't mean the same thing as "return the
     * collation element [or ordering priority] of the next character
     * in the string".
     *
     * 
This function returns the collation element that the
     * iterator is currently pointing to, and then updates the
     * internal pointer to point to the next element.
     *
     * @return the next collation element or NULLORDER if the end of the
     *         iteration has been reached.
     * @stable ICU 2.8
     */
    public int next() {
        if (dir_ > 1) {
            // Continue forward iteration. Test this first.
            if (otherHalf_ != 0) {
                int oh = otherHalf_;
                otherHalf_ = 0;
                return oh;
            }
        } else if (dir_ == 1) {
            // next() after setOffset()
            dir_ = 2;
        } else if (dir_ == 0) {
            // The iter_ is already reset to the start of the text.
            dir_ = 2;
        } else /* dir_ < 0 */{
            // illegal change of direction
            throw new IllegalStateException("Illegal change of direction");
            // Java porting note: ICU4C sets U_INVALID_STATE_ERROR to the return status.
        }
        // No need to keep all CEs in the buffer when we iterate.
        iter_.clearCEsIfNoneRemaining();
        long ce = iter_.nextCE();
        if (ce == Collation.NO_CE) {
            return NULLORDER;
        }
        // Turn the 64-bit CE into two old-style 32-bit CEs, without quaternary bits.
        long p = ce >>> 32;
        int lower32 = (int) ce;
        int firstHalf = getFirstHalf(p, lower32);
        int secondHalf = getSecondHalf(p, lower32);
        if (secondHalf != 0) {
            otherHalf_ = secondHalf | 0xc0; // continuation CE
        }
        return firstHalf;
    }

    /**
     * Get the previous collation element in the source string.
     *
     * 
This iterator iterates over a sequence of collation elements
     * that were built from the string. Because there isn't
     * necessarily a one-to-one mapping from characters to collation
     * elements, this doesn't mean the same thing as "return the
     * collation element [or ordering priority] of the previous
     * character in the string".
     *
     * 
This function updates the iterator's internal pointer to
     * point to the collation element preceding the one it's currently
     * pointing to and then returns that element, while next() returns
     * the current element and then updates the pointer.
     *
     * @return the previous collation element, or NULLORDER when the start of
     *             the iteration has been reached.
     * @stable ICU 2.8
     */
    public int previous() {
        if (dir_ < 0) {
            // Continue backwards iteration. Test this first.
            if (otherHalf_ != 0) {
                int oh = otherHalf_;
                otherHalf_ = 0;
                return oh;
            }
        } else if (dir_ == 0) {
            iter_.resetToOffset(string_.length());
            dir_ = -1;
        } else if (dir_ == 1) {
            // previous() after setOffset()
            dir_ = -1;
        } else /* dir_ > 1 */{
            // illegal change of direction
            throw new IllegalStateException("Illegal change of direction");
            // Java porting note: ICU4C sets U_INVALID_STATE_ERROR to the return status.
        }
        if (offsets_ == null) {
            offsets_ = new UVector32();
        }
        // If we already have expansion CEs, then we also have offsets.
        // Otherwise remember the trailing offset in case we need to
        // write offsets for an artificial expansion.
        int limitOffset = iter_.getCEsLength() == 0 ? iter_.getOffset() : 0;
        long ce = iter_.previousCE(offsets_);
        if (ce == Collation.NO_CE) {
            return NULLORDER;
        }
        // Turn the 64-bit CE into two old-style 32-bit CEs, without quaternary bits.
        long p = ce >>> 32;
        int lower32 = (int) ce;
        int firstHalf = getFirstHalf(p, lower32);
        int secondHalf = getSecondHalf(p, lower32);
        if (secondHalf != 0) {
            if (offsets_.isEmpty()) {
                // When we convert a single 64-bit CE into two 32-bit CEs,
                // we need to make this artificial expansion behave like a normal expansion.
                // See CollationIterator.previousCE().
                offsets_.addElement(iter_.getOffset());
                offsets_.addElement(limitOffset);
            }
            otherHalf_ = firstHalf;
            return secondHalf | 0xc0; // continuation CE
        }
        return firstHalf;
    }

    /**
     * Resets the cursor to the beginning of the string. The next
     * call to next() or previous() will return the first and last
     * collation element in the string, respectively.
     *
     * 
If the RuleBasedCollator used by this iterator has had its
     * attributes changed, calling reset() will reinitialize the
     * iterator to use the new attributes.
     *
     * @stable ICU 2.8
     */
    public void reset() {
        iter_ .resetToOffset(0);
        otherHalf_ = 0;
        dir_ = 0;
    }

    /**
     * Sets the iterator to point to the collation element
     * corresponding to the character at the specified offset. The
     * value returned by the next call to next() will be the collation
     * element corresponding to the characters at offset.
     *
     * 
If offset is in the middle of a contracting character
     * sequence, the iterator is adjusted to the start of the
     * contracting sequence. This means that getOffset() is not
     * guaranteed to return the same value set by this method.
     *
     * 
If the decomposition mode is on, and offset is in the middle
     * of a decomposible range of source text, the iterator may not
     * return a correct result for the next forwards or backwards
     * iteration.  The user must ensure that the offset is not in the
     * middle of a decomposible range.
     *
     * @param newOffset the character offset into the original source string to
     *        set. Note that this is not an offset into the corresponding
     *        sequence of collation elements.
     * @stable ICU 2.8
     */
    public void setOffset(int newOffset) {
        if (0 < newOffset && newOffset < string_.length()) {
            int offset = newOffset;
            do {
                char c = string_.charAt(offset);
                if (!rbc_.isUnsafe(c) ||
                        (Character.isHighSurrogate(c) && !rbc_.isUnsafe(string_.codePointAt(offset)))) {
                    break;
                }
                // Back up to before this unsafe character.
                --offset;
            } while (offset > 0);
            if (offset < newOffset) {
                // We might have backed up more than necessary.
                // For example, contractions "ch" and "cu" make both 'h' and 'u' unsafe,
                // but for text "chu" setOffset(2) should remain at 2
                // although we initially back up to offset 0.
                // Find the last safe offset no greater than newOffset by iterating forward.
                int lastSafeOffset = offset;
                do {
                    iter_.resetToOffset(lastSafeOffset);
                    do {
                        iter_.nextCE();
                    } while ((offset = iter_.getOffset()) == lastSafeOffset);
                    if (offset <= newOffset) {
                        lastSafeOffset = offset;
                    }
                } while (offset < newOffset);
                newOffset = lastSafeOffset;
            }
        }
        iter_.resetToOffset(newOffset);
        otherHalf_ = 0;
        dir_ = 1;
    }

    /**
     * Set a new source string for iteration, and reset the offset
     * to the beginning of the text.
     *
     * @param source the new source string for iteration.
     * @stable ICU 2.8
     */
    public void setText(String source) {
        string_ = source; // TODO: do we need to remember the source string in a field?
        CollationIterator newIter;
        boolean numeric = rbc_.settings.readOnly().isNumeric();
        if (rbc_.settings.readOnly().dontCheckFCD()) {
            newIter = new UTF16CollationIterator(rbc_.data, numeric, string_, 0);
        } else {
            newIter = new FCDUTF16CollationIterator(rbc_.data, numeric, string_, 0);
        }
        iter_ = newIter;
        otherHalf_ = 0;
        dir_ = 0;
    }

    /**
     * Set a new source string iterator for iteration, and reset the
     * offset to the beginning of the text.
     *
     * The source iterator's integrity will be preserved since a new copy
     * will be created for use.
     * @param source the new source string iterator for iteration.
     * @stable ICU 2.8
     */
    public void setText(UCharacterIterator source) {
        string_ = source.getText(); // TODO: do we need to remember the source string in a field?
        // Note: In C++, we just setText(source.getText()).
        // In Java, we actually operate on a character iterator.
        // (The old code apparently did so only for a CharacterIterator;
        // for a UCharacterIterator it also just used source.getText()).
        // TODO: do we need to remember the cloned iterator in a field?
        UCharacterIterator src;
        try {
            src = (UCharacterIterator) source.clone();
        } catch (CloneNotSupportedException e) {
            // Fall back to ICU 52 behavior of iterating over the text contents
            // of the UCharacterIterator.
            setText(source.getText());
            return;
        }
        src.setToStart();
        CollationIterator newIter;
        boolean numeric = rbc_.settings.readOnly().isNumeric();
        if (rbc_.settings.readOnly().dontCheckFCD()) {
            newIter = new IterCollationIterator(rbc_.data, numeric, src);
        } else {
            newIter = new FCDIterCollationIterator(rbc_.data, numeric, src, 0);
        }
        iter_ = newIter;
        otherHalf_ = 0;
        dir_ = 0;
    }

    /**
     * Set a new source string iterator for iteration, and reset the
     * offset to the beginning of the text.
     *
     * @param source the new source string iterator for iteration.
     * @stable ICU 2.8
     */
    public void setText(CharacterIterator source) {
        // Note: In C++, we just setText(source.getText()).
        // In Java, we actually operate on a character iterator.
        // TODO: do we need to remember the iterator in a field?
        // TODO: apparently we don't clone a CharacterIterator in Java,
        // we only clone the text for a UCharacterIterator?? see the old code in the constructors
        UCharacterIterator src = new CharacterIteratorWrapper(source);
        src.setToStart();
        string_ = src.getText(); // TODO: do we need to remember the source string in a field?
        CollationIterator newIter;
        boolean numeric = rbc_.settings.readOnly().isNumeric();
        if (rbc_.settings.readOnly().dontCheckFCD()) {
            newIter = new IterCollationIterator(rbc_.data, numeric, src);
        } else {
            newIter = new FCDIterCollationIterator(rbc_.data, numeric, src, 0);
        }
        iter_ = newIter;
        otherHalf_ = 0;
        dir_ = 0;
    }

    private static final class MaxExpSink implements ContractionsAndExpansions.CESink {
        MaxExpSink(Map h) {
            maxExpansions = h;
        }

        @Override
        public void handleCE(long ce) {
        }

        @Override
        public void handleExpansion(long ces[], int start, int length) {
            if (length <= 1) {
                // We do not need to add single CEs into the map.
                return;
            }
            int count = 0; // number of CE "halves"
            for (int i = 0; i < length; ++i) {
                count += ceNeedsTwoParts(ces[start + i]) ? 2 : 1;
            }
            // last "half" of the last CE
            long ce = ces[start + length - 1];
            long p = ce >>> 32;
            int lower32 = (int) ce;
            int lastHalf = getSecondHalf(p, lower32);
            if (lastHalf == 0) {
                lastHalf = getFirstHalf(p, lower32);
                assert (lastHalf != 0);
            } else {
                lastHalf |= 0xc0; // old-style continuation CE
            }
            Integer oldCount = maxExpansions.get(lastHalf);
            if (oldCount == null || count > oldCount) {
                maxExpansions.put(lastHalf, count);
            }
        }

        private Map maxExpansions;
    }

    static final Map computeMaxExpansions(CollationData data) {
        Map maxExpansions = new HashMap<>();
        MaxExpSink sink = new MaxExpSink(maxExpansions);
        new ContractionsAndExpansions(null, null, sink, true).forData(data);
        return maxExpansions;
    }

    /**
     * Returns the maximum length of any expansion sequence that ends with
     * the specified collation element. If there is no expansion with this
     * collation element as the last element, returns 1.
     *
     * @param ce a collation element returned by previous() or next().
     * @return the maximum length of any expansion sequence ending
     *         with the specified collation element.
     * @stable ICU 2.8
     */
    public int getMaxExpansion(int ce) {
        return getMaxExpansion(rbc_.tailoring.maxExpansions, ce);
    }

    static int getMaxExpansion(Map maxExpansions, int order) {
        if (order == 0) {
            return 1;
        }
        Integer max;
        if (maxExpansions != null && (max = maxExpansions.get(order)) != null) {
            return max;
        }
        if ((order & 0xc0) == 0xc0) {
            // old-style continuation CE
            return 2;
        } else {
            return 1;
        }
    }

    /** Normalizes dir_=1 (just after setOffset()) to dir_=0 (just after reset()). */
    private byte normalizeDir() {
        return dir_ == 1 ? 0 : dir_;
    }

    /**
     * Tests that argument object is equals to this CollationElementIterator.
     * Iterators are equal if the objects uses the same RuleBasedCollator,
     * the same source text and have the same current position in iteration.
     * @param that object to test if it is equals to this
     *             CollationElementIterator
     * @stable ICU 2.8
     */
    @Override
    public boolean equals(Object that) {
        if (that == this) {
            return true;
        }
        if (that instanceof CollationElementIterator) {
            CollationElementIterator thatceiter = (CollationElementIterator) that;
            return rbc_.equals(thatceiter.rbc_)
                    && otherHalf_ == thatceiter.otherHalf_
                    && normalizeDir() == thatceiter.normalizeDir()
                    && string_.equals(thatceiter.string_)
                    && iter_.equals(thatceiter.iter_);
        }
        return false;
    }

    /**
     * Mock implementation of hashCode(). This implementation always returns a constant
     * value. When Java assertion is enabled, this method triggers an assertion failure.
     * @stable ICU 2.8
     */
    @Override
    public int hashCode() {
        assert false : "hashCode not designed";
        return 42;
    }

    /**
     * @internal
     * @deprecated This API is ICU internal only.
     */
    @Deprecated
    public RuleBasedCollator getRuleBasedCollator() {
        return rbc_;
    }
}