All Downloads are FREE. Search and download functionalities are using the official Maven repository.

com.ibm.icu.text.CollationElementIterator Maven / Gradle / Ivy

Go to download

International Component for Unicode for Java (ICU4J) is a mature, widely used Java library providing Unicode and Globalization support

There is a newer version: 76.1
Show newest version
// © 2016 and later: Unicode, Inc. and others.
// License & terms of use: http://www.unicode.org/copyright.html#License
/**
*******************************************************************************
* Copyright (C) 1996-2016, International Business Machines Corporation and
* others. All Rights Reserved.
*******************************************************************************
*/
package com.ibm.icu.text;

import java.text.CharacterIterator;
import java.util.HashMap;
import java.util.Map;

import com.ibm.icu.impl.CharacterIteratorWrapper;
import com.ibm.icu.impl.coll.Collation;
import com.ibm.icu.impl.coll.CollationData;
import com.ibm.icu.impl.coll.CollationIterator;
import com.ibm.icu.impl.coll.ContractionsAndExpansions;
import com.ibm.icu.impl.coll.FCDIterCollationIterator;
import com.ibm.icu.impl.coll.FCDUTF16CollationIterator;
import com.ibm.icu.impl.coll.IterCollationIterator;
import com.ibm.icu.impl.coll.UTF16CollationIterator;
import com.ibm.icu.impl.coll.UVector32;

/**
 * CollationElementIterator is an iterator created by
 * a RuleBasedCollator to walk through a string. The return result of
 * each iteration is a 32-bit collation element (CE) that defines the
 * ordering priority of the next character or sequence of characters
 * in the source string.
 *
 * 

For illustration, consider the following in Slovak and in traditional Spanish collation: *

*
 * "ca" -> the first collation element is CE('c') and the second
 *         collation element is CE('a').
 * "cha" -> the first collation element is CE('ch') and the second
 *          collation element is CE('a').
 * 
*
* And in German phonebook collation, *
*
 * Since the character 'æ' is a composed character of 'a' and 'e', the
 * iterator returns two collation elements for the single character 'æ'
 *
 * "æb" -> the first collation element is collation_element('a'), the
 *              second collation element is collation_element('e'), and the
 *              third collation element is collation_element('b').
 * 
*
* *

For collation ordering comparison, the collation element results * can not be compared simply by using basic arithmetic operators, * e.g. <, == or >, further processing has to be done. Details * can be found in the ICU * * User Guide. An example of using the CollationElementIterator * for collation ordering comparison is the class * {@link com.ibm.icu.text.StringSearch}. * *

To construct a CollationElementIterator object, users * call the method getCollationElementIterator() on a * RuleBasedCollator that defines the desired sorting order. * *

Example: *

*
 *  String testString = "This is a test";
 *  RuleBasedCollator rbc = new RuleBasedCollator("&a<b");
 *  CollationElementIterator iterator = rbc.getCollationElementIterator(testString);
 *  int primaryOrder = iterator.IGNORABLE;
 *  while (primaryOrder != iterator.NULLORDER) {
 *      int order = iterator.next();
 *      if (order != iterator.IGNORABLE &&
 *          order != iterator.NULLORDER) {
 *          // order is valid, not ignorable and we have not passed the end
 *          // of the iteration, we do something
 *          primaryOrder = CollationElementIterator.primaryOrder(order);
 *          System.out.println("Next primary order 0x" +
 *                             Integer.toHexString(primaryOrder));
 *      }
 *  }
 * 
*
*

* The method next() returns the collation order of the next character based on * the comparison level of the collator. The method previous() returns the * collation order of the previous character based on the comparison level of * the collator. The Collation Element Iterator moves only in one direction * between calls to reset(), setOffset(), or setText(). That is, next() and * previous() can not be inter-used. Whenever previous() is to be called after * next() or vice versa, reset(), setOffset() or setText() has to be called first * to reset the status, shifting current position to either the end or the start of * the string (reset() or setText()), or the specified position (setOffset()). * Hence at the next call of next() or previous(), the first or last collation order, * or collation order at the specified position will be returned. If a change of * direction is done without one of these calls, the result is undefined. *

* This class is not subclassable. * @see Collator * @see RuleBasedCollator * @see StringSearch * @author Syn Wee Quek * @stable ICU 2.8 */ public final class CollationElementIterator { private CollationIterator iter_; // owned private RuleBasedCollator rbc_; // aliased private int otherHalf_; /** * <0: backwards; 0: just after reset() (previous() begins from end); * 1: just after setOffset(); >1: forward */ private byte dir_; /** * Stores offsets from expansions and from unsafe-backwards iteration, * so that getOffset() returns intermediate offsets for the CEs * that are consistent with forward iteration. */ private UVector32 offsets_; private String string_; // TODO: needed in Java? if so, then add a UCharacterIterator field too? /** * This constant is returned by the iterator in the methods * next() and previous() when the end or the beginning of the * source string has been reached, and there are no more valid * collation elements to return. * *

See class documentation for an example of use. * @stable ICU 2.8 * @see #next * @see #previous */ public final static int NULLORDER = 0xffffffff; /** * This constant is returned by the iterator in the methods * next() and previous() when a collation element result is to be * ignored. * *

See class documentation for an example of use. * @stable ICU 2.8 * @see #next * @see #previous */ public static final int IGNORABLE = 0; /** * Return the primary order of the specified collation element, * i.e. the first 16 bits. This value is unsigned. * @param ce the collation element * @return the element's 16 bits primary order. * @stable ICU 2.8 */ public final static int primaryOrder(int ce) { return (ce >>> 16) & 0xffff; } /** * Return the secondary order of the specified collation element, * i.e. the 16th to 23th bits, inclusive. This value is unsigned. * @param ce the collation element * @return the element's 8 bits secondary order * @stable ICU 2.8 */ public final static int secondaryOrder(int ce) { return (ce >>> 8) & 0xff; } /** * Return the tertiary order of the specified collation element, i.e. the last * 8 bits. This value is unsigned. * @param ce the collation element * @return the element's 8 bits tertiary order * @stable ICU 2.8 */ public final static int tertiaryOrder(int ce) { return ce & 0xff; } private static final int getFirstHalf(long p, int lower32) { return ((int)p & 0xffff0000) | ((lower32 >> 16) & 0xff00) | ((lower32 >> 8) & 0xff); } private static final int getSecondHalf(long p, int lower32) { return ((int)p << 16) | ((lower32 >> 8) & 0xff00) | (lower32 & 0x3f); } private static final boolean ceNeedsTwoParts(long ce) { return (ce & 0xffff00ff003fL) != 0; } private CollationElementIterator(RuleBasedCollator collator) { iter_ = null; rbc_ = collator; otherHalf_ = 0; dir_ = 0; offsets_ = null; } /** * CollationElementIterator constructor. This takes a source * string and a RuleBasedCollator. The iterator will walk through * the source string based on the rules defined by the * collator. If the source string is empty, NULLORDER will be * returned on the first call to next(). * * @param source the source string. * @param collator the RuleBasedCollator * @stable ICU 2.8 */ CollationElementIterator(String source, RuleBasedCollator collator) { this(collator); setText(source); } // Note: The constructors should take settings & tailoring, not a collator, // to avoid circular dependencies. // However, for equals() we would need to be able to compare tailoring data for equality // without making CollationData or CollationTailoring depend on TailoredSet. // (See the implementation of RuleBasedCollator.equals().) // That might require creating an intermediate class that would be used // by both CollationElementIterator and RuleBasedCollator // but only contain the part of RBC.equals() related to data and rules. /** * CollationElementIterator constructor. This takes a source * character iterator and a RuleBasedCollator. The iterator will * walk through the source string based on the rules defined by * the collator. If the source string is empty, NULLORDER will be * returned on the first call to next(). * * @param source the source string iterator. * @param collator the RuleBasedCollator * @stable ICU 2.8 */ CollationElementIterator(CharacterIterator source, RuleBasedCollator collator) { this(collator); setText(source); } /** * CollationElementIterator constructor. This takes a source * character iterator and a RuleBasedCollator. The iterator will * walk through the source string based on the rules defined by * the collator. If the source string is empty, NULLORDER will be * returned on the first call to next(). * * @param source the source string iterator. * @param collator the RuleBasedCollator * @stable ICU 2.8 */ CollationElementIterator(UCharacterIterator source, RuleBasedCollator collator) { this(collator); setText(source); } /** * Returns the character offset in the source string * corresponding to the next collation element. I.e., getOffset() * returns the position in the source string corresponding to the * collation element that will be returned by the next call to * next() or previous(). This value could be any of: *

    *
  • The index of the first character corresponding to * the next collation element. (This means that if * setOffset(offset) sets the index in the middle of * a contraction, getOffset() returns the index of * the first character in the contraction, which may not be equal * to the original offset that was set. Hence calling getOffset() * immediately after setOffset(offset) does not guarantee that the * original offset set will be returned.) *
  • If normalization is on, the index of the immediate * subsequent character, or composite character with the first * character, having a combining class of 0. *
  • The length of the source string, if iteration has reached * the end. *
* * @return The character offset in the source string corresponding to the * collation element that will be returned by the next call to * next() or previous(). * @stable ICU 2.8 */ public int getOffset() { if (dir_ < 0 && offsets_ != null && !offsets_.isEmpty()) { // CollationIterator.previousCE() decrements the CEs length // while it pops CEs from its internal buffer. int i = iter_.getCEsLength(); if (otherHalf_ != 0) { // Return the trailing CE offset while we are in the middle of a 64-bit CE. ++i; } assert (i < offsets_.size()); return offsets_.elementAti(i); } return iter_.getOffset(); } /** * Get the next collation element in the source string. * *

This iterator iterates over a sequence of collation elements * that were built from the string. Because there isn't * necessarily a one-to-one mapping from characters to collation * elements, this doesn't mean the same thing as "return the * collation element [or ordering priority] of the next character * in the string". * *

This function returns the collation element that the * iterator is currently pointing to, and then updates the * internal pointer to point to the next element. * * @return the next collation element or NULLORDER if the end of the * iteration has been reached. * @stable ICU 2.8 */ public int next() { if (dir_ > 1) { // Continue forward iteration. Test this first. if (otherHalf_ != 0) { int oh = otherHalf_; otherHalf_ = 0; return oh; } } else if (dir_ == 1) { // next() after setOffset() dir_ = 2; } else if (dir_ == 0) { // The iter_ is already reset to the start of the text. dir_ = 2; } else /* dir_ < 0 */{ // illegal change of direction throw new IllegalStateException("Illegal change of direction"); // Java porting note: ICU4C sets U_INVALID_STATE_ERROR to the return status. } // No need to keep all CEs in the buffer when we iterate. iter_.clearCEsIfNoneRemaining(); long ce = iter_.nextCE(); if (ce == Collation.NO_CE) { return NULLORDER; } // Turn the 64-bit CE into two old-style 32-bit CEs, without quaternary bits. long p = ce >>> 32; int lower32 = (int) ce; int firstHalf = getFirstHalf(p, lower32); int secondHalf = getSecondHalf(p, lower32); if (secondHalf != 0) { otherHalf_ = secondHalf | 0xc0; // continuation CE } return firstHalf; } /** * Get the previous collation element in the source string. * *

This iterator iterates over a sequence of collation elements * that were built from the string. Because there isn't * necessarily a one-to-one mapping from characters to collation * elements, this doesn't mean the same thing as "return the * collation element [or ordering priority] of the previous * character in the string". * *

This function updates the iterator's internal pointer to * point to the collation element preceding the one it's currently * pointing to and then returns that element, while next() returns * the current element and then updates the pointer. * * @return the previous collation element, or NULLORDER when the start of * the iteration has been reached. * @stable ICU 2.8 */ public int previous() { if (dir_ < 0) { // Continue backwards iteration. Test this first. if (otherHalf_ != 0) { int oh = otherHalf_; otherHalf_ = 0; return oh; } } else if (dir_ == 0) { iter_.resetToOffset(string_.length()); dir_ = -1; } else if (dir_ == 1) { // previous() after setOffset() dir_ = -1; } else /* dir_ > 1 */{ // illegal change of direction throw new IllegalStateException("Illegal change of direction"); // Java porting note: ICU4C sets U_INVALID_STATE_ERROR to the return status. } if (offsets_ == null) { offsets_ = new UVector32(); } // If we already have expansion CEs, then we also have offsets. // Otherwise remember the trailing offset in case we need to // write offsets for an artificial expansion. int limitOffset = iter_.getCEsLength() == 0 ? iter_.getOffset() : 0; long ce = iter_.previousCE(offsets_); if (ce == Collation.NO_CE) { return NULLORDER; } // Turn the 64-bit CE into two old-style 32-bit CEs, without quaternary bits. long p = ce >>> 32; int lower32 = (int) ce; int firstHalf = getFirstHalf(p, lower32); int secondHalf = getSecondHalf(p, lower32); if (secondHalf != 0) { if (offsets_.isEmpty()) { // When we convert a single 64-bit CE into two 32-bit CEs, // we need to make this artificial expansion behave like a normal expansion. // See CollationIterator.previousCE(). offsets_.addElement(iter_.getOffset()); offsets_.addElement(limitOffset); } otherHalf_ = firstHalf; return secondHalf | 0xc0; // continuation CE } return firstHalf; } /** * Resets the cursor to the beginning of the string. The next * call to next() or previous() will return the first and last * collation element in the string, respectively. * *

If the RuleBasedCollator used by this iterator has had its * attributes changed, calling reset() will reinitialize the * iterator to use the new attributes. * * @stable ICU 2.8 */ public void reset() { iter_ .resetToOffset(0); otherHalf_ = 0; dir_ = 0; } /** * Sets the iterator to point to the collation element * corresponding to the character at the specified offset. The * value returned by the next call to next() will be the collation * element corresponding to the characters at offset. * *

If offset is in the middle of a contracting character * sequence, the iterator is adjusted to the start of the * contracting sequence. This means that getOffset() is not * guaranteed to return the same value set by this method. * *

If the decomposition mode is on, and offset is in the middle * of a decomposible range of source text, the iterator may not * return a correct result for the next forwards or backwards * iteration. The user must ensure that the offset is not in the * middle of a decomposible range. * * @param newOffset the character offset into the original source string to * set. Note that this is not an offset into the corresponding * sequence of collation elements. * @stable ICU 2.8 */ public void setOffset(int newOffset) { if (0 < newOffset && newOffset < string_.length()) { int offset = newOffset; do { char c = string_.charAt(offset); if (!rbc_.isUnsafe(c) || (Character.isHighSurrogate(c) && !rbc_.isUnsafe(string_.codePointAt(offset)))) { break; } // Back up to before this unsafe character. --offset; } while (offset > 0); if (offset < newOffset) { // We might have backed up more than necessary. // For example, contractions "ch" and "cu" make both 'h' and 'u' unsafe, // but for text "chu" setOffset(2) should remain at 2 // although we initially back up to offset 0. // Find the last safe offset no greater than newOffset by iterating forward. int lastSafeOffset = offset; do { iter_.resetToOffset(lastSafeOffset); do { iter_.nextCE(); } while ((offset = iter_.getOffset()) == lastSafeOffset); if (offset <= newOffset) { lastSafeOffset = offset; } } while (offset < newOffset); newOffset = lastSafeOffset; } } iter_.resetToOffset(newOffset); otherHalf_ = 0; dir_ = 1; } /** * Set a new source string for iteration, and reset the offset * to the beginning of the text. * * @param source the new source string for iteration. * @stable ICU 2.8 */ public void setText(String source) { string_ = source; // TODO: do we need to remember the source string in a field? CollationIterator newIter; boolean numeric = rbc_.settings.readOnly().isNumeric(); if (rbc_.settings.readOnly().dontCheckFCD()) { newIter = new UTF16CollationIterator(rbc_.data, numeric, string_, 0); } else { newIter = new FCDUTF16CollationIterator(rbc_.data, numeric, string_, 0); } iter_ = newIter; otherHalf_ = 0; dir_ = 0; } /** * Set a new source string iterator for iteration, and reset the * offset to the beginning of the text. * *

The source iterator's integrity will be preserved since a new copy * will be created for use. * @param source the new source string iterator for iteration. * @stable ICU 2.8 */ public void setText(UCharacterIterator source) { string_ = source.getText(); // TODO: do we need to remember the source string in a field? // Note: In C++, we just setText(source.getText()). // In Java, we actually operate on a character iterator. // (The old code apparently did so only for a CharacterIterator; // for a UCharacterIterator it also just used source.getText()). // TODO: do we need to remember the cloned iterator in a field? UCharacterIterator src; try { src = (UCharacterIterator) source.clone(); } catch (CloneNotSupportedException e) { // Fall back to ICU 52 behavior of iterating over the text contents // of the UCharacterIterator. setText(source.getText()); return; } src.setToStart(); CollationIterator newIter; boolean numeric = rbc_.settings.readOnly().isNumeric(); if (rbc_.settings.readOnly().dontCheckFCD()) { newIter = new IterCollationIterator(rbc_.data, numeric, src); } else { newIter = new FCDIterCollationIterator(rbc_.data, numeric, src, 0); } iter_ = newIter; otherHalf_ = 0; dir_ = 0; } /** * Set a new source string iterator for iteration, and reset the * offset to the beginning of the text. * * @param source the new source string iterator for iteration. * @stable ICU 2.8 */ public void setText(CharacterIterator source) { // Note: In C++, we just setText(source.getText()). // In Java, we actually operate on a character iterator. // TODO: do we need to remember the iterator in a field? // TODO: apparently we don't clone a CharacterIterator in Java, // we only clone the text for a UCharacterIterator?? see the old code in the constructors UCharacterIterator src = new CharacterIteratorWrapper(source); src.setToStart(); string_ = src.getText(); // TODO: do we need to remember the source string in a field? CollationIterator newIter; boolean numeric = rbc_.settings.readOnly().isNumeric(); if (rbc_.settings.readOnly().dontCheckFCD()) { newIter = new IterCollationIterator(rbc_.data, numeric, src); } else { newIter = new FCDIterCollationIterator(rbc_.data, numeric, src, 0); } iter_ = newIter; otherHalf_ = 0; dir_ = 0; } private static final class MaxExpSink implements ContractionsAndExpansions.CESink { MaxExpSink(Map h) { maxExpansions = h; } @Override public void handleCE(long ce) { } @Override public void handleExpansion(long ces[], int start, int length) { if (length <= 1) { // We do not need to add single CEs into the map. return; } int count = 0; // number of CE "halves" for (int i = 0; i < length; ++i) { count += ceNeedsTwoParts(ces[start + i]) ? 2 : 1; } // last "half" of the last CE long ce = ces[start + length - 1]; long p = ce >>> 32; int lower32 = (int) ce; int lastHalf = getSecondHalf(p, lower32); if (lastHalf == 0) { lastHalf = getFirstHalf(p, lower32); assert (lastHalf != 0); } else { lastHalf |= 0xc0; // old-style continuation CE } Integer oldCount = maxExpansions.get(lastHalf); if (oldCount == null || count > oldCount) { maxExpansions.put(lastHalf, count); } } private Map maxExpansions; } static final Map computeMaxExpansions(CollationData data) { Map maxExpansions = new HashMap<>(); MaxExpSink sink = new MaxExpSink(maxExpansions); new ContractionsAndExpansions(null, null, sink, true).forData(data); return maxExpansions; } /** * Returns the maximum length of any expansion sequence that ends with * the specified collation element. If there is no expansion with this * collation element as the last element, returns 1. * * @param ce a collation element returned by previous() or next(). * @return the maximum length of any expansion sequence ending * with the specified collation element. * @stable ICU 2.8 */ public int getMaxExpansion(int ce) { return getMaxExpansion(rbc_.tailoring.maxExpansions, ce); } static int getMaxExpansion(Map maxExpansions, int order) { if (order == 0) { return 1; } Integer max; if (maxExpansions != null && (max = maxExpansions.get(order)) != null) { return max; } if ((order & 0xc0) == 0xc0) { // old-style continuation CE return 2; } else { return 1; } } /** Normalizes dir_=1 (just after setOffset()) to dir_=0 (just after reset()). */ private byte normalizeDir() { return dir_ == 1 ? 0 : dir_; } /** * Tests that argument object is equals to this CollationElementIterator. * Iterators are equal if the objects uses the same RuleBasedCollator, * the same source text and have the same current position in iteration. * @param that object to test if it is equals to this * CollationElementIterator * @stable ICU 2.8 */ @Override public boolean equals(Object that) { if (that == this) { return true; } if (that instanceof CollationElementIterator) { CollationElementIterator thatceiter = (CollationElementIterator) that; return rbc_.equals(thatceiter.rbc_) && otherHalf_ == thatceiter.otherHalf_ && normalizeDir() == thatceiter.normalizeDir() && string_.equals(thatceiter.string_) && iter_.equals(thatceiter.iter_); } return false; } /** * Mock implementation of hashCode(). This implementation always returns a constant * value. When Java assertion is enabled, this method triggers an assertion failure. * @stable ICU 2.8 */ @Override public int hashCode() { assert false : "hashCode not designed"; return 42; } /** * @internal * @deprecated This API is ICU internal only. */ @Deprecated public RuleBasedCollator getRuleBasedCollator() { return rbc_; } }





© 2015 - 2024 Weber Informatics LLC | Privacy Policy