com.ibm.icu.text.CollationElementIterator Maven / Gradle / Ivy
// © 2016 and later: Unicode, Inc. and others.
// License & terms of use: http://www.unicode.org/copyright.html
/**
*******************************************************************************
* Copyright (C) 1996-2016, International Business Machines Corporation and
* others. All Rights Reserved.
*******************************************************************************
*/
package com.ibm.icu.text;
import java.text.CharacterIterator;
import java.util.HashMap;
import java.util.Map;
import com.ibm.icu.impl.CharacterIteratorWrapper;
import com.ibm.icu.impl.coll.Collation;
import com.ibm.icu.impl.coll.CollationData;
import com.ibm.icu.impl.coll.CollationIterator;
import com.ibm.icu.impl.coll.ContractionsAndExpansions;
import com.ibm.icu.impl.coll.FCDIterCollationIterator;
import com.ibm.icu.impl.coll.FCDUTF16CollationIterator;
import com.ibm.icu.impl.coll.IterCollationIterator;
import com.ibm.icu.impl.coll.UTF16CollationIterator;
import com.ibm.icu.impl.coll.UVector32;
/**
* CollationElementIterator
is an iterator created by
* a RuleBasedCollator to walk through a string. The return result of
* each iteration is a 32-bit collation element (CE) that defines the
* ordering priority of the next character or sequence of characters
* in the source string.
*
* For illustration, consider the following in Slovak and in traditional Spanish collation:
*
*
* "ca" -> the first collation element is CE('c') and the second
* collation element is CE('a').
* "cha" -> the first collation element is CE('ch') and the second
* collation element is CE('a').
*
*
* And in German phonebook collation,
*
*
* Since the character 'æ' is a composed character of 'a' and 'e', the
* iterator returns two collation elements for the single character 'æ'
*
* "æb" -> the first collation element is collation_element('a'), the
* second collation element is collation_element('e'), and the
* third collation element is collation_element('b').
*
*
*
* For collation ordering comparison, the collation element results
* can not be compared simply by using basic arithmetic operators,
* e.g. <, == or >, further processing has to be done. Details
* can be found in the ICU
*
* User Guide. An example of using the CollationElementIterator
* for collation ordering comparison is the class
* {@link com.ibm.icu.text.StringSearch}.
*
*
To construct a CollationElementIterator object, users
* call the method getCollationElementIterator() on a
* RuleBasedCollator that defines the desired sorting order.
*
*
Example:
*
*
* String testString = "This is a test";
* RuleBasedCollator rbc = new RuleBasedCollator("&a<b");
* CollationElementIterator iterator = rbc.getCollationElementIterator(testString);
* int primaryOrder = iterator.IGNORABLE;
* while (primaryOrder != iterator.NULLORDER) {
* int order = iterator.next();
* if (order != iterator.IGNORABLE &&
* order != iterator.NULLORDER) {
* // order is valid, not ignorable and we have not passed the end
* // of the iteration, we do something
* primaryOrder = CollationElementIterator.primaryOrder(order);
* System.out.println("Next primary order 0x" +
* Integer.toHexString(primaryOrder));
* }
* }
*
*
*
* The method next() returns the collation order of the next character based on
* the comparison level of the collator. The method previous() returns the
* collation order of the previous character based on the comparison level of
* the collator. The Collation Element Iterator moves only in one direction
* between calls to reset(), setOffset(), or setText(). That is, next() and
* previous() can not be inter-used. Whenever previous() is to be called after
* next() or vice versa, reset(), setOffset() or setText() has to be called first
* to reset the status, shifting current position to either the end or the start of
* the string (reset() or setText()), or the specified position (setOffset()).
* Hence at the next call of next() or previous(), the first or last collation order,
* or collation order at the specified position will be returned. If a change of
* direction is done without one of these calls, the result is undefined.
*
* This class is not subclassable.
* @see Collator
* @see RuleBasedCollator
* @see StringSearch
* @author Syn Wee Quek
* @stable ICU 2.8
*/
public final class CollationElementIterator
{
private CollationIterator iter_; // owned
private RuleBasedCollator rbc_; // aliased
private int otherHalf_;
/**
* <0: backwards; 0: just after reset() (previous() begins from end);
* 1: just after setOffset(); >1: forward
*/
private byte dir_;
/**
* Stores offsets from expansions and from unsafe-backwards iteration,
* so that getOffset() returns intermediate offsets for the CEs
* that are consistent with forward iteration.
*/
private UVector32 offsets_;
private String string_; // TODO: needed in Java? if so, then add a UCharacterIterator field too?
/**
* This constant is returned by the iterator in the methods
* next() and previous() when the end or the beginning of the
* source string has been reached, and there are no more valid
* collation elements to return.
*
*
See class documentation for an example of use.
* @stable ICU 2.8
* @see #next
* @see #previous */
public final static int NULLORDER = 0xffffffff;
/**
* This constant is returned by the iterator in the methods
* next() and previous() when a collation element result is to be
* ignored.
*
*
See class documentation for an example of use.
* @stable ICU 2.8
* @see #next
* @see #previous */
public static final int IGNORABLE = 0;
/**
* Return the primary order of the specified collation element,
* i.e. the first 16 bits. This value is unsigned.
* @param ce the collation element
* @return the element's 16 bits primary order.
* @stable ICU 2.8
*/
public final static int primaryOrder(int ce) {
return (ce >>> 16) & 0xffff;
}
/**
* Return the secondary order of the specified collation element,
* i.e. the 16th to 23th bits, inclusive. This value is unsigned.
* @param ce the collation element
* @return the element's 8 bits secondary order
* @stable ICU 2.8
*/
public final static int secondaryOrder(int ce) {
return (ce >>> 8) & 0xff;
}
/**
* Return the tertiary order of the specified collation element, i.e. the last
* 8 bits. This value is unsigned.
* @param ce the collation element
* @return the element's 8 bits tertiary order
* @stable ICU 2.8
*/
public final static int tertiaryOrder(int ce) {
return ce & 0xff;
}
private static final int getFirstHalf(long p, int lower32) {
return ((int)p & 0xffff0000) | ((lower32 >> 16) & 0xff00) | ((lower32 >> 8) & 0xff);
}
private static final int getSecondHalf(long p, int lower32) {
return ((int)p << 16) | ((lower32 >> 8) & 0xff00) | (lower32 & 0x3f);
}
private static final boolean ceNeedsTwoParts(long ce) {
return (ce & 0xffff00ff003fL) != 0;
}
private CollationElementIterator(RuleBasedCollator collator) {
iter_ = null;
rbc_ = collator;
otherHalf_ = 0;
dir_ = 0;
offsets_ = null;
}
/**
* CollationElementIterator constructor. This takes a source
* string and a RuleBasedCollator. The iterator will walk through
* the source string based on the rules defined by the
* collator. If the source string is empty, NULLORDER will be
* returned on the first call to next().
*
* @param source the source string.
* @param collator the RuleBasedCollator
* @stable ICU 2.8
*/
CollationElementIterator(String source, RuleBasedCollator collator) {
this(collator);
setText(source);
}
// Note: The constructors should take settings & tailoring, not a collator,
// to avoid circular dependencies.
// However, for equals() we would need to be able to compare tailoring data for equality
// without making CollationData or CollationTailoring depend on TailoredSet.
// (See the implementation of RuleBasedCollator.equals().)
// That might require creating an intermediate class that would be used
// by both CollationElementIterator and RuleBasedCollator
// but only contain the part of RBC.equals() related to data and rules.
/**
* CollationElementIterator constructor. This takes a source
* character iterator and a RuleBasedCollator. The iterator will
* walk through the source string based on the rules defined by
* the collator. If the source string is empty, NULLORDER will be
* returned on the first call to next().
*
* @param source the source string iterator.
* @param collator the RuleBasedCollator
* @stable ICU 2.8
*/
CollationElementIterator(CharacterIterator source, RuleBasedCollator collator) {
this(collator);
setText(source);
}
/**
* CollationElementIterator constructor. This takes a source
* character iterator and a RuleBasedCollator. The iterator will
* walk through the source string based on the rules defined by
* the collator. If the source string is empty, NULLORDER will be
* returned on the first call to next().
*
* @param source the source string iterator.
* @param collator the RuleBasedCollator
* @stable ICU 2.8
*/
CollationElementIterator(UCharacterIterator source, RuleBasedCollator collator) {
this(collator);
setText(source);
}
/**
* Returns the character offset in the source string
* corresponding to the next collation element. I.e., getOffset()
* returns the position in the source string corresponding to the
* collation element that will be returned by the next call to
* next() or previous(). This value could be any of:
*
* - The index of the first character corresponding to
* the next collation element. (This means that if
*
setOffset(offset)
sets the index in the middle of
* a contraction, getOffset()
returns the index of
* the first character in the contraction, which may not be equal
* to the original offset that was set. Hence calling getOffset()
* immediately after setOffset(offset) does not guarantee that the
* original offset set will be returned.)
* - If normalization is on, the index of the immediate
* subsequent character, or composite character with the first
* character, having a combining class of 0.
*
- The length of the source string, if iteration has reached
* the end.
*
*
* @return The character offset in the source string corresponding to the
* collation element that will be returned by the next call to
* next() or previous().
* @stable ICU 2.8
*/
public int getOffset() {
if (dir_ < 0 && offsets_ != null && !offsets_.isEmpty()) {
// CollationIterator.previousCE() decrements the CEs length
// while it pops CEs from its internal buffer.
int i = iter_.getCEsLength();
if (otherHalf_ != 0) {
// Return the trailing CE offset while we are in the middle of a 64-bit CE.
++i;
}
assert (i < offsets_.size());
return offsets_.elementAti(i);
}
return iter_.getOffset();
}
/**
* Get the next collation element in the source string.
*
* This iterator iterates over a sequence of collation elements
* that were built from the string. Because there isn't
* necessarily a one-to-one mapping from characters to collation
* elements, this doesn't mean the same thing as "return the
* collation element [or ordering priority] of the next character
* in the string".
*
*
This function returns the collation element that the
* iterator is currently pointing to, and then updates the
* internal pointer to point to the next element.
*
* @return the next collation element or NULLORDER if the end of the
* iteration has been reached.
* @stable ICU 2.8
*/
public int next() {
if (dir_ > 1) {
// Continue forward iteration. Test this first.
if (otherHalf_ != 0) {
int oh = otherHalf_;
otherHalf_ = 0;
return oh;
}
} else if (dir_ == 1) {
// next() after setOffset()
dir_ = 2;
} else if (dir_ == 0) {
// The iter_ is already reset to the start of the text.
dir_ = 2;
} else /* dir_ < 0 */{
// illegal change of direction
throw new IllegalStateException("Illegal change of direction");
// Java porting note: ICU4C sets U_INVALID_STATE_ERROR to the return status.
}
// No need to keep all CEs in the buffer when we iterate.
iter_.clearCEsIfNoneRemaining();
long ce = iter_.nextCE();
if (ce == Collation.NO_CE) {
return NULLORDER;
}
// Turn the 64-bit CE into two old-style 32-bit CEs, without quaternary bits.
long p = ce >>> 32;
int lower32 = (int) ce;
int firstHalf = getFirstHalf(p, lower32);
int secondHalf = getSecondHalf(p, lower32);
if (secondHalf != 0) {
otherHalf_ = secondHalf | 0xc0; // continuation CE
}
return firstHalf;
}
/**
* Get the previous collation element in the source string.
*
*
This iterator iterates over a sequence of collation elements
* that were built from the string. Because there isn't
* necessarily a one-to-one mapping from characters to collation
* elements, this doesn't mean the same thing as "return the
* collation element [or ordering priority] of the previous
* character in the string".
*
*
This function updates the iterator's internal pointer to
* point to the collation element preceding the one it's currently
* pointing to and then returns that element, while next() returns
* the current element and then updates the pointer.
*
* @return the previous collation element, or NULLORDER when the start of
* the iteration has been reached.
* @stable ICU 2.8
*/
public int previous() {
if (dir_ < 0) {
// Continue backwards iteration. Test this first.
if (otherHalf_ != 0) {
int oh = otherHalf_;
otherHalf_ = 0;
return oh;
}
} else if (dir_ == 0) {
iter_.resetToOffset(string_.length());
dir_ = -1;
} else if (dir_ == 1) {
// previous() after setOffset()
dir_ = -1;
} else /* dir_ > 1 */{
// illegal change of direction
throw new IllegalStateException("Illegal change of direction");
// Java porting note: ICU4C sets U_INVALID_STATE_ERROR to the return status.
}
if (offsets_ == null) {
offsets_ = new UVector32();
}
// If we already have expansion CEs, then we also have offsets.
// Otherwise remember the trailing offset in case we need to
// write offsets for an artificial expansion.
int limitOffset = iter_.getCEsLength() == 0 ? iter_.getOffset() : 0;
long ce = iter_.previousCE(offsets_);
if (ce == Collation.NO_CE) {
return NULLORDER;
}
// Turn the 64-bit CE into two old-style 32-bit CEs, without quaternary bits.
long p = ce >>> 32;
int lower32 = (int) ce;
int firstHalf = getFirstHalf(p, lower32);
int secondHalf = getSecondHalf(p, lower32);
if (secondHalf != 0) {
if (offsets_.isEmpty()) {
// When we convert a single 64-bit CE into two 32-bit CEs,
// we need to make this artificial expansion behave like a normal expansion.
// See CollationIterator.previousCE().
offsets_.addElement(iter_.getOffset());
offsets_.addElement(limitOffset);
}
otherHalf_ = firstHalf;
return secondHalf | 0xc0; // continuation CE
}
return firstHalf;
}
/**
* Resets the cursor to the beginning of the string. The next
* call to next() or previous() will return the first and last
* collation element in the string, respectively.
*
*
If the RuleBasedCollator used by this iterator has had its
* attributes changed, calling reset() will reinitialize the
* iterator to use the new attributes.
*
* @stable ICU 2.8
*/
public void reset() {
iter_ .resetToOffset(0);
otherHalf_ = 0;
dir_ = 0;
}
/**
* Sets the iterator to point to the collation element
* corresponding to the character at the specified offset. The
* value returned by the next call to next() will be the collation
* element corresponding to the characters at offset.
*
*
If offset is in the middle of a contracting character
* sequence, the iterator is adjusted to the start of the
* contracting sequence. This means that getOffset() is not
* guaranteed to return the same value set by this method.
*
*
If the decomposition mode is on, and offset is in the middle
* of a decomposible range of source text, the iterator may not
* return a correct result for the next forwards or backwards
* iteration. The user must ensure that the offset is not in the
* middle of a decomposible range.
*
* @param newOffset the character offset into the original source string to
* set. Note that this is not an offset into the corresponding
* sequence of collation elements.
* @stable ICU 2.8
*/
public void setOffset(int newOffset) {
if (0 < newOffset && newOffset < string_.length()) {
int offset = newOffset;
do {
char c = string_.charAt(offset);
if (!rbc_.isUnsafe(c) ||
(Character.isHighSurrogate(c) && !rbc_.isUnsafe(string_.codePointAt(offset)))) {
break;
}
// Back up to before this unsafe character.
--offset;
} while (offset > 0);
if (offset < newOffset) {
// We might have backed up more than necessary.
// For example, contractions "ch" and "cu" make both 'h' and 'u' unsafe,
// but for text "chu" setOffset(2) should remain at 2
// although we initially back up to offset 0.
// Find the last safe offset no greater than newOffset by iterating forward.
int lastSafeOffset = offset;
do {
iter_.resetToOffset(lastSafeOffset);
do {
iter_.nextCE();
} while ((offset = iter_.getOffset()) == lastSafeOffset);
if (offset <= newOffset) {
lastSafeOffset = offset;
}
} while (offset < newOffset);
newOffset = lastSafeOffset;
}
}
iter_.resetToOffset(newOffset);
otherHalf_ = 0;
dir_ = 1;
}
/**
* Set a new source string for iteration, and reset the offset
* to the beginning of the text.
*
* @param source the new source string for iteration.
* @stable ICU 2.8
*/
public void setText(String source) {
string_ = source; // TODO: do we need to remember the source string in a field?
CollationIterator newIter;
boolean numeric = rbc_.settings.readOnly().isNumeric();
if (rbc_.settings.readOnly().dontCheckFCD()) {
newIter = new UTF16CollationIterator(rbc_.data, numeric, string_, 0);
} else {
newIter = new FCDUTF16CollationIterator(rbc_.data, numeric, string_, 0);
}
iter_ = newIter;
otherHalf_ = 0;
dir_ = 0;
}
/**
* Set a new source string iterator for iteration, and reset the
* offset to the beginning of the text.
*
*
The source iterator's integrity will be preserved since a new copy
* will be created for use.
* @param source the new source string iterator for iteration.
* @stable ICU 2.8
*/
public void setText(UCharacterIterator source) {
string_ = source.getText(); // TODO: do we need to remember the source string in a field?
// Note: In C++, we just setText(source.getText()).
// In Java, we actually operate on a character iterator.
// (The old code apparently did so only for a CharacterIterator;
// for a UCharacterIterator it also just used source.getText()).
// TODO: do we need to remember the cloned iterator in a field?
UCharacterIterator src;
try {
src = (UCharacterIterator) source.clone();
} catch (CloneNotSupportedException e) {
// Fall back to ICU 52 behavior of iterating over the text contents
// of the UCharacterIterator.
setText(source.getText());
return;
}
src.setToStart();
CollationIterator newIter;
boolean numeric = rbc_.settings.readOnly().isNumeric();
if (rbc_.settings.readOnly().dontCheckFCD()) {
newIter = new IterCollationIterator(rbc_.data, numeric, src);
} else {
newIter = new FCDIterCollationIterator(rbc_.data, numeric, src, 0);
}
iter_ = newIter;
otherHalf_ = 0;
dir_ = 0;
}
/**
* Set a new source string iterator for iteration, and reset the
* offset to the beginning of the text.
*
* @param source the new source string iterator for iteration.
* @stable ICU 2.8
*/
public void setText(CharacterIterator source) {
// Note: In C++, we just setText(source.getText()).
// In Java, we actually operate on a character iterator.
// TODO: do we need to remember the iterator in a field?
// TODO: apparently we don't clone a CharacterIterator in Java,
// we only clone the text for a UCharacterIterator?? see the old code in the constructors
UCharacterIterator src = new CharacterIteratorWrapper(source);
src.setToStart();
string_ = src.getText(); // TODO: do we need to remember the source string in a field?
CollationIterator newIter;
boolean numeric = rbc_.settings.readOnly().isNumeric();
if (rbc_.settings.readOnly().dontCheckFCD()) {
newIter = new IterCollationIterator(rbc_.data, numeric, src);
} else {
newIter = new FCDIterCollationIterator(rbc_.data, numeric, src, 0);
}
iter_ = newIter;
otherHalf_ = 0;
dir_ = 0;
}
private static final class MaxExpSink implements ContractionsAndExpansions.CESink {
MaxExpSink(Map h) {
maxExpansions = h;
}
@Override
public void handleCE(long ce) {
}
@Override
public void handleExpansion(long ces[], int start, int length) {
if (length <= 1) {
// We do not need to add single CEs into the map.
return;
}
int count = 0; // number of CE "halves"
for (int i = 0; i < length; ++i) {
count += ceNeedsTwoParts(ces[start + i]) ? 2 : 1;
}
// last "half" of the last CE
long ce = ces[start + length - 1];
long p = ce >>> 32;
int lower32 = (int) ce;
int lastHalf = getSecondHalf(p, lower32);
if (lastHalf == 0) {
lastHalf = getFirstHalf(p, lower32);
assert (lastHalf != 0);
} else {
lastHalf |= 0xc0; // old-style continuation CE
}
Integer oldCount = maxExpansions.get(lastHalf);
if (oldCount == null || count > oldCount) {
maxExpansions.put(lastHalf, count);
}
}
private Map maxExpansions;
}
static final Map computeMaxExpansions(CollationData data) {
Map maxExpansions = new HashMap<>();
MaxExpSink sink = new MaxExpSink(maxExpansions);
new ContractionsAndExpansions(null, null, sink, true).forData(data);
return maxExpansions;
}
/**
* Returns the maximum length of any expansion sequence that ends with
* the specified collation element. If there is no expansion with this
* collation element as the last element, returns 1.
*
* @param ce a collation element returned by previous() or next().
* @return the maximum length of any expansion sequence ending
* with the specified collation element.
* @stable ICU 2.8
*/
public int getMaxExpansion(int ce) {
return getMaxExpansion(rbc_.tailoring.maxExpansions, ce);
}
static int getMaxExpansion(Map maxExpansions, int order) {
if (order == 0) {
return 1;
}
Integer max;
if (maxExpansions != null && (max = maxExpansions.get(order)) != null) {
return max;
}
if ((order & 0xc0) == 0xc0) {
// old-style continuation CE
return 2;
} else {
return 1;
}
}
/** Normalizes dir_=1 (just after setOffset()) to dir_=0 (just after reset()). */
private byte normalizeDir() {
return dir_ == 1 ? 0 : dir_;
}
/**
* Tests that argument object is equals to this CollationElementIterator.
* Iterators are equal if the objects uses the same RuleBasedCollator,
* the same source text and have the same current position in iteration.
* @param that object to test if it is equals to this
* CollationElementIterator
* @stable ICU 2.8
*/
@Override
public boolean equals(Object that) {
if (that == this) {
return true;
}
if (that instanceof CollationElementIterator) {
CollationElementIterator thatceiter = (CollationElementIterator) that;
return rbc_.equals(thatceiter.rbc_)
&& otherHalf_ == thatceiter.otherHalf_
&& normalizeDir() == thatceiter.normalizeDir()
&& string_.equals(thatceiter.string_)
&& iter_.equals(thatceiter.iter_);
}
return false;
}
/**
* Mock implementation of hashCode(). This implementation always returns a constant
* value. When Java assertion is enabled, this method triggers an assertion failure.
* @stable ICU 2.8
*/
@Override
public int hashCode() {
assert false : "hashCode not designed";
return 42;
}
/**
* @internal
* @deprecated This API is ICU internal only.
*/
@Deprecated
public RuleBasedCollator getRuleBasedCollator() {
return rbc_;
}
}