com.ibm.icu.text.ComposedCharIter Maven / Gradle / Ivy
Show all versions of icu4j Show documentation
// © 2016 and later: Unicode, Inc. and others.
// License & terms of use: http://www.unicode.org/copyright.html
/*
*******************************************************************************
* Copyright (C) 1996-2014, International Business Machines Corporation and *
* others. All Rights Reserved. *
*******************************************************************************
*/
package com.ibm.icu.text;
import com.ibm.icu.impl.Norm2AllModes;
import com.ibm.icu.impl.Normalizer2Impl;
/**
* This class has been deprecated since ICU 2.2.
* One problem is that this class is not designed to return supplementary characters.
* Use the Normalizer2 and UCharacter classes instead.
*
* ComposedCharIter is an iterator class that returns all
* of the precomposed characters defined in the Unicode standard, along
* with their decomposed forms. This is often useful when building
* data tables (e.g. collation tables) which need to treat composed
* and decomposed characters equivalently.
*
* For example, imagine that you have built a collation table with ordering
* rules for the {@link Normalizer#DECOMP canonically decomposed} forms of all
* characters used in a particular language. When you process input text using
* this table, the text must first be decomposed so that it matches the form
* used in the table. This can impose a performance penalty that may be
* unacceptable in some situations.
*
* You can avoid this problem by ensuring that the collation table contains
* rules for both the decomposed and composed versions of each character.
* To do so, use a ComposedCharIter to iterate through all of the
* composed characters in Unicode. If the decomposition for that character
* consists solely of characters that are listed in your ruleset, you can
* add a new rule for the composed character that makes it equivalent to
* its decomposition sequence.
*
* Note that ComposedCharIter iterates over a static table
* of the composed characters in Unicode. If you want to iterate over the
* composed characters in a particular string, use {@link Normalizer} instead.
*
* When constructing a ComposedCharIter there is one
* optional feature that you can enable or disable:
*
* - {@link Normalizer#IGNORE_HANGUL} - Do not iterate over the Hangul
* characters and their corresponding Jamo decompositions.
* This option is off by default (i.e. Hangul processing is enabled)
* since the Unicode standard specifies that Hangul to Jamo
* is a canonical decomposition.
*
*
* ComposedCharIter is currently based on version 2.1.8 of the
* Unicode Standard.
* It will be updated as later versions of Unicode are released.
* @deprecated ICU 2.2
*/
@Deprecated
///CLOVER:OFF
public final class ComposedCharIter {
/**
* Constant that indicates the iteration has completed.
* {@link #next} returns this value when there are no more composed characters
* over which to iterate.
* @deprecated ICU 2.2
*/
@Deprecated
public static final char DONE = (char) Normalizer.DONE;
/**
* Construct a new ComposedCharIter. The iterator will return
* all Unicode characters with canonical decompositions, including Korean
* Hangul characters.
* @deprecated ICU 2.2
*/
@Deprecated
public ComposedCharIter() {
this(false, 0);
}
/**
* Constructs a non-default ComposedCharIter with optional behavior.
*
* @param compat false for canonical decompositions only;
* true for both canonical and compatibility
* decompositions.
*
* @param options Optional decomposition features. None are supported, so this is ignored.
* @deprecated ICU 2.2
*/
@Deprecated
public ComposedCharIter(boolean compat, int options) {
if(compat) {
n2impl = Norm2AllModes.getNFKCInstance().impl;
} else {
n2impl = Norm2AllModes.getNFCInstance().impl;
}
}
/**
* Determines whether there any precomposed Unicode characters not yet returned
* by {@link #next}.
* @deprecated ICU 2.2
*/
@Deprecated
public boolean hasNext() {
if (nextChar == Normalizer.DONE) {
findNextChar();
}
return nextChar != Normalizer.DONE;
}
/**
* Returns the next precomposed Unicode character.
* Repeated calls to next return all of the precomposed characters defined
* by Unicode, in ascending order. After all precomposed characters have
* been returned, {@link #hasNext} will return false and further calls
* to next will return {@link #DONE}.
* @deprecated ICU 2.2
*/
@Deprecated
public char next() {
if (nextChar == Normalizer.DONE) {
findNextChar();
}
curChar = nextChar;
nextChar = Normalizer.DONE;
return (char) curChar;
}
/**
* Returns the Unicode decomposition of the current character.
* This method returns the decomposition of the precomposed character most
* recently returned by {@link #next}. The resulting decomposition is
* affected by the settings of the options passed to the constructor.
* @deprecated ICU 2.2
*/
@Deprecated
public String decomposition() {
// the decomposition buffer contains the decomposition of
// current char so just return it
if(decompBuf != null) {
return decompBuf;
} else {
return "";
}
}
private void findNextChar() {
int c=curChar+1;
decompBuf = null;
for(;;) {
if(c < 0xFFFF) {
decompBuf = n2impl.getDecomposition(c);
if(decompBuf != null) {
// the curChar can be decomposed... so it is a composed char
// cache the result
break;
}
c++;
} else {
c=Normalizer.DONE;
break;
}
}
nextChar=c;
}
private final Normalizer2Impl n2impl;
private String decompBuf;
private int curChar = 0;
private int nextChar = Normalizer.DONE;
}