com.ibm.icu.text.CanonicalIterator Maven / Gradle / Ivy
Go to download
Show more of this group Show more artifacts with this name
Show all versions of icu4j Show documentation
Show all versions of icu4j Show documentation
International Component for Unicode for Java (ICU4J) is a mature, widely used Java library
providing Unicode and Globalization support
// © 2016 and later: Unicode, Inc. and others.
// License & terms of use: http://www.unicode.org/copyright.html
/*
*******************************************************************************
* Copyright (C) 1996-2014, International Business Machines Corporation and *
* others. All Rights Reserved. *
*******************************************************************************
*/
package com.ibm.icu.text;
import java.util.ArrayList;
import java.util.HashSet;
import java.util.Iterator;
import java.util.List;
import java.util.Set;
import com.ibm.icu.impl.Norm2AllModes;
import com.ibm.icu.impl.Normalizer2Impl;
import com.ibm.icu.impl.Utility;
import com.ibm.icu.lang.UCharacter;
/**
* This class allows one to iterate through all the strings that are canonically equivalent to a given
* string. For example, here are some sample results:
* Results for: {A WITH RING ABOVE}{d}{DOT ABOVE}{CEDILLA}
*
1: {A}{RING ABOVE}{d}{DOT ABOVE}{CEDILLA}
2: {A}{RING ABOVE}{d}{CEDILLA}{DOT ABOVE}
3: {A}{RING ABOVE}{d WITH DOT ABOVE}{CEDILLA}
4: {A}{RING ABOVE}{d WITH CEDILLA}{DOT ABOVE}
5: {A WITH RING ABOVE}{d}{DOT ABOVE}{CEDILLA}
6: {A WITH RING ABOVE}{d}{CEDILLA}{DOT ABOVE}
7: {A WITH RING ABOVE}{d WITH DOT ABOVE}{CEDILLA}
8: {A WITH RING ABOVE}{d WITH CEDILLA}{DOT ABOVE}
9: {ANGSTROM SIGN}{d}{DOT ABOVE}{CEDILLA}
10: {ANGSTROM SIGN}{d}{CEDILLA}{DOT ABOVE}
11: {ANGSTROM SIGN}{d WITH DOT ABOVE}{CEDILLA}
12: {ANGSTROM SIGN}{d WITH CEDILLA}{DOT ABOVE}
*
*
Note: the code is intended for use with small strings, and is not suitable for larger ones,
* since it has not been optimized for that situation.
* @author M. Davis
* @stable ICU 2.4
*/
public final class CanonicalIterator {
/**
* Construct a CanonicalIterator object
* @param source string to get results for
* @stable ICU 2.4
*/
public CanonicalIterator(String source) {
Norm2AllModes allModes = Norm2AllModes.getNFCInstance();
nfd = allModes.decomp;
nfcImpl = allModes.impl.ensureCanonIterData();
setSource(source);
}
/**
* Gets the NFD form of the current source we are iterating over.
* @return gets the source: NOTE: it is the NFD form of the source originally passed in
* @stable ICU 2.4
*/
public String getSource() {
return source;
}
/**
* Resets the iterator so that one can start again from the beginning.
* @stable ICU 2.4
*/
public void reset() {
done = false;
for (int i = 0; i < current.length; ++i) {
current[i] = 0;
}
}
/**
* Get the next canonically equivalent string.
*
Warning: The strings are not guaranteed to be in any particular order.
* @return the next string that is canonically equivalent. The value null is returned when
* the iteration is done.
* @stable ICU 2.4
*/
public String next() {
if (done) return null;
// construct return value
buffer.setLength(0); // delete old contents
for (int i = 0; i < pieces.length; ++i) {
buffer.append(pieces[i][current[i]]);
}
String result = buffer.toString();
// find next value for next time
for (int i = current.length - 1; ; --i) {
if (i < 0) {
done = true;
break;
}
current[i]++;
if (current[i] < pieces[i].length) break; // got sequence
current[i] = 0;
}
return result;
}
/**
* Set a new source for this iterator. Allows object reuse.
* @param newSource the source string to iterate against. This allows the same iterator to be used
* while changing the source string, saving object creation.
* @stable ICU 2.4
*/
public void setSource(String newSource) {
source = nfd.normalize(newSource);
done = false;
// catch degenerate case
if (newSource.length() == 0) {
pieces = new String[1][];
current = new int[1];
pieces[0] = new String[]{""};
return;
}
// find the segments
List segmentList = new ArrayList();
int cp;
int start = 0;
// i should be the end of the first code point
// break up the string into segements
int i = UTF16.findOffsetFromCodePoint(source, 1);
for (; i < source.length(); i += Character.charCount(cp)) {
cp = source.codePointAt(i);
if (nfcImpl.isCanonSegmentStarter(cp)) {
segmentList.add(source.substring(start, i)); // add up to i
start = i;
}
}
segmentList.add(source.substring(start, i)); // add last one
// allocate the arrays, and find the strings that are CE to each segment
pieces = new String[segmentList.size()][];
current = new int[segmentList.size()];
for (i = 0; i < pieces.length; ++i) {
if (PROGRESS) System.out.println("SEGMENT");
pieces[i] = getEquivalents(segmentList.get(i));
}
}
/**
* Simple implementation of permutation.
*
Warning: The strings are not guaranteed to be in any particular order.
* @param source the string to find permutations for
* @param skipZeros set to true to skip characters with canonical combining class zero
* @param output the set to add the results to
* @internal
* @deprecated This API is ICU internal only.
*/
@Deprecated
public static void permute(String source, boolean skipZeros, Set output) {
// TODO: optimize
//if (PROGRESS) System.out.println("Permute: " + source);
// optimization:
// if zero or one character, just return a set with it
// we check for length < 2 to keep from counting code points all the time
if (source.length() <= 2 && UTF16.countCodePoint(source) <= 1) {
output.add(source);
return;
}
// otherwise iterate through the string, and recursively permute all the other characters
Set subpermute = new HashSet();
int cp;
for (int i = 0; i < source.length(); i += UTF16.getCharCount(cp)) {
cp = UTF16.charAt(source, i);
// optimization:
// if the character is canonical combining class zero,
// don't permute it
if (skipZeros && i != 0 && UCharacter.getCombiningClass(cp) == 0) {
//System.out.println("Skipping " + Utility.hex(UTF16.valueOf(source, i)));
continue;
}
// see what the permutations of the characters before and after this one are
subpermute.clear();
permute(source.substring(0,i)
+ source.substring(i + UTF16.getCharCount(cp)), skipZeros, subpermute);
// prefix this character to all of them
String chStr = UTF16.valueOf(source, i);
for (String s : subpermute) {
String piece = chStr + s;
//if (PROGRESS) System.out.println(" Piece: " + piece);
output.add(piece);
}
}
}
// FOR TESTING
/*
*@return the set of "safe starts", characters that are class zero AND are never non-initial in a decomposition.
*
public static UnicodeSet getSafeStart() {
return (UnicodeSet) SAFE_START.clone();
}
*/
/*
*@return the set of characters whose decompositions start with the given character
*
public static UnicodeSet getStarts(int cp) {
UnicodeSet result = AT_START.get(cp);
if (result == null) result = EMPTY;
return (UnicodeSet) result.clone();
}
*/
// ===================== PRIVATES ==============================
// debug
private static boolean PROGRESS = false; // debug progress
//private static Transliterator NAME = PROGRESS ? Transliterator.getInstance("name") : null;
private static boolean SKIP_ZEROS = true;
// fields
private final Normalizer2 nfd;
private final Normalizer2Impl nfcImpl;
private String source;
private boolean done;
private String[][] pieces;
private int[] current;
// Note: C will need two more fields, since arrays there don't have lengths
// int pieces_length;
// int[] pieces_lengths;
// transient fields
private transient StringBuilder buffer = new StringBuilder();
// we have a segment, in NFD. Find all the strings that are canonically equivalent to it.
private String[] getEquivalents(String segment) {
Set result = new HashSet();
Set basic = getEquivalents2(segment);
Set permutations = new HashSet();
// now get all the permutations
// add only the ones that are canonically equivalent
// TODO: optimize by not permuting any class zero.
Iterator it = basic.iterator();
while (it.hasNext()) {
String item = it.next();
permutations.clear();
permute(item, SKIP_ZEROS, permutations);
Iterator it2 = permutations.iterator();
while (it2.hasNext()) {
String possible = it2.next();
/*
String attempt = Normalizer.normalize(possible, Normalizer.DECOMP, 0);
if (attempt.equals(segment)) {
*/
if (Normalizer.compare(possible, segment,0)==0) {
if (PROGRESS) System.out.println("Adding Permutation: " + Utility.hex(possible));
result.add(possible);
} else {
if (PROGRESS) System.out.println("-Skipping Permutation: " + Utility.hex(possible));
}
}
}
// convert into a String[] to clean up storage
String[] finalResult = new String[result.size()];
result.toArray(finalResult);
return finalResult;
}
private Set getEquivalents2(String segment) {
Set result = new HashSet();
if (PROGRESS) System.out.println("Adding: " + Utility.hex(segment));
result.add(segment);
StringBuffer workingBuffer = new StringBuffer();
UnicodeSet starts = new UnicodeSet();
// cycle through all the characters
int cp;
for (int i = 0; i < segment.length(); i += Character.charCount(cp)) {
// see if any character is at the start of some decomposition
cp = segment.codePointAt(i);
if (!nfcImpl.getCanonStartSet(cp, starts)) {
continue;
}
// if so, see which decompositions match
for(UnicodeSetIterator iter = new UnicodeSetIterator(starts); iter.next();) {
int cp2 = iter.codepoint;
Set remainder = extract(cp2, segment, i, workingBuffer);
if (remainder == null) {
continue;
}
// there were some matches, so add all the possibilities to the set.
String prefix= segment.substring(0,i);
prefix += UTF16.valueOf(cp2);
for (String item : remainder) {
result.add(prefix + item);
}
}
}
return result;
/*
Set result = new HashSet();
if (PROGRESS) System.out.println("Adding: " + NAME.transliterate(segment));
result.add(segment);
StringBuffer workingBuffer = new StringBuffer();
// cycle through all the characters
int cp;
for (int i = 0; i < segment.length(); i += UTF16.getCharCount(cp)) {
// see if any character is at the start of some decomposition
cp = UTF16.charAt(segment, i);
NormalizerImpl.getCanonStartSet(c,fillSet)
UnicodeSet starts = AT_START.get(cp);
if (starts == null) continue;
UnicodeSetIterator usi = new UnicodeSetIterator(starts);
// if so, see which decompositions match
while (usi.next()) {
int cp2 = usi.codepoint;
// we know that there are no strings in it
// so we don't have to check CharacterIterator.IS_STRING
Set remainder = extract(cp2, segment, i, workingBuffer);
if (remainder == null) continue;
// there were some matches, so add all the possibilities to the set.
String prefix = segment.substring(0, i) + UTF16.valueOf(cp2);
Iterator it = remainder.iterator();
while (it.hasNext()) {
String item = (String) it.next();
if (PROGRESS) System.out.println("Adding: " + NAME.transliterate(prefix + item));
result.add(prefix + item);
}
}
}
return result;
*/
}
/**
* See if the decomposition of cp2 is at segment starting at segmentPos
* (with canonical rearrangment!)
* If so, take the remainder, and return the equivalents
*/
private Set extract(int comp, String segment, int segmentPos, StringBuffer buf) {
if (PROGRESS) System.out.println(" extract: " + Utility.hex(UTF16.valueOf(comp))
+ ", " + Utility.hex(segment.substring(segmentPos)));
String decomp = nfcImpl.getDecomposition(comp);
if (decomp == null) {
decomp = UTF16.valueOf(comp);
}
// See if it matches the start of segment (at segmentPos)
boolean ok = false;
int cp;
int decompPos = 0;
int decompCp = UTF16.charAt(decomp,0);
decompPos += UTF16.getCharCount(decompCp); // adjust position to skip first char
//int decompClass = getClass(decompCp);
buf.setLength(0); // initialize working buffer, shared among callees
for (int i = segmentPos; i < segment.length(); i += UTF16.getCharCount(cp)) {
cp = UTF16.charAt(segment, i);
if (cp == decompCp) { // if equal, eat another cp from decomp
if (PROGRESS) System.out.println(" matches: " + Utility.hex(UTF16.valueOf(cp)));
if (decompPos == decomp.length()) { // done, have all decomp characters!
buf.append(segment.substring(i + UTF16.getCharCount(cp))); // add remaining segment chars
ok = true;
break;
}
decompCp = UTF16.charAt(decomp, decompPos);
decompPos += UTF16.getCharCount(decompCp);
//decompClass = getClass(decompCp);
} else {
if (PROGRESS) System.out.println(" buffer: " + Utility.hex(UTF16.valueOf(cp)));
// brute force approach
UTF16.append(buf, cp);
/* TODO: optimize
// since we know that the classes are monotonically increasing, after zero
// e.g. 0 5 7 9 0 3
// we can do an optimization
// there are only a few cases that work: zero, less, same, greater
// if both classes are the same, we fail
// if the decomp class < the segment class, we fail
segClass = getClass(cp);
if (decompClass <= segClass) return null;
*/
}
}
if (!ok) return null; // we failed, characters left over
if (PROGRESS) System.out.println("Matches");
if (buf.length() == 0) return SET_WITH_NULL_STRING; // succeed, but no remainder
String remainder = buf.toString();
// brute force approach
// to check to make sure result is canonically equivalent
/*
String trial = Normalizer.normalize(UTF16.valueOf(comp) + remainder, Normalizer.DECOMP, 0);
if (!segment.regionMatches(segmentPos, trial, 0, segment.length() - segmentPos)) return null;
*/
if (0!=Normalizer.compare(UTF16.valueOf(comp) + remainder, segment.substring(segmentPos), 0)) return null;
// get the remaining combinations
return getEquivalents2(remainder);
}
/*
// TODO: fix once we have a codepoint interface to get the canonical combining class
// TODO: Need public access to canonical combining class in UCharacter!
private static int getClass(int cp) {
return Normalizer.getClass((char)cp);
}
*/
// ================= BUILDER =========================
// TODO: Flatten this data so it doesn't have to be reconstructed each time!
//private static final UnicodeSet EMPTY = new UnicodeSet(); // constant, don't change
private static final Set SET_WITH_NULL_STRING = new HashSet(); // constant, don't change
static {
SET_WITH_NULL_STRING.add("");
}
// private static UnicodeSet SAFE_START = new UnicodeSet();
// private static CharMap AT_START = new CharMap();
// TODO: WARNING, NORMALIZER doesn't have supplementaries yet !!;
// Change FFFF to 10FFFF in C, and in Java when normalizer is upgraded.
// private static int LAST_UNICODE = 0x10FFFF;
/*
static {
buildData();
}
*/
/*
private static void buildData() {
if (PROGRESS) System.out.println("Getting Safe Start");
for (int cp = 0; cp <= LAST_UNICODE; ++cp) {
if (PROGRESS & (cp & 0x7FF) == 0) System.out.print('.');
int cc = UCharacter.getCombiningClass(cp);
if (cc == 0) SAFE_START.add(cp);
// will fix to be really safe below
}
if (PROGRESS) System.out.println();
if (PROGRESS) System.out.println("Getting Containment");
for (int cp = 0; cp <= LAST_UNICODE; ++cp) {
if (PROGRESS & (cp & 0x7FF) == 0) System.out.print('.');
if (Normalizer.isNormalized(cp, Normalizer.NFD)) continue;
//String istr = UTF16.valueOf(cp);
String decomp = Normalizer.normalize(cp, Normalizer.NFD);
//if (decomp.equals(istr)) continue;
// add each character in the decomposition to canBeIn
int component;
for (int i = 0; i < decomp.length(); i += UTF16.getCharCount(component)) {
component = UTF16.charAt(decomp, i);
if (i == 0) {
AT_START.add(component, cp);
} else if (UCharacter.getCombiningClass(component) == 0) {
SAFE_START.remove(component);
}
}
}
if (PROGRESS) System.out.println();
}
// the following is just for a map from characters to a set of characters
private static class CharMap {
Map storage = new HashMap();
MutableInt probe = new MutableInt();
boolean converted = false;
public void add(int cp, int whatItIsIn) {
UnicodeSet result = (UnicodeSet) storage.get(probe.set(cp));
if (result == null) {
result = new UnicodeSet();
storage.put(probe, result);
}
result.add(whatItIsIn);
}
public UnicodeSet get(int cp) {
return (UnicodeSet) storage.get(probe.set(cp));
}
}
private static class MutableInt {
public int contents;
public int hashCode() { return contents; }
public boolean equals(Object other) {
return ((MutableInt)other).contents == contents;
}
// allows chaining
public MutableInt set(int contents) {
this.contents = contents;
return this;
}
}
*/
}