com.ibm.icu.impl.coll.ContractionsAndExpansions Maven / Gradle / Ivy
Go to download
Show more of this group Show more artifacts with this name
Show all versions of icu4j Show documentation
Show all versions of icu4j Show documentation
International Component for Unicode for Java (ICU4J) is a mature, widely used Java library
providing Unicode and Globalization support
// © 2016 and later: Unicode, Inc. and others.
// License & terms of use: http://www.unicode.org/copyright.html
/*
*******************************************************************************
* Copyright (C) 2013-2014, International Business Machines
* Corporation and others. All Rights Reserved.
*******************************************************************************
* ContractionsAndExpansions.java, ported from collationsets.h/.cpp
*
* C++ version created on: 2013feb09
* created by: Markus W. Scherer
*/
package com.ibm.icu.impl.coll;
import java.util.Iterator;
import com.ibm.icu.impl.Trie2;
import com.ibm.icu.text.UnicodeSet;
import com.ibm.icu.util.CharsTrie;
import com.ibm.icu.util.CharsTrie.Entry;
public final class ContractionsAndExpansions {
// C++: The following fields are @internal, only public for access by callback.
private CollationData data;
private UnicodeSet contractions;
private UnicodeSet expansions;
private CESink sink;
private boolean addPrefixes;
private int checkTailored = 0; // -1: collected tailored +1: exclude tailored
private UnicodeSet tailored = new UnicodeSet();
private UnicodeSet ranges;
private StringBuilder unreversedPrefix = new StringBuilder();
private String suffix;
private long[] ces = new long[Collation.MAX_EXPANSION_LENGTH];
public static interface CESink {
void handleCE(long ce);
void handleExpansion(long ces[], int start, int length);
}
public ContractionsAndExpansions(UnicodeSet con, UnicodeSet exp, CESink s, boolean prefixes) {
contractions = con;
expansions = exp;
sink = s;
addPrefixes = prefixes;
}
public void forData(CollationData d) {
// Add all from the data, can be tailoring or base.
if (d.base != null) {
checkTailored = -1;
}
data = d;
Iterator trieIterator = data.trie.iterator();
Trie2.Range range;
while (trieIterator.hasNext() && !(range = trieIterator.next()).leadSurrogate) {
enumCnERange(range.startCodePoint, range.endCodePoint, range.value, this);
}
if (d.base == null) {
return;
}
// Add all from the base data but only for un-tailored code points.
tailored.freeze();
checkTailored = 1;
data = d.base;
trieIterator = data.trie.iterator();
while (trieIterator.hasNext() && !(range = trieIterator.next()).leadSurrogate) {
enumCnERange(range.startCodePoint, range.endCodePoint, range.value, this);
}
}
private void enumCnERange(int start, int end, int ce32, ContractionsAndExpansions cne) {
if (cne.checkTailored == 0) {
// There is no tailoring.
// No need to collect nor check the tailored set.
} else if (cne.checkTailored < 0) {
// Collect the set of code points with mappings in the tailoring data.
if (ce32 == Collation.FALLBACK_CE32) {
return; // fallback to base, not tailored
} else {
cne.tailored.add(start, end);
}
// checkTailored > 0: Exclude tailored ranges from the base data enumeration.
} else if (start == end) {
if (cne.tailored.contains(start)) {
return;
}
} else if (cne.tailored.containsSome(start, end)) {
if (cne.ranges == null) {
cne.ranges = new UnicodeSet();
}
cne.ranges.set(start, end).removeAll(cne.tailored);
int count = cne.ranges.getRangeCount();
for (int i = 0; i < count; ++i) {
cne.handleCE32(cne.ranges.getRangeStart(i), cne.ranges.getRangeEnd(i), ce32);
}
}
cne.handleCE32(start, end, ce32);
}
public void forCodePoint(CollationData d, int c) {
int ce32 = d.getCE32(c);
if (ce32 == Collation.FALLBACK_CE32) {
d = d.base;
ce32 = d.getCE32(c);
}
data = d;
handleCE32(c, c, ce32);
}
private void handleCE32(int start, int end, int ce32) {
for (;;) {
if ((ce32 & 0xff) < Collation.SPECIAL_CE32_LOW_BYTE) {
// !isSpecialCE32()
if (sink != null) {
sink.handleCE(Collation.ceFromSimpleCE32(ce32));
}
return;
}
switch (Collation.tagFromCE32(ce32)) {
case Collation.FALLBACK_TAG:
return;
case Collation.RESERVED_TAG_3:
case Collation.BUILDER_DATA_TAG:
case Collation.LEAD_SURROGATE_TAG:
// Java porting note: U_INTERNAL_PROGRAM_ERROR is set to errorCode in ICU4C.
throw new AssertionError(
String.format("Unexpected CE32 tag type %d for ce32=0x%08x",
Collation.tagFromCE32(ce32), ce32));
case Collation.LONG_PRIMARY_TAG:
if (sink != null) {
sink.handleCE(Collation.ceFromLongPrimaryCE32(ce32));
}
return;
case Collation.LONG_SECONDARY_TAG:
if (sink != null) {
sink.handleCE(Collation.ceFromLongSecondaryCE32(ce32));
}
return;
case Collation.LATIN_EXPANSION_TAG:
if (sink != null) {
ces[0] = Collation.latinCE0FromCE32(ce32);
ces[1] = Collation.latinCE1FromCE32(ce32);
sink.handleExpansion(ces, 0, 2);
}
// Optimization: If we have a prefix,
// then the relevant strings have been added already.
if (unreversedPrefix.length() == 0) {
addExpansions(start, end);
}
return;
case Collation.EXPANSION32_TAG:
if (sink != null) {
int idx = Collation.indexFromCE32(ce32);
int length = Collation.lengthFromCE32(ce32);
for (int i = 0; i < length; ++i) {
ces[i] = Collation.ceFromCE32(data.ce32s[idx + i]);
}
sink.handleExpansion(ces, 0, length);
}
// Optimization: If we have a prefix,
// then the relevant strings have been added already.
if (unreversedPrefix.length() == 0) {
addExpansions(start, end);
}
return;
case Collation.EXPANSION_TAG:
if (sink != null) {
int idx = Collation.indexFromCE32(ce32);
int length = Collation.lengthFromCE32(ce32);
sink.handleExpansion(data.ces, idx, length);
}
// Optimization: If we have a prefix,
// then the relevant strings have been added already.
if (unreversedPrefix.length() == 0) {
addExpansions(start, end);
}
return;
case Collation.PREFIX_TAG:
handlePrefixes(start, end, ce32);
return;
case Collation.CONTRACTION_TAG:
handleContractions(start, end, ce32);
return;
case Collation.DIGIT_TAG:
// Fetch the non-numeric-collation CE32 and continue.
ce32 = data.ce32s[Collation.indexFromCE32(ce32)];
break;
case Collation.U0000_TAG:
assert (start == 0 && end == 0);
// Fetch the normal ce32 for U+0000 and continue.
ce32 = data.ce32s[0];
break;
case Collation.HANGUL_TAG:
if (sink != null) {
// TODO: This should be optimized,
// especially if [start..end] is the complete Hangul range. (assert that)
UTF16CollationIterator iter = new UTF16CollationIterator(data);
StringBuilder hangul = new StringBuilder(1);
for (int c = start; c <= end; ++c) {
hangul.setLength(0);
hangul.appendCodePoint(c);
iter.setText(false, hangul, 0);
int length = iter.fetchCEs();
// Ignore the terminating non-CE.
assert (length >= 2 && iter.getCE(length - 1) == Collation.NO_CE);
sink.handleExpansion(iter.getCEs(), 0, length - 1);
}
}
// Optimization: If we have a prefix,
// then the relevant strings have been added already.
if (unreversedPrefix.length() == 0) {
addExpansions(start, end);
}
return;
case Collation.OFFSET_TAG:
// Currently no need to send offset CEs to the sink.
return;
case Collation.IMPLICIT_TAG:
// Currently no need to send implicit CEs to the sink.
return;
}
}
}
private void handlePrefixes(int start, int end, int ce32) {
int index = Collation.indexFromCE32(ce32);
ce32 = data.getCE32FromContexts(index); // Default if no prefix match.
handleCE32(start, end, ce32);
if (!addPrefixes) {
return;
}
CharsTrie.Iterator prefixes = new CharsTrie(data.contexts, index + 2).iterator();
while (prefixes.hasNext()) {
Entry e = prefixes.next();
setPrefix(e.chars);
// Prefix/pre-context mappings are special kinds of contractions
// that always yield expansions.
addStrings(start, end, contractions);
addStrings(start, end, expansions);
handleCE32(start, end, e.value);
}
resetPrefix();
}
void handleContractions(int start, int end, int ce32) {
int index = Collation.indexFromCE32(ce32);
if ((ce32 & Collation.CONTRACT_SINGLE_CP_NO_MATCH) != 0) {
// No match on the single code point.
// We are underneath a prefix, and the default mapping is just
// a fallback to the mappings for a shorter prefix.
assert (unreversedPrefix.length() != 0);
} else {
ce32 = data.getCE32FromContexts(index); // Default if no suffix match.
assert (!Collation.isContractionCE32(ce32));
handleCE32(start, end, ce32);
}
CharsTrie.Iterator suffixes = new CharsTrie(data.contexts, index + 2).iterator();
while (suffixes.hasNext()) {
Entry e = suffixes.next();
suffix = e.chars.toString();
addStrings(start, end, contractions);
if (unreversedPrefix.length() != 0) {
addStrings(start, end, expansions);
}
handleCE32(start, end, e.value);
}
suffix = null;
}
void addExpansions(int start, int end) {
if (unreversedPrefix.length() == 0 && suffix == null) {
if (expansions != null) {
expansions.add(start, end);
}
} else {
addStrings(start, end, expansions);
}
}
void addStrings(int start, int end, UnicodeSet set) {
if (set == null) {
return;
}
StringBuilder s = new StringBuilder(unreversedPrefix);
do {
s.appendCodePoint(start);
if (suffix != null) {
s.append(suffix);
}
set.add(s);
s.setLength(unreversedPrefix.length());
} while (++start <= end);
}
// Prefixes are reversed in the data structure.
private void setPrefix(CharSequence pfx) {
unreversedPrefix.setLength(0);
unreversedPrefix.append(pfx).reverse();
}
private void resetPrefix() {
unreversedPrefix.setLength(0);
}
}