com.ibm.icu.impl.coll.ContractionsAndExpansions Maven / Gradle / Ivy
// © 2016 and later: Unicode, Inc. and others.
// License & terms of use: http://www.unicode.org/copyright.html#License
/*
*******************************************************************************
* Copyright (C) 2013-2014, International Business Machines
* Corporation and others. All Rights Reserved.
*******************************************************************************
* ContractionsAndExpansions.java, ported from collationsets.h/.cpp
*
* C++ version created on: 2013feb09
* created by: Markus W. Scherer
*/
package com.ibm.icu.impl.coll;
import java.util.Iterator;
import com.ibm.icu.impl.Trie2;
import com.ibm.icu.text.UnicodeSet;
import com.ibm.icu.util.CharsTrie;
import com.ibm.icu.util.CharsTrie.Entry;
public final class ContractionsAndExpansions {
// C++: The following fields are @internal, only public for access by callback.
private CollationData data;
private UnicodeSet contractions;
private UnicodeSet expansions;
private CESink sink;
private boolean addPrefixes;
private int checkTailored = 0; // -1: collected tailored +1: exclude tailored
private UnicodeSet tailored = new UnicodeSet();
private UnicodeSet ranges;
private StringBuilder unreversedPrefix = new StringBuilder();
private String suffix;
private long[] ces = new long[Collation.MAX_EXPANSION_LENGTH];
public static interface CESink {
void handleCE(long ce);
void handleExpansion(long ces[], int start, int length);
}
public ContractionsAndExpansions(UnicodeSet con, UnicodeSet exp, CESink s, boolean prefixes) {
contractions = con;
expansions = exp;
sink = s;
addPrefixes = prefixes;
}
public void forData(CollationData d) {
// Add all from the data, can be tailoring or base.
if (d.base != null) {
checkTailored = -1;
}
data = d;
Iterator trieIterator = data.trie.iterator();
Trie2.Range range;
while (trieIterator.hasNext() && !(range = trieIterator.next()).leadSurrogate) {
enumCnERange(range.startCodePoint, range.endCodePoint, range.value, this);
}
if (d.base == null) {
return;
}
// Add all from the base data but only for un-tailored code points.
tailored.freeze();
checkTailored = 1;
data = d.base;
trieIterator = data.trie.iterator();
while (trieIterator.hasNext() && !(range = trieIterator.next()).leadSurrogate) {
enumCnERange(range.startCodePoint, range.endCodePoint, range.value, this);
}
}
private void enumCnERange(int start, int end, int ce32, ContractionsAndExpansions cne) {
if (cne.checkTailored == 0) {
// There is no tailoring.
// No need to collect nor check the tailored set.
} else if (cne.checkTailored < 0) {
// Collect the set of code points with mappings in the tailoring data.
if (ce32 == Collation.FALLBACK_CE32) {
return; // fallback to base, not tailored
} else {
cne.tailored.add(start, end);
}
// checkTailored > 0: Exclude tailored ranges from the base data enumeration.
} else if (start == end) {
if (cne.tailored.contains(start)) {
return;
}
} else if (cne.tailored.containsSome(start, end)) {
if (cne.ranges == null) {
cne.ranges = new UnicodeSet();
}
cne.ranges.set(start, end).removeAll(cne.tailored);
int count = cne.ranges.getRangeCount();
for (int i = 0; i < count; ++i) {
cne.handleCE32(cne.ranges.getRangeStart(i), cne.ranges.getRangeEnd(i), ce32);
}
}
cne.handleCE32(start, end, ce32);
}
public void forCodePoint(CollationData d, int c) {
int ce32 = d.getCE32(c);
if (ce32 == Collation.FALLBACK_CE32) {
d = d.base;
ce32 = d.getCE32(c);
}
data = d;
handleCE32(c, c, ce32);
}
private void handleCE32(int start, int end, int ce32) {
for (;;) {
if ((ce32 & 0xff) < Collation.SPECIAL_CE32_LOW_BYTE) {
// !isSpecialCE32()
if (sink != null) {
sink.handleCE(Collation.ceFromSimpleCE32(ce32));
}
return;
}
switch (Collation.tagFromCE32(ce32)) {
case Collation.FALLBACK_TAG:
return;
case Collation.RESERVED_TAG_3:
case Collation.BUILDER_DATA_TAG:
case Collation.LEAD_SURROGATE_TAG:
// Java porting note: U_INTERNAL_PROGRAM_ERROR is set to errorCode in ICU4C.
throw new AssertionError(
String.format("Unexpected CE32 tag type %d for ce32=0x%08x",
Collation.tagFromCE32(ce32), ce32));
case Collation.LONG_PRIMARY_TAG:
if (sink != null) {
sink.handleCE(Collation.ceFromLongPrimaryCE32(ce32));
}
return;
case Collation.LONG_SECONDARY_TAG:
if (sink != null) {
sink.handleCE(Collation.ceFromLongSecondaryCE32(ce32));
}
return;
case Collation.LATIN_EXPANSION_TAG:
if (sink != null) {
ces[0] = Collation.latinCE0FromCE32(ce32);
ces[1] = Collation.latinCE1FromCE32(ce32);
sink.handleExpansion(ces, 0, 2);
}
// Optimization: If we have a prefix,
// then the relevant strings have been added already.
if (unreversedPrefix.length() == 0) {
addExpansions(start, end);
}
return;
case Collation.EXPANSION32_TAG:
if (sink != null) {
int idx = Collation.indexFromCE32(ce32);
int length = Collation.lengthFromCE32(ce32);
for (int i = 0; i < length; ++i) {
ces[i] = Collation.ceFromCE32(data.ce32s[idx + i]);
}
sink.handleExpansion(ces, 0, length);
}
// Optimization: If we have a prefix,
// then the relevant strings have been added already.
if (unreversedPrefix.length() == 0) {
addExpansions(start, end);
}
return;
case Collation.EXPANSION_TAG:
if (sink != null) {
int idx = Collation.indexFromCE32(ce32);
int length = Collation.lengthFromCE32(ce32);
sink.handleExpansion(data.ces, idx, length);
}
// Optimization: If we have a prefix,
// then the relevant strings have been added already.
if (unreversedPrefix.length() == 0) {
addExpansions(start, end);
}
return;
case Collation.PREFIX_TAG:
handlePrefixes(start, end, ce32);
return;
case Collation.CONTRACTION_TAG:
handleContractions(start, end, ce32);
return;
case Collation.DIGIT_TAG:
// Fetch the non-numeric-collation CE32 and continue.
ce32 = data.ce32s[Collation.indexFromCE32(ce32)];
break;
case Collation.U0000_TAG:
assert (start == 0 && end == 0);
// Fetch the normal ce32 for U+0000 and continue.
ce32 = data.ce32s[0];
break;
case Collation.HANGUL_TAG:
if (sink != null) {
// TODO: This should be optimized,
// especially if [start..end] is the complete Hangul range. (assert that)
UTF16CollationIterator iter = new UTF16CollationIterator(data);
StringBuilder hangul = new StringBuilder(1);
for (int c = start; c <= end; ++c) {
hangul.setLength(0);
hangul.appendCodePoint(c);
iter.setText(false, hangul, 0);
int length = iter.fetchCEs();
// Ignore the terminating non-CE.
assert (length >= 2 && iter.getCE(length - 1) == Collation.NO_CE);
sink.handleExpansion(iter.getCEs(), 0, length - 1);
}
}
// Optimization: If we have a prefix,
// then the relevant strings have been added already.
if (unreversedPrefix.length() == 0) {
addExpansions(start, end);
}
return;
case Collation.OFFSET_TAG:
// Currently no need to send offset CEs to the sink.
return;
case Collation.IMPLICIT_TAG:
// Currently no need to send implicit CEs to the sink.
return;
}
}
}
private void handlePrefixes(int start, int end, int ce32) {
int index = Collation.indexFromCE32(ce32);
ce32 = data.getCE32FromContexts(index); // Default if no prefix match.
handleCE32(start, end, ce32);
if (!addPrefixes) {
return;
}
CharsTrie.Iterator prefixes = new CharsTrie(data.contexts, index + 2).iterator();
while (prefixes.hasNext()) {
Entry e = prefixes.next();
setPrefix(e.chars);
// Prefix/pre-context mappings are special kinds of contractions
// that always yield expansions.
addStrings(start, end, contractions);
addStrings(start, end, expansions);
handleCE32(start, end, e.value);
}
resetPrefix();
}
void handleContractions(int start, int end, int ce32) {
int index = Collation.indexFromCE32(ce32);
if ((ce32 & Collation.CONTRACT_SINGLE_CP_NO_MATCH) != 0) {
// No match on the single code point.
// We are underneath a prefix, and the default mapping is just
// a fallback to the mappings for a shorter prefix.
assert (unreversedPrefix.length() != 0);
} else {
ce32 = data.getCE32FromContexts(index); // Default if no suffix match.
assert (!Collation.isContractionCE32(ce32));
handleCE32(start, end, ce32);
}
CharsTrie.Iterator suffixes = new CharsTrie(data.contexts, index + 2).iterator();
while (suffixes.hasNext()) {
Entry e = suffixes.next();
suffix = e.chars.toString();
addStrings(start, end, contractions);
if (unreversedPrefix.length() != 0) {
addStrings(start, end, expansions);
}
handleCE32(start, end, e.value);
}
suffix = null;
}
void addExpansions(int start, int end) {
if (unreversedPrefix.length() == 0 && suffix == null) {
if (expansions != null) {
expansions.add(start, end);
}
} else {
addStrings(start, end, expansions);
}
}
void addStrings(int start, int end, UnicodeSet set) {
if (set == null) {
return;
}
StringBuilder s = new StringBuilder(unreversedPrefix);
do {
s.appendCodePoint(start);
if (suffix != null) {
s.append(suffix);
}
set.add(s);
s.setLength(unreversedPrefix.length());
} while (++start <= end);
}
// Prefixes are reversed in the data structure.
private void setPrefix(CharSequence pfx) {
unreversedPrefix.setLength(0);
unreversedPrefix.append(pfx).reverse();
}
private void resetPrefix() {
unreversedPrefix.setLength(0);
}
}
© 2015 - 2024 Weber Informatics LLC | Privacy Policy