com.ibm.icu.impl.coll.CollationData Maven / Gradle / Ivy
// © 2016 and later: Unicode, Inc. and others.
// License & terms of use: http://www.unicode.org/copyright.html#License
/*
*******************************************************************************
* Copyright (C) 2010-2015, International Business Machines
* Corporation and others. All Rights Reserved.
*******************************************************************************
* CollationData.java, ported from collationdata.h/.cpp
*
* C++ version created on: 2010oct27
* created by: Markus W. Scherer
*/
package com.ibm.icu.impl.coll;
import com.ibm.icu.impl.Normalizer2Impl;
import com.ibm.icu.impl.Trie2_32;
import com.ibm.icu.lang.UScript;
import com.ibm.icu.text.Collator;
import com.ibm.icu.text.UnicodeSet;
import com.ibm.icu.util.ICUException;
/**
* Collation data container.
* Immutable data created by a CollationDataBuilder, or loaded from a file,
* or deserialized from API-provided binary data.
*
* Includes data for the collation base (root/default), aliased if this is not the base.
*/
public final class CollationData {
// Note: The ucadata.icu loader could discover the reserved ranges by setting an array
// parallel with the ranges, and resetting ranges that are indexed.
// The reordering builder code could clone the resulting template array.
static final int REORDER_RESERVED_BEFORE_LATIN = Collator.ReorderCodes.FIRST + 14;
static final int REORDER_RESERVED_AFTER_LATIN = Collator.ReorderCodes.FIRST + 15;
static final int MAX_NUM_SPECIAL_REORDER_CODES = 8;
CollationData(Normalizer2Impl nfc) {
nfcImpl = nfc;
}
public int getCE32(int c) {
return trie.get(c);
}
int getCE32FromSupplementary(int c) {
return trie.get(c); // TODO: port UTRIE2_GET32_FROM_SUPP(trie, c) to Java?
}
boolean isDigit(int c) {
return c < 0x660 ? c <= 0x39 && 0x30 <= c :
Collation.hasCE32Tag(getCE32(c), Collation.DIGIT_TAG);
}
public boolean isUnsafeBackward(int c, boolean numeric) {
return unsafeBackwardSet.contains(c) || (numeric && isDigit(c));
}
public boolean isCompressibleLeadByte(int b) {
return compressibleBytes[b];
}
public boolean isCompressiblePrimary(long p) {
return isCompressibleLeadByte((int)p >>> 24);
}
/**
* Returns the CE32 from two contexts words.
* Access to the defaultCE32 for contraction and prefix matching.
*/
int getCE32FromContexts(int index) {
return ((int)contexts.charAt(index) << 16) | contexts.charAt(index + 1);
}
/**
* Returns the CE32 for an indirect special CE32 (e.g., with DIGIT_TAG).
* Requires that ce32 is special.
*/
int getIndirectCE32(int ce32) {
assert(Collation.isSpecialCE32(ce32));
int tag = Collation.tagFromCE32(ce32);
if(tag == Collation.DIGIT_TAG) {
// Fetch the non-numeric-collation CE32.
ce32 = ce32s[Collation.indexFromCE32(ce32)];
} else if(tag == Collation.LEAD_SURROGATE_TAG) {
ce32 = Collation.UNASSIGNED_CE32;
} else if(tag == Collation.U0000_TAG) {
// Fetch the normal ce32 for U+0000.
ce32 = ce32s[0];
}
return ce32;
}
/**
* Returns the CE32 for an indirect special CE32 (e.g., with DIGIT_TAG),
* if ce32 is special.
*/
int getFinalCE32(int ce32) {
if(Collation.isSpecialCE32(ce32)) {
ce32 = getIndirectCE32(ce32);
}
return ce32;
}
/**
* Computes a CE from c's ce32 which has the OFFSET_TAG.
*/
long getCEFromOffsetCE32(int c, int ce32) {
long dataCE = ces[Collation.indexFromCE32(ce32)];
return Collation.makeCE(Collation.getThreeBytePrimaryForOffsetData(c, dataCE));
}
/**
* Returns the single CE that c maps to.
* Throws UnsupportedOperationException if c does not map to a single CE.
*/
long getSingleCE(int c) {
CollationData d;
int ce32 = getCE32(c);
if(ce32 == Collation.FALLBACK_CE32) {
d = base;
ce32 = base.getCE32(c);
} else {
d = this;
}
while(Collation.isSpecialCE32(ce32)) {
switch(Collation.tagFromCE32(ce32)) {
case Collation.LATIN_EXPANSION_TAG:
case Collation.BUILDER_DATA_TAG:
case Collation.PREFIX_TAG:
case Collation.CONTRACTION_TAG:
case Collation.HANGUL_TAG:
case Collation.LEAD_SURROGATE_TAG:
throw new UnsupportedOperationException(String.format(
"there is not exactly one collation element for U+%04X (CE32 0x%08x)",
c, ce32));
case Collation.FALLBACK_TAG:
case Collation.RESERVED_TAG_3:
throw new AssertionError(String.format(
"unexpected CE32 tag for U+%04X (CE32 0x%08x)", c, ce32));
case Collation.LONG_PRIMARY_TAG:
return Collation.ceFromLongPrimaryCE32(ce32);
case Collation.LONG_SECONDARY_TAG:
return Collation.ceFromLongSecondaryCE32(ce32);
case Collation.EXPANSION32_TAG:
if(Collation.lengthFromCE32(ce32) == 1) {
ce32 = d.ce32s[Collation.indexFromCE32(ce32)];
break;
} else {
throw new UnsupportedOperationException(String.format(
"there is not exactly one collation element for U+%04X (CE32 0x%08x)",
c, ce32));
}
case Collation.EXPANSION_TAG: {
if(Collation.lengthFromCE32(ce32) == 1) {
return d.ces[Collation.indexFromCE32(ce32)];
} else {
throw new UnsupportedOperationException(String.format(
"there is not exactly one collation element for U+%04X (CE32 0x%08x)",
c, ce32));
}
}
case Collation.DIGIT_TAG:
// Fetch the non-numeric-collation CE32 and continue.
ce32 = d.ce32s[Collation.indexFromCE32(ce32)];
break;
case Collation.U0000_TAG:
assert(c == 0);
// Fetch the normal ce32 for U+0000 and continue.
ce32 = d.ce32s[0];
break;
case Collation.OFFSET_TAG:
return d.getCEFromOffsetCE32(c, ce32);
case Collation.IMPLICIT_TAG:
return Collation.unassignedCEFromCodePoint(c);
}
}
return Collation.ceFromSimpleCE32(ce32);
}
/**
* Returns the FCD16 value for code point c. c must be >= 0.
*/
int getFCD16(int c) {
return nfcImpl.getFCD16(c);
}
/**
* Returns the first primary for the script's reordering group.
* @return the primary with only the first primary lead byte of the group
* (not necessarily an actual root collator primary weight),
* or 0 if the script is unknown
*/
long getFirstPrimaryForGroup(int script) {
int index = getScriptIndex(script);
return index == 0 ? 0 : (long)scriptStarts[index] << 16;
}
/**
* Returns the last primary for the script's reordering group.
* @return the last primary of the group
* (not an actual root collator primary weight),
* or 0 if the script is unknown
*/
public long getLastPrimaryForGroup(int script) {
int index = getScriptIndex(script);
if(index == 0) {
return 0;
}
long limit = scriptStarts[index + 1];
return (limit << 16) - 1;
}
/**
* Finds the reordering group which contains the primary weight.
* @return the first script of the group, or -1 if the weight is beyond the last group
*/
public int getGroupForPrimary(long p) {
p >>= 16;
if(p < scriptStarts[1] || scriptStarts[scriptStarts.length - 1] <= p) {
return -1;
}
int index = 1;
while(p >= scriptStarts[index + 1]) { ++index; }
for(int i = 0; i < numScripts; ++i) {
if(scriptsIndex[i] == index) {
return i;
}
}
for(int i = 0; i < MAX_NUM_SPECIAL_REORDER_CODES; ++i) {
if(scriptsIndex[numScripts + i] == index) {
return Collator.ReorderCodes.FIRST + i;
}
}
return -1;
}
private int getScriptIndex(int script) {
if(script < 0) {
return 0;
} else if(script < numScripts) {
return scriptsIndex[script];
} else if(script < Collator.ReorderCodes.FIRST) {
return 0;
} else {
script -= Collator.ReorderCodes.FIRST;
if(script < MAX_NUM_SPECIAL_REORDER_CODES) {
return scriptsIndex[numScripts + script];
} else {
return 0;
}
}
}
public int[] getEquivalentScripts(int script) {
int index = getScriptIndex(script);
if(index == 0) { return EMPTY_INT_ARRAY; }
if(script >= Collator.ReorderCodes.FIRST) {
// Special groups have no aliases.
return new int[] { script };
}
int length = 0;
for(int i = 0; i < numScripts; ++i) {
if(scriptsIndex[i] == index) {
++length;
}
}
int[] dest = new int[length];
if(length == 1) {
dest[0] = script;
return dest;
}
length = 0;
for(int i = 0; i < numScripts; ++i) {
if(scriptsIndex[i] == index) {
dest[length++] = i;
}
}
return dest;
}
/**
* Writes the permutation of primary-weight ranges
* for the given reordering of scripts and groups.
* The caller checks for illegal arguments and
* takes care of [DEFAULT] and memory allocation.
*
* Each list element will be a (limit, offset) pair as described
* for the CollationSettings.reorderRanges.
* The list will be empty if no ranges are reordered.
*/
void makeReorderRanges(int[] reorder, UVector32 ranges) {
makeReorderRanges(reorder, false, ranges);
}
private void makeReorderRanges(int[] reorder, boolean latinMustMove, UVector32 ranges) {
ranges.removeAllElements();
int length = reorder.length;
if(length == 0 || (length == 1 && reorder[0] == UScript.UNKNOWN)) {
return;
}
// Maps each script-or-group range to a new lead byte.
short[] table = new short[scriptStarts.length - 1]; // C++: uint8_t[]
{
// Set "don't care" values for reserved ranges.
int index = scriptsIndex[
numScripts + REORDER_RESERVED_BEFORE_LATIN - Collator.ReorderCodes.FIRST];
if(index != 0) {
table[index] = 0xff;
}
index = scriptsIndex[
numScripts + REORDER_RESERVED_AFTER_LATIN - Collator.ReorderCodes.FIRST];
if(index != 0) {
table[index] = 0xff;
}
}
// Never reorder special low and high primary lead bytes.
assert(scriptStarts.length >= 2);
assert(scriptStarts[0] == 0);
int lowStart = scriptStarts[1];
assert(lowStart == ((Collation.MERGE_SEPARATOR_BYTE + 1) << 8));
int highLimit = scriptStarts[scriptStarts.length - 1];
assert(highLimit == (Collation.TRAIL_WEIGHT_BYTE << 8));
// Get the set of special reorder codes in the input list.
// This supports a fixed number of special reorder codes;
// it works for data with codes beyond Collator.ReorderCodes.LIMIT.
int specials = 0;
for(int i = 0; i < length; ++i) {
int reorderCode = reorder[i] - Collator.ReorderCodes.FIRST;
if(0 <= reorderCode && reorderCode < MAX_NUM_SPECIAL_REORDER_CODES) {
specials |= 1 << reorderCode;
}
}
// Start the reordering with the special low reorder codes that do not occur in the input.
for(int i = 0; i < MAX_NUM_SPECIAL_REORDER_CODES; ++i) {
int index = scriptsIndex[numScripts + i];
if(index != 0 && (specials & (1 << i)) == 0) {
lowStart = addLowScriptRange(table, index, lowStart);
}
}
// Skip the reserved range before Latin if Latin is the first script,
// so that we do not move it unnecessarily.
int skippedReserved = 0;
if(specials == 0 && reorder[0] == UScript.LATIN && !latinMustMove) {
int index = scriptsIndex[UScript.LATIN];
assert(index != 0);
int start = scriptStarts[index];
assert(lowStart <= start);
skippedReserved = start - lowStart;
lowStart = start;
}
// Reorder according to the input scripts, continuing from the bottom of the primary range.
boolean hasReorderToEnd = false;
for(int i = 0; i < length;) {
int script = reorder[i++];
if(script == UScript.UNKNOWN) {
// Put the remaining scripts at the top.
hasReorderToEnd = true;
while(i < length) {
script = reorder[--length];
if(script == UScript.UNKNOWN) { // Must occur at most once.
throw new IllegalArgumentException(
"setReorderCodes(): duplicate UScript.UNKNOWN");
}
if(script == Collator.ReorderCodes.DEFAULT) {
throw new IllegalArgumentException(
"setReorderCodes(): UScript.DEFAULT together with other scripts");
}
int index = getScriptIndex(script);
if(index == 0) { continue; }
if(table[index] != 0) { // Duplicate or equivalent script.
throw new IllegalArgumentException(
"setReorderCodes(): duplicate or equivalent script " +
scriptCodeString(script));
}
highLimit = addHighScriptRange(table, index, highLimit);
}
break;
}
if(script == Collator.ReorderCodes.DEFAULT) {
// The default code must be the only one in the list, and that is handled by the caller.
// Otherwise it must not be used.
throw new IllegalArgumentException(
"setReorderCodes(): UScript.DEFAULT together with other scripts");
}
int index = getScriptIndex(script);
if(index == 0) { continue; }
if(table[index] != 0) { // Duplicate or equivalent script.
throw new IllegalArgumentException(
"setReorderCodes(): duplicate or equivalent script " +
scriptCodeString(script));
}
lowStart = addLowScriptRange(table, index, lowStart);
}
// Put all remaining scripts into the middle.
for(int i = 1; i < scriptStarts.length - 1; ++i) {
int leadByte = table[i];
if(leadByte != 0) { continue; }
int start = scriptStarts[i];
if(!hasReorderToEnd && start > lowStart) {
// No need to move this script.
lowStart = start;
}
lowStart = addLowScriptRange(table, i, lowStart);
}
if(lowStart > highLimit) {
if((lowStart - (skippedReserved & 0xff00)) <= highLimit) {
// Try not skipping the before-Latin reserved range.
makeReorderRanges(reorder, true, ranges);
return;
}
// We need more primary lead bytes than available, despite the reserved ranges.
throw new ICUException(
"setReorderCodes(): reordering too many partial-primary-lead-byte scripts");
}
// Turn lead bytes into a list of (limit, offset) pairs.
// Encode each pair in one list element:
// Upper 16 bits = limit, lower 16 = signed lead byte offset.
int offset = 0;
for(int i = 1;; ++i) {
int nextOffset = offset;
while(i < scriptStarts.length - 1) {
int newLeadByte = table[i];
if(newLeadByte == 0xff) {
// "Don't care" lead byte for reserved range, continue with current offset.
} else {
nextOffset = newLeadByte - (scriptStarts[i] >> 8);
if(nextOffset != offset) { break; }
}
++i;
}
if(offset != 0 || i < scriptStarts.length - 1) {
ranges.addElement(((int)scriptStarts[i] << 16) | (offset & 0xffff));
}
if(i == scriptStarts.length - 1) { break; }
offset = nextOffset;
}
}
private int addLowScriptRange(short[] table, int index, int lowStart) {
int start = scriptStarts[index];
if((start & 0xff) < (lowStart & 0xff)) {
lowStart += 0x100;
}
table[index] = (short)(lowStart >> 8);
int limit = scriptStarts[index + 1];
lowStart = ((lowStart & 0xff00) + ((limit & 0xff00) - (start & 0xff00))) | (limit & 0xff);
return lowStart;
}
private int addHighScriptRange(short[] table, int index, int highLimit) {
int limit = scriptStarts[index + 1];
if((limit & 0xff) > (highLimit & 0xff)) {
highLimit -= 0x100;
}
int start = scriptStarts[index];
highLimit = ((highLimit & 0xff00) - ((limit & 0xff00) - (start & 0xff00))) | (start & 0xff);
table[index] = (short)(highLimit >> 8);
return highLimit;
}
private static String scriptCodeString(int script) {
// Do not use the script name here: We do not want to depend on that data.
return (script < Collator.ReorderCodes.FIRST) ?
Integer.toString(script) : "0x" + Integer.toHexString(script);
}
private static final int[] EMPTY_INT_ARRAY = new int[0];
/** @see jamoCE32s */
static final int JAMO_CE32S_LENGTH = 19 + 21 + 27;
/** Main lookup trie. */
Trie2_32 trie;
/**
* Array of CE32 values.
* At index 0 there must be CE32(U+0000)
* to support U+0000's special-tag for NUL-termination handling.
*/
int[] ce32s;
/** Array of CE values for expansions and OFFSET_TAG. */
long[] ces;
/** Array of prefix and contraction-suffix matching data. */
String contexts;
/** Base collation data, or null if this data itself is a base. */
public CollationData base;
/**
* Simple array of JAMO_CE32S_LENGTH=19+21+27 CE32s, one per canonical Jamo L/V/T.
* They are normally simple CE32s, rarely expansions.
* For fast handling of HANGUL_TAG.
*/
int[] jamoCE32s = new int[JAMO_CE32S_LENGTH];
public Normalizer2Impl nfcImpl;
/** The single-byte primary weight (xx000000) for numeric collation. */
long numericPrimary = 0x12000000;
/** 256 flags for which primary-weight lead bytes are compressible. */
public boolean[] compressibleBytes;
/**
* Set of code points that are unsafe for starting string comparison after an identical prefix,
* or in backwards CE iteration.
*/
UnicodeSet unsafeBackwardSet;
/**
* Fast Latin table for common-Latin-text string comparisons.
* Data structure see class CollationFastLatin.
*/
public char[] fastLatinTable;
/**
* Header portion of the fastLatinTable.
* In C++, these are one array, and the header is skipped for mapping characters.
* In Java, two arrays work better.
*/
char[] fastLatinTableHeader;
/**
* Data for scripts and reordering groups.
* Uses include building a reordering permutation table and
* providing script boundaries to AlphabeticIndex.
*/
int numScripts;
/**
* The length of scriptsIndex is numScripts+16.
* It maps from a UScriptCode or a special reorder code to an entry in scriptStarts.
* 16 special reorder codes (not all used) are mapped starting at numScripts.
* Up to MAX_NUM_SPECIAL_REORDER_CODES are codes for special groups like space/punct/digit.
* There are special codes at the end for reorder-reserved primary ranges.
*
*
Multiple scripts may share a range and index, for example Hira & Kana.
*/
char[] scriptsIndex;
/**
* Start primary weight (top 16 bits only) for a group/script/reserved range
* indexed by scriptsIndex.
* The first range (separators & terminators) and the last range (trailing weights)
* are not reorderable, and no scriptsIndex entry points to them.
*/
char[] scriptStarts;
/**
* Collation elements in the root collator.
* Used by the CollationRootElements class. The data structure is described there.
* null in a tailoring.
*/
public long[] rootElements;
}