com.ibm.icu.impl.coll.CollationData Maven / Gradle / Ivy
Go to download
Show more of this group Show more artifacts with this name
Show all versions of icu4j Show documentation
Show all versions of icu4j Show documentation
International Component for Unicode for Java (ICU4J) is a mature, widely used Java library
providing Unicode and Globalization support
/*
*******************************************************************************
* Copyright (C) 2010-2014, International Business Machines
* Corporation and others. All Rights Reserved.
*******************************************************************************
* CollationData.java, ported from collationdata.h/.cpp
*
* C++ version created on: 2010oct27
* created by: Markus W. Scherer
*/
package com.ibm.icu.impl.coll;
import com.ibm.icu.impl.Normalizer2Impl;
import com.ibm.icu.impl.Trie2_32;
import com.ibm.icu.lang.UScript;
import com.ibm.icu.text.Collator;
import com.ibm.icu.text.UnicodeSet;
/**
* Collation data container.
* Immutable data created by a CollationDataBuilder, or loaded from a file,
* or deserialized from API-provided binary data.
*
* Includes data for the collation base (root/default), aliased if this is not the base.
*/
public final class CollationData {
CollationData(Normalizer2Impl nfc) {
nfcImpl = nfc;
}
public int getCE32(int c) {
return trie.get(c);
}
int getCE32FromSupplementary(int c) {
return trie.get(c); // TODO: port UTRIE2_GET32_FROM_SUPP(trie, c) to Java?
}
boolean isDigit(int c) {
return c < 0x660 ? c <= 0x39 && 0x30 <= c :
Collation.hasCE32Tag(getCE32(c), Collation.DIGIT_TAG);
}
public boolean isUnsafeBackward(int c, boolean numeric) {
return unsafeBackwardSet.contains(c) || (numeric && isDigit(c));
}
public boolean isCompressibleLeadByte(int b) {
return compressibleBytes[b];
}
public boolean isCompressiblePrimary(long p) {
return isCompressibleLeadByte((int)p >>> 24);
}
/**
* Returns the CE32 from two contexts words.
* Access to the defaultCE32 for contraction and prefix matching.
*/
int getCE32FromContexts(int index) {
return ((int)contexts.charAt(index) << 16) | contexts.charAt(index + 1);
}
/**
* Returns the CE32 for an indirect special CE32 (e.g., with DIGIT_TAG).
* Requires that ce32 is special.
*/
int getIndirectCE32(int ce32) {
assert(Collation.isSpecialCE32(ce32));
int tag = Collation.tagFromCE32(ce32);
if(tag == Collation.DIGIT_TAG) {
// Fetch the non-numeric-collation CE32.
ce32 = ce32s[Collation.indexFromCE32(ce32)];
} else if(tag == Collation.LEAD_SURROGATE_TAG) {
ce32 = Collation.UNASSIGNED_CE32;
} else if(tag == Collation.U0000_TAG) {
// Fetch the normal ce32 for U+0000.
ce32 = ce32s[0];
}
return ce32;
}
/**
* Returns the CE32 for an indirect special CE32 (e.g., with DIGIT_TAG),
* if ce32 is special.
*/
int getFinalCE32(int ce32) {
if(Collation.isSpecialCE32(ce32)) {
ce32 = getIndirectCE32(ce32);
}
return ce32;
}
/**
* Computes a CE from c's ce32 which has the OFFSET_TAG.
*/
long getCEFromOffsetCE32(int c, int ce32) {
long dataCE = ces[Collation.indexFromCE32(ce32)];
return Collation.makeCE(Collation.getThreeBytePrimaryForOffsetData(c, dataCE));
}
/**
* Returns the single CE that c maps to.
* Throws UnsupportedOperationException if c does not map to a single CE.
*/
long getSingleCE(int c) {
CollationData d;
int ce32 = getCE32(c);
if(ce32 == Collation.FALLBACK_CE32) {
d = base;
ce32 = base.getCE32(c);
} else {
d = this;
}
while(Collation.isSpecialCE32(ce32)) {
switch(Collation.tagFromCE32(ce32)) {
case Collation.LATIN_EXPANSION_TAG:
case Collation.BUILDER_DATA_TAG:
case Collation.PREFIX_TAG:
case Collation.CONTRACTION_TAG:
case Collation.HANGUL_TAG:
case Collation.LEAD_SURROGATE_TAG:
throw new UnsupportedOperationException(String.format(
"there is not exactly one collation element for U+%04X (CE32 0x%08x)",
c, ce32));
case Collation.FALLBACK_TAG:
case Collation.RESERVED_TAG_3:
throw new AssertionError(String.format(
"unexpected CE32 tag for U+%04X (CE32 0x%08x)", c, ce32));
case Collation.LONG_PRIMARY_TAG:
return Collation.ceFromLongPrimaryCE32(ce32);
case Collation.LONG_SECONDARY_TAG:
return Collation.ceFromLongSecondaryCE32(ce32);
case Collation.EXPANSION32_TAG:
if(Collation.lengthFromCE32(ce32) == 1) {
ce32 = d.ce32s[Collation.indexFromCE32(ce32)];
break;
} else {
throw new UnsupportedOperationException(String.format(
"there is not exactly one collation element for U+%04X (CE32 0x%08x)",
c, ce32));
}
case Collation.EXPANSION_TAG: {
if(Collation.lengthFromCE32(ce32) == 1) {
return d.ces[Collation.indexFromCE32(ce32)];
} else {
throw new UnsupportedOperationException(String.format(
"there is not exactly one collation element for U+%04X (CE32 0x%08x)",
c, ce32));
}
}
case Collation.DIGIT_TAG:
// Fetch the non-numeric-collation CE32 and continue.
ce32 = d.ce32s[Collation.indexFromCE32(ce32)];
break;
case Collation.U0000_TAG:
assert(c == 0);
// Fetch the normal ce32 for U+0000 and continue.
ce32 = d.ce32s[0];
break;
case Collation.OFFSET_TAG:
return d.getCEFromOffsetCE32(c, ce32);
case Collation.IMPLICIT_TAG:
return Collation.unassignedCEFromCodePoint(c);
}
}
return Collation.ceFromSimpleCE32(ce32);
}
/**
* Returns the FCD16 value for code point c. c must be >= 0.
*/
int getFCD16(int c) {
return nfcImpl.getFCD16(c);
}
/**
* Returns the first primary for the script's reordering group.
* @return the primary with only the first primary lead byte of the group
* (not necessarily an actual root collator primary weight),
* or 0 if the script is unknown
*/
long getFirstPrimaryForGroup(int script) {
int index = findScript(script);
if(index < 0) {
return 0;
}
long head = scripts[index];
return (head & 0xff00) << 16;
}
/**
* Returns the last primary for the script's reordering group.
* @return the last primary of the group
* (not an actual root collator primary weight),
* or 0 if the script is unknown
*/
public long getLastPrimaryForGroup(int script) {
int index = findScript(script);
if(index < 0) {
return 0;
}
int head = scripts[index];
long lastByte = head & 0xff;
return ((lastByte + 1) << 24) - 1;
}
/**
* Finds the reordering group which contains the primary weight.
* @return the first script of the group, or -1 if the weight is beyond the last group
*/
public int getGroupForPrimary(long p) {
p >>= 24; // Reordering groups are distinguished by primary lead bytes.
for(int i = 0; i < scripts.length; i = i + 2 + scripts[i + 1]) {
int lastByte = scripts[i] & 0xff;
if(p <= lastByte) {
return scripts[i + 2];
}
}
return -1;
}
private int findScript(int script) {
if(script < 0 || 0xffff < script) { return -1; }
for(int i = 0; i < scripts.length;) {
int limit = i + 2 + scripts[i + 1];
for(int j = i + 2; j < limit; ++j) {
if(script == scripts[j]) { return i; }
}
i = limit;
}
return -1;
}
public int[] getEquivalentScripts(int script) {
int i = findScript(script);
if(i < 0) { return EMPTY_INT_ARRAY; }
int length = scripts[i + 1];
assert(length != 0);
int dest[] = new int[length];
i += 2;
dest[0] = scripts[i++];
for(int j = 1; j < length; ++j) {
script = scripts[i++];
// Sorted insertion.
for(int k = j;; --k) {
// Invariant: dest[k] is free to receive either script or dest[k - 1].
if(k > 0 && script < dest[k - 1]) {
dest[k] = dest[k - 1];
} else {
dest[k] = script;
break;
}
}
}
return dest;
}
/**
* Writes the permutation table for the given reordering of scripts and groups,
* mapping from default-order primary-weight lead bytes to reordered lead bytes.
* The caller checks for illegal arguments and
* takes care of [DEFAULT] and memory allocation.
*/
public void makeReorderTable(int[] reorder, byte[] table) {
int length = reorder.length;
// Initialize the table.
// Never reorder special low and high primary lead bytes.
int lowByte;
for(lowByte = 0; lowByte <= Collation.MERGE_SEPARATOR_BYTE; ++lowByte) {
table[lowByte] = (byte)lowByte;
}
// lowByte == 03
int highByte;
for(highByte = 0xff; highByte >= Collation.TRAIL_WEIGHT_BYTE; --highByte) {
table[highByte] = (byte)highByte;
}
// highByte == FE
// Set intermediate bytes to 0 to indicate that they have not been set yet.
for(int i = lowByte; i <= highByte; ++i) {
table[i] = 0;
}
// Get the set of special reorder codes in the input list.
// This supports up to 32 special reorder codes;
// it works for data with codes beyond Collator.ReorderCodes.LIMIT.
int specials = 0;
for(int i = 0; i < length; ++i) {
int reorderCode = reorder[i] - Collator.ReorderCodes.FIRST;
if(0 <= reorderCode && reorderCode <= 31) {
specials |= 1 << reorderCode;
}
}
// Start the reordering with the special low reorder codes that do not occur in the input.
for(int i = 0;; i += 3) {
if(scripts[i + 1] != 1) { break; } // Went beyond special single-code reorder codes.
int reorderCode = scripts[i + 2] - Collator.ReorderCodes.FIRST;
if(reorderCode < 0) { break; } // Went beyond special reorder codes.
if((specials & (1 << reorderCode)) == 0) {
int head = scripts[i];
int firstByte = head >> 8;
int lastByte = head & 0xff;
do { table[firstByte++] = (byte)lowByte++; } while(firstByte <= lastByte);
}
}
// Reorder according to the input scripts, continuing from the bottom of the bytes range.
for(int i = 0; i < length;) {
int script = reorder[i++];
if(script == UScript.UNKNOWN) {
// Put the remaining scripts at the top.
while(i < length) {
script = reorder[--length];
if(script == UScript.UNKNOWN) { // Must occur at most once.
throw new IllegalArgumentException(
"setReorderCodes(): duplicate UScript.UNKNOWN");
}
if(script == Collator.ReorderCodes.DEFAULT) {
throw new IllegalArgumentException(
"setReorderCodes(): UScript.DEFAULT together with other scripts");
}
int index = findScript(script);
if(index < 0) { continue; }
int head = scripts[index];
int firstByte = head >> 8;
int lastByte = head & 0xff;
if(table[firstByte] != 0) { // Duplicate or equivalent script.
throw new IllegalArgumentException(
"setReorderCodes(): duplicate or equivalent script " +
scriptCodeString(script));
}
do { table[lastByte--] = (byte)highByte--; } while(firstByte <= lastByte);
}
break;
}
if(script == Collator.ReorderCodes.DEFAULT) {
// The default code must be the only one in the list, and that is handled by the caller.
// Otherwise it must not be used.
throw new IllegalArgumentException(
"setReorderCodes(): UScript.DEFAULT together with other scripts");
}
int index = findScript(script);
if(index < 0) { continue; }
int head = scripts[index];
int firstByte = head >> 8;
int lastByte = head & 0xff;
if(table[firstByte] != 0) { // Duplicate or equivalent script.
throw new IllegalArgumentException(
"setReorderCodes(): duplicate or equivalent script " +
scriptCodeString(script));
}
do { table[firstByte++] = (byte)lowByte++; } while(firstByte <= lastByte);
}
// Put all remaining scripts into the middle.
// Avoid table[0] which must remain 0.
for(int i = 1; i <= 0xff; ++i) {
if(table[i] == 0) { table[i] = (byte)lowByte++; }
}
assert(lowByte == highByte + 1);
}
private static String scriptCodeString(int script) {
// Do not use the script name here: We do not want to depend on that data.
return (script < Collator.ReorderCodes.FIRST) ?
Integer.toString(script) : "0x" + Integer.toHexString(script);
}
private static final int[] EMPTY_INT_ARRAY = new int[0];
/** @see jamoCE32s */
static final int JAMO_CE32S_LENGTH = 19 + 21 + 27;
/** Main lookup trie. */
Trie2_32 trie;
/**
* Array of CE32 values.
* At index 0 there must be CE32(U+0000)
* to support U+0000's special-tag for NUL-termination handling.
*/
int[] ce32s;
/** Array of CE values for expansions and OFFSET_TAG. */
long[] ces;
/** Array of prefix and contraction-suffix matching data. */
String contexts;
/** Base collation data, or null if this data itself is a base. */
public CollationData base;
/**
* Simple array of JAMO_CE32S_LENGTH=19+21+27 CE32s, one per canonical Jamo L/V/T.
* They are normally simple CE32s, rarely expansions.
* For fast handling of HANGUL_TAG.
*/
int[] jamoCE32s = new int[JAMO_CE32S_LENGTH];
public Normalizer2Impl nfcImpl;
/** The single-byte primary weight (xx000000) for numeric collation. */
long numericPrimary = 0x12000000;
/** 256 flags for which primary-weight lead bytes are compressible. */
public boolean[] compressibleBytes;
/**
* Set of code points that are unsafe for starting string comparison after an identical prefix,
* or in backwards CE iteration.
*/
UnicodeSet unsafeBackwardSet;
/**
* Fast Latin table for common-Latin-text string comparisons.
* Data structure see class CollationFastLatin.
*/
public char[] fastLatinTable;
/**
* Header portion of the fastLatinTable.
* In C++, these are one array, and the header is skipped for mapping characters.
* In Java, two arrays work better.
*/
char[] fastLatinTableHeader;
/**
* Data for scripts and reordering groups.
* Uses include building a reordering permutation table and
* providing script boundaries to AlphabeticIndex.
*
* This data is a sorted list of primary-weight lead byte ranges (reordering groups),
* each with a list of pairs sorted in base collation order;
* each pair contains a script/reorder code and the lowest primary weight for that script.
*
* Data structure:
* - Each reordering group is encoded in n+2 16-bit integers.
* - First integer:
* Bits 15..8: First byte of the reordering group's range.
* Bits 7..0: Last byte of the reordering group's range.
* - Second integer:
* Length n of the list of script/reordering codes.
* - Each further integer is a script or reordering code.
*/
char[] scripts;
/**
* Collation elements in the root collator.
* Used by the CollationRootElements class. The data structure is described there.
* null in a tailoring.
*/
public long[] rootElements;
}