com.ibm.icu.impl.IntTrieBuilder Maven / Gradle / Ivy
Go to download
Show more of this group Show more artifacts with this name
Show all versions of icu4j Show documentation
Show all versions of icu4j Show documentation
International Component for Unicode for Java (ICU4J) is a mature, widely used Java library
providing Unicode and Globalization support
// © 2016 and later: Unicode, Inc. and others.
// License & terms of use: http://www.unicode.org/copyright.html#License
/*
******************************************************************************
* Copyright (C) 1996-2010, International Business Machines Corporation and *
* others. All Rights Reserved. *
******************************************************************************
*/
package com.ibm.icu.impl;
import java.io.DataOutputStream;
import java.io.IOException;
import java.io.OutputStream;
import java.util.Arrays;
import com.ibm.icu.lang.UCharacter;
import com.ibm.icu.text.UTF16;
/**
* Builder class to manipulate and generate a trie.
* This is useful for ICU data in primitive types.
* Provides a compact way to store information that is indexed by Unicode
* values, such as character properties, types, keyboard values, etc. This is
* very useful when you have a block of Unicode data that contains significant
* values while the rest of the Unicode data is unused in the application or
* when you have a lot of redundance, such as where all 21,000 Han ideographs
* have the same value. However, lookup is much faster than a hash table.
* A trie of any primitive data type serves two purposes:
*
* - Fast access of the indexed values.
*
- Smaller memory footprint.
*
* This is a direct port from the ICU4C version
* @author Syn Wee Quek
*/
public class IntTrieBuilder extends TrieBuilder
{
// public constructor ----------------------------------------------
/**
* Copy constructor
*/
public IntTrieBuilder(IntTrieBuilder table)
{
super(table);
m_data_ = new int[m_dataCapacity_];
System.arraycopy(table.m_data_, 0, m_data_, 0, m_dataLength_);
m_initialValue_ = table.m_initialValue_;
m_leadUnitValue_ = table.m_leadUnitValue_;
}
/**
* Constructs a build table
* @param aliasdata data to be filled into table
* @param maxdatalength maximum data length allowed in table
* @param initialvalue inital data value
* @param latin1linear is latin 1 to be linear
*/
public IntTrieBuilder(int aliasdata[], int maxdatalength,
int initialvalue, int leadunitvalue,
boolean latin1linear)
{
super();
if (maxdatalength < DATA_BLOCK_LENGTH || (latin1linear
&& maxdatalength < 1024)) {
throw new IllegalArgumentException(
"Argument maxdatalength is too small");
}
if (aliasdata != null) {
m_data_ = aliasdata;
}
else {
m_data_ = new int[maxdatalength];
}
// preallocate and reset the first data block (block index 0)
int j = DATA_BLOCK_LENGTH;
if (latin1linear) {
// preallocate and reset the first block (number 0) and Latin-1
// (U+0000..U+00ff) after that made sure above that
// maxDataLength >= 1024
// set indexes to point to consecutive data blocks
int i = 0;
do {
// do this at least for trie->index[0] even if that block is
// only partly used for Latin-1
m_index_[i ++] = j;
j += DATA_BLOCK_LENGTH;
} while (i < (256 >> SHIFT_));
}
m_dataLength_ = j;
// reset the initially allocated blocks to the initial value
Arrays.fill(m_data_, 0, m_dataLength_, initialvalue);
m_initialValue_ = initialvalue;
m_leadUnitValue_ = leadunitvalue;
m_dataCapacity_ = maxdatalength;
m_isLatin1Linear_ = latin1linear;
m_isCompacted_ = false;
}
// public methods -------------------------------------------------------
/*public final void print()
{
int i = 0;
int oldvalue = m_index_[i];
int count = 0;
System.out.println("index length " + m_indexLength_
+ " --------------------------");
while (i < m_indexLength_) {
if (m_index_[i] != oldvalue) {
System.out.println("index has " + count + " counts of "
+ Integer.toHexString(oldvalue));
count = 0;
oldvalue = m_index_[i];
}
count ++;
i ++;
}
System.out.println("index has " + count + " counts of "
+ Integer.toHexString(oldvalue));
i = 0;
oldvalue = m_data_[i];
count = 0;
System.out.println("data length " + m_dataLength_
+ " --------------------------");
while (i < m_dataLength_) {
if (m_data_[i] != oldvalue) {
if ((oldvalue & 0xf1000000) == 0xf1000000) {
int temp = oldvalue & 0xffffff;
temp += 0x320;
oldvalue = 0xf1000000 | temp;
}
if ((oldvalue & 0xf2000000) == 0xf2000000) {
int temp = oldvalue & 0xffffff;
temp += 0x14a;
oldvalue = 0xf2000000 | temp;
}
System.out.println("data has " + count + " counts of "
+ Integer.toHexString(oldvalue));
count = 0;
oldvalue = m_data_[i];
}
count ++;
i ++;
}
if ((oldvalue & 0xf1000000) == 0xf1000000) {
int temp = oldvalue & 0xffffff;
temp += 0x320;
oldvalue = 0xf1000000 | temp;
}
if ((oldvalue & 0xf2000000) == 0xf2000000) {
int temp = oldvalue & 0xffffff;
temp += 0x14a;
oldvalue = 0xf2000000 | temp;
}
System.out.println("data has " + count + " counts of "
+ Integer.toHexString(oldvalue));
}
*/
/**
* Gets a 32 bit data from the table data
* @param ch codepoint which data is to be retrieved
* @return the 32 bit data
*/
public int getValue(int ch)
{
// valid, uncompacted trie and valid c?
if (m_isCompacted_ || ch > UCharacter.MAX_VALUE || ch < 0) {
return 0;
}
int block = m_index_[ch >> SHIFT_];
return m_data_[Math.abs(block) + (ch & MASK_)];
}
/**
* Get a 32 bit data from the table data
* @param ch code point for which data is to be retrieved.
* @param inBlockZero Output parameter, inBlockZero[0] returns true if the
* char maps into block zero, otherwise false.
* @return the 32 bit data value.
*/
public int getValue(int ch, boolean [] inBlockZero)
{
// valid, uncompacted trie and valid c?
if (m_isCompacted_ || ch > UCharacter.MAX_VALUE || ch < 0) {
if (inBlockZero != null) {
inBlockZero[0] = true;
}
return 0;
}
int block = m_index_[ch >> SHIFT_];
if (inBlockZero != null) {
inBlockZero[0] = (block == 0);
}
return m_data_[Math.abs(block) + (ch & MASK_)];
}
/**
* Sets a 32 bit data in the table data
* @param ch codepoint which data is to be set
* @param value to set
* @return true if the set is successful, otherwise
* if the table has been compacted return false
*/
public boolean setValue(int ch, int value)
{
// valid, uncompacted trie and valid c?
if (m_isCompacted_ || ch > UCharacter.MAX_VALUE || ch < 0) {
return false;
}
int block = getDataBlock(ch);
if (block < 0) {
return false;
}
m_data_[block + (ch & MASK_)] = value;
return true;
}
/**
* Serializes the build table with 32 bit data
* @param datamanipulate builder raw fold method implementation
* @param triedatamanipulate result trie fold method
* @return a new trie
*/
public IntTrie serialize(TrieBuilder.DataManipulate datamanipulate,
Trie.DataManipulate triedatamanipulate)
{
if (datamanipulate == null) {
throw new IllegalArgumentException("Parameters can not be null");
}
// fold and compact if necessary, also checks that indexLength is
// within limits
if (!m_isCompacted_) {
// compact once without overlap to improve folding
compact(false);
// fold the supplementary part of the index array
fold(datamanipulate);
// compact again with overlap for minimum data array length
compact(true);
m_isCompacted_ = true;
}
// is dataLength within limits?
if (m_dataLength_ >= MAX_DATA_LENGTH_) {
throw new ArrayIndexOutOfBoundsException("Data length too small");
}
char index[] = new char[m_indexLength_];
int data[] = new int[m_dataLength_];
// write the index (stage 1) array and the 32-bit data (stage 2) array
// write 16-bit index values shifted right by INDEX_SHIFT_
for (int i = 0; i < m_indexLength_; i ++) {
index[i] = (char)(m_index_[i] >>> INDEX_SHIFT_);
}
// write 32-bit data values
System.arraycopy(m_data_, 0, data, 0, m_dataLength_);
int options = SHIFT_ | (INDEX_SHIFT_ << OPTIONS_INDEX_SHIFT_);
options |= OPTIONS_DATA_IS_32_BIT_;
if (m_isLatin1Linear_) {
options |= OPTIONS_LATIN1_IS_LINEAR_;
}
return new IntTrie(index, data, m_initialValue_, options,
triedatamanipulate);
}
/**
* Serializes the build table to an output stream.
*
* Compacts the build-time trie after all values are set, and then
* writes the serialized form onto an output stream.
*
* After this, this build-time Trie can only be serialized again and/or closed;
* no further values can be added.
*
* This function is the rough equivalent of utrie_seriaize() in ICU4C.
*
* @param os the output stream to which the seriaized trie will be written.
* If nul, the function still returns the size of the serialized Trie.
* @param reduceTo16Bits If true, reduce the data size to 16 bits. The resulting
* serialized form can then be used to create a CharTrie.
* @param datamanipulate builder raw fold method implementation
* @return the number of bytes written to the output stream.
*/
public int serialize(OutputStream os, boolean reduceTo16Bits,
TrieBuilder.DataManipulate datamanipulate) throws IOException {
if (datamanipulate == null) {
throw new IllegalArgumentException("Parameters can not be null");
}
// fold and compact if necessary, also checks that indexLength is
// within limits
if (!m_isCompacted_) {
// compact once without overlap to improve folding
compact(false);
// fold the supplementary part of the index array
fold(datamanipulate);
// compact again with overlap for minimum data array length
compact(true);
m_isCompacted_ = true;
}
// is dataLength within limits?
int length;
if (reduceTo16Bits) {
length = m_dataLength_ + m_indexLength_;
} else {
length = m_dataLength_;
}
if (length >= MAX_DATA_LENGTH_) {
throw new ArrayIndexOutOfBoundsException("Data length too small");
}
// struct UTrieHeader {
// int32_t signature;
// int32_t options (a bit field)
// int32_t indexLength
// int32_t dataLength
length = Trie.HEADER_LENGTH_ + 2*m_indexLength_;
if(reduceTo16Bits) {
length+=2*m_dataLength_;
} else {
length+=4*m_dataLength_;
}
if (os == null) {
// No output stream. Just return the length of the serialized Trie, in bytes.
return length;
}
DataOutputStream dos = new DataOutputStream(os);
dos.writeInt(Trie.HEADER_SIGNATURE_);
int options = Trie.INDEX_STAGE_1_SHIFT_ | (Trie.INDEX_STAGE_2_SHIFT_<>> Trie.INDEX_STAGE_2_SHIFT_;
dos.writeChar(v);
}
/* write 16-bit data values */
for(int i=0; i>> Trie.INDEX_STAGE_2_SHIFT_;
dos.writeChar(v);
}
/* write 32-bit data values */
for(int i=0; i UCharacter.MAX_VALUE || limit < UCharacter.MIN_VALUE
|| limit > (UCharacter.MAX_VALUE + 1) || start > limit) {
return false;
}
if (start == limit) {
return true; // nothing to do
}
if ((start & MASK_) != 0) {
// set partial block at [start..following block boundary[
int block = getDataBlock(start);
if (block < 0) {
return false;
}
int nextStart = (start + DATA_BLOCK_LENGTH) & ~MASK_;
if (nextStart <= limit) {
fillBlock(block, start & MASK_, DATA_BLOCK_LENGTH,
value, overwrite);
start = nextStart;
}
else {
fillBlock(block, start & MASK_, limit & MASK_,
value, overwrite);
return true;
}
}
// number of positions in the last, partial block
int rest = limit & MASK_;
// round down limit to a block boundary
limit &= ~MASK_;
// iterate over all-value blocks
int repeatBlock = 0;
if (value == m_initialValue_) {
// repeatBlock = 0; assigned above
}
else {
repeatBlock = -1;
}
while (start < limit) {
// get index value
int block = m_index_[start >> SHIFT_];
if (block > 0) {
// already allocated, fill in value
fillBlock(block, 0, DATA_BLOCK_LENGTH, value, overwrite);
}
else if (m_data_[-block] != value && (block == 0 || overwrite)) {
// set the repeatBlock instead of the current block 0 or range
// block
if (repeatBlock >= 0) {
m_index_[start >> SHIFT_] = -repeatBlock;
}
else {
// create and set and fill the repeatBlock
repeatBlock = getDataBlock(start);
if (repeatBlock < 0) {
return false;
}
// set the negative block number to indicate that it is a
// repeat block
m_index_[start >> SHIFT_] = -repeatBlock;
fillBlock(repeatBlock, 0, DATA_BLOCK_LENGTH, value, true);
}
}
start += DATA_BLOCK_LENGTH;
}
if (rest > 0) {
// set partial block at [last block boundary..limit[
int block = getDataBlock(start);
if (block < 0) {
return false;
}
fillBlock(block, 0, rest, value, overwrite);
}
return true;
}
// protected data member ------------------------------------------------
protected int m_data_[];
protected int m_initialValue_;
// private data member ------------------------------------------------
private int m_leadUnitValue_;
// private methods ------------------------------------------------------
private int allocDataBlock()
{
int newBlock = m_dataLength_;
int newTop = newBlock + DATA_BLOCK_LENGTH;
if (newTop > m_dataCapacity_) {
// out of memory in the data array
return -1;
}
m_dataLength_ = newTop;
return newBlock;
}
/**
* No error checking for illegal arguments.
* @param ch codepoint to look for
* @return -1 if no new data block available (out of memory in data array)
*/
private int getDataBlock(int ch)
{
ch >>= SHIFT_;
int indexValue = m_index_[ch];
if (indexValue > 0) {
return indexValue;
}
// allocate a new data block
int newBlock = allocDataBlock();
if (newBlock < 0) {
// out of memory in the data array
return -1;
}
m_index_[ch] = newBlock;
// copy-on-write for a block from a setRange()
System.arraycopy(m_data_, Math.abs(indexValue), m_data_, newBlock,
DATA_BLOCK_LENGTH << 2);
return newBlock;
}
/**
* Compact a folded build-time trie.
* The compaction
* - removes blocks that are identical with earlier ones
* - overlaps adjacent blocks as much as possible (if overlap == true)
* - moves blocks in steps of the data granularity
* - moves and overlaps blocks that overlap with multiple values in the overlap region
*
* It does not
* - try to move and overlap blocks that are not already adjacent
* @param overlap flag
*/
private void compact(boolean overlap)
{
if (m_isCompacted_) {
return; // nothing left to do
}
// compaction
// initialize the index map with "block is used/unused" flags
findUnusedBlocks();
// if Latin-1 is preallocated and linear, then do not compact Latin-1
// data
int overlapStart = DATA_BLOCK_LENGTH;
if (m_isLatin1Linear_ && SHIFT_ <= 8) {
overlapStart += 256;
}
int newStart = DATA_BLOCK_LENGTH;
int i;
for (int start = newStart; start < m_dataLength_;) {
// start: index of first entry of current block
// newStart: index where the current block is to be moved
// (right after current end of already-compacted data)
// skip blocks that are not used
if (m_map_[start >>> SHIFT_] < 0) {
// advance start to the next block
start += DATA_BLOCK_LENGTH;
// leave newStart with the previous block!
continue;
}
// search for an identical block
if (start >= overlapStart) {
i = findSameDataBlock(m_data_, newStart, start,
overlap ? DATA_GRANULARITY_ : DATA_BLOCK_LENGTH);
if (i >= 0) {
// found an identical block, set the other block's index
// value for the current block
m_map_[start >>> SHIFT_] = i;
// advance start to the next block
start += DATA_BLOCK_LENGTH;
// leave newStart with the previous block!
continue;
}
}
// see if the beginning of this block can be overlapped with the
// end of the previous block
if(overlap && start>=overlapStart) {
/* look for maximum overlap (modulo granularity) with the previous, adjacent block */
for(i=DATA_BLOCK_LENGTH-DATA_GRANULARITY_;
i>0 && !equal_int(m_data_, newStart-i, start, i);
i-=DATA_GRANULARITY_) {}
} else {
i=0;
}
if (i > 0) {
// some overlap
m_map_[start >>> SHIFT_] = newStart - i;
// move the non-overlapping indexes to their new positions
start += i;
for (i = DATA_BLOCK_LENGTH - i; i > 0; -- i) {
m_data_[newStart ++] = m_data_[start ++];
}
}
else if (newStart < start) {
// no overlap, just move the indexes to their new positions
m_map_[start >>> SHIFT_] = newStart;
for (i = DATA_BLOCK_LENGTH; i > 0; -- i) {
m_data_[newStart ++] = m_data_[start ++];
}
}
else { // no overlap && newStart==start
m_map_[start >>> SHIFT_] = start;
newStart += DATA_BLOCK_LENGTH;
start = newStart;
}
}
// now adjust the index (stage 1) table
for (i = 0; i < m_indexLength_; ++ i) {
m_index_[i] = m_map_[Math.abs(m_index_[i]) >>> SHIFT_];
}
m_dataLength_ = newStart;
}
/**
* Find the same data block
* @param data array
* @param dataLength
* @param otherBlock
* @param step
*/
private static final int findSameDataBlock(int data[], int dataLength,
int otherBlock, int step)
{
// ensure that we do not even partially get past dataLength
dataLength -= DATA_BLOCK_LENGTH;
for (int block = 0; block <= dataLength; block += step) {
if(equal_int(data, block, otherBlock, DATA_BLOCK_LENGTH)) {
return block;
}
}
return -1;
}
/**
* Fold the normalization data for supplementary code points into
* a compact area on top of the BMP-part of the trie index,
* with the lead surrogates indexing this compact area.
*
* Duplicate the index values for lead surrogates:
* From inside the BMP area, where some may be overridden with folded values,
* to just after the BMP area, where they can be retrieved for
* code point lookups.
* @param manipulate fold implementation
*/
private final void fold(DataManipulate manipulate)
{
int leadIndexes[] = new int[SURROGATE_BLOCK_COUNT_];
int index[] = m_index_;
// copy the lead surrogate indexes into a temporary array
System.arraycopy(index, 0xd800 >> SHIFT_, leadIndexes, 0,
SURROGATE_BLOCK_COUNT_);
// set all values for lead surrogate code *units* to leadUnitValue
// so that by default runtime lookups will find no data for associated
// supplementary code points, unless there is data for such code points
// which will result in a non-zero folding value below that is set for
// the respective lead units
// the above saved the indexes for surrogate code *points*
// fill the indexes with simplified code from utrie_setRange32()
int block = 0;
if (m_leadUnitValue_ == m_initialValue_) {
// leadUnitValue == initialValue, use all-initial-value block
// block = 0; if block here left empty
}
else {
// create and fill the repeatBlock
block = allocDataBlock();
if (block < 0) {
// data table overflow
throw new IllegalStateException("Internal error: Out of memory space");
}
fillBlock(block, 0, DATA_BLOCK_LENGTH, m_leadUnitValue_, true);
// negative block number to indicate that it is a repeat block
block = -block;
}
for (int c = (0xd800 >> SHIFT_); c < (0xdc00 >> SHIFT_); ++ c) {
m_index_[c] = block;
}
// Fold significant index values into the area just after the BMP
// indexes.
// In case the first lead surrogate has significant data,
// its index block must be used first (in which case the folding is a
// no-op).
// Later all folded index blocks are moved up one to insert the copied
// lead surrogate indexes.
int indexLength = BMP_INDEX_LENGTH_;
// search for any index (stage 1) entries for supplementary code points
for (int c = 0x10000; c < 0x110000;) {
if (index[c >> SHIFT_] != 0) {
// there is data, treat the full block for a lead surrogate
c &= ~0x3ff;
// is there an identical index block?
block = findSameIndexBlock(index, indexLength, c >> SHIFT_);
// get a folded value for [c..c+0x400[ and,
// if different from the value for the lead surrogate code
// point, set it for the lead surrogate code unit
int value = manipulate.getFoldedValue(c,
block + SURROGATE_BLOCK_COUNT_);
if (value != getValue(UTF16.getLeadSurrogate(c))) {
if (!setValue(UTF16.getLeadSurrogate(c), value)) {
// data table overflow
throw new ArrayIndexOutOfBoundsException(
"Data table overflow");
}
// if we did not find an identical index block...
if (block == indexLength) {
// move the actual index (stage 1) entries from the
// supplementary position to the new one
System.arraycopy(index, c >> SHIFT_, index, indexLength,
SURROGATE_BLOCK_COUNT_);
indexLength += SURROGATE_BLOCK_COUNT_;
}
}
c += 0x400;
}
else {
c += DATA_BLOCK_LENGTH;
}
}
// index array overflow?
// This is to guarantee that a folding offset is of the form
// UTRIE_BMP_INDEX_LENGTH+n*UTRIE_SURROGATE_BLOCK_COUNT with n=0..1023.
// If the index is too large, then n>=1024 and more than 10 bits are
// necessary.
// In fact, it can only ever become n==1024 with completely unfoldable
// data and the additional block of duplicated values for lead
// surrogates.
if (indexLength >= MAX_INDEX_LENGTH_) {
throw new ArrayIndexOutOfBoundsException("Index table overflow");
}
// make space for the lead surrogate index block and insert it between
// the BMP indexes and the folded ones
System.arraycopy(index, BMP_INDEX_LENGTH_, index,
BMP_INDEX_LENGTH_ + SURROGATE_BLOCK_COUNT_,
indexLength - BMP_INDEX_LENGTH_);
System.arraycopy(leadIndexes, 0, index, BMP_INDEX_LENGTH_,
SURROGATE_BLOCK_COUNT_);
indexLength += SURROGATE_BLOCK_COUNT_;
m_indexLength_ = indexLength;
}
/**
* @internal
*/
private void fillBlock(int block, int start, int limit, int value,
boolean overwrite)
{
limit += block;
block += start;
if (overwrite) {
while (block < limit) {
m_data_[block ++] = value;
}
}
else {
while (block < limit) {
if (m_data_[block] == m_initialValue_) {
m_data_[block] = value;
}
++ block;
}
}
}
}