All Downloads are FREE. Search and download functionalities are using the official Maven repository.

com.ibm.icu.impl.IntTrieBuilder Maven / Gradle / Ivy

There is a newer version: 4.15.0-HBase-1.5
Show newest version
// © 2016 and later: Unicode, Inc. and others.
// License & terms of use: http://www.unicode.org/copyright.html#License
/*
******************************************************************************
* Copyright (C) 1996-2010, International Business Machines Corporation and   *
* others. All Rights Reserved.                                               *
******************************************************************************
*/

package com.ibm.icu.impl;

import java.io.DataOutputStream;
import java.io.IOException;
import java.io.OutputStream;
import java.util.Arrays;

import com.ibm.icu.lang.UCharacter;
import com.ibm.icu.text.UTF16;

/**
 * Builder class to manipulate and generate a trie.
 * This is useful for ICU data in primitive types.
 * Provides a compact way to store information that is indexed by Unicode 
 * values, such as character properties, types, keyboard values, etc. This is 
 * very useful when you have a block of Unicode data that contains significant 
 * values while the rest of the Unicode data is unused in the application or 
 * when you have a lot of redundance, such as where all 21,000 Han ideographs 
 * have the same value.  However, lookup is much faster than a hash table.
 * A trie of any primitive data type serves two purposes:
 * 
    *
  • Fast access of the indexed values. *
  • Smaller memory footprint. *
* This is a direct port from the ICU4C version * @author Syn Wee Quek */ public class IntTrieBuilder extends TrieBuilder { // public constructor ---------------------------------------------- /** * Copy constructor */ public IntTrieBuilder(IntTrieBuilder table) { super(table); m_data_ = new int[m_dataCapacity_]; System.arraycopy(table.m_data_, 0, m_data_, 0, m_dataLength_); m_initialValue_ = table.m_initialValue_; m_leadUnitValue_ = table.m_leadUnitValue_; } /** * Constructs a build table * @param aliasdata data to be filled into table * @param maxdatalength maximum data length allowed in table * @param initialvalue inital data value * @param latin1linear is latin 1 to be linear */ public IntTrieBuilder(int aliasdata[], int maxdatalength, int initialvalue, int leadunitvalue, boolean latin1linear) { super(); if (maxdatalength < DATA_BLOCK_LENGTH || (latin1linear && maxdatalength < 1024)) { throw new IllegalArgumentException( "Argument maxdatalength is too small"); } if (aliasdata != null) { m_data_ = aliasdata; } else { m_data_ = new int[maxdatalength]; } // preallocate and reset the first data block (block index 0) int j = DATA_BLOCK_LENGTH; if (latin1linear) { // preallocate and reset the first block (number 0) and Latin-1 // (U+0000..U+00ff) after that made sure above that // maxDataLength >= 1024 // set indexes to point to consecutive data blocks int i = 0; do { // do this at least for trie->index[0] even if that block is // only partly used for Latin-1 m_index_[i ++] = j; j += DATA_BLOCK_LENGTH; } while (i < (256 >> SHIFT_)); } m_dataLength_ = j; // reset the initially allocated blocks to the initial value Arrays.fill(m_data_, 0, m_dataLength_, initialvalue); m_initialValue_ = initialvalue; m_leadUnitValue_ = leadunitvalue; m_dataCapacity_ = maxdatalength; m_isLatin1Linear_ = latin1linear; m_isCompacted_ = false; } // public methods ------------------------------------------------------- /*public final void print() { int i = 0; int oldvalue = m_index_[i]; int count = 0; System.out.println("index length " + m_indexLength_ + " --------------------------"); while (i < m_indexLength_) { if (m_index_[i] != oldvalue) { System.out.println("index has " + count + " counts of " + Integer.toHexString(oldvalue)); count = 0; oldvalue = m_index_[i]; } count ++; i ++; } System.out.println("index has " + count + " counts of " + Integer.toHexString(oldvalue)); i = 0; oldvalue = m_data_[i]; count = 0; System.out.println("data length " + m_dataLength_ + " --------------------------"); while (i < m_dataLength_) { if (m_data_[i] != oldvalue) { if ((oldvalue & 0xf1000000) == 0xf1000000) { int temp = oldvalue & 0xffffff; temp += 0x320; oldvalue = 0xf1000000 | temp; } if ((oldvalue & 0xf2000000) == 0xf2000000) { int temp = oldvalue & 0xffffff; temp += 0x14a; oldvalue = 0xf2000000 | temp; } System.out.println("data has " + count + " counts of " + Integer.toHexString(oldvalue)); count = 0; oldvalue = m_data_[i]; } count ++; i ++; } if ((oldvalue & 0xf1000000) == 0xf1000000) { int temp = oldvalue & 0xffffff; temp += 0x320; oldvalue = 0xf1000000 | temp; } if ((oldvalue & 0xf2000000) == 0xf2000000) { int temp = oldvalue & 0xffffff; temp += 0x14a; oldvalue = 0xf2000000 | temp; } System.out.println("data has " + count + " counts of " + Integer.toHexString(oldvalue)); } */ /** * Gets a 32 bit data from the table data * @param ch codepoint which data is to be retrieved * @return the 32 bit data */ public int getValue(int ch) { // valid, uncompacted trie and valid c? if (m_isCompacted_ || ch > UCharacter.MAX_VALUE || ch < 0) { return 0; } int block = m_index_[ch >> SHIFT_]; return m_data_[Math.abs(block) + (ch & MASK_)]; } /** * Get a 32 bit data from the table data * @param ch code point for which data is to be retrieved. * @param inBlockZero Output parameter, inBlockZero[0] returns true if the * char maps into block zero, otherwise false. * @return the 32 bit data value. */ public int getValue(int ch, boolean [] inBlockZero) { // valid, uncompacted trie and valid c? if (m_isCompacted_ || ch > UCharacter.MAX_VALUE || ch < 0) { if (inBlockZero != null) { inBlockZero[0] = true; } return 0; } int block = m_index_[ch >> SHIFT_]; if (inBlockZero != null) { inBlockZero[0] = (block == 0); } return m_data_[Math.abs(block) + (ch & MASK_)]; } /** * Sets a 32 bit data in the table data * @param ch codepoint which data is to be set * @param value to set * @return true if the set is successful, otherwise * if the table has been compacted return false */ public boolean setValue(int ch, int value) { // valid, uncompacted trie and valid c? if (m_isCompacted_ || ch > UCharacter.MAX_VALUE || ch < 0) { return false; } int block = getDataBlock(ch); if (block < 0) { return false; } m_data_[block + (ch & MASK_)] = value; return true; } /** * Serializes the build table with 32 bit data * @param datamanipulate builder raw fold method implementation * @param triedatamanipulate result trie fold method * @return a new trie */ public IntTrie serialize(TrieBuilder.DataManipulate datamanipulate, Trie.DataManipulate triedatamanipulate) { if (datamanipulate == null) { throw new IllegalArgumentException("Parameters can not be null"); } // fold and compact if necessary, also checks that indexLength is // within limits if (!m_isCompacted_) { // compact once without overlap to improve folding compact(false); // fold the supplementary part of the index array fold(datamanipulate); // compact again with overlap for minimum data array length compact(true); m_isCompacted_ = true; } // is dataLength within limits? if (m_dataLength_ >= MAX_DATA_LENGTH_) { throw new ArrayIndexOutOfBoundsException("Data length too small"); } char index[] = new char[m_indexLength_]; int data[] = new int[m_dataLength_]; // write the index (stage 1) array and the 32-bit data (stage 2) array // write 16-bit index values shifted right by INDEX_SHIFT_ for (int i = 0; i < m_indexLength_; i ++) { index[i] = (char)(m_index_[i] >>> INDEX_SHIFT_); } // write 32-bit data values System.arraycopy(m_data_, 0, data, 0, m_dataLength_); int options = SHIFT_ | (INDEX_SHIFT_ << OPTIONS_INDEX_SHIFT_); options |= OPTIONS_DATA_IS_32_BIT_; if (m_isLatin1Linear_) { options |= OPTIONS_LATIN1_IS_LINEAR_; } return new IntTrie(index, data, m_initialValue_, options, triedatamanipulate); } /** * Serializes the build table to an output stream. * * Compacts the build-time trie after all values are set, and then * writes the serialized form onto an output stream. * * After this, this build-time Trie can only be serialized again and/or closed; * no further values can be added. * * This function is the rough equivalent of utrie_seriaize() in ICU4C. * * @param os the output stream to which the seriaized trie will be written. * If nul, the function still returns the size of the serialized Trie. * @param reduceTo16Bits If true, reduce the data size to 16 bits. The resulting * serialized form can then be used to create a CharTrie. * @param datamanipulate builder raw fold method implementation * @return the number of bytes written to the output stream. */ public int serialize(OutputStream os, boolean reduceTo16Bits, TrieBuilder.DataManipulate datamanipulate) throws IOException { if (datamanipulate == null) { throw new IllegalArgumentException("Parameters can not be null"); } // fold and compact if necessary, also checks that indexLength is // within limits if (!m_isCompacted_) { // compact once without overlap to improve folding compact(false); // fold the supplementary part of the index array fold(datamanipulate); // compact again with overlap for minimum data array length compact(true); m_isCompacted_ = true; } // is dataLength within limits? int length; if (reduceTo16Bits) { length = m_dataLength_ + m_indexLength_; } else { length = m_dataLength_; } if (length >= MAX_DATA_LENGTH_) { throw new ArrayIndexOutOfBoundsException("Data length too small"); } // struct UTrieHeader { // int32_t signature; // int32_t options (a bit field) // int32_t indexLength // int32_t dataLength length = Trie.HEADER_LENGTH_ + 2*m_indexLength_; if(reduceTo16Bits) { length+=2*m_dataLength_; } else { length+=4*m_dataLength_; } if (os == null) { // No output stream. Just return the length of the serialized Trie, in bytes. return length; } DataOutputStream dos = new DataOutputStream(os); dos.writeInt(Trie.HEADER_SIGNATURE_); int options = Trie.INDEX_STAGE_1_SHIFT_ | (Trie.INDEX_STAGE_2_SHIFT_<>> Trie.INDEX_STAGE_2_SHIFT_; dos.writeChar(v); } /* write 16-bit data values */ for(int i=0; i>> Trie.INDEX_STAGE_2_SHIFT_; dos.writeChar(v); } /* write 32-bit data values */ for(int i=0; i UCharacter.MAX_VALUE || limit < UCharacter.MIN_VALUE || limit > (UCharacter.MAX_VALUE + 1) || start > limit) { return false; } if (start == limit) { return true; // nothing to do } if ((start & MASK_) != 0) { // set partial block at [start..following block boundary[ int block = getDataBlock(start); if (block < 0) { return false; } int nextStart = (start + DATA_BLOCK_LENGTH) & ~MASK_; if (nextStart <= limit) { fillBlock(block, start & MASK_, DATA_BLOCK_LENGTH, value, overwrite); start = nextStart; } else { fillBlock(block, start & MASK_, limit & MASK_, value, overwrite); return true; } } // number of positions in the last, partial block int rest = limit & MASK_; // round down limit to a block boundary limit &= ~MASK_; // iterate over all-value blocks int repeatBlock = 0; if (value == m_initialValue_) { // repeatBlock = 0; assigned above } else { repeatBlock = -1; } while (start < limit) { // get index value int block = m_index_[start >> SHIFT_]; if (block > 0) { // already allocated, fill in value fillBlock(block, 0, DATA_BLOCK_LENGTH, value, overwrite); } else if (m_data_[-block] != value && (block == 0 || overwrite)) { // set the repeatBlock instead of the current block 0 or range // block if (repeatBlock >= 0) { m_index_[start >> SHIFT_] = -repeatBlock; } else { // create and set and fill the repeatBlock repeatBlock = getDataBlock(start); if (repeatBlock < 0) { return false; } // set the negative block number to indicate that it is a // repeat block m_index_[start >> SHIFT_] = -repeatBlock; fillBlock(repeatBlock, 0, DATA_BLOCK_LENGTH, value, true); } } start += DATA_BLOCK_LENGTH; } if (rest > 0) { // set partial block at [last block boundary..limit[ int block = getDataBlock(start); if (block < 0) { return false; } fillBlock(block, 0, rest, value, overwrite); } return true; } // protected data member ------------------------------------------------ protected int m_data_[]; protected int m_initialValue_; // private data member ------------------------------------------------ private int m_leadUnitValue_; // private methods ------------------------------------------------------ private int allocDataBlock() { int newBlock = m_dataLength_; int newTop = newBlock + DATA_BLOCK_LENGTH; if (newTop > m_dataCapacity_) { // out of memory in the data array return -1; } m_dataLength_ = newTop; return newBlock; } /** * No error checking for illegal arguments. * @param ch codepoint to look for * @return -1 if no new data block available (out of memory in data array) */ private int getDataBlock(int ch) { ch >>= SHIFT_; int indexValue = m_index_[ch]; if (indexValue > 0) { return indexValue; } // allocate a new data block int newBlock = allocDataBlock(); if (newBlock < 0) { // out of memory in the data array return -1; } m_index_[ch] = newBlock; // copy-on-write for a block from a setRange() System.arraycopy(m_data_, Math.abs(indexValue), m_data_, newBlock, DATA_BLOCK_LENGTH << 2); return newBlock; } /** * Compact a folded build-time trie. * The compaction * - removes blocks that are identical with earlier ones * - overlaps adjacent blocks as much as possible (if overlap == true) * - moves blocks in steps of the data granularity * - moves and overlaps blocks that overlap with multiple values in the overlap region * * It does not * - try to move and overlap blocks that are not already adjacent * @param overlap flag */ private void compact(boolean overlap) { if (m_isCompacted_) { return; // nothing left to do } // compaction // initialize the index map with "block is used/unused" flags findUnusedBlocks(); // if Latin-1 is preallocated and linear, then do not compact Latin-1 // data int overlapStart = DATA_BLOCK_LENGTH; if (m_isLatin1Linear_ && SHIFT_ <= 8) { overlapStart += 256; } int newStart = DATA_BLOCK_LENGTH; int i; for (int start = newStart; start < m_dataLength_;) { // start: index of first entry of current block // newStart: index where the current block is to be moved // (right after current end of already-compacted data) // skip blocks that are not used if (m_map_[start >>> SHIFT_] < 0) { // advance start to the next block start += DATA_BLOCK_LENGTH; // leave newStart with the previous block! continue; } // search for an identical block if (start >= overlapStart) { i = findSameDataBlock(m_data_, newStart, start, overlap ? DATA_GRANULARITY_ : DATA_BLOCK_LENGTH); if (i >= 0) { // found an identical block, set the other block's index // value for the current block m_map_[start >>> SHIFT_] = i; // advance start to the next block start += DATA_BLOCK_LENGTH; // leave newStart with the previous block! continue; } } // see if the beginning of this block can be overlapped with the // end of the previous block if(overlap && start>=overlapStart) { /* look for maximum overlap (modulo granularity) with the previous, adjacent block */ for(i=DATA_BLOCK_LENGTH-DATA_GRANULARITY_; i>0 && !equal_int(m_data_, newStart-i, start, i); i-=DATA_GRANULARITY_) {} } else { i=0; } if (i > 0) { // some overlap m_map_[start >>> SHIFT_] = newStart - i; // move the non-overlapping indexes to their new positions start += i; for (i = DATA_BLOCK_LENGTH - i; i > 0; -- i) { m_data_[newStart ++] = m_data_[start ++]; } } else if (newStart < start) { // no overlap, just move the indexes to their new positions m_map_[start >>> SHIFT_] = newStart; for (i = DATA_BLOCK_LENGTH; i > 0; -- i) { m_data_[newStart ++] = m_data_[start ++]; } } else { // no overlap && newStart==start m_map_[start >>> SHIFT_] = start; newStart += DATA_BLOCK_LENGTH; start = newStart; } } // now adjust the index (stage 1) table for (i = 0; i < m_indexLength_; ++ i) { m_index_[i] = m_map_[Math.abs(m_index_[i]) >>> SHIFT_]; } m_dataLength_ = newStart; } /** * Find the same data block * @param data array * @param dataLength * @param otherBlock * @param step */ private static final int findSameDataBlock(int data[], int dataLength, int otherBlock, int step) { // ensure that we do not even partially get past dataLength dataLength -= DATA_BLOCK_LENGTH; for (int block = 0; block <= dataLength; block += step) { if(equal_int(data, block, otherBlock, DATA_BLOCK_LENGTH)) { return block; } } return -1; } /** * Fold the normalization data for supplementary code points into * a compact area on top of the BMP-part of the trie index, * with the lead surrogates indexing this compact area. * * Duplicate the index values for lead surrogates: * From inside the BMP area, where some may be overridden with folded values, * to just after the BMP area, where they can be retrieved for * code point lookups. * @param manipulate fold implementation */ private final void fold(DataManipulate manipulate) { int leadIndexes[] = new int[SURROGATE_BLOCK_COUNT_]; int index[] = m_index_; // copy the lead surrogate indexes into a temporary array System.arraycopy(index, 0xd800 >> SHIFT_, leadIndexes, 0, SURROGATE_BLOCK_COUNT_); // set all values for lead surrogate code *units* to leadUnitValue // so that by default runtime lookups will find no data for associated // supplementary code points, unless there is data for such code points // which will result in a non-zero folding value below that is set for // the respective lead units // the above saved the indexes for surrogate code *points* // fill the indexes with simplified code from utrie_setRange32() int block = 0; if (m_leadUnitValue_ == m_initialValue_) { // leadUnitValue == initialValue, use all-initial-value block // block = 0; if block here left empty } else { // create and fill the repeatBlock block = allocDataBlock(); if (block < 0) { // data table overflow throw new IllegalStateException("Internal error: Out of memory space"); } fillBlock(block, 0, DATA_BLOCK_LENGTH, m_leadUnitValue_, true); // negative block number to indicate that it is a repeat block block = -block; } for (int c = (0xd800 >> SHIFT_); c < (0xdc00 >> SHIFT_); ++ c) { m_index_[c] = block; } // Fold significant index values into the area just after the BMP // indexes. // In case the first lead surrogate has significant data, // its index block must be used first (in which case the folding is a // no-op). // Later all folded index blocks are moved up one to insert the copied // lead surrogate indexes. int indexLength = BMP_INDEX_LENGTH_; // search for any index (stage 1) entries for supplementary code points for (int c = 0x10000; c < 0x110000;) { if (index[c >> SHIFT_] != 0) { // there is data, treat the full block for a lead surrogate c &= ~0x3ff; // is there an identical index block? block = findSameIndexBlock(index, indexLength, c >> SHIFT_); // get a folded value for [c..c+0x400[ and, // if different from the value for the lead surrogate code // point, set it for the lead surrogate code unit int value = manipulate.getFoldedValue(c, block + SURROGATE_BLOCK_COUNT_); if (value != getValue(UTF16.getLeadSurrogate(c))) { if (!setValue(UTF16.getLeadSurrogate(c), value)) { // data table overflow throw new ArrayIndexOutOfBoundsException( "Data table overflow"); } // if we did not find an identical index block... if (block == indexLength) { // move the actual index (stage 1) entries from the // supplementary position to the new one System.arraycopy(index, c >> SHIFT_, index, indexLength, SURROGATE_BLOCK_COUNT_); indexLength += SURROGATE_BLOCK_COUNT_; } } c += 0x400; } else { c += DATA_BLOCK_LENGTH; } } // index array overflow? // This is to guarantee that a folding offset is of the form // UTRIE_BMP_INDEX_LENGTH+n*UTRIE_SURROGATE_BLOCK_COUNT with n=0..1023. // If the index is too large, then n>=1024 and more than 10 bits are // necessary. // In fact, it can only ever become n==1024 with completely unfoldable // data and the additional block of duplicated values for lead // surrogates. if (indexLength >= MAX_INDEX_LENGTH_) { throw new ArrayIndexOutOfBoundsException("Index table overflow"); } // make space for the lead surrogate index block and insert it between // the BMP indexes and the folded ones System.arraycopy(index, BMP_INDEX_LENGTH_, index, BMP_INDEX_LENGTH_ + SURROGATE_BLOCK_COUNT_, indexLength - BMP_INDEX_LENGTH_); System.arraycopy(leadIndexes, 0, index, BMP_INDEX_LENGTH_, SURROGATE_BLOCK_COUNT_); indexLength += SURROGATE_BLOCK_COUNT_; m_indexLength_ = indexLength; } /** * @internal */ private void fillBlock(int block, int start, int limit, int value, boolean overwrite) { limit += block; block += start; if (overwrite) { while (block < limit) { m_data_[block ++] = value; } } else { while (block < limit) { if (m_data_[block] == m_initialValue_) { m_data_[block] = value; } ++ block; } } } }




© 2015 - 2024 Weber Informatics LLC | Privacy Policy