com.ibm.icu.impl.IntTrieBuilder Maven / Gradle / Ivy

Go to download
Show more of this group Show more artifacts with this name
Show all versions of icu4j Show documentation
International Component for Unicode for Java (ICU4J) is a mature, widely used Java library providing Unicode and Globalization support
There is a newer version: 76.1
Show newest version
/*
******************************************************************************
* Copyright (C) 1996-2010, International Business Machines Corporation and   *
* others. All Rights Reserved.                                               *
******************************************************************************
*/

package com.ibm.icu.impl;

import java.io.DataOutputStream;
import java.io.IOException;
import java.io.OutputStream;
import java.util.Arrays;

import com.ibm.icu.lang.UCharacter;
import com.ibm.icu.text.UTF16;

/**
 * Builder class to manipulate and generate a trie.
 * This is useful for ICU data in primitive types.
 * Provides a compact way to store information that is indexed by Unicode 
 * values, such as character properties, types, keyboard values, etc. This is 
 * very useful when you have a block of Unicode data that contains significant 
 * values while the rest of the Unicode data is unused in the application or 
 * when you have a lot of redundance, such as where all 21,000 Han ideographs 
 * have the same value.  However, lookup is much faster than a hash table.
 * A trie of any primitive data type serves two purposes:
 * 
 *     Fast access of the indexed values.
 *     
Smaller memory footprint.
 * 
 * This is a direct port from the ICU4C version
 * @author             Syn Wee Quek
 */
public class IntTrieBuilder extends TrieBuilder
{
    // public constructor ----------------------------------------------
                
    /**
     * Copy constructor
     */
    public IntTrieBuilder(IntTrieBuilder table)
    {
        super(table);
        m_data_ = new int[m_dataCapacity_];
        System.arraycopy(table.m_data_, 0, m_data_, 0, m_dataLength_);
        m_initialValue_ = table.m_initialValue_;
        m_leadUnitValue_ = table.m_leadUnitValue_;
    }
    
    /**
     * Constructs a build table
     * @param aliasdata data to be filled into table
     * @param maxdatalength maximum data length allowed in table
     * @param initialvalue inital data value
     * @param latin1linear is latin 1 to be linear
     */
    public IntTrieBuilder(int aliasdata[], int maxdatalength, 
                          int initialvalue, int leadunitvalue, 
                          boolean latin1linear) 
    {
        super();
        if (maxdatalength < DATA_BLOCK_LENGTH || (latin1linear 
                                                  && maxdatalength < 1024)) {
            throw new IllegalArgumentException(
                                               "Argument maxdatalength is too small");
        }
            
        if (aliasdata != null) {
            m_data_ = aliasdata;
        } 
        else {
            m_data_ = new int[maxdatalength];
        }
        
        // preallocate and reset the first data block (block index 0)
        int j = DATA_BLOCK_LENGTH;
        
        if (latin1linear) {
            // preallocate and reset the first block (number 0) and Latin-1 
            // (U+0000..U+00ff) after that made sure above that 
            // maxDataLength >= 1024
            // set indexes to point to consecutive data blocks
            int i = 0;
            do {
                // do this at least for trie->index[0] even if that block is 
                // only partly used for Latin-1
                m_index_[i ++] = j;
                j += DATA_BLOCK_LENGTH;
            } while (i < (256 >> SHIFT_));
        }
        
        m_dataLength_ = j;
        // reset the initially allocated blocks to the initial value
        Arrays.fill(m_data_, 0, m_dataLength_, initialvalue);
        m_initialValue_ = initialvalue;
        m_leadUnitValue_ = leadunitvalue;
        m_dataCapacity_ = maxdatalength;
        m_isLatin1Linear_ = latin1linear;
        m_isCompacted_ = false;
    }

    // public methods -------------------------------------------------------
     
    /*public final void print()
      {
      int i = 0;
      int oldvalue = m_index_[i];
      int count = 0;
      System.out.println("index length " + m_indexLength_ 
      + " --------------------------");
      while (i < m_indexLength_) {
      if (m_index_[i] != oldvalue) {
      System.out.println("index has " + count + " counts of " 
      + Integer.toHexString(oldvalue));
      count = 0;
      oldvalue = m_index_[i];
      }
      count ++;
      i ++;
      }
      System.out.println("index has " + count + " counts of " 
      + Integer.toHexString(oldvalue));
      i = 0;
      oldvalue = m_data_[i];
      count = 0;
      System.out.println("data length " + m_dataLength_ 
      + " --------------------------");
      while (i < m_dataLength_) {
      if (m_data_[i] != oldvalue) {
      if ((oldvalue & 0xf1000000) == 0xf1000000) {
      int temp = oldvalue & 0xffffff; 
      temp += 0x320;
      oldvalue = 0xf1000000 | temp;
      }
      if ((oldvalue & 0xf2000000) == 0xf2000000) {
      int temp = oldvalue & 0xffffff; 
      temp += 0x14a;
      oldvalue = 0xf2000000 | temp;
      }
      System.out.println("data has " + count + " counts of " 
      + Integer.toHexString(oldvalue));
      count = 0;
      oldvalue = m_data_[i];
      }
      count ++;
      i ++;
      }
      if ((oldvalue & 0xf1000000) == 0xf1000000) {
      int temp = oldvalue & 0xffffff; 
      temp += 0x320;
      oldvalue = 0xf1000000 | temp;
      }
      if ((oldvalue & 0xf2000000) == 0xf2000000) {
      int temp = oldvalue & 0xffffff; 
      temp += 0x14a;
      oldvalue = 0xf2000000 | temp;
      }
      System.out.println("data has " + count + " counts of " 
      + Integer.toHexString(oldvalue));
      }
    */   
    /**
     * Gets a 32 bit data from the table data
     * @param ch codepoint which data is to be retrieved
     * @return the 32 bit data
     */
    public int getValue(int ch) 
    {
        // valid, uncompacted trie and valid c?
        if (m_isCompacted_ || ch > UCharacter.MAX_VALUE || ch < 0) {
            return 0;
        }
    
        int block = m_index_[ch >> SHIFT_];
        return m_data_[Math.abs(block) + (ch & MASK_)];
    }
    
    /**
     * Get a 32 bit data from the table data
     * @param ch  code point for which data is to be retrieved.
     * @param inBlockZero  Output parameter, inBlockZero[0] returns true if the
     *                      char maps into block zero, otherwise false.
     * @return the 32 bit data value.
     */
    public int getValue(int ch, boolean [] inBlockZero) 
    {
        // valid, uncompacted trie and valid c?
        if (m_isCompacted_ || ch > UCharacter.MAX_VALUE || ch < 0) {
            if (inBlockZero != null) {
                inBlockZero[0] = true;
            }
            return 0;
        }
    
        int block = m_index_[ch >> SHIFT_];
        if (inBlockZero != null) {
            inBlockZero[0] = (block == 0);
        }
        return m_data_[Math.abs(block) + (ch & MASK_)];
    }
    
    
    /**
     * Sets a 32 bit data in the table data
     * @param ch codepoint which data is to be set
     * @param value to set
     * @return true if the set is successful, otherwise 
     *              if the table has been compacted return false
     */
    public boolean setValue(int ch, int value) 
    {
        // valid, uncompacted trie and valid c? 
        if (m_isCompacted_ || ch > UCharacter.MAX_VALUE || ch < 0) {
            return false;
        }
    
        int block = getDataBlock(ch);
        if (block < 0) {
            return false;
        }
    
        m_data_[block + (ch & MASK_)] = value;
        return true;
    }
    
    /**
     * Serializes the build table with 32 bit data
     * @param datamanipulate builder raw fold method implementation
     * @param triedatamanipulate result trie fold method
     * @return a new trie
     */
    public IntTrie serialize(TrieBuilder.DataManipulate datamanipulate, 
                             Trie.DataManipulate triedatamanipulate)
    {
        if (datamanipulate == null) {
            throw new IllegalArgumentException("Parameters can not be null");
        }
        // fold and compact if necessary, also checks that indexLength is 
        // within limits 
        if (!m_isCompacted_) {
            // compact once without overlap to improve folding
            compact(false);
            // fold the supplementary part of the index array
            fold(datamanipulate);
            // compact again with overlap for minimum data array length
            compact(true);
            m_isCompacted_ = true;
        }
        // is dataLength within limits? 
        if (m_dataLength_ >= MAX_DATA_LENGTH_) {
            throw new ArrayIndexOutOfBoundsException("Data length too small");
        }
    
        char index[] = new char[m_indexLength_];
        int data[] = new int[m_dataLength_];
        // write the index (stage 1) array and the 32-bit data (stage 2) array
        // write 16-bit index values shifted right by INDEX_SHIFT_ 
        for (int i = 0; i < m_indexLength_; i ++) {
            index[i] = (char)(m_index_[i] >>> INDEX_SHIFT_);
        }
        // write 32-bit data values
        System.arraycopy(m_data_, 0, data, 0, m_dataLength_);
        
        int options = SHIFT_ | (INDEX_SHIFT_ << OPTIONS_INDEX_SHIFT_);
        options |= OPTIONS_DATA_IS_32_BIT_;
        if (m_isLatin1Linear_) {
            options |= OPTIONS_LATIN1_IS_LINEAR_;
        }
        return new IntTrie(index, data, m_initialValue_, options, 
                           triedatamanipulate);
    }
    
    
    /**
     * Serializes the build table to an output stream.
     * 
     * Compacts the build-time trie after all values are set, and then
     * writes the serialized form onto an output stream.
     * 
     * After this, this build-time Trie can only be serialized again and/or closed;
     * no further values can be added.
     * 
     * This function is the rough equivalent of utrie_seriaize() in ICU4C.
     * 
     * @param os the output stream to which the seriaized trie will be written.
     *         If nul, the function still returns the size of the serialized Trie.
     * @param reduceTo16Bits If true, reduce the data size to 16 bits.  The resulting
     *         serialized form can then be used to create a CharTrie.
     * @param datamanipulate builder raw fold method implementation
     * @return the number of bytes written to the output stream.
     */
     public int serialize(OutputStream os, boolean reduceTo16Bits,
            TrieBuilder.DataManipulate datamanipulate)  throws IOException {
         if (datamanipulate == null) {
             throw new IllegalArgumentException("Parameters can not be null");
         }

         // fold and compact if necessary, also checks that indexLength is 
         // within limits 
         if (!m_isCompacted_) {
             // compact once without overlap to improve folding
             compact(false);
             // fold the supplementary part of the index array
             fold(datamanipulate);
             // compact again with overlap for minimum data array length
             compact(true);
             m_isCompacted_ = true;
         }
         
         // is dataLength within limits? 
         int length;
         if (reduceTo16Bits) {
             length = m_dataLength_ + m_indexLength_;
         } else {
             length = m_dataLength_;
         }
         if (length >= MAX_DATA_LENGTH_) {
             throw new ArrayIndexOutOfBoundsException("Data length too small");
         }
         
         //  struct UTrieHeader {
         //      int32_t   signature;
         //      int32_t   options  (a bit field)
         //      int32_t   indexLength
         //      int32_t   dataLength
         length = Trie.HEADER_LENGTH_ + 2*m_indexLength_;
         if(reduceTo16Bits) {
             length+=2*m_dataLength_;
         } else {
             length+=4*m_dataLength_;
         }
         
         if (os == null) {
             // No output stream.  Just return the length of the serialized Trie, in bytes.
             return length;
         }

         DataOutputStream dos = new DataOutputStream(os);
         dos.writeInt(Trie.HEADER_SIGNATURE_);  
         
         int options = Trie.INDEX_STAGE_1_SHIFT_ | (Trie.INDEX_STAGE_2_SHIFT_<>> Trie.INDEX_STAGE_2_SHIFT_;
                 dos.writeChar(v);
             }
             
             /* write 16-bit data values */
             for(int i=0; i>> Trie.INDEX_STAGE_2_SHIFT_;
                 dos.writeChar(v);
             }
             
             /* write 32-bit data values */
             for(int i=0; i UCharacter.MAX_VALUE || limit < UCharacter.MIN_VALUE
            || limit > (UCharacter.MAX_VALUE + 1) || start > limit) {
            return false;
        }
            
        if (start == limit) {
            return true; // nothing to do
        }
        
        if ((start & MASK_) != 0) {
            // set partial block at [start..following block boundary[
            int block = getDataBlock(start);
            if (block < 0) {
                return false;
            }
        
            int nextStart = (start + DATA_BLOCK_LENGTH) & ~MASK_;
            if (nextStart <= limit) {
                fillBlock(block, start & MASK_, DATA_BLOCK_LENGTH,
                          value, overwrite);
                start = nextStart;
            } 
            else {
                fillBlock(block, start & MASK_, limit & MASK_,
                          value, overwrite);
                return true;
            }
        }
        
        // number of positions in the last, partial block
        int rest = limit & MASK_;
        
        // round down limit to a block boundary 
        limit &= ~MASK_;
        
        // iterate over all-value blocks 
        int repeatBlock = 0;
        if (value == m_initialValue_) {
            // repeatBlock = 0; assigned above
        } 
        else {
            repeatBlock = -1;
        }
        while (start < limit) {
            // get index value 
            int block = m_index_[start >> SHIFT_];
            if (block > 0) {
                // already allocated, fill in value
                fillBlock(block, 0, DATA_BLOCK_LENGTH, value, overwrite);
            } 
            else if (m_data_[-block] != value && (block == 0 || overwrite)) {
                // set the repeatBlock instead of the current block 0 or range 
                // block 
                if (repeatBlock >= 0) {
                    m_index_[start >> SHIFT_] = -repeatBlock;
                } 
                else {
                    // create and set and fill the repeatBlock
                    repeatBlock = getDataBlock(start);
                    if (repeatBlock < 0) {
                        return false;
                    }
        
                    // set the negative block number to indicate that it is a 
                    // repeat block
                    m_index_[start >> SHIFT_] = -repeatBlock;
                    fillBlock(repeatBlock, 0, DATA_BLOCK_LENGTH, value, true);
                }
            }
        
            start += DATA_BLOCK_LENGTH;
        }
        
        if (rest > 0) {
            // set partial block at [last block boundary..limit[
            int block = getDataBlock(start);
            if (block < 0) {
                return false;
            }
            fillBlock(block, 0, rest, value, overwrite);
        }
        
        return true;
    }
    
    // protected data member ------------------------------------------------
                
    protected int m_data_[];
    protected int m_initialValue_;  
    
    //  private data member ------------------------------------------------
        
    private int m_leadUnitValue_;  
     
    // private methods ------------------------------------------------------
   
    private int allocDataBlock() 
    {
        int newBlock = m_dataLength_;
        int newTop = newBlock + DATA_BLOCK_LENGTH;
        if (newTop > m_dataCapacity_) {
            // out of memory in the data array
            return -1;
        }
        m_dataLength_ = newTop;
        return newBlock;
    }

    /**
     * No error checking for illegal arguments.
     * @param ch codepoint to look for
     * @return -1 if no new data block available (out of memory in data array)
     */
    private int getDataBlock(int ch) 
    {
        ch >>= SHIFT_;
        int indexValue = m_index_[ch];
        if (indexValue > 0) {
            return indexValue;
        }
    
        // allocate a new data block
        int newBlock = allocDataBlock();
        if (newBlock < 0) {
            // out of memory in the data array 
            return -1;
        }
        m_index_[ch] = newBlock;
    
        // copy-on-write for a block from a setRange()
        System.arraycopy(m_data_, Math.abs(indexValue), m_data_, newBlock,  
                         DATA_BLOCK_LENGTH << 2);
        return newBlock;
    }
    
    /**
     * Compact a folded build-time trie.
     * The compaction
     * - removes blocks that are identical with earlier ones
     * - overlaps adjacent blocks as much as possible (if overlap == true)
     * - moves blocks in steps of the data granularity
     * - moves and overlaps blocks that overlap with multiple values in the overlap region
     *
     * It does not
     * - try to move and overlap blocks that are not already adjacent
     * @param overlap flag
     */
    private void compact(boolean overlap) 
    {
        if (m_isCompacted_) {
            return; // nothing left to do
        }
    
        // compaction
        // initialize the index map with "block is used/unused" flags
        findUnusedBlocks();
        
        // if Latin-1 is preallocated and linear, then do not compact Latin-1 
        // data
        int overlapStart = DATA_BLOCK_LENGTH;
        if (m_isLatin1Linear_ && SHIFT_ <= 8) {
            overlapStart += 256;
        }
       
        int newStart = DATA_BLOCK_LENGTH;
        int i;
        for (int start = newStart; start < m_dataLength_;) {
            // start: index of first entry of current block
            // newStart: index where the current block is to be moved
            //           (right after current end of already-compacted data)
            // skip blocks that are not used 
            if (m_map_[start >>> SHIFT_] < 0) {
                // advance start to the next block 
                start += DATA_BLOCK_LENGTH;
                // leave newStart with the previous block!
                continue;
            }
            // search for an identical block
            if (start >= overlapStart) {
                i = findSameDataBlock(m_data_, newStart, start,
                                          overlap ? DATA_GRANULARITY_ : DATA_BLOCK_LENGTH);
                if (i >= 0) {
                    // found an identical block, set the other block's index 
                    // value for the current block
                    m_map_[start >>> SHIFT_] = i;
                    // advance start to the next block
                    start += DATA_BLOCK_LENGTH;
                    // leave newStart with the previous block!
                    continue;
                }
            }
            // see if the beginning of this block can be overlapped with the 
            // end of the previous block
            if(overlap && start>=overlapStart) {
                /* look for maximum overlap (modulo granularity) with the previous, adjacent block */
                for(i=DATA_BLOCK_LENGTH-DATA_GRANULARITY_;
                    i>0 && !equal_int(m_data_, newStart-i, start, i);
                    i-=DATA_GRANULARITY_) {}
            } else {
                i=0;
            }
            if (i > 0) {
                // some overlap
                m_map_[start >>> SHIFT_] = newStart - i;
                // move the non-overlapping indexes to their new positions
                start += i;
                for (i = DATA_BLOCK_LENGTH - i; i > 0; -- i) {
                    m_data_[newStart ++] = m_data_[start ++];
                }
            } 
            else if (newStart < start) {
                // no overlap, just move the indexes to their new positions
                m_map_[start >>> SHIFT_] = newStart;
                for (i = DATA_BLOCK_LENGTH; i > 0; -- i) {
                    m_data_[newStart ++] = m_data_[start ++];
                }
            } 
            else { // no overlap && newStart==start
                m_map_[start >>> SHIFT_] = start;
                newStart += DATA_BLOCK_LENGTH;
                start = newStart;
            }
        }
        // now adjust the index (stage 1) table
        for (i = 0; i < m_indexLength_; ++ i) {
            m_index_[i] = m_map_[Math.abs(m_index_[i]) >>> SHIFT_];
        }
        m_dataLength_ = newStart;
    }

    /**
     * Find the same data block
     * @param data array
     * @param dataLength
     * @param otherBlock
     * @param step
     */
    private static final int findSameDataBlock(int data[], int dataLength,
                                               int otherBlock, int step) 
    {
        // ensure that we do not even partially get past dataLength
        dataLength -= DATA_BLOCK_LENGTH;

        for (int block = 0; block <= dataLength; block += step) {
            if(equal_int(data, block, otherBlock, DATA_BLOCK_LENGTH)) {
                return block;
            }
        }
        return -1;
    }
    
    /**
     * Fold the normalization data for supplementary code points into
     * a compact area on top of the BMP-part of the trie index,
     * with the lead surrogates indexing this compact area.
     *
     * Duplicate the index values for lead surrogates:
     * From inside the BMP area, where some may be overridden with folded values,
     * to just after the BMP area, where they can be retrieved for
     * code point lookups.
     * @param manipulate fold implementation
     */
    private final void fold(DataManipulate manipulate) 
    {
        int leadIndexes[] = new int[SURROGATE_BLOCK_COUNT_];
        int index[] = m_index_;
        // copy the lead surrogate indexes into a temporary array
        System.arraycopy(index, 0xd800 >> SHIFT_, leadIndexes, 0, 
                         SURROGATE_BLOCK_COUNT_);
        
        // set all values for lead surrogate code *units* to leadUnitValue
        // so that by default runtime lookups will find no data for associated
        // supplementary code points, unless there is data for such code points
        // which will result in a non-zero folding value below that is set for
        // the respective lead units
        // the above saved the indexes for surrogate code *points*
        // fill the indexes with simplified code from utrie_setRange32()
        int block = 0;
        if (m_leadUnitValue_ == m_initialValue_) {
            // leadUnitValue == initialValue, use all-initial-value block
            // block = 0; if block here left empty
        } 
        else {
            // create and fill the repeatBlock
            block = allocDataBlock();
            if (block < 0) {
                // data table overflow
                throw new IllegalStateException("Internal error: Out of memory space");
            }
            fillBlock(block, 0, DATA_BLOCK_LENGTH, m_leadUnitValue_, true);
            // negative block number to indicate that it is a repeat block
            block = -block; 
        }
        for (int c = (0xd800 >> SHIFT_); c < (0xdc00 >> SHIFT_); ++ c) {
            m_index_[c] = block;
        }

        // Fold significant index values into the area just after the BMP 
        // indexes.
        // In case the first lead surrogate has significant data,
        // its index block must be used first (in which case the folding is a 
        // no-op).
        // Later all folded index blocks are moved up one to insert the copied
        // lead surrogate indexes.
        int indexLength = BMP_INDEX_LENGTH_;
        // search for any index (stage 1) entries for supplementary code points 
        for (int c = 0x10000; c < 0x110000;) {
            if (index[c >> SHIFT_] != 0) {
                // there is data, treat the full block for a lead surrogate
                c &= ~0x3ff;
                // is there an identical index block?
                block = findSameIndexBlock(index, indexLength, c >> SHIFT_);
                
                // get a folded value for [c..c+0x400[ and,
                // if different from the value for the lead surrogate code 
                // point, set it for the lead surrogate code unit

                int value = manipulate.getFoldedValue(c, 
                                                      block + SURROGATE_BLOCK_COUNT_);
                if (value != getValue(UTF16.getLeadSurrogate(c))) {
                    if (!setValue(UTF16.getLeadSurrogate(c), value)) {
                        // data table overflow 
                        throw new ArrayIndexOutOfBoundsException(
                                                                 "Data table overflow");
                    }
                    // if we did not find an identical index block...
                    if (block == indexLength) {
                        // move the actual index (stage 1) entries from the 
                        // supplementary position to the new one
                        System.arraycopy(index, c >> SHIFT_, index, indexLength,
                                         SURROGATE_BLOCK_COUNT_);
                        indexLength += SURROGATE_BLOCK_COUNT_;
                    }
                }
                c += 0x400;
            } 
            else {
                c += DATA_BLOCK_LENGTH;
            }
        }
        
        // index array overflow?
        // This is to guarantee that a folding offset is of the form
        // UTRIE_BMP_INDEX_LENGTH+n*UTRIE_SURROGATE_BLOCK_COUNT with n=0..1023.
        // If the index is too large, then n>=1024 and more than 10 bits are 
        // necessary.
        // In fact, it can only ever become n==1024 with completely unfoldable 
        // data and the additional block of duplicated values for lead 
        // surrogates.
        if (indexLength >= MAX_INDEX_LENGTH_) {
            throw new ArrayIndexOutOfBoundsException("Index table overflow");
        }
        // make space for the lead surrogate index block and insert it between 
        // the BMP indexes and the folded ones
        System.arraycopy(index, BMP_INDEX_LENGTH_, index, 
                         BMP_INDEX_LENGTH_ + SURROGATE_BLOCK_COUNT_,
                         indexLength - BMP_INDEX_LENGTH_);
        System.arraycopy(leadIndexes, 0, index, BMP_INDEX_LENGTH_,
                         SURROGATE_BLOCK_COUNT_);
        indexLength += SURROGATE_BLOCK_COUNT_;
        m_indexLength_ = indexLength;
    }
    
    /**
     * @internal
     */
    private void fillBlock(int block, int start, int limit, int value, 
                           boolean overwrite) 
    {
        limit += block;
        block += start;
        if (overwrite) {
            while (block < limit) {
                m_data_[block ++] = value;
            }
        } 
        else {
            while (block < limit) {
                if (m_data_[block] == m_initialValue_) {
                    m_data_[block] = value;
                }
                ++ block;
            }
        }
    }
}