com.ibm.icu.impl.Trie Maven / Gradle / Ivy
// © 2016 and later: Unicode, Inc. and others.
// License & terms of use: http://www.unicode.org/copyright.html#License
/*
******************************************************************************
* Copyright (C) 1996-2015, International Business Machines Corporation and
* others. All Rights Reserved.
******************************************************************************
*/
package com.ibm.icu.impl;
import java.nio.ByteBuffer;
import java.util.Arrays;
import com.ibm.icu.lang.UCharacter;
import com.ibm.icu.text.UTF16;
/**
* A trie is a kind of compressed, serializable table of values
* associated with Unicode code points (0..0x10ffff).
* This class defines the basic structure of a trie and provides methods
* to retrieve the offsets to the actual data.
* Data will be the form of an array of basic types, char or int.
* The actual data format will have to be specified by the user in the
* inner static interface com.ibm.icu.impl.Trie.DataManipulate.
* This trie implementation is optimized for getting offset while walking
* forward through a UTF-16 string.
* Therefore, the simplest and fastest access macros are the
* fromLead() and fromOffsetTrail() methods.
* The fromBMP() method are a little more complicated; they get offsets even
* for lead surrogate codepoints, while the fromLead() method get special
* "folded" offsets for lead surrogate code units if there is relevant data
* associated with them.
* From such a folded offsets, an offset needs to be extracted to supply
* to the fromOffsetTrail() methods.
* To handle such supplementary codepoints, some offset information are kept
* in the data.
* Methods in com.ibm.icu.impl.Trie.DataManipulate are called to retrieve
* that offset from the folded value for the lead surrogate unit.
* For examples of use, see com.ibm.icu.impl.CharTrie or
* com.ibm.icu.impl.IntTrie.
* @author synwee
* @see com.ibm.icu.impl.CharTrie
* @see com.ibm.icu.impl.IntTrie
* @since release 2.1, Jan 01 2002
*/
public abstract class Trie
{
// public class declaration ----------------------------------------
/**
* Character data in com.ibm.impl.Trie have different user-specified format
* for different purposes.
* This interface specifies methods to be implemented in order for
* com.ibm.impl.Trie, to surrogate offset information encapsulated within
* the data.
*/
public static interface DataManipulate
{
/**
* Called by com.ibm.icu.impl.Trie to extract from a lead surrogate's
* data
* the index array offset of the indexes for that lead surrogate.
* @param value data value for a surrogate from the trie, including the
* folding offset
* @return data offset or 0 if there is no data for the lead surrogate
*/
public int getFoldingOffset(int value);
}
// default implementation
private static class DefaultGetFoldingOffset implements DataManipulate {
@Override
public int getFoldingOffset(int value) {
return value;
}
}
// public methods --------------------------------------------------
/**
* Determines if this trie has a linear latin 1 array
* @return true if this trie has a linear latin 1 array, false otherwise
*/
public final boolean isLatin1Linear()
{
return m_isLatin1Linear_;
}
/**
* Checks if the argument Trie has the same data as this Trie.
* Attributes are checked but not the index data.
* @param other Trie to check
* @return true if the argument Trie has the same data as this Trie, false
* otherwise
*/
///CLOVER:OFF
@Override
public boolean equals(Object other)
{
if (other == this) {
return true;
}
if (!(other instanceof Trie)) {
return false;
}
Trie othertrie = (Trie)other;
return m_isLatin1Linear_ == othertrie.m_isLatin1Linear_
&& m_options_ == othertrie.m_options_
&& m_dataLength_ == othertrie.m_dataLength_
&& Arrays.equals(m_index_, othertrie.m_index_);
}
@Override
public int hashCode() {
assert false : "hashCode not designed";
return 42;
}
///CLOVER:ON
/**
* Gets the serialized data file size of the Trie. This is used during
* trie data reading for size checking purposes.
* @return size size of serialized trie data file in terms of the number
* of bytes
*/
public int getSerializedDataSize()
{
// includes signature, option, dataoffset and datalength output
int result = (4 << 2);
result += (m_dataOffset_ << 1);
if (isCharTrie()) {
result += (m_dataLength_ << 1);
}
else if (isIntTrie()) {
result += (m_dataLength_ << 2);
}
return result;
}
// protected constructor -------------------------------------------
/**
* Trie constructor for CharTrie use.
* @param bytes data of an ICU data file, containing the trie
* @param dataManipulate object containing the information to parse the
* trie data
*/
protected Trie(ByteBuffer bytes, DataManipulate dataManipulate)
{
// Magic number to authenticate the data.
int signature = bytes.getInt();
m_options_ = bytes.getInt();
if (!checkHeader(signature)) {
throw new IllegalArgumentException("ICU data file error: Trie header authentication failed, please check if you have the most updated ICU data file");
}
if(dataManipulate != null) {
m_dataManipulate_ = dataManipulate;
} else {
m_dataManipulate_ = new DefaultGetFoldingOffset();
}
m_isLatin1Linear_ = (m_options_ &
HEADER_OPTIONS_LATIN1_IS_LINEAR_MASK_) != 0;
m_dataOffset_ = bytes.getInt();
m_dataLength_ = bytes.getInt();
unserialize(bytes);
}
/**
* Trie constructor
* @param index array to be used for index
* @param options used by the trie
* @param dataManipulate object containing the information to parse the
* trie data
*/
protected Trie(char index[], int options, DataManipulate dataManipulate)
{
m_options_ = options;
if(dataManipulate != null) {
m_dataManipulate_ = dataManipulate;
} else {
m_dataManipulate_ = new DefaultGetFoldingOffset();
}
m_isLatin1Linear_ = (m_options_ &
HEADER_OPTIONS_LATIN1_IS_LINEAR_MASK_) != 0;
m_index_ = index;
m_dataOffset_ = m_index_.length;
}
// protected data members ------------------------------------------
/**
* Lead surrogate code points' index displacement in the index array.
* 0x10000-0xd800=0x2800
* 0x2800 >> INDEX_STAGE_1_SHIFT_
*/
protected static final int LEAD_INDEX_OFFSET_ = 0x2800 >> 5;
/**
* Shift size for shifting right the input index. 1..9
*/
protected static final int INDEX_STAGE_1_SHIFT_ = 5;
/**
* Shift size for shifting left the index array values.
* Increases possible data size with 16-bit index values at the cost
* of compactability.
* This requires blocks of stage 2 data to be aligned by
* DATA_GRANULARITY.
* 0..INDEX_STAGE_1_SHIFT
*/
protected static final int INDEX_STAGE_2_SHIFT_ = 2;
/**
* Number of data values in a stage 2 (data array) block.
*/
protected static final int DATA_BLOCK_LENGTH=1<>INDEX_STAGE_1_SHIFT_
*/
protected static final int SURROGATE_BLOCK_COUNT=(1<>INDEX_STAGE_1_SHIFT_;
/**
* Surrogate mask to use when shifting offset to retrieve supplementary
* values
*/
protected static final int SURROGATE_MASK_ = 0x3FF;
/**
* Index or UTF16 characters
*/
protected char m_index_[];
/**
* Internal TrieValue which handles the parsing of the data value.
* This class is to be implemented by the user
*/
protected DataManipulate m_dataManipulate_;
/**
* Start index of the data portion of the trie. CharTrie combines
* index and data into a char array, so this is used to indicate the
* initial offset to the data portion.
* Note this index always points to the initial value.
*/
protected int m_dataOffset_;
/**
* Length of the data array
*/
protected int m_dataLength_;
// protected methods -----------------------------------------------
/**
* Gets the offset to the data which the surrogate pair points to.
* @param lead lead surrogate
* @param trail trailing surrogate
* @return offset to data
*/
protected abstract int getSurrogateOffset(char lead, char trail);
/**
* Gets the value at the argument index
* @param index value at index will be retrieved
* @return 32 bit value
*/
protected abstract int getValue(int index);
/**
* Gets the default initial value
* @return 32 bit value
*/
protected abstract int getInitialValue();
/**
* Gets the offset to the data which the index ch after variable offset
* points to.
* Note for locating a non-supplementary character data offset, calling
*
* getRawOffset(0, ch);
*
* will do. Otherwise if it is a supplementary character formed by
* surrogates lead and trail. Then we would have to call getRawOffset()
* with getFoldingIndexOffset(). See getSurrogateOffset().
* @param offset index offset which ch is to start from
* @param ch index to be used after offset
* @return offset to the data
*/
protected final int getRawOffset(int offset, char ch)
{
return (m_index_[offset + (ch >> INDEX_STAGE_1_SHIFT_)]
<< INDEX_STAGE_2_SHIFT_)
+ (ch & INDEX_STAGE_3_MASK_);
}
/**
* Gets the offset to data which the BMP character points to
* Treats a lead surrogate as a normal code point.
* @param ch BMP character
* @return offset to data
*/
protected final int getBMPOffset(char ch)
{
return (ch >= UTF16.LEAD_SURROGATE_MIN_VALUE
&& ch <= UTF16.LEAD_SURROGATE_MAX_VALUE)
? getRawOffset(LEAD_INDEX_OFFSET_, ch)
: getRawOffset(0, ch);
// using a getRawOffset(ch) makes no diff
}
/**
* Gets the offset to the data which this lead surrogate character points
* to.
* Data at the returned offset may contain folding offset information for
* the next trailing surrogate character.
* @param ch lead surrogate character
* @return offset to data
*/
protected final int getLeadOffset(char ch)
{
return getRawOffset(0, ch);
}
/**
* Internal trie getter from a code point.
* Could be faster(?) but longer with
* if((c32)<=0xd7ff) { (result)=_TRIE_GET_RAW(trie, data, 0, c32); }
* Gets the offset to data which the codepoint points to
* @param ch codepoint
* @return offset to data
*/
protected final int getCodePointOffset(int ch)
{
// if ((ch >> 16) == 0) slower
if (ch < 0) {
return -1;
} else if (ch < UTF16.LEAD_SURROGATE_MIN_VALUE) {
// fastpath for the part of the BMP below surrogates (D800) where getRawOffset() works
return getRawOffset(0, (char)ch);
} else if (ch < UTF16.SUPPLEMENTARY_MIN_VALUE) {
// BMP codepoint
return getBMPOffset((char)ch);
} else if (ch <= UCharacter.MAX_VALUE) {
// look at the construction of supplementary characters
// trail forms the ends of it.
return getSurrogateOffset(UTF16.getLeadSurrogate(ch),
(char)(ch & SURROGATE_MASK_));
} else {
// return -1 if there is an error, in this case we return
return -1;
}
}
/**
* Parses the byte buffer and creates the trie index with it.
* The position of the input ByteBuffer must be right after the trie header.
* This is overwritten by the child classes.
* @param bytes buffer containing trie data
*/
protected void unserialize(ByteBuffer bytes)
{
m_index_ = ICUBinary.getChars(bytes, m_dataOffset_, 0);
}
/**
* Determines if this is a 32 bit trie
* @return true if options specifies this is a 32 bit trie
*/
protected final boolean isIntTrie()
{
return (m_options_ & HEADER_OPTIONS_DATA_IS_32_BIT_) != 0;
}
/**
* Determines if this is a 16 bit trie
* @return true if this is a 16 bit trie
*/
protected final boolean isCharTrie()
{
return (m_options_ & HEADER_OPTIONS_DATA_IS_32_BIT_) == 0;
}
// private data members --------------------------------------------
// struct UTrieHeader {
// int32_t signature;
// int32_t options (a bit field)
// int32_t indexLength
// int32_t dataLength
/**
* Size of Trie header in bytes
*/
protected static final int HEADER_LENGTH_ = 4 * 4;
/**
* Latin 1 option mask
*/
protected static final int HEADER_OPTIONS_LATIN1_IS_LINEAR_MASK_ = 0x200;
/**
* Constant number to authenticate the byte block
*/
protected static final int HEADER_SIGNATURE_ = 0x54726965;
/**
* Header option formatting
*/
private static final int HEADER_OPTIONS_SHIFT_MASK_ = 0xF;
protected static final int HEADER_OPTIONS_INDEX_SHIFT_ = 4;
protected static final int HEADER_OPTIONS_DATA_IS_32_BIT_ = 0x100;
/**
* Flag indicator for Latin quick access data block
*/
private boolean m_isLatin1Linear_;
/**
*
Trie options field.
* options bit field:
* 9 1 = Latin-1 data is stored linearly at data + DATA_BLOCK_LENGTH
* 8 0 = 16-bit data, 1=32-bit data
* 7..4 INDEX_STAGE_1_SHIFT // 0..INDEX_STAGE_2_SHIFT
* 3..0 INDEX_STAGE_2_SHIFT // 1..9
*/
private int m_options_;
// private methods ---------------------------------------------------
/**
* Authenticates raw data header.
* Checking the header information, signature and options.
* @param signature This contains the options and type of a Trie
* @return true if the header is authenticated valid
*/
private final boolean checkHeader(int signature)
{
// check the signature
// Trie in big-endian US-ASCII (0x54726965).
// Magic number to authenticate the data.
if (signature != HEADER_SIGNATURE_) {
return false;
}
if ((m_options_ & HEADER_OPTIONS_SHIFT_MASK_) !=
INDEX_STAGE_1_SHIFT_ ||
((m_options_ >> HEADER_OPTIONS_INDEX_SHIFT_) &
HEADER_OPTIONS_SHIFT_MASK_)
!= INDEX_STAGE_2_SHIFT_) {
return false;
}
return true;
}
}