com.ibm.icu.impl.Trie2_32 Maven / Gradle / Ivy
Go to download
Show more of this group Show more artifacts with this name
Show all versions of icu4j Show documentation
Show all versions of icu4j Show documentation
International Component for Unicode for Java (ICU4J) is a mature, widely used Java library
providing Unicode and Globalization support
// © 2016 and later: Unicode, Inc. and others.
// License & terms of use: http://www.unicode.org/copyright.html
/*
*******************************************************************************
* Copyright (C) 2009-2014, International Business Machines Corporation and
* others. All Rights Reserved.
*******************************************************************************
*/
package com.ibm.icu.impl;
import java.io.DataOutputStream;
import java.io.IOException;
import java.io.OutputStream;
import java.nio.ByteBuffer;
/**
* @author aheninger
*
* A read-only Trie2, holding 32 bit data values.
*
* A Trie2 is a highly optimized data structure for mapping from Unicode
* code points (values ranging from 0 to 0x10ffff) to a 16 or 32 bit value.
*
* See class Trie2 for descriptions of the API for accessing the contents of a trie.
*
* The fundamental data access methods are declared final in this class, with
* the intent that applications might gain a little extra performance, when compared
* with calling the same methods via the abstract UTrie2 base class.
*/
public class Trie2_32 extends Trie2 {
/**
* Internal constructor, not for general use.
*/
Trie2_32() {
}
/**
* Create a Trie2 from its serialized form. Inverse of utrie2_serialize().
* The serialized format is identical between ICU4C and ICU4J, so this function
* will work with serialized Trie2s from either.
*
* The serialized Trie2 in the bytes may be in either little or big endian byte order.
* This allows using serialized Tries from ICU4C without needing to consider the
* byte order of the system that created them.
*
* @param bytes a byte buffer to the serialized form of a UTrie2.
* @return An unserialized Trie_32, ready for use.
* @throws IllegalArgumentException if the stream does not contain a serialized Trie2.
* @throws IOException if a read error occurs in the buffer.
* @throws ClassCastException if the bytes contains a serialized Trie2_16
*/
public static Trie2_32 createFromSerialized(ByteBuffer bytes) throws IOException {
return (Trie2_32) Trie2.createFromSerialized(bytes);
}
/**
* Get the value for a code point as stored in the Trie2.
*
* @param codePoint the code point
* @return the value
*/
@Override
public final int get(int codePoint) {
int value;
int ix;
if (codePoint >= 0) {
if (codePoint < 0x0d800 || (codePoint > 0x0dbff && codePoint <= 0x0ffff)) {
// Ordinary BMP code point, excluding leading surrogates.
// BMP uses a single level lookup. BMP index starts at offset 0 in the Trie2 index.
// 32 bit data is stored in the index array itself.
ix = index[codePoint >> UTRIE2_SHIFT_2];
ix = (ix << UTRIE2_INDEX_SHIFT) + (codePoint & UTRIE2_DATA_MASK);
value = data32[ix];
return value;
}
if (codePoint <= 0xffff) {
// Lead Surrogate Code Point. A Separate index section is stored for
// lead surrogate code units and code points.
// The main index has the code unit data.
// For this function, we need the code point data.
// Note: this expression could be refactored for slightly improved efficiency, but
// surrogate code points will be so rare in practice that it's not worth it.
ix = index[UTRIE2_LSCP_INDEX_2_OFFSET + ((codePoint - 0xd800) >> UTRIE2_SHIFT_2)];
ix = (ix << UTRIE2_INDEX_SHIFT) + (codePoint & UTRIE2_DATA_MASK);
value = data32[ix];
return value;
}
if (codePoint < highStart) {
// Supplemental code point, use two-level lookup.
ix = (UTRIE2_INDEX_1_OFFSET - UTRIE2_OMITTED_BMP_INDEX_1_LENGTH) + (codePoint >> UTRIE2_SHIFT_1);
ix = index[ix];
ix += (codePoint >> UTRIE2_SHIFT_2) & UTRIE2_INDEX_2_MASK;
ix = index[ix];
ix = (ix << UTRIE2_INDEX_SHIFT) + (codePoint & UTRIE2_DATA_MASK);
value = data32[ix];
return value;
}
if (codePoint <= 0x10ffff) {
value = data32[highValueIndex];
return value;
}
}
// Fall through. The code point is outside of the legal range of 0..0x10ffff.
return errorValue;
}
/**
* Get a Trie2 value for a UTF-16 code unit.
*
* This function returns the same value as get() if the input
* character is outside of the lead surrogate range
*
* There are two values stored in a Trie2 for inputs in the lead
* surrogate range. This function returns the alternate value,
* while Trie2.get() returns the main value.
*
* @param codeUnit a 16 bit code unit or lead surrogate value.
* @return the value
*/
@Override
public int getFromU16SingleLead(char codeUnit){
int value;
int ix;
ix = index[codeUnit >> UTRIE2_SHIFT_2];
ix = (ix << UTRIE2_INDEX_SHIFT) + (codeUnit & UTRIE2_DATA_MASK);
value = data32[ix];
return value;
}
/**
* Serialize a Trie2_32 onto an OutputStream.
*
* A Trie2 can be serialized multiple times.
* The serialized data is compatible with ICU4C UTrie2 serialization.
* Trie2 serialization is unrelated to Java object serialization.
*
* @param os the stream to which the serialized Trie2 data will be written.
* @return the number of bytes written.
* @throw IOException on an error writing to the OutputStream.
*/
public int serialize(OutputStream os) throws IOException {
DataOutputStream dos = new DataOutputStream(os);
int bytesWritten = 0;
bytesWritten += serializeHeader(dos);
for (int i=0; i= limit) {
break;
}
if (cp < 0x0d800 || (cp > 0x0dbff && cp <= 0x0ffff)) {
// Ordinary BMP code point, excluding leading surrogates.
// BMP uses a single level lookup. BMP index starts at offset 0 in the Trie2 index.
// 16 bit data is stored in the index array itself.
index2Block = 0;
block = index[cp >> UTRIE2_SHIFT_2] << UTRIE2_INDEX_SHIFT;
} else if (cp < 0xffff) {
// Lead Surrogate Code Point, 0xd800 <= cp < 0xdc00
index2Block = UTRIE2_LSCP_INDEX_2_OFFSET;
block = index[index2Block + ((cp - 0xd800) >> UTRIE2_SHIFT_2)] << UTRIE2_INDEX_SHIFT;
} else if (cp < highStart) {
// Supplemental code point, use two-level lookup.
int ix = (UTRIE2_INDEX_1_OFFSET - UTRIE2_OMITTED_BMP_INDEX_1_LENGTH) + (cp >> UTRIE2_SHIFT_1);
index2Block = index[ix];
block = index[index2Block + ((cp >> UTRIE2_SHIFT_2) & UTRIE2_INDEX_2_MASK)] << UTRIE2_INDEX_SHIFT;
} else {
// Code point above highStart.
if (value == data32[highValueIndex]) {
cp = limit;
}
break;
}
if (index2Block == index2NullOffset) {
if (value != initialValue) {
break;
}
cp += UTRIE2_CP_PER_INDEX_1_ENTRY;
} else if (block == dataNullOffset) {
// The block at dataNullOffset has all values == initialValue.
// Because Trie2 iteration always proceeds in ascending order, we will always
// encounter a null block at its beginning, and can skip over
// a number of code points equal to the length of the block.
if (value != initialValue) {
break;
}
cp += UTRIE2_DATA_BLOCK_LENGTH;
} else {
// Current position refers to an ordinary data block.
// Walk over the data entries, checking the values.
int startIx = block + (cp & UTRIE2_DATA_MASK);
int limitIx = block + UTRIE2_DATA_BLOCK_LENGTH;
for (int ix = startIx; ix limit) {
cp = limit;
}
return cp - 1;
}
}