com.ibm.icu.util.BytesTrie Maven / Gradle / Ivy
Show all versions of icu4j Show documentation
/*
*******************************************************************************
* Copyright (C) 2010-2014, International Business Machines
* Corporation and others. All Rights Reserved.
*******************************************************************************
* created on: 2010nov23
* created by: Markus W. Scherer
* ported from ICU4C bytestrie.h/.cpp
*/
package com.ibm.icu.util;
import java.io.IOException;
import java.nio.ByteBuffer;
import java.util.ArrayList;
import java.util.NoSuchElementException;
/**
* Light-weight, non-const reader class for a BytesTrie.
* Traverses a byte-serialized data structure with minimal state,
* for mapping byte sequences to non-negative integer values.
*
* This class is not intended for public subclassing.
*
* @stable ICU 4.8
* @author Markus W. Scherer
*/
public final class BytesTrie implements Cloneable, Iterable {
/**
* Constructs a BytesTrie reader instance.
*
* The array must contain a copy of a byte sequence from the BytesTrieBuilder,
* with the offset indicating the first byte of that sequence.
* The BytesTrie object will not read more bytes than
* the BytesTrieBuilder generated in the corresponding build() call.
*
*
The array is not copied/cloned and must not be modified while
* the BytesTrie object is in use.
*
* @param trieBytes Bytes array that contains the serialized trie.
* @param offset Root offset of the trie in the array.
* @stable ICU 4.8
*/
public BytesTrie(byte[] trieBytes, int offset) {
bytes_=trieBytes;
pos_=root_=offset;
remainingMatchLength_=-1;
}
/**
* Clones this trie reader object and its state,
* but not the byte array which will be shared.
* @return A shallow clone of this trie.
* @stable ICU 4.8
*/
@Override
public Object clone() throws CloneNotSupportedException {
return super.clone(); // A shallow copy is just what we need.
}
/**
* Resets this trie to its initial state.
* @return this
* @stable ICU 4.8
*/
public BytesTrie reset() {
pos_=root_;
remainingMatchLength_=-1;
return this;
}
/**
* BytesTrie state object, for saving a trie's current state
* and resetting the trie back to this state later.
* @stable ICU 4.8
*/
public static final class State {
/**
* Constructs an empty State.
* @stable ICU 4.8
*/
public State() {}
private byte[] bytes;
private int root;
private int pos;
private int remainingMatchLength;
}
/**
* Saves the state of this trie.
* @param state The State object to hold the trie's state.
* @return this
* @see #resetToState
* @stable ICU 4.8
*/
public BytesTrie saveState(State state) /*const*/ {
state.bytes=bytes_;
state.root=root_;
state.pos=pos_;
state.remainingMatchLength=remainingMatchLength_;
return this;
}
/**
* Resets this trie to the saved state.
* @param state The State object which holds a saved trie state.
* @return this
* @throws IllegalArgumentException if the state object contains no state,
* or the state of a different trie
* @see #saveState
* @see #reset
* @stable ICU 4.8
*/
public BytesTrie resetToState(State state) {
if(bytes_==state.bytes && bytes_!=null && root_==state.root) {
pos_=state.pos;
remainingMatchLength_=state.remainingMatchLength;
} else {
throw new IllegalArgumentException("incompatible trie state");
}
return this;
}
/**
* Return values for BytesTrie.next(), CharsTrie.next() and similar methods.
* @stable ICU 4.8
*/
public enum Result {
/**
* The input unit(s) did not continue a matching string.
* Once current()/next() return NO_MATCH,
* all further calls to current()/next() will also return NO_MATCH,
* until the trie is reset to its original state or to a saved state.
* @stable ICU 4.8
*/
NO_MATCH,
/**
* The input unit(s) continued a matching string
* but there is no value for the string so far.
* (It is a prefix of a longer string.)
* @stable ICU 4.8
*/
NO_VALUE,
/**
* The input unit(s) continued a matching string
* and there is a value for the string so far.
* This value will be returned by getValue().
* No further input byte/unit can continue a matching string.
* @stable ICU 4.8
*/
FINAL_VALUE,
/**
* The input unit(s) continued a matching string
* and there is a value for the string so far.
* This value will be returned by getValue().
* Another input byte/unit can continue a matching string.
* @stable ICU 4.8
*/
INTERMEDIATE_VALUE;
// Note: The following methods assume the particular order
// of enum constants, treating the ordinal() values like bit sets.
// Do not reorder the enum constants!
/**
* Same as (result!=NO_MATCH).
* @return true if the input bytes/units so far are part of a matching string/byte sequence.
* @stable ICU 4.8
*/
public boolean matches() { return this!=NO_MATCH; }
/**
* Equivalent to (result==INTERMEDIATE_VALUE || result==FINAL_VALUE).
* @return true if there is a value for the input bytes/units so far.
* @see #getValue
* @stable ICU 4.8
*/
public boolean hasValue() { return ordinal()>=2; }
/**
* Equivalent to (result==NO_VALUE || result==INTERMEDIATE_VALUE).
* @return true if another input byte/unit can continue a matching string.
* @stable ICU 4.8
*/
public boolean hasNext() { return (ordinal()&1)!=0; }
}
/**
* Determines whether the byte sequence so far matches, whether it has a value,
* and whether another input byte can continue a matching byte sequence.
* @return The match/value Result.
* @stable ICU 4.8
*/
public Result current() /*const*/ {
int pos=pos_;
if(pos<0) {
return Result.NO_MATCH;
} else {
int node;
return (remainingMatchLength_<0 && (node=bytes_[pos]&0xff)>=kMinValueLead) ?
valueResults_[node&kValueIsFinal] : Result.NO_VALUE;
}
}
/**
* Traverses the trie from the initial state for this input byte.
* Equivalent to reset().next(inByte).
* @param inByte Input byte value. Values -0x100..-1 are treated like 0..0xff.
* Values below -0x100 and above 0xff will never match.
* @return The match/value Result.
* @stable ICU 4.8
*/
public Result first(int inByte) {
remainingMatchLength_=-1;
if(inByte<0) {
inByte+=0x100;
}
return nextImpl(root_, inByte);
}
/**
* Traverses the trie from the current state for this input byte.
* @param inByte Input byte value. Values -0x100..-1 are treated like 0..0xff.
* Values below -0x100 and above 0xff will never match.
* @return The match/value Result.
* @stable ICU 4.8
*/
public Result next(int inByte) {
int pos=pos_;
if(pos<0) {
return Result.NO_MATCH;
}
if(inByte<0) {
inByte+=0x100;
}
int length=remainingMatchLength_; // Actual remaining match length minus 1.
if(length>=0) {
// Remaining part of a linear-match node.
if(inByte==(bytes_[pos++]&0xff)) {
remainingMatchLength_=--length;
pos_=pos;
int node;
return (length<0 && (node=bytes_[pos]&0xff)>=kMinValueLead) ?
valueResults_[node&kValueIsFinal] : Result.NO_VALUE;
} else {
stop();
return Result.NO_MATCH;
}
}
return nextImpl(pos, inByte);
}
/**
* Traverses the trie from the current state for this byte sequence.
* Equivalent to
*
* Result result=current();
* for(each c in s)
* if(!result.hasNext()) return Result.NO_MATCH;
* result=next(c);
* return result;
*
* @param s Contains a string or byte sequence.
* @param sIndex The start index of the byte sequence in s.
* @param sLimit The (exclusive) end index of the byte sequence in s.
* @return The match/value Result.
* @stable ICU 4.8
*/
public Result next(byte[] s, int sIndex, int sLimit) {
if(sIndex>=sLimit) {
// Empty input.
return current();
}
int pos=pos_;
if(pos<0) {
return Result.NO_MATCH;
}
int length=remainingMatchLength_; // Actual remaining match length minus 1.
for(;;) {
// Fetch the next input byte, if there is one.
// Continue a linear-match node.
byte inByte;
for(;;) {
if(sIndex==sLimit) {
remainingMatchLength_=length;
pos_=pos;
int node;
return (length<0 && (node=(bytes_[pos]&0xff))>=kMinValueLead) ?
valueResults_[node&kValueIsFinal] : Result.NO_VALUE;
}
inByte=s[sIndex++];
if(length<0) {
remainingMatchLength_=length;
break;
}
if(inByte!=bytes_[pos]) {
stop();
return Result.NO_MATCH;
}
++pos;
--length;
}
for(;;) {
int node=bytes_[pos++]&0xff;
if(node=kMinValueLead);
return readValue(bytes_, pos, leadByte>>1);
}
/**
* Determines whether all byte sequences reachable from the current state
* map to the same value, and if so, returns that value.
* @return The unique value in bits 32..1 with bit 0 set,
* if all byte sequences reachable from the current state
* map to the same value; otherwise returns 0.
* @stable ICU 4.8
*/
public long getUniqueValue() /*const*/ {
int pos=pos_;
if(pos<0) {
return 0;
}
// Skip the rest of a pending linear-match node.
long uniqueValue=findUniqueValue(bytes_, pos+remainingMatchLength_+1, 0);
// Ignore internally used bits 63..33; extend the actual value's sign bit from bit 32.
return (uniqueValue<<31)>>31;
}
/**
* Finds each byte which continues the byte sequence from the current state.
* That is, each byte b for which it would be next(b)!=Result.NO_MATCH now.
* @param out Each next byte is 0-extended to a char and appended to this object.
* (Only uses the out.append(c) method.)
* @return The number of bytes which continue the byte sequence from here.
* @stable ICU 4.8
*/
public int getNextBytes(Appendable out) /*const*/ {
int pos=pos_;
if(pos<0) {
return 0;
}
if(remainingMatchLength_>=0) {
append(out, bytes_[pos]&0xff); // Next byte of a pending linear-match node.
return 1;
}
int node=bytes_[pos++]&0xff;
if(node>=kMinValueLead) {
if((node&kValueIsFinal)!=0) {
return 0;
} else {
pos=skipValue(pos, node);
node=bytes_[pos++]&0xff;
assert(node {
private Iterator(byte[] trieBytes, int offset, int remainingMatchLength, int maxStringLength) {
bytes_=trieBytes;
pos_=initialPos_=offset;
remainingMatchLength_=initialRemainingMatchLength_=remainingMatchLength;
maxLength_=maxStringLength;
entry_=new Entry(maxLength_!=0 ? maxLength_ : 32);
int length=remainingMatchLength_; // Actual remaining match length minus 1.
if(length>=0) {
// Pending linear-match node, append remaining bytes to entry_.
++length;
if(maxLength_>0 && length>maxLength_) {
length=maxLength_; // This will leave remainingMatchLength>=0 as a signal.
}
entry_.append(bytes_, pos_, length);
pos_+=length;
remainingMatchLength_-=length;
}
}
/**
* Resets this iterator to its initial state.
* @return this
* @stable ICU 4.8
*/
public Iterator reset() {
pos_=initialPos_;
remainingMatchLength_=initialRemainingMatchLength_;
int length=remainingMatchLength_+1; // Remaining match length.
if(maxLength_>0 && length>maxLength_) {
length=maxLength_;
}
entry_.truncateString(length);
pos_+=length;
remainingMatchLength_-=length;
stack_.clear();
return this;
}
/**
* @return true if there are more elements.
* @stable ICU 4.8
*/
public boolean hasNext() /*const*/ { return pos_>=0 || !stack_.isEmpty(); }
/**
* Finds the next (byte sequence, value) pair if there is one.
*
* If the byte sequence is truncated to the maximum length and does not
* have a real value, then the value is set to -1.
* In this case, this "not a real value" is indistinguishable from
* a real value of -1.
* @return An Entry with the string and value of the next element.
* @throws NoSuchElementException - iteration has no more elements.
* @stable ICU 4.8
*/
public Entry next() {
int pos=pos_;
if(pos<0) {
if(stack_.isEmpty()) {
throw new NoSuchElementException();
}
// Pop the state off the stack and continue with the next outbound edge of
// the branch node.
long top=stack_.remove(stack_.size()-1);
int length=(int)top;
pos=(int)(top>>32);
entry_.truncateString(length&0xffff);
length>>>=16;
if(length>1) {
pos=branchNext(pos, length);
if(pos<0) {
return entry_; // Reached a final value.
}
} else {
entry_.append(bytes_[pos++]);
}
}
if(remainingMatchLength_>=0) {
// We only get here if we started in a pending linear-match node
// with more than maxLength remaining bytes.
return truncateAndStop();
}
for(;;) {
int node=bytes_[pos++]&0xff;
if(node>=kMinValueLead) {
// Deliver value for the byte sequence so far.
boolean isFinal=(node&kValueIsFinal)!=0;
entry_.value=readValue(bytes_, pos, node>>1);
if(isFinal || (maxLength_>0 && entry_.length==maxLength_)) {
pos_=-1;
} else {
pos_=skipValue(pos, node);
}
return entry_;
}
if(maxLength_>0 && entry_.length==maxLength_) {
return truncateAndStop();
}
if(node0 && entry_.length+length>maxLength_) {
entry_.append(bytes_, pos, maxLength_-entry_.length);
return truncateAndStop();
}
entry_.append(bytes_, pos, length);
pos+=length;
}
}
}
/**
* Iterator.remove() is not supported.
* @throws UnsupportedOperationException (always)
* @stable ICU 4.8
*/
public void remove() {
throw new UnsupportedOperationException();
}
private Entry truncateAndStop() {
pos_=-1;
entry_.value=-1; // no real value for str
return entry_;
}
private int branchNext(int pos, int length) {
while(length>kMaxBranchLinearSubNodeLength) {
++pos; // ignore the comparison byte
// Push state for the greater-or-equal edge.
stack_.add(((long)skipDelta(bytes_, pos)<<32)|((length-(length>>1))<<16)|entry_.length);
// Follow the less-than edge.
length>>=1;
pos=jumpByDelta(bytes_, pos);
}
// List of key-value pairs where values are either final values or jump deltas.
// Read the first (key, value) pair.
byte trieByte=bytes_[pos++];
int node=bytes_[pos++]&0xff;
boolean isFinal=(node&kValueIsFinal)!=0;
int value=readValue(bytes_, pos, node>>1);
pos=skipValue(pos, node);
stack_.add(((long)pos<<32)|((length-1)<<16)|entry_.length);
entry_.append(trieByte);
if(isFinal) {
pos_=-1;
entry_.value=value;
return -1;
} else {
return pos+value;
}
}
private byte[] bytes_;
private int pos_;
private int initialPos_;
private int remainingMatchLength_;
private int initialRemainingMatchLength_;
private int maxLength_;
private Entry entry_;
// The stack stores longs for backtracking to another
// outbound edge of a branch node.
// Each long has the offset from bytes_ in bits 62..32,
// the entry_.stringLength() from before the node in bits 15..0,
// and the remaining branch length in bits 24..16. (Bits 31..25 are unused.)
// (We could store the remaining branch length minus 1 in bits 23..16 and not use bits 31..24,
// but the code looks more confusing that way.)
private ArrayList stack_=new ArrayList();
}
private void stop() {
pos_=-1;
}
// Reads a compact 32-bit integer.
// pos is already after the leadByte, and the lead byte is already shifted right by 1.
private static int readValue(byte[] bytes, int pos, int leadByte) {
int value;
if(leadByte=kMinValueLead);
if(leadByte>=(kMinTwoByteValueLead<<1)) {
if(leadByte<(kMinThreeByteValueLead<<1)) {
++pos;
} else if(leadByte<(kFourByteValueLead<<1)) {
pos+=2;
} else {
pos+=3+((leadByte>>1)&1);
}
}
return pos;
}
private static int skipValue(byte[] bytes, int pos) {
int leadByte=bytes[pos++]&0xff;
return skipValue(pos, leadByte);
}
// Reads a jump delta and jumps.
private static int jumpByDelta(byte[] bytes, int pos) {
int delta=bytes[pos++]&0xff;
if(delta=kMinTwoByteDeltaLead) {
if(deltakMaxBranchLinearSubNodeLength) {
if(inByte<(bytes_[pos++]&0xff)) {
length>>=1;
pos=jumpByDelta(bytes_, pos);
} else {
length=length-(length>>1);
pos=skipDelta(bytes_, pos);
}
}
// Drop down to linear search for the last few bytes.
// length>=2 because the loop body above sees length>kMaxBranchLinearSubNodeLength>=3
// and divides length by 2.
do {
if(inByte==(bytes_[pos++]&0xff)) {
Result result;
int node=bytes_[pos]&0xff;
assert(node>=kMinValueLead);
if((node&kValueIsFinal)!=0) {
// Leave the final value for getValue() to read.
result=Result.FINAL_VALUE;
} else {
// Use the non-final value as the jump delta.
++pos;
// int delta=readValue(pos, node>>1);
node>>=1;
int delta;
if(node=kMinValueLead ? valueResults_[node&kValueIsFinal] : Result.NO_VALUE;
}
pos_=pos;
return result;
}
--length;
pos=skipValue(bytes_, pos);
} while(length>1);
if(inByte==(bytes_[pos++]&0xff)) {
pos_=pos;
int node=bytes_[pos]&0xff;
return node>=kMinValueLead ? valueResults_[node&kValueIsFinal] : Result.NO_VALUE;
} else {
stop();
return Result.NO_MATCH;
}
}
// Requires remainingLength_<0.
private Result nextImpl(int pos, int inByte) {
for(;;) {
int node=bytes_[pos++]&0xff;
if(node=kMinValueLead) ?
valueResults_[node&kValueIsFinal] : Result.NO_VALUE;
} else {
// No match.
break;
}
} else if((node&kValueIsFinal)!=0) {
// No further matching bytes.
break;
} else {
// Skip intermediate value.
pos=skipValue(pos, node);
// The next node must not also be a value node.
assert((bytes_[pos]&0xff)kMaxBranchLinearSubNodeLength) {
++pos; // ignore the comparison byte
uniqueValue=findUniqueValueFromBranch(bytes, jumpByDelta(bytes, pos), length>>1, uniqueValue);
if(uniqueValue==0) {
return 0;
}
length=length-(length>>1);
pos=skipDelta(bytes, pos);
}
do {
++pos; // ignore a comparison byte
// handle its value
int node=bytes[pos++]&0xff;
boolean isFinal=(node&kValueIsFinal)!=0;
int value=readValue(bytes, pos, node>>1);
pos=skipValue(pos, node);
if(isFinal) {
if(uniqueValue!=0) {
if(value!=(int)(uniqueValue>>1)) {
return 0;
}
} else {
uniqueValue=((long)value<<1)|1;
}
} else {
uniqueValue=findUniqueValue(bytes, pos+value, uniqueValue);
if(uniqueValue==0) {
return 0;
}
}
} while(--length>1);
// ignore the last comparison byte
return ((long)(pos+1)<<33)|(uniqueValue&0x1ffffffffL);
}
// Recursively finds a unique value (or whether there is not a unique one)
// starting from a position on a node lead byte.
// uniqueValue: If there is one, then bits 32..1 contain the value and bit 0 is set.
// Otherwise, uniqueValue is 0. Bits 63..33 are ignored.
private static long findUniqueValue(byte[] bytes, int pos, long uniqueValue) {
for(;;) {
int node=bytes[pos++]&0xff;
if(node>>33);
} else if(node>1);
if(uniqueValue!=0) {
if(value!=(int)(uniqueValue>>1)) {
return 0;
}
} else {
uniqueValue=((long)value<<1)|1;
}
if(isFinal) {
return uniqueValue;
}
pos=skipValue(pos, node);
}
}
}
// Helper functions for getNextBytes().
// getNextBytes() when pos is on a branch node.
private static void getNextBranchBytes(byte[] bytes, int pos, int length, Appendable out) {
while(length>kMaxBranchLinearSubNodeLength) {
++pos; // ignore the comparison byte
getNextBranchBytes(bytes, jumpByDelta(bytes, pos), length>>1, out);
length=length-(length>>1);
pos=skipDelta(bytes, pos);
}
do {
append(out, bytes[pos++]&0xff);
pos=skipValue(bytes, pos);
} while(--length>1);
append(out, bytes[pos]&0xff);
}
private static void append(Appendable out, int c) {
try {
out.append((char)c);
} catch(IOException e) {
throw new ICUUncheckedIOException(e);
}
}
// BytesTrie data structure
//
// The trie consists of a series of byte-serialized nodes for incremental
// string/byte sequence matching. The root node is at the beginning of the trie data.
//
// Types of nodes are distinguished by their node lead byte ranges.
// After each node, except a final-value node, another node follows to
// encode match values or continue matching further bytes.
//
// Node types:
// - Value node: Stores a 32-bit integer in a compact, variable-length format.
// The value is for the string/byte sequence so far.
// One node bit indicates whether the value is final or whether
// matching continues with the next node.
// - Linear-match node: Matches a number of bytes.
// - Branch node: Branches to other nodes according to the current input byte.
// The node byte is the length of the branch (number of bytes to select from)
// minus 1. It is followed by a sub-node:
// - If the length is at most kMaxBranchLinearSubNodeLength, then
// there are length-1 (key, value) pairs and then one more comparison byte.
// If one of the key bytes matches, then the value is either a final value for
// the string/byte sequence so far, or a "jump" delta to the next node.
// If the last byte matches, then matching continues with the next node.
// (Values have the same encoding as value nodes.)
// - If the length is greater than kMaxBranchLinearSubNodeLength, then
// there is one byte and one "jump" delta.
// If the input byte is less than the sub-node byte, then "jump" by delta to
// the next sub-node which will have a length of length/2.
// (The delta has its own compact encoding.)
// Otherwise, skip the "jump" delta to the next sub-node
// which will have a length of length-length/2.
// Node lead byte values.
// 00..0f: Branch node. If node!=0 then the length is node+1, otherwise
// the length is one more than the next byte.
// For a branch sub-node with at most this many entries, we drop down
// to a linear search.
/*package*/ static final int kMaxBranchLinearSubNodeLength=5;
// 10..1f: Linear-match node, match 1..16 bytes and continue reading the next node.
/*package*/ static final int kMinLinearMatch=0x10;
/*package*/ static final int kMaxLinearMatchLength=0x10;
// 20..ff: Variable-length value node.
// If odd, the value is final. (Otherwise, intermediate value or jump delta.)
// Then shift-right by 1 bit.
// The remaining lead byte value indicates the number of following bytes (0..4)
// and contains the value's top bits.
/*package*/ static final int kMinValueLead=kMinLinearMatch+kMaxLinearMatchLength; // 0x20
// It is a final value if bit 0 is set.
private static final int kValueIsFinal=1;
// Compact value: After testing bit 0, shift right by 1 and then use the following thresholds.
/*package*/ static final int kMinOneByteValueLead=kMinValueLead/2; // 0x10
/*package*/ static final int kMaxOneByteValue=0x40; // At least 6 bits in the first byte.
/*package*/ static final int kMinTwoByteValueLead=kMinOneByteValueLead+kMaxOneByteValue+1; // 0x51
/*package*/ static final int kMaxTwoByteValue=0x1aff;
/*package*/ static final int kMinThreeByteValueLead=kMinTwoByteValueLead+(kMaxTwoByteValue>>8)+1; // 0x6c
/*package*/ static final int kFourByteValueLead=0x7e;
// A little more than Unicode code points. (0x11ffff)
/*package*/ static final int kMaxThreeByteValue=((kFourByteValueLead-kMinThreeByteValueLead)<<16)-1;
/*package*/ static final int kFiveByteValueLead=0x7f;
// Compact delta integers.
/*package*/ static final int kMaxOneByteDelta=0xbf;
/*package*/ static final int kMinTwoByteDeltaLead=kMaxOneByteDelta+1; // 0xc0
/*package*/ static final int kMinThreeByteDeltaLead=0xf0;
/*package*/ static final int kFourByteDeltaLead=0xfe;
/*package*/ static final int kFiveByteDeltaLead=0xff;
/*package*/ static final int kMaxTwoByteDelta=((kMinThreeByteDeltaLead-kMinTwoByteDeltaLead)<<8)-1; // 0x2fff
/*package*/ static final int kMaxThreeByteDelta=((kFourByteDeltaLead-kMinThreeByteDeltaLead)<<16)-1; // 0xdffff
// Fixed value referencing the BytesTrie bytes.
private byte[] bytes_;
private int root_;
// Iterator variables.
// Index of next trie byte to read. Negative if no more matches.
private int pos_;
// Remaining length of a linear-match node, minus 1. Negative if not in such a node.
private int remainingMatchLength_;
};