com.ibm.icu.util.CharsTrie Maven / Gradle / Ivy
// © 2016 and later: Unicode, Inc. and others.
// License & terms of use: http://www.unicode.org/copyright.html#License
/*
*******************************************************************************
* Copyright (C) 2011-2014, International Business Machines
* Corporation and others. All Rights Reserved.
*******************************************************************************
* created on: 2011jan06
* created by: Markus W. Scherer
* ported from ICU4C ucharstrie.h/.cpp
*/
package com.ibm.icu.util;
import java.io.IOException;
import java.util.ArrayList;
import java.util.NoSuchElementException;
import com.ibm.icu.text.UTF16;
import com.ibm.icu.util.BytesTrie.Result;
/**
* Light-weight, non-const reader class for a CharsTrie.
* Traverses a char-serialized data structure with minimal state,
* for mapping strings (16-bit-unit sequences) to non-negative integer values.
*
* This class is not intended for public subclassing.
*
* @stable ICU 4.8
* @author Markus W. Scherer
*/
public final class CharsTrie implements Cloneable, Iterable {
/**
* Constructs a CharsTrie reader instance.
*
* The CharSequence must contain a copy of a char sequence from the CharsTrieBuilder,
* with the offset indicating the first char of that sequence.
* The CharsTrie object will not read more chars than
* the CharsTrieBuilder generated in the corresponding build() call.
*
*
The CharSequence is not copied/cloned and must not be modified while
* the CharsTrie object is in use.
*
* @param trieChars CharSequence that contains the serialized trie.
* @param offset Root offset of the trie in the CharSequence.
* @stable ICU 4.8
*/
public CharsTrie(CharSequence trieChars, int offset) {
chars_=trieChars;
pos_=root_=offset;
remainingMatchLength_=-1;
}
/**
* Clones this trie reader object and its state,
* but not the char array which will be shared.
* @return A shallow clone of this trie.
* @stable ICU 4.8
*/
@Override
public Object clone() throws CloneNotSupportedException {
return super.clone(); // A shallow copy is just what we need.
}
/**
* Resets this trie to its initial state.
* @return this
* @stable ICU 4.8
*/
public CharsTrie reset() {
pos_=root_;
remainingMatchLength_=-1;
return this;
}
/**
* CharsTrie state object, for saving a trie's current state
* and resetting the trie back to this state later.
* @stable ICU 4.8
*/
public static final class State {
/**
* Constructs an empty State.
* @stable ICU 4.8
*/
public State() {}
private CharSequence chars;
private int root;
private int pos;
private int remainingMatchLength;
}
/**
* Saves the state of this trie.
* @param state The State object to hold the trie's state.
* @return this
* @see #resetToState
* @stable ICU 4.8
*/
public CharsTrie saveState(State state) /*const*/ {
state.chars=chars_;
state.root=root_;
state.pos=pos_;
state.remainingMatchLength=remainingMatchLength_;
return this;
}
/**
* Resets this trie to the saved state.
* @param state The State object which holds a saved trie state.
* @return this
* @throws IllegalArgumentException if the state object contains no state,
* or the state of a different trie
* @see #saveState
* @see #reset
* @stable ICU 4.8
*/
public CharsTrie resetToState(State state) {
if(chars_==state.chars && chars_!=null && root_==state.root) {
pos_=state.pos;
remainingMatchLength_=state.remainingMatchLength;
} else {
throw new IllegalArgumentException("incompatible trie state");
}
return this;
}
/**
* Determines whether the string so far matches, whether it has a value,
* and whether another input char can continue a matching string.
* @return The match/value Result.
* @stable ICU 4.8
*/
public Result current() /*const*/ {
int pos=pos_;
if(pos<0) {
return Result.NO_MATCH;
} else {
int node;
return (remainingMatchLength_<0 && (node=chars_.charAt(pos))>=kMinValueLead) ?
valueResults_[node>>15] : Result.NO_VALUE;
}
}
/**
* Traverses the trie from the initial state for this input char.
* Equivalent to reset().next(inUnit).
* @param inUnit Input char value. Values below 0 and above 0xffff will never match.
* @return The match/value Result.
* @stable ICU 4.8
*/
public Result first(int inUnit) {
remainingMatchLength_=-1;
return nextImpl(root_, inUnit);
}
/**
* Traverses the trie from the initial state for the
* one or two UTF-16 code units for this input code point.
* Equivalent to reset().nextForCodePoint(cp).
* @param cp A Unicode code point 0..0x10ffff.
* @return The match/value Result.
* @stable ICU 4.8
*/
public Result firstForCodePoint(int cp) {
return cp<=0xffff ?
first(cp) :
(first(UTF16.getLeadSurrogate(cp)).hasNext() ?
next(UTF16.getTrailSurrogate(cp)) :
Result.NO_MATCH);
}
/**
* Traverses the trie from the current state for this input char.
* @param inUnit Input char value. Values below 0 and above 0xffff will never match.
* @return The match/value Result.
* @stable ICU 4.8
*/
public Result next(int inUnit) {
int pos=pos_;
if(pos<0) {
return Result.NO_MATCH;
}
int length=remainingMatchLength_; // Actual remaining match length minus 1.
if(length>=0) {
// Remaining part of a linear-match node.
if(inUnit==chars_.charAt(pos++)) {
remainingMatchLength_=--length;
pos_=pos;
int node;
return (length<0 && (node=chars_.charAt(pos))>=kMinValueLead) ?
valueResults_[node>>15] : Result.NO_VALUE;
} else {
stop();
return Result.NO_MATCH;
}
}
return nextImpl(pos, inUnit);
}
/**
* Traverses the trie from the current state for the
* one or two UTF-16 code units for this input code point.
* @param cp A Unicode code point 0..0x10ffff.
* @return The match/value Result.
* @stable ICU 4.8
*/
public Result nextForCodePoint(int cp) {
return cp<=0xffff ?
next(cp) :
(next(UTF16.getLeadSurrogate(cp)).hasNext() ?
next(UTF16.getTrailSurrogate(cp)) :
Result.NO_MATCH);
}
/**
* Traverses the trie from the current state for this string.
* Equivalent to
*
* Result result=current();
* for(each c in s)
* if(!result.hasNext()) return Result.NO_MATCH;
* result=next(c);
* return result;
*
* @param s Contains a string.
* @param sIndex The start index of the string in s.
* @param sLimit The (exclusive) end index of the string in s.
* @return The match/value Result.
* @stable ICU 4.8
*/
public Result next(CharSequence s, int sIndex, int sLimit) {
if(sIndex>=sLimit) {
// Empty input.
return current();
}
int pos=pos_;
if(pos<0) {
return Result.NO_MATCH;
}
int length=remainingMatchLength_; // Actual remaining match length minus 1.
for(;;) {
// Fetch the next input unit, if there is one.
// Continue a linear-match node.
char inUnit;
for(;;) {
if(sIndex==sLimit) {
remainingMatchLength_=length;
pos_=pos;
int node;
return (length<0 && (node=chars_.charAt(pos))>=kMinValueLead) ?
valueResults_[node>>15] : Result.NO_VALUE;
}
inUnit=s.charAt(sIndex++);
if(length<0) {
remainingMatchLength_=length;
break;
}
if(inUnit!=chars_.charAt(pos)) {
stop();
return Result.NO_MATCH;
}
++pos;
--length;
}
int node=chars_.charAt(pos++);
for(;;) {
if(node=kMinValueLead);
return (leadUnit&kValueIsFinal)!=0 ?
readValue(chars_, pos, leadUnit&0x7fff) : readNodeValue(chars_, pos, leadUnit);
}
/**
* Determines whether all strings reachable from the current state
* map to the same value, and if so, returns that value.
* @return The unique value in bits 32..1 with bit 0 set,
* if all strings reachable from the current state
* map to the same value; otherwise returns 0.
* @stable ICU 4.8
*/
public long getUniqueValue() /*const*/ {
int pos=pos_;
if(pos<0) {
return 0;
}
// Skip the rest of a pending linear-match node.
long uniqueValue=findUniqueValue(chars_, pos+remainingMatchLength_+1, 0);
// Ignore internally used bits 63..33; extend the actual value's sign bit from bit 32.
return (uniqueValue<<31)>>31;
}
/**
* Finds each char which continues the string from the current state.
* That is, each char c for which it would be next(c)!=Result.NO_MATCH now.
* @param out Each next char is appended to this object.
* (Only uses the out.append(c) method.)
* @return The number of chars which continue the string from here.
* @stable ICU 4.8
*/
public int getNextChars(Appendable out) /*const*/ {
int pos=pos_;
if(pos<0) {
return 0;
}
if(remainingMatchLength_>=0) {
append(out, chars_.charAt(pos)); // Next unit of a pending linear-match node.
return 1;
}
int node=chars_.charAt(pos++);
if(node>=kMinValueLead) {
if((node&kValueIsFinal)!=0) {
return 0;
} else {
pos=skipNodeValue(pos, node);
node&=kNodeTypeMask;
}
}
if(node {
private Iterator(CharSequence trieChars, int offset, int remainingMatchLength, int maxStringLength) {
chars_=trieChars;
pos_=initialPos_=offset;
remainingMatchLength_=initialRemainingMatchLength_=remainingMatchLength;
maxLength_=maxStringLength;
int length=remainingMatchLength_; // Actual remaining match length minus 1.
if(length>=0) {
// Pending linear-match node, append remaining bytes to str_.
++length;
if(maxLength_>0 && length>maxLength_) {
length=maxLength_; // This will leave remainingMatchLength>=0 as a signal.
}
str_.append(chars_, pos_, pos_+length);
pos_+=length;
remainingMatchLength_-=length;
}
}
/**
* Resets this iterator to its initial state.
* @return this
* @stable ICU 4.8
*/
public Iterator reset() {
pos_=initialPos_;
remainingMatchLength_=initialRemainingMatchLength_;
skipValue_=false;
int length=remainingMatchLength_+1; // Remaining match length.
if(maxLength_>0 && length>maxLength_) {
length=maxLength_;
}
str_.setLength(length);
pos_+=length;
remainingMatchLength_-=length;
stack_.clear();
return this;
}
/**
* @return true if there are more elements.
* @stable ICU 4.8
*/
@Override
public boolean hasNext() /*const*/ { return pos_>=0 || !stack_.isEmpty(); }
/**
* Finds the next (string, value) pair if there is one.
*
* If the string is truncated to the maximum length and does not
* have a real value, then the value is set to -1.
* In this case, this "not a real value" is indistinguishable from
* a real value of -1.
* @return An Entry with the string and value of the next element.
* @throws NoSuchElementException - iteration has no more elements.
* @stable ICU 4.8
*/
@Override
public Entry next() {
int pos=pos_;
if(pos<0) {
if(stack_.isEmpty()) {
throw new NoSuchElementException();
}
// Pop the state off the stack and continue with the next outbound edge of
// the branch node.
long top=stack_.remove(stack_.size()-1);
int length=(int)top;
pos=(int)(top>>32);
str_.setLength(length&0xffff);
length>>>=16;
if(length>1) {
pos=branchNext(pos, length);
if(pos<0) {
return entry_; // Reached a final value.
}
} else {
str_.append(chars_.charAt(pos++));
}
}
if(remainingMatchLength_>=0) {
// We only get here if we started in a pending linear-match node
// with more than maxLength remaining units.
return truncateAndStop();
}
for(;;) {
int node=chars_.charAt(pos++);
if(node>=kMinValueLead) {
if(skipValue_) {
pos=skipNodeValue(pos, node);
node&=kNodeTypeMask;
skipValue_=false;
} else {
// Deliver value for the string so far.
boolean isFinal=(node&kValueIsFinal)!=0;
if(isFinal) {
entry_.value=readValue(chars_, pos, node&0x7fff);
} else {
entry_.value=readNodeValue(chars_, pos, node);
}
if(isFinal || (maxLength_>0 && str_.length()==maxLength_)) {
pos_=-1;
} else {
// We cannot skip the value right here because it shares its
// lead unit with a match node which we have to evaluate
// next time.
// Instead, keep pos_ on the node lead unit itself.
pos_=pos-1;
skipValue_=true;
}
entry_.chars=str_;
return entry_;
}
}
if(maxLength_>0 && str_.length()==maxLength_) {
return truncateAndStop();
}
if(node0 && str_.length()+length>maxLength_) {
str_.append(chars_, pos, pos+maxLength_-str_.length());
return truncateAndStop();
}
str_.append(chars_, pos, pos+length);
pos+=length;
}
}
}
/**
* Iterator.remove() is not supported.
* @throws UnsupportedOperationException (always)
* @stable ICU 4.8
*/
@Override
public void remove() {
throw new UnsupportedOperationException();
}
private Entry truncateAndStop() {
pos_=-1;
// We reset entry_.chars every time we return entry_
// just because the caller might have modified the Entry.
entry_.chars=str_;
entry_.value=-1; // no real value for str
return entry_;
}
private int branchNext(int pos, int length) {
while(length>kMaxBranchLinearSubNodeLength) {
++pos; // ignore the comparison unit
// Push state for the greater-or-equal edge.
stack_.add(((long)skipDelta(chars_, pos)<<32)|((length-(length>>1))<<16)|str_.length());
// Follow the less-than edge.
length>>=1;
pos=jumpByDelta(chars_, pos);
}
// List of key-value pairs where values are either final values or jump deltas.
// Read the first (key, value) pair.
char trieUnit=chars_.charAt(pos++);
int node=chars_.charAt(pos++);
boolean isFinal=(node&kValueIsFinal)!=0;
int value=readValue(chars_, pos, node&=0x7fff);
pos=skipValue(pos, node);
stack_.add(((long)pos<<32)|((length-1)<<16)|str_.length());
str_.append(trieUnit);
if(isFinal) {
pos_=-1;
entry_.chars=str_;
entry_.value=value;
return -1;
} else {
return pos+value;
}
}
private CharSequence chars_;
private int pos_;
private int initialPos_;
private int remainingMatchLength_;
private int initialRemainingMatchLength_;
private boolean skipValue_; // Skip intermediate value which was already delivered.
private StringBuilder str_=new StringBuilder();
private int maxLength_;
private Entry entry_=new Entry();
// The stack stores longs for backtracking to another
// outbound edge of a branch node.
// Each long has the offset in chars_ in bits 62..32,
// the str_.length() from before the node in bits 15..0,
// and the remaining branch length in bits 31..16.
// (We could store the remaining branch length minus 1 in bits 30..16 and not use bit 31,
// but the code looks more confusing that way.)
private ArrayList stack_=new ArrayList();
}
private void stop() {
pos_=-1;
}
// Reads a compact 32-bit integer.
// pos is already after the leadUnit, and the lead unit has bit 15 reset.
private static int readValue(CharSequence chars, int pos, int leadUnit) {
int value;
if(leadUnit=kMinTwoUnitValueLead) {
if(leadUnit>6)-1;
} else if(leadUnit=kMinTwoUnitNodeValueLead) {
if(leadUnit=kMinTwoUnitDeltaLead) {
if(delta==kThreeUnitDeltaLead) {
delta=(chars.charAt(pos)<<16)|chars.charAt(pos+1);
pos+=2;
} else {
delta=((delta-kMinTwoUnitDeltaLead)<<16)|chars.charAt(pos++);
}
}
return pos+delta;
}
private static int skipDelta(CharSequence chars, int pos) {
int delta=chars.charAt(pos++);
if(delta>=kMinTwoUnitDeltaLead) {
if(delta==kThreeUnitDeltaLead) {
pos+=2;
} else {
++pos;
}
}
return pos;
}
private static Result[] valueResults_={ Result.INTERMEDIATE_VALUE, Result.FINAL_VALUE };
// Handles a branch node for both next(unit) and next(string).
private Result branchNext(int pos, int length, int inUnit) {
// Branch according to the current unit.
if(length==0) {
length=chars_.charAt(pos++);
}
++length;
// The length of the branch is the number of units to select from.
// The data structure encodes a binary search.
while(length>kMaxBranchLinearSubNodeLength) {
if(inUnit>=1;
pos=jumpByDelta(chars_, pos);
} else {
length=length-(length>>1);
pos=skipDelta(chars_, pos);
}
}
// Drop down to linear search for the last few units.
// length>=2 because the loop body above sees length>kMaxBranchLinearSubNodeLength>=3
// and divides length by 2.
do {
if(inUnit==chars_.charAt(pos++)) {
Result result;
int node=chars_.charAt(pos);
if((node&kValueIsFinal)!=0) {
// Leave the final value for getValue() to read.
result=Result.FINAL_VALUE;
} else {
// Use the non-final value as the jump delta.
++pos;
// int delta=readValue(pos, node);
int delta;
if(node=kMinValueLead ? valueResults_[node>>15] : Result.NO_VALUE;
}
pos_=pos;
return result;
}
--length;
pos=skipValue(chars_, pos);
} while(length>1);
if(inUnit==chars_.charAt(pos++)) {
pos_=pos;
int node=chars_.charAt(pos);
return node>=kMinValueLead ? valueResults_[node>>15] : Result.NO_VALUE;
} else {
stop();
return Result.NO_MATCH;
}
}
// Requires remainingLength_<0.
private Result nextImpl(int pos, int inUnit) {
int node=chars_.charAt(pos++);
for(;;) {
if(node=kMinValueLead) ?
valueResults_[node>>15] : Result.NO_VALUE;
} else {
// No match.
break;
}
} else if((node&kValueIsFinal)!=0) {
// No further matching units.
break;
} else {
// Skip intermediate value.
pos=skipNodeValue(pos, node);
node&=kNodeTypeMask;
}
}
stop();
return Result.NO_MATCH;
}
// Helper functions for getUniqueValue().
// Recursively finds a unique value (or whether there is not a unique one)
// from a branch.
// uniqueValue: On input, same as for getUniqueValue()/findUniqueValue().
// On return, if not 0, then bits 63..33 contain the updated non-negative pos.
private static long findUniqueValueFromBranch(CharSequence chars, int pos, int length,
long uniqueValue) {
while(length>kMaxBranchLinearSubNodeLength) {
++pos; // ignore the comparison unit
uniqueValue=findUniqueValueFromBranch(chars, jumpByDelta(chars, pos), length>>1, uniqueValue);
if(uniqueValue==0) {
return 0;
}
length=length-(length>>1);
pos=skipDelta(chars, pos);
}
do {
++pos; // ignore a comparison unit
// handle its value
int node=chars.charAt(pos++);
boolean isFinal=(node&kValueIsFinal)!=0;
node&=0x7fff;
int value=readValue(chars, pos, node);
pos=skipValue(pos, node);
if(isFinal) {
if(uniqueValue!=0) {
if(value!=(int)(uniqueValue>>1)) {
return 0;
}
} else {
uniqueValue=((long)value<<1)|1;
}
} else {
uniqueValue=findUniqueValue(chars, pos+value, uniqueValue);
if(uniqueValue==0) {
return 0;
}
}
} while(--length>1);
// ignore the last comparison byte
return ((long)(pos+1)<<33)|(uniqueValue&0x1ffffffffL);
}
// Recursively finds a unique value (or whether there is not a unique one)
// starting from a position on a node lead unit.
// uniqueValue: If there is one, then bits 32..1 contain the value and bit 0 is set.
// Otherwise, uniqueValue is 0. Bits 63..33 are ignored.
private static long findUniqueValue(CharSequence chars, int pos, long uniqueValue) {
int node=chars.charAt(pos++);
for(;;) {
if(node>>33);
node=chars.charAt(pos++);
} else if(node>1)) {
return 0;
}
} else {
uniqueValue=((long)value<<1)|1;
}
if(isFinal) {
return uniqueValue;
}
pos=skipNodeValue(pos, node);
node&=kNodeTypeMask;
}
}
}
// Helper functions for getNextChars().
// getNextChars() when pos is on a branch node.
private static void getNextBranchChars(CharSequence chars, int pos, int length, Appendable out) {
while(length>kMaxBranchLinearSubNodeLength) {
++pos; // ignore the comparison unit
getNextBranchChars(chars, jumpByDelta(chars, pos), length>>1, out);
length=length-(length>>1);
pos=skipDelta(chars, pos);
}
do {
append(out, chars.charAt(pos++));
pos=skipValue(chars, pos);
} while(--length>1);
append(out, chars.charAt(pos));
}
private static void append(Appendable out, int c) {
try {
out.append((char)c);
} catch(IOException e) {
throw new ICUUncheckedIOException(e);
}
}
// CharsTrie data structure
//
// The trie consists of a series of char-serialized nodes for incremental
// Unicode string/char sequence matching. (char=16-bit unsigned integer)
// The root node is at the beginning of the trie data.
//
// Types of nodes are distinguished by their node lead unit ranges.
// After each node, except a final-value node, another node follows to
// encode match values or continue matching further units.
//
// Node types:
// - Final-value node: Stores a 32-bit integer in a compact, variable-length format.
// The value is for the string/char sequence so far.
// - Match node, optionally with an intermediate value in a different compact format.
// The value, if present, is for the string/char sequence so far.
//
// Aside from the value, which uses the node lead unit's high bits:
//
// - Linear-match node: Matches a number of units.
// - Branch node: Branches to other nodes according to the current input unit.
// The node unit is the length of the branch (number of units to select from)
// minus 1. It is followed by a sub-node:
// - If the length is at most kMaxBranchLinearSubNodeLength, then
// there are length-1 (key, value) pairs and then one more comparison unit.
// If one of the key units matches, then the value is either a final value for
// the string so far, or a "jump" delta to the next node.
// If the last unit matches, then matching continues with the next node.
// (Values have the same encoding as final-value nodes.)
// - If the length is greater than kMaxBranchLinearSubNodeLength, then
// there is one unit and one "jump" delta.
// If the input unit is less than the sub-node unit, then "jump" by delta to
// the next sub-node which will have a length of length/2.
// (The delta has its own compact encoding.)
// Otherwise, skip the "jump" delta to the next sub-node
// which will have a length of length-length/2.
// Match-node lead unit values, after masking off intermediate-value bits:
// 0000..002f: Branch node. If node!=0 then the length is node+1, otherwise
// the length is one more than the next unit.
// For a branch sub-node with at most this many entries, we drop down
// to a linear search.
/*package*/ static final int kMaxBranchLinearSubNodeLength=5;
// 0030..003f: Linear-match node, match 1..16 units and continue reading the next node.
/*package*/ static final int kMinLinearMatch=0x30;
/*package*/ static final int kMaxLinearMatchLength=0x10;
// Match-node lead unit bits 14..6 for the optional intermediate value.
// If these bits are 0, then there is no intermediate value.
// Otherwise, see the *NodeValue* constants below.
/*package*/ static final int kMinValueLead=kMinLinearMatch+kMaxLinearMatchLength; // 0x0040
/*package*/ static final int kNodeTypeMask=kMinValueLead-1; // 0x003f
// A final-value node has bit 15 set.
/*package*/ static final int kValueIsFinal=0x8000;
// Compact value: After testing and masking off bit 15, use the following thresholds.
/*package*/ static final int kMaxOneUnitValue=0x3fff;
/*package*/ static final int kMinTwoUnitValueLead=kMaxOneUnitValue+1; // 0x4000
/*package*/ static final int kThreeUnitValueLead=0x7fff;
/*package*/ static final int kMaxTwoUnitValue=((kThreeUnitValueLead-kMinTwoUnitValueLead)<<16)-1; // 0x3ffeffff
// Compact intermediate-value integer, lead unit shared with a branch or linear-match node.
/*package*/ static final int kMaxOneUnitNodeValue=0xff;
/*package*/ static final int kMinTwoUnitNodeValueLead=kMinValueLead+((kMaxOneUnitNodeValue+1)<<6); // 0x4040
/*package*/ static final int kThreeUnitNodeValueLead=0x7fc0;
/*package*/ static final int kMaxTwoUnitNodeValue=
((kThreeUnitNodeValueLead-kMinTwoUnitNodeValueLead)<<10)-1; // 0xfdffff
// Compact delta integers.
/*package*/ static final int kMaxOneUnitDelta=0xfbff;
/*package*/ static final int kMinTwoUnitDeltaLead=kMaxOneUnitDelta+1; // 0xfc00
/*package*/ static final int kThreeUnitDeltaLead=0xffff;
/*package*/ static final int kMaxTwoUnitDelta=((kThreeUnitDeltaLead-kMinTwoUnitDeltaLead)<<16)-1; // 0x03feffff
// Fixed value referencing the CharsTrie words.
private CharSequence chars_;
private int root_;
// Iterator variables.
// Pointer to next trie unit to read. NULL if no more matches.
private int pos_;
// Remaining length of a linear-match node, minus 1. Negative if not in such a node.
private int remainingMatchLength_;
}