All Downloads are FREE. Search and download functionalities are using the official Maven repository.

com.ibm.icu.util.CharsTrie Maven / Gradle / Ivy

There is a newer version: 2.12.15
Show newest version
// © 2016 and later: Unicode, Inc. and others.
// License & terms of use: http://www.unicode.org/copyright.html#License
/*
*******************************************************************************
*   Copyright (C) 2011-2014, International Business Machines
*   Corporation and others.  All Rights Reserved.
*******************************************************************************
*   created on: 2011jan06
*   created by: Markus W. Scherer
*   ported from ICU4C ucharstrie.h/.cpp
*/

package com.ibm.icu.util;

import java.io.IOException;
import java.util.ArrayList;
import java.util.NoSuchElementException;

import com.ibm.icu.text.UTF16;
import com.ibm.icu.util.BytesTrie.Result;

/**
 * Light-weight, non-const reader class for a CharsTrie.
 * Traverses a char-serialized data structure with minimal state,
 * for mapping strings (16-bit-unit sequences) to non-negative integer values.
 *
 * 

This class is not intended for public subclassing. * * @stable ICU 4.8 * @author Markus W. Scherer */ public final class CharsTrie implements Cloneable, Iterable { /** * Constructs a CharsTrie reader instance. * *

The CharSequence must contain a copy of a char sequence from the CharsTrieBuilder, * with the offset indicating the first char of that sequence. * The CharsTrie object will not read more chars than * the CharsTrieBuilder generated in the corresponding build() call. * *

The CharSequence is not copied/cloned and must not be modified while * the CharsTrie object is in use. * * @param trieChars CharSequence that contains the serialized trie. * @param offset Root offset of the trie in the CharSequence. * @stable ICU 4.8 */ public CharsTrie(CharSequence trieChars, int offset) { chars_=trieChars; pos_=root_=offset; remainingMatchLength_=-1; } /** * Clones this trie reader object and its state, * but not the char array which will be shared. * @return A shallow clone of this trie. * @stable ICU 4.8 */ @Override public Object clone() throws CloneNotSupportedException { return super.clone(); // A shallow copy is just what we need. } /** * Resets this trie to its initial state. * @return this * @stable ICU 4.8 */ public CharsTrie reset() { pos_=root_; remainingMatchLength_=-1; return this; } /** * CharsTrie state object, for saving a trie's current state * and resetting the trie back to this state later. * @stable ICU 4.8 */ public static final class State { /** * Constructs an empty State. * @stable ICU 4.8 */ public State() {} private CharSequence chars; private int root; private int pos; private int remainingMatchLength; } /** * Saves the state of this trie. * @param state The State object to hold the trie's state. * @return this * @see #resetToState * @stable ICU 4.8 */ public CharsTrie saveState(State state) /*const*/ { state.chars=chars_; state.root=root_; state.pos=pos_; state.remainingMatchLength=remainingMatchLength_; return this; } /** * Resets this trie to the saved state. * @param state The State object which holds a saved trie state. * @return this * @throws IllegalArgumentException if the state object contains no state, * or the state of a different trie * @see #saveState * @see #reset * @stable ICU 4.8 */ public CharsTrie resetToState(State state) { if(chars_==state.chars && chars_!=null && root_==state.root) { pos_=state.pos; remainingMatchLength_=state.remainingMatchLength; } else { throw new IllegalArgumentException("incompatible trie state"); } return this; } /** * Determines whether the string so far matches, whether it has a value, * and whether another input char can continue a matching string. * @return The match/value Result. * @stable ICU 4.8 */ public Result current() /*const*/ { int pos=pos_; if(pos<0) { return Result.NO_MATCH; } else { int node; return (remainingMatchLength_<0 && (node=chars_.charAt(pos))>=kMinValueLead) ? valueResults_[node>>15] : Result.NO_VALUE; } } /** * Traverses the trie from the initial state for this input char. * Equivalent to reset().next(inUnit). * @param inUnit Input char value. Values below 0 and above 0xffff will never match. * @return The match/value Result. * @stable ICU 4.8 */ public Result first(int inUnit) { remainingMatchLength_=-1; return nextImpl(root_, inUnit); } /** * Traverses the trie from the initial state for the * one or two UTF-16 code units for this input code point. * Equivalent to reset().nextForCodePoint(cp). * @param cp A Unicode code point 0..0x10ffff. * @return The match/value Result. * @stable ICU 4.8 */ public Result firstForCodePoint(int cp) { return cp<=0xffff ? first(cp) : (first(UTF16.getLeadSurrogate(cp)).hasNext() ? next(UTF16.getTrailSurrogate(cp)) : Result.NO_MATCH); } /** * Traverses the trie from the current state for this input char. * @param inUnit Input char value. Values below 0 and above 0xffff will never match. * @return The match/value Result. * @stable ICU 4.8 */ public Result next(int inUnit) { int pos=pos_; if(pos<0) { return Result.NO_MATCH; } int length=remainingMatchLength_; // Actual remaining match length minus 1. if(length>=0) { // Remaining part of a linear-match node. if(inUnit==chars_.charAt(pos++)) { remainingMatchLength_=--length; pos_=pos; int node; return (length<0 && (node=chars_.charAt(pos))>=kMinValueLead) ? valueResults_[node>>15] : Result.NO_VALUE; } else { stop(); return Result.NO_MATCH; } } return nextImpl(pos, inUnit); } /** * Traverses the trie from the current state for the * one or two UTF-16 code units for this input code point. * @param cp A Unicode code point 0..0x10ffff. * @return The match/value Result. * @stable ICU 4.8 */ public Result nextForCodePoint(int cp) { return cp<=0xffff ? next(cp) : (next(UTF16.getLeadSurrogate(cp)).hasNext() ? next(UTF16.getTrailSurrogate(cp)) : Result.NO_MATCH); } /** * Traverses the trie from the current state for this string. * Equivalent to *

     * Result result=current();
     * for(each c in s)
     *   if(!result.hasNext()) return Result.NO_MATCH;
     *   result=next(c);
     * return result;
     * 
* @param s Contains a string. * @param sIndex The start index of the string in s. * @param sLimit The (exclusive) end index of the string in s. * @return The match/value Result. * @stable ICU 4.8 */ public Result next(CharSequence s, int sIndex, int sLimit) { if(sIndex>=sLimit) { // Empty input. return current(); } int pos=pos_; if(pos<0) { return Result.NO_MATCH; } int length=remainingMatchLength_; // Actual remaining match length minus 1. for(;;) { // Fetch the next input unit, if there is one. // Continue a linear-match node. char inUnit; for(;;) { if(sIndex==sLimit) { remainingMatchLength_=length; pos_=pos; int node; return (length<0 && (node=chars_.charAt(pos))>=kMinValueLead) ? valueResults_[node>>15] : Result.NO_VALUE; } inUnit=s.charAt(sIndex++); if(length<0) { remainingMatchLength_=length; break; } if(inUnit!=chars_.charAt(pos)) { stop(); return Result.NO_MATCH; } ++pos; --length; } int node=chars_.charAt(pos++); for(;;) { if(node=kMinValueLead); return (leadUnit&kValueIsFinal)!=0 ? readValue(chars_, pos, leadUnit&0x7fff) : readNodeValue(chars_, pos, leadUnit); } /** * Determines whether all strings reachable from the current state * map to the same value, and if so, returns that value. * @return The unique value in bits 32..1 with bit 0 set, * if all strings reachable from the current state * map to the same value; otherwise returns 0. * @stable ICU 4.8 */ public long getUniqueValue() /*const*/ { int pos=pos_; if(pos<0) { return 0; } // Skip the rest of a pending linear-match node. long uniqueValue=findUniqueValue(chars_, pos+remainingMatchLength_+1, 0); // Ignore internally used bits 63..33; extend the actual value's sign bit from bit 32. return (uniqueValue<<31)>>31; } /** * Finds each char which continues the string from the current state. * That is, each char c for which it would be next(c)!=Result.NO_MATCH now. * @param out Each next char is appended to this object. * (Only uses the out.append(c) method.) * @return The number of chars which continue the string from here. * @stable ICU 4.8 */ public int getNextChars(Appendable out) /*const*/ { int pos=pos_; if(pos<0) { return 0; } if(remainingMatchLength_>=0) { append(out, chars_.charAt(pos)); // Next unit of a pending linear-match node. return 1; } int node=chars_.charAt(pos++); if(node>=kMinValueLead) { if((node&kValueIsFinal)!=0) { return 0; } else { pos=skipNodeValue(pos, node); node&=kNodeTypeMask; } } if(node { private Iterator(CharSequence trieChars, int offset, int remainingMatchLength, int maxStringLength) { chars_=trieChars; pos_=initialPos_=offset; remainingMatchLength_=initialRemainingMatchLength_=remainingMatchLength; maxLength_=maxStringLength; int length=remainingMatchLength_; // Actual remaining match length minus 1. if(length>=0) { // Pending linear-match node, append remaining bytes to str_. ++length; if(maxLength_>0 && length>maxLength_) { length=maxLength_; // This will leave remainingMatchLength>=0 as a signal. } str_.append(chars_, pos_, pos_+length); pos_+=length; remainingMatchLength_-=length; } } /** * Resets this iterator to its initial state. * @return this * @stable ICU 4.8 */ public Iterator reset() { pos_=initialPos_; remainingMatchLength_=initialRemainingMatchLength_; skipValue_=false; int length=remainingMatchLength_+1; // Remaining match length. if(maxLength_>0 && length>maxLength_) { length=maxLength_; } str_.setLength(length); pos_+=length; remainingMatchLength_-=length; stack_.clear(); return this; } /** * @return true if there are more elements. * @stable ICU 4.8 */ @Override public boolean hasNext() /*const*/ { return pos_>=0 || !stack_.isEmpty(); } /** * Finds the next (string, value) pair if there is one. * * If the string is truncated to the maximum length and does not * have a real value, then the value is set to -1. * In this case, this "not a real value" is indistinguishable from * a real value of -1. * @return An Entry with the string and value of the next element. * @throws NoSuchElementException - iteration has no more elements. * @stable ICU 4.8 */ @Override public Entry next() { int pos=pos_; if(pos<0) { if(stack_.isEmpty()) { throw new NoSuchElementException(); } // Pop the state off the stack and continue with the next outbound edge of // the branch node. long top=stack_.remove(stack_.size()-1); int length=(int)top; pos=(int)(top>>32); str_.setLength(length&0xffff); length>>>=16; if(length>1) { pos=branchNext(pos, length); if(pos<0) { return entry_; // Reached a final value. } } else { str_.append(chars_.charAt(pos++)); } } if(remainingMatchLength_>=0) { // We only get here if we started in a pending linear-match node // with more than maxLength remaining units. return truncateAndStop(); } for(;;) { int node=chars_.charAt(pos++); if(node>=kMinValueLead) { if(skipValue_) { pos=skipNodeValue(pos, node); node&=kNodeTypeMask; skipValue_=false; } else { // Deliver value for the string so far. boolean isFinal=(node&kValueIsFinal)!=0; if(isFinal) { entry_.value=readValue(chars_, pos, node&0x7fff); } else { entry_.value=readNodeValue(chars_, pos, node); } if(isFinal || (maxLength_>0 && str_.length()==maxLength_)) { pos_=-1; } else { // We cannot skip the value right here because it shares its // lead unit with a match node which we have to evaluate // next time. // Instead, keep pos_ on the node lead unit itself. pos_=pos-1; skipValue_=true; } entry_.chars=str_; return entry_; } } if(maxLength_>0 && str_.length()==maxLength_) { return truncateAndStop(); } if(node0 && str_.length()+length>maxLength_) { str_.append(chars_, pos, pos+maxLength_-str_.length()); return truncateAndStop(); } str_.append(chars_, pos, pos+length); pos+=length; } } } /** * Iterator.remove() is not supported. * @throws UnsupportedOperationException (always) * @stable ICU 4.8 */ @Override public void remove() { throw new UnsupportedOperationException(); } private Entry truncateAndStop() { pos_=-1; // We reset entry_.chars every time we return entry_ // just because the caller might have modified the Entry. entry_.chars=str_; entry_.value=-1; // no real value for str return entry_; } private int branchNext(int pos, int length) { while(length>kMaxBranchLinearSubNodeLength) { ++pos; // ignore the comparison unit // Push state for the greater-or-equal edge. stack_.add(((long)skipDelta(chars_, pos)<<32)|((length-(length>>1))<<16)|str_.length()); // Follow the less-than edge. length>>=1; pos=jumpByDelta(chars_, pos); } // List of key-value pairs where values are either final values or jump deltas. // Read the first (key, value) pair. char trieUnit=chars_.charAt(pos++); int node=chars_.charAt(pos++); boolean isFinal=(node&kValueIsFinal)!=0; int value=readValue(chars_, pos, node&=0x7fff); pos=skipValue(pos, node); stack_.add(((long)pos<<32)|((length-1)<<16)|str_.length()); str_.append(trieUnit); if(isFinal) { pos_=-1; entry_.chars=str_; entry_.value=value; return -1; } else { return pos+value; } } private CharSequence chars_; private int pos_; private int initialPos_; private int remainingMatchLength_; private int initialRemainingMatchLength_; private boolean skipValue_; // Skip intermediate value which was already delivered. private StringBuilder str_=new StringBuilder(); private int maxLength_; private Entry entry_=new Entry(); // The stack stores longs for backtracking to another // outbound edge of a branch node. // Each long has the offset in chars_ in bits 62..32, // the str_.length() from before the node in bits 15..0, // and the remaining branch length in bits 31..16. // (We could store the remaining branch length minus 1 in bits 30..16 and not use bit 31, // but the code looks more confusing that way.) private ArrayList stack_=new ArrayList(); } private void stop() { pos_=-1; } // Reads a compact 32-bit integer. // pos is already after the leadUnit, and the lead unit has bit 15 reset. private static int readValue(CharSequence chars, int pos, int leadUnit) { int value; if(leadUnit=kMinTwoUnitValueLead) { if(leadUnit>6)-1; } else if(leadUnit=kMinTwoUnitNodeValueLead) { if(leadUnit=kMinTwoUnitDeltaLead) { if(delta==kThreeUnitDeltaLead) { delta=(chars.charAt(pos)<<16)|chars.charAt(pos+1); pos+=2; } else { delta=((delta-kMinTwoUnitDeltaLead)<<16)|chars.charAt(pos++); } } return pos+delta; } private static int skipDelta(CharSequence chars, int pos) { int delta=chars.charAt(pos++); if(delta>=kMinTwoUnitDeltaLead) { if(delta==kThreeUnitDeltaLead) { pos+=2; } else { ++pos; } } return pos; } private static Result[] valueResults_={ Result.INTERMEDIATE_VALUE, Result.FINAL_VALUE }; // Handles a branch node for both next(unit) and next(string). private Result branchNext(int pos, int length, int inUnit) { // Branch according to the current unit. if(length==0) { length=chars_.charAt(pos++); } ++length; // The length of the branch is the number of units to select from. // The data structure encodes a binary search. while(length>kMaxBranchLinearSubNodeLength) { if(inUnit>=1; pos=jumpByDelta(chars_, pos); } else { length=length-(length>>1); pos=skipDelta(chars_, pos); } } // Drop down to linear search for the last few units. // length>=2 because the loop body above sees length>kMaxBranchLinearSubNodeLength>=3 // and divides length by 2. do { if(inUnit==chars_.charAt(pos++)) { Result result; int node=chars_.charAt(pos); if((node&kValueIsFinal)!=0) { // Leave the final value for getValue() to read. result=Result.FINAL_VALUE; } else { // Use the non-final value as the jump delta. ++pos; // int delta=readValue(pos, node); int delta; if(node=kMinValueLead ? valueResults_[node>>15] : Result.NO_VALUE; } pos_=pos; return result; } --length; pos=skipValue(chars_, pos); } while(length>1); if(inUnit==chars_.charAt(pos++)) { pos_=pos; int node=chars_.charAt(pos); return node>=kMinValueLead ? valueResults_[node>>15] : Result.NO_VALUE; } else { stop(); return Result.NO_MATCH; } } // Requires remainingLength_<0. private Result nextImpl(int pos, int inUnit) { int node=chars_.charAt(pos++); for(;;) { if(node=kMinValueLead) ? valueResults_[node>>15] : Result.NO_VALUE; } else { // No match. break; } } else if((node&kValueIsFinal)!=0) { // No further matching units. break; } else { // Skip intermediate value. pos=skipNodeValue(pos, node); node&=kNodeTypeMask; } } stop(); return Result.NO_MATCH; } // Helper functions for getUniqueValue(). // Recursively finds a unique value (or whether there is not a unique one) // from a branch. // uniqueValue: On input, same as for getUniqueValue()/findUniqueValue(). // On return, if not 0, then bits 63..33 contain the updated non-negative pos. private static long findUniqueValueFromBranch(CharSequence chars, int pos, int length, long uniqueValue) { while(length>kMaxBranchLinearSubNodeLength) { ++pos; // ignore the comparison unit uniqueValue=findUniqueValueFromBranch(chars, jumpByDelta(chars, pos), length>>1, uniqueValue); if(uniqueValue==0) { return 0; } length=length-(length>>1); pos=skipDelta(chars, pos); } do { ++pos; // ignore a comparison unit // handle its value int node=chars.charAt(pos++); boolean isFinal=(node&kValueIsFinal)!=0; node&=0x7fff; int value=readValue(chars, pos, node); pos=skipValue(pos, node); if(isFinal) { if(uniqueValue!=0) { if(value!=(int)(uniqueValue>>1)) { return 0; } } else { uniqueValue=((long)value<<1)|1; } } else { uniqueValue=findUniqueValue(chars, pos+value, uniqueValue); if(uniqueValue==0) { return 0; } } } while(--length>1); // ignore the last comparison byte return ((long)(pos+1)<<33)|(uniqueValue&0x1ffffffffL); } // Recursively finds a unique value (or whether there is not a unique one) // starting from a position on a node lead unit. // uniqueValue: If there is one, then bits 32..1 contain the value and bit 0 is set. // Otherwise, uniqueValue is 0. Bits 63..33 are ignored. private static long findUniqueValue(CharSequence chars, int pos, long uniqueValue) { int node=chars.charAt(pos++); for(;;) { if(node>>33); node=chars.charAt(pos++); } else if(node>1)) { return 0; } } else { uniqueValue=((long)value<<1)|1; } if(isFinal) { return uniqueValue; } pos=skipNodeValue(pos, node); node&=kNodeTypeMask; } } } // Helper functions for getNextChars(). // getNextChars() when pos is on a branch node. private static void getNextBranchChars(CharSequence chars, int pos, int length, Appendable out) { while(length>kMaxBranchLinearSubNodeLength) { ++pos; // ignore the comparison unit getNextBranchChars(chars, jumpByDelta(chars, pos), length>>1, out); length=length-(length>>1); pos=skipDelta(chars, pos); } do { append(out, chars.charAt(pos++)); pos=skipValue(chars, pos); } while(--length>1); append(out, chars.charAt(pos)); } private static void append(Appendable out, int c) { try { out.append((char)c); } catch(IOException e) { throw new ICUUncheckedIOException(e); } } // CharsTrie data structure // // The trie consists of a series of char-serialized nodes for incremental // Unicode string/char sequence matching. (char=16-bit unsigned integer) // The root node is at the beginning of the trie data. // // Types of nodes are distinguished by their node lead unit ranges. // After each node, except a final-value node, another node follows to // encode match values or continue matching further units. // // Node types: // - Final-value node: Stores a 32-bit integer in a compact, variable-length format. // The value is for the string/char sequence so far. // - Match node, optionally with an intermediate value in a different compact format. // The value, if present, is for the string/char sequence so far. // // Aside from the value, which uses the node lead unit's high bits: // // - Linear-match node: Matches a number of units. // - Branch node: Branches to other nodes according to the current input unit. // The node unit is the length of the branch (number of units to select from) // minus 1. It is followed by a sub-node: // - If the length is at most kMaxBranchLinearSubNodeLength, then // there are length-1 (key, value) pairs and then one more comparison unit. // If one of the key units matches, then the value is either a final value for // the string so far, or a "jump" delta to the next node. // If the last unit matches, then matching continues with the next node. // (Values have the same encoding as final-value nodes.) // - If the length is greater than kMaxBranchLinearSubNodeLength, then // there is one unit and one "jump" delta. // If the input unit is less than the sub-node unit, then "jump" by delta to // the next sub-node which will have a length of length/2. // (The delta has its own compact encoding.) // Otherwise, skip the "jump" delta to the next sub-node // which will have a length of length-length/2. // Match-node lead unit values, after masking off intermediate-value bits: // 0000..002f: Branch node. If node!=0 then the length is node+1, otherwise // the length is one more than the next unit. // For a branch sub-node with at most this many entries, we drop down // to a linear search. /*package*/ static final int kMaxBranchLinearSubNodeLength=5; // 0030..003f: Linear-match node, match 1..16 units and continue reading the next node. /*package*/ static final int kMinLinearMatch=0x30; /*package*/ static final int kMaxLinearMatchLength=0x10; // Match-node lead unit bits 14..6 for the optional intermediate value. // If these bits are 0, then there is no intermediate value. // Otherwise, see the *NodeValue* constants below. /*package*/ static final int kMinValueLead=kMinLinearMatch+kMaxLinearMatchLength; // 0x0040 /*package*/ static final int kNodeTypeMask=kMinValueLead-1; // 0x003f // A final-value node has bit 15 set. /*package*/ static final int kValueIsFinal=0x8000; // Compact value: After testing and masking off bit 15, use the following thresholds. /*package*/ static final int kMaxOneUnitValue=0x3fff; /*package*/ static final int kMinTwoUnitValueLead=kMaxOneUnitValue+1; // 0x4000 /*package*/ static final int kThreeUnitValueLead=0x7fff; /*package*/ static final int kMaxTwoUnitValue=((kThreeUnitValueLead-kMinTwoUnitValueLead)<<16)-1; // 0x3ffeffff // Compact intermediate-value integer, lead unit shared with a branch or linear-match node. /*package*/ static final int kMaxOneUnitNodeValue=0xff; /*package*/ static final int kMinTwoUnitNodeValueLead=kMinValueLead+((kMaxOneUnitNodeValue+1)<<6); // 0x4040 /*package*/ static final int kThreeUnitNodeValueLead=0x7fc0; /*package*/ static final int kMaxTwoUnitNodeValue= ((kThreeUnitNodeValueLead-kMinTwoUnitNodeValueLead)<<10)-1; // 0xfdffff // Compact delta integers. /*package*/ static final int kMaxOneUnitDelta=0xfbff; /*package*/ static final int kMinTwoUnitDeltaLead=kMaxOneUnitDelta+1; // 0xfc00 /*package*/ static final int kThreeUnitDeltaLead=0xffff; /*package*/ static final int kMaxTwoUnitDelta=((kThreeUnitDeltaLead-kMinTwoUnitDeltaLead)<<16)-1; // 0x03feffff // Fixed value referencing the CharsTrie words. private CharSequence chars_; private int root_; // Iterator variables. // Pointer to next trie unit to read. NULL if no more matches. private int pos_; // Remaining length of a linear-match node, minus 1. Negative if not in such a node. private int remainingMatchLength_; }




© 2015 - 2025 Weber Informatics LLC | Privacy Policy