com.ibm.icu.util.StringTrieBuilder Maven / Gradle / Ivy
Show all versions of icu4j Show documentation
// © 2016 and later: Unicode, Inc. and others.
// License & terms of use: http://www.unicode.org/copyright.html
/*
*******************************************************************************
* Copyright (C) 2011-2014, International Business Machines
* Corporation and others. All Rights Reserved.
*******************************************************************************
* created on: 2011jan05
* created by: Markus W. Scherer
* ported from ICU4C stringtriebuilder.h/.cpp
*/
package com.ibm.icu.util;
import java.util.ArrayList;
import java.util.HashMap;
/**
* Base class for string trie builder classes.
*
* This class is not intended for public subclassing.
*
* @author Markus W. Scherer
* @stable ICU 4.8
*/
public abstract class StringTrieBuilder {
/**
* Build options for BytesTrieBuilder and CharsTrieBuilder.
* @stable ICU 4.8
*/
public enum Option {
/**
* Builds a trie quickly.
* @stable ICU 4.8
*/
FAST,
/**
* Builds a trie more slowly, attempting to generate
* a shorter but equivalent serialization.
* This build option also uses more memory.
*
*
This option can be effective when many integer values are the same
* and string/byte sequence suffixes can be shared.
* Runtime speed is not expected to improve.
* @stable ICU 4.8
*/
SMALL
}
/**
* @internal
* @deprecated This API is ICU internal only.
*/
@Deprecated
protected StringTrieBuilder() {}
/**
* @internal
* @deprecated This API is ICU internal only.
*/
@Deprecated
protected void addImpl(CharSequence s, int value) {
if(state!=State.ADDING) {
// Cannot add elements after building.
throw new IllegalStateException("Cannot add (string, value) pairs after build().");
}
if(s.length()>0xffff) {
// Too long: Limited by iterator internals, and by builder recursion depth.
throw new IndexOutOfBoundsException("The maximum string length is 0xffff.");
}
if(root==null) {
root=createSuffixNode(s, 0, value);
} else {
root=root.add(this, s, 0, value);
}
}
/**
* @internal
* @deprecated This API is ICU internal only.
*/
@Deprecated
protected final void buildImpl(Option buildOption) {
switch(state) {
case ADDING:
if(root==null) {
throw new IndexOutOfBoundsException("No (string, value) pairs were added.");
}
if(buildOption==Option.FAST) {
state=State.BUILDING_FAST;
// Building "fast" is somewhat faster (25..50% in some test)
// because it makes registerNode() return the input node
// rather than checking for duplicates.
// As a result, we sometimes write larger trie serializations.
//
// In either case we need to fix-up linear-match nodes (for their maximum length)
// and branch nodes (turning dynamic branch nodes into trees of
// runtime-equivalent nodes), but the HashMap/hashCode()/equals() are omitted for
// nodes other than final values.
} else {
state=State.BUILDING_SMALL;
}
break;
case BUILDING_FAST:
case BUILDING_SMALL:
// Building must have failed.
throw new IllegalStateException("Builder failed and must be clear()ed.");
case BUILT:
return; // Nothing more to do.
}
// Implementation note:
// We really build three versions of the trie.
// The first is a fully dynamic trie, built successively by addImpl().
// Then we call root.register() to turn it into a tree of nodes
// which is 1:1 equivalent to the runtime data structure.
// Finally, root.markRightEdgesFirst() and root.write() write that serialized form.
root=root.register(this);
root.markRightEdgesFirst(-1);
root.write(this);
state=State.BUILT;
}
/**
* @internal
* @deprecated This API is ICU internal only.
*/
@Deprecated
protected void clearImpl() {
strings.setLength(0);
nodes.clear();
root=null;
state=State.ADDING;
}
/**
* Makes sure that there is only one unique node registered that is
* equivalent to newNode, unless BUILDING_FAST.
* @param newNode Input node. The builder takes ownership.
* @return newNode if it is the first of its kind, or
* an equivalent node if newNode is a duplicate.
*/
private final Node registerNode(Node newNode) {
if(state==State.BUILDING_FAST) {
return newNode;
}
// BUILDING_SMALL
Node oldNode=nodes.get(newNode);
if(oldNode!=null) {
return oldNode;
}
// If put() returns a non-null value from an equivalent, previously
// registered node, then get() failed to find that and we will leak newNode.
oldNode=nodes.put(newNode, newNode);
assert(oldNode==null);
return newNode;
}
/**
* Makes sure that there is only one unique FinalValueNode registered
* with this value.
* Avoids creating a node if the value is a duplicate.
* @param value A final value.
* @return A FinalValueNode with the given value.
*/
private final ValueNode registerFinalValue(int value) {
// We always register final values because while ADDING
// we do not know yet whether we will build fast or small.
lookupFinalValueNode.setFinalValue(value);
Node oldNode=nodes.get(lookupFinalValueNode);
if(oldNode!=null) {
return (ValueNode)oldNode;
}
ValueNode newNode=new ValueNode(value);
// If put() returns a non-null value from an equivalent, previously
// registered node, then get() failed to find that and we will leak newNode.
oldNode=nodes.put(newNode, newNode);
assert(oldNode==null);
return newNode;
}
private static abstract class Node {
public Node() {
offset=0;
}
// hashCode() and equals() for use with registerNode() and the nodes hash.
@Override
public abstract int hashCode() /*const*/;
// Base class equals() compares the actual class types.
@Override
public boolean equals(Object other) {
return this==other || this.getClass()==other.getClass();
}
/**
* Recursive method for adding a new (string, value) pair.
* Matches the remaining part of s from start,
* and adds a new node where there is a mismatch.
* @return this or a replacement Node
*/
public Node add(StringTrieBuilder builder, CharSequence s, int start, int sValue) {
return this;
}
/**
* Recursive method for registering unique nodes,
* after all (string, value) pairs have been added.
* Final-value nodes are pre-registered while add()ing (string, value) pairs.
* Other nodes created while add()ing registerNode() themselves later
* and might replace themselves with new types of nodes for write()ing.
* @return The registered version of this node which implements write().
*/
public Node register(StringTrieBuilder builder) { return this; }
/**
* Traverses the Node graph and numbers branch edges, with rightmost edges first.
* This is to avoid writing a duplicate node twice.
*
* Branch nodes in this trie data structure are not symmetric.
* Most branch edges "jump" to other nodes but the rightmost branch edges
* just continue without a jump.
* Therefore, write() must write the rightmost branch edge last
* (trie units are written backwards), and must write it at that point even if
* it is a duplicate of a node previously written elsewhere.
*
* This function visits and marks right branch edges first.
* Edges are numbered with increasingly negative values because we share the
* offset field which gets positive values when nodes are written.
* A branch edge also remembers the first number for any of its edges.
*
* When a further-left branch edge has a number in the range of the rightmost
* edge's numbers, then it will be written as part of the required right edge
* and we can avoid writing it first.
*
* After root.markRightEdgesFirst(-1) the offsets of all nodes are negative
* edge numbers.
*
* @param edgeNumber The first edge number for this node and its sub-nodes.
* @return An edge number that is at least the maximum-negative
* of the input edge number and the numbers of this node and all of its sub-nodes.
*/
public int markRightEdgesFirst(int edgeNumber) {
if(offset==0) {
offset=edgeNumber;
}
return edgeNumber;
}
// write() must set the offset to a positive value.
public abstract void write(StringTrieBuilder builder);
// See markRightEdgesFirst.
public final void writeUnlessInsideRightEdge(int firstRight, int lastRight,
StringTrieBuilder builder) {
// Note: Edge numbers are negative, lastRight<=firstRight.
// If offset>0 then this node and its sub-nodes have been written already
// and we need not write them again.
// If this node is part of the unwritten right branch edge,
// then we wait until that is written.
if(offset<0 && (offset0 ? this : next;
// C++: if(length==0) { delete this; }
result=branchNode;
} else if(i==limit-1) {
// Mismatch on last character, keep this node for the prefix.
--length;
thisSuffixNode=next;
next=branchNode;
result=this;
} else {
// Mismatch on intermediate character, keep this node for the prefix.
int prefixLength=i-stringOffset;
++i; // Suffix start offset (after thisChar).
thisSuffixNode=new LinearMatchNode(
strings, i, length-(prefixLength+1), next);
length=prefixLength;
next=branchNode;
result=this;
}
ValueNode newSuffixNode=builder.createSuffixNode(s, start+1, sValue);
branchNode.add(thisChar, thisSuffixNode);
branchNode.add(newChar, newSuffixNode);
return result;
}
}
// s matches all of this node's characters.
next=next.add(builder, s, start, sValue);
return this;
}
@Override
public Node register(StringTrieBuilder builder) {
next=next.register(builder);
// Break the linear-match sequence into chunks of at most kMaxLinearMatchLength.
int maxLinearMatchLength=builder.getMaxLinearMatchLength();
while(length>maxLinearMatchLength) {
int nextOffset=stringOffset+length-maxLinearMatchLength;
length-=maxLinearMatchLength;
LinearMatchNode suffixNode=
new LinearMatchNode(strings, nextOffset, maxLinearMatchLength, next);
suffixNode.setHashCode();
next=builder.registerNode(suffixNode);
}
Node result;
if(hasValue && !builder.matchNodesCanHaveValues()) {
int intermediateValue=value;
value=0;
hasValue=false;
setHashCode();
result=new IntermediateValueNode(intermediateValue, builder.registerNode(this));
} else {
setHashCode();
result=this;
}
return builder.registerNode(result);
}
@Override
public int markRightEdgesFirst(int edgeNumber) {
if(offset==0) {
offset=edgeNumber=next.markRightEdgesFirst(edgeNumber);
}
return edgeNumber;
}
@Override
public void write(StringTrieBuilder builder) {
next.write(builder);
builder.write(stringOffset, length);
offset=builder.writeValueAndType(hasValue, value, builder.getMinLinearMatch()+length-1);
}
// Must be called just before registerNode(this).
private void setHashCode() /*const*/ {
hash=(0x333333*37+length)*37+next.hashCode();
if(hasValue) {
hash=hash*37+value;
}
for(int i=stringOffset, limit=stringOffset+length; ibuilder.getMaxBranchLinearSubNodeLength()) {
// Branch on the middle unit.
int middle=start+length/2;
return builder.registerNode(
new SplitBranchNode(
chars.charAt(middle),
register(builder, start, middle),
register(builder, middle, limit)));
}
ListBranchNode listNode=new ListBranchNode(length);
do {
char c=chars.charAt(start);
Node node=equal.get(start);
if(node.getClass()==ValueNode.class) {
// Final value.
listNode.add(c, ((ValueNode)node).value);
} else {
listNode.add(c, node.register(builder));
}
} while(++start equal=new ArrayList();
}
private static abstract class BranchNode extends Node {
public BranchNode() {}
@Override
public int hashCode() /*const*/ { return hash; }
protected int hash;
protected int firstEdgeNumber;
}
private static final class ListBranchNode extends BranchNode {
public ListBranchNode(int capacity) {
hash=0x444444*37+capacity;
equal=new Node[capacity];
values=new int[capacity];
units=new char[capacity];
}
@Override
public boolean equals(Object other) {
if(this==other) {
return true;
}
if(!super.equals(other)) {
return false;
}
ListBranchNode o=(ListBranchNode)other;
for(int i=0; i0);
offset=edgeNumber;
}
return edgeNumber;
}
@Override
public void write(StringTrieBuilder builder) {
// Write the sub-nodes in reverse order: The jump lengths are deltas from
// after their own positions, so if we wrote the minUnit sub-node first,
// then its jump delta would be larger.
// Instead we write the minUnit sub-node last, for a shorter delta.
int unitNumber=length-1;
Node rightEdge=equal[unitNumber];
int rightEdgeNumber= rightEdge==null ? firstEdgeNumber : rightEdge.getOffset();
do {
--unitNumber;
if(equal[unitNumber]!=null) {
equal[unitNumber].writeUnlessInsideRightEdge(firstEdgeNumber, rightEdgeNumber, builder);
}
} while(unitNumber>0);
// The maxUnit sub-node is written as the very last one because we do
// not jump for it at all.
unitNumber=length-1;
if(rightEdge==null) {
builder.writeValueAndFinal(values[unitNumber], true);
} else {
rightEdge.write(builder);
}
offset=builder.write(units[unitNumber]);
// Write the rest of this node's unit-value pairs.
while(--unitNumber>=0) {
int value;
boolean isFinal;
if(equal[unitNumber]==null) {
// Write the final value for the one string ending with this unit.
value=values[unitNumber];
isFinal=true;
} else {
// Write the delta to the start position of the sub-node.
assert(equal[unitNumber].getOffset()>0);
value=offset-equal[unitNumber].getOffset();
isFinal=false;
}
builder.writeValueAndFinal(value, isFinal);
offset=builder.write(units[unitNumber]);
}
}
// Adds a unit with a final value.
public void add(int c, int value) {
units[length]=(char)c;
equal[length]=null;
values[length]=value;
++length;
hash=(hash*37+c)*37+value;
}
// Adds a unit which leads to another match node.
public void add(int c, Node node) {
units[length]=(char)c;
equal[length]=node;
values[length]=0;
++length;
hash=(hash*37+c)*37+node.hashCode();
}
// Note: We could try to reduce memory allocations
// by replacing these per-node arrays with per-builder ArrayLists and
// (for units) a StringBuilder (or even use its strings for the units too).
// It remains to be seen whether that would improve performance.
private Node[] equal; // null means "has final value".
private int length;
private int[] values;
private char[] units;
}
private static final class SplitBranchNode extends BranchNode {
public SplitBranchNode(char middleUnit, Node lessThanNode, Node greaterOrEqualNode) {
hash=((0x555555*37+middleUnit)*37+
lessThanNode.hashCode())*37+greaterOrEqualNode.hashCode();
unit=middleUnit;
lessThan=lessThanNode;
greaterOrEqual=greaterOrEqualNode;
}
@Override
public boolean equals(Object other) {
if(this==other) {
return true;
}
if(!super.equals(other)) {
return false;
}
SplitBranchNode o=(SplitBranchNode)other;
return unit==o.unit && lessThan==o.lessThan && greaterOrEqual==o.greaterOrEqual;
}
@Override
public int hashCode() {
return super.hashCode();
}
@Override
public int markRightEdgesFirst(int edgeNumber) {
if(offset==0) {
firstEdgeNumber=edgeNumber;
edgeNumber=greaterOrEqual.markRightEdgesFirst(edgeNumber);
offset=edgeNumber=lessThan.markRightEdgesFirst(edgeNumber-1);
}
return edgeNumber;
}
@Override
public void write(StringTrieBuilder builder) {
// Encode the less-than branch first.
lessThan.writeUnlessInsideRightEdge(firstEdgeNumber, greaterOrEqual.getOffset(), builder);
// Encode the greater-or-equal branch last because we do not jump for it at all.
greaterOrEqual.write(builder);
// Write this node.
assert(lessThan.getOffset()>0);
builder.writeDeltaTo(lessThan.getOffset()); // less-than
offset=builder.write(unit);
}
private char unit;
private Node lessThan;
private Node greaterOrEqual;
}
// Branch head node, for writing the actual node lead unit.
private static final class BranchHeadNode extends ValueNode {
public BranchHeadNode(int len, Node subNode) {
length=len;
next=subNode;
}
@Override
public int hashCode() /*const*/ {
return (0x666666*37+length)*37+next.hashCode();
}
@Override
public boolean equals(Object other) {
if(this==other) {
return true;
}
if(!super.equals(other)) {
return false;
}
BranchHeadNode o=(BranchHeadNode)other;
return length==o.length && next==o.next;
}
@Override
public int markRightEdgesFirst(int edgeNumber) {
if(offset==0) {
offset=edgeNumber=next.markRightEdgesFirst(edgeNumber);
}
return edgeNumber;
}
@Override
public void write(StringTrieBuilder builder) {
next.write(builder);
if(length<=builder.getMinLinearMatch()) {
offset=builder.writeValueAndType(hasValue, value, length-1);
} else {
builder.write(length-1);
offset=builder.writeValueAndType(hasValue, value, 0);
}
}
private int length;
private Node next; // A branch sub-node.
}
private ValueNode createSuffixNode(CharSequence s, int start, int sValue) {
ValueNode node=registerFinalValue(sValue);
if(start nodes=new HashMap();
private ValueNode lookupFinalValueNode=new ValueNode();
}