All Downloads are FREE. Search and download functionalities are using the official Maven repository.

com.ibm.icu.util.StringTrieBuilder Maven / Gradle / Ivy

Go to download

International Component for Unicode for Java (ICU4J) is a mature, widely used Java library providing Unicode and Globalization support

There is a newer version: 76.1
Show newest version
/*
*******************************************************************************
*   Copyright (C) 2011-2012, International Business Machines
*   Corporation and others.  All Rights Reserved.
*******************************************************************************
*   created on: 2011jan05
*   created by: Markus W. Scherer
*   ported from ICU4C stringtriebuilder.h/.cpp
*/
package com.ibm.icu.util;

import java.util.ArrayList;
import java.util.HashMap;

/**
 * Base class for string trie builder classes.
 *
 * 

This class is not intended for public subclassing. * * @author Markus W. Scherer * @stable ICU 4.8 */ public abstract class StringTrieBuilder { /** * Build options for BytesTrieBuilder and CharsTrieBuilder. * @stable ICU 4.8 */ public enum Option { /** * Builds a trie quickly. * @stable ICU 4.8 */ FAST, /** * Builds a trie more slowly, attempting to generate * a shorter but equivalent serialization. * This build option also uses more memory. * *

This option can be effective when many integer values are the same * and string/byte sequence suffixes can be shared. * Runtime speed is not expected to improve. * @stable ICU 4.8 */ SMALL } /** * @internal * @deprecated This API is ICU internal only. */ protected StringTrieBuilder() {} /** * @internal * @deprecated This API is ICU internal only. */ protected void addImpl(CharSequence s, int value) { if(state!=State.ADDING) { // Cannot add elements after building. throw new IllegalStateException("Cannot add (string, value) pairs after build()."); } if(s.length()>0xffff) { // Too long: Limited by iterator internals, and by builder recursion depth. throw new IndexOutOfBoundsException("The maximum string length is 0xffff."); } if(root==null) { root=createSuffixNode(s, 0, value); } else { root=root.add(this, s, 0, value); } } /** * @internal * @deprecated This API is ICU internal only. */ protected final void buildImpl(Option buildOption) { switch(state) { case ADDING: if(root==null) { throw new IndexOutOfBoundsException("No (string, value) pairs were added."); } if(buildOption==Option.FAST) { state=State.BUILDING_FAST; // Building "fast" is somewhat faster (25..50% in some test) // because it makes registerNode() return the input node // rather than checking for duplicates. // As a result, we sometimes write larger trie serializations. // // In either case we need to fix-up linear-match nodes (for their maximum length) // and branch nodes (turning dynamic branch nodes into trees of // runtime-equivalent nodes), but the HashMap/hashCode()/equals() are omitted for // nodes other than final values. } else { state=State.BUILDING_SMALL; } break; case BUILDING_FAST: case BUILDING_SMALL: // Building must have failed. throw new IllegalStateException("Builder failed and must be clear()ed."); case BUILT: return; // Nothing more to do. } // Implementation note: // We really build three versions of the trie. // The first is a fully dynamic trie, built successively by addImpl(). // Then we call root.register() to turn it into a tree of nodes // which is 1:1 equivalent to the runtime data structure. // Finally, root.markRightEdgesFirst() and root.write() write that serialized form. root=root.register(this); root.markRightEdgesFirst(-1); root.write(this); state=State.BUILT; } /** * @internal * @deprecated This API is ICU internal only. */ protected void clearImpl() { strings.setLength(0); nodes.clear(); root=null; state=State.ADDING; } /** * Makes sure that there is only one unique node registered that is * equivalent to newNode, unless BUILDING_FAST. * @param newNode Input node. The builder takes ownership. * @return newNode if it is the first of its kind, or * an equivalent node if newNode is a duplicate. */ private final Node registerNode(Node newNode) { if(state==State.BUILDING_FAST) { return newNode; } // BUILDING_SMALL Node oldNode=nodes.get(newNode); if(oldNode!=null) { return oldNode; } // If put() returns a non-null value from an equivalent, previously // registered node, then get() failed to find that and we will leak newNode. oldNode=nodes.put(newNode, newNode); assert(oldNode==null); return newNode; } /** * Makes sure that there is only one unique FinalValueNode registered * with this value. * Avoids creating a node if the value is a duplicate. * @param value A final value. * @return A FinalValueNode with the given value. */ private final ValueNode registerFinalValue(int value) { // We always register final values because while ADDING // we do not know yet whether we will build fast or small. lookupFinalValueNode.setFinalValue(value); Node oldNode=nodes.get(lookupFinalValueNode); if(oldNode!=null) { return (ValueNode)oldNode; } ValueNode newNode=new ValueNode(value); // If put() returns a non-null value from an equivalent, previously // registered node, then get() failed to find that and we will leak newNode. oldNode=nodes.put(newNode, newNode); assert(oldNode==null); return newNode; } private static abstract class Node { public Node() { offset=0; } // hashCode() and equals() for use with registerNode() and the nodes hash. @Override public abstract int hashCode() /*const*/; // Base class equals() compares the actual class types. @Override public boolean equals(Object other) { return this==other || this.getClass()==other.getClass(); } /** * Recursive method for adding a new (string, value) pair. * Matches the remaining part of s from start, * and adds a new node where there is a mismatch. * @return this or a replacement Node */ public Node add(StringTrieBuilder builder, CharSequence s, int start, int sValue) { return this; } /** * Recursive method for registering unique nodes, * after all (string, value) pairs have been added. * Final-value nodes are pre-registered while add()ing (string, value) pairs. * Other nodes created while add()ing registerNode() themselves later * and might replace themselves with new types of nodes for write()ing. * @return The registered version of this node which implements write(). */ public Node register(StringTrieBuilder builder) { return this; } /** * Traverses the Node graph and numbers branch edges, with rightmost edges first. * This is to avoid writing a duplicate node twice. * * Branch nodes in this trie data structure are not symmetric. * Most branch edges "jump" to other nodes but the rightmost branch edges * just continue without a jump. * Therefore, write() must write the rightmost branch edge last * (trie units are written backwards), and must write it at that point even if * it is a duplicate of a node previously written elsewhere. * * This function visits and marks right branch edges first. * Edges are numbered with increasingly negative values because we share the * offset field which gets positive values when nodes are written. * A branch edge also remembers the first number for any of its edges. * * When a further-left branch edge has a number in the range of the rightmost * edge's numbers, then it will be written as part of the required right edge * and we can avoid writing it first. * * After root.markRightEdgesFirst(-1) the offsets of all nodes are negative * edge numbers. * * @param edgeNumber The first edge number for this node and its sub-nodes. * @return An edge number that is at least the maximum-negative * of the input edge number and the numbers of this node and all of its sub-nodes. */ public int markRightEdgesFirst(int edgeNumber) { if(offset==0) { offset=edgeNumber; } return edgeNumber; } // write() must set the offset to a positive value. public abstract void write(StringTrieBuilder builder); // See markRightEdgesFirst. public final void writeUnlessInsideRightEdge(int firstRight, int lastRight, StringTrieBuilder builder) { // Note: Edge numbers are negative, lastRight<=firstRight. // If offset>0 then this node and its sub-nodes have been written already // and we need not write them again. // If this node is part of the unwritten right branch edge, // then we wait until that is written. if(offset<0 && (offset0 ? this : next; // C++: if(length==0) { delete this; } result=branchNode; } else if(i==limit-1) { // Mismatch on last character, keep this node for the prefix. --length; thisSuffixNode=next; next=branchNode; result=this; } else { // Mismatch on intermediate character, keep this node for the prefix. int prefixLength=i-stringOffset; ++i; // Suffix start offset (after thisChar). thisSuffixNode=new LinearMatchNode( strings, i, length-(prefixLength+1), next); length=prefixLength; next=branchNode; result=this; } ValueNode newSuffixNode=builder.createSuffixNode(s, start+1, sValue); branchNode.add(thisChar, thisSuffixNode); branchNode.add(newChar, newSuffixNode); return result; } } // s matches all of this node's characters. next=next.add(builder, s, start, sValue); return this; } @Override public Node register(StringTrieBuilder builder) { next=next.register(builder); // Break the linear-match sequence into chunks of at most kMaxLinearMatchLength. int maxLinearMatchLength=builder.getMaxLinearMatchLength(); while(length>maxLinearMatchLength) { int nextOffset=stringOffset+length-maxLinearMatchLength; length-=maxLinearMatchLength; LinearMatchNode suffixNode= new LinearMatchNode(strings, nextOffset, maxLinearMatchLength, next); suffixNode.setHashCode(); next=builder.registerNode(suffixNode); } Node result; if(hasValue && !builder.matchNodesCanHaveValues()) { int intermediateValue=value; value=0; hasValue=false; setHashCode(); result=new IntermediateValueNode(intermediateValue, builder.registerNode(this)); } else { setHashCode(); result=this; } return builder.registerNode(result); } @Override public int markRightEdgesFirst(int edgeNumber) { if(offset==0) { offset=edgeNumber=next.markRightEdgesFirst(edgeNumber); } return edgeNumber; } @Override public void write(StringTrieBuilder builder) { next.write(builder); builder.write(stringOffset, length); offset=builder.writeValueAndType(hasValue, value, builder.getMinLinearMatch()+length-1); } // Must be called just before registerNode(this). private void setHashCode() /*const*/ { hash=(0x333333*37+length)*37+next.hashCode(); if(hasValue) { hash=hash*37+value; } for(int i=stringOffset, limit=stringOffset+length; ibuilder.getMaxBranchLinearSubNodeLength()) { // Branch on the middle unit. int middle=start+length/2; return builder.registerNode( new SplitBranchNode( chars.charAt(middle), register(builder, start, middle), register(builder, middle, limit))); } ListBranchNode listNode=new ListBranchNode(length); do { char c=chars.charAt(start); Node node=equal.get(start); if(node.getClass()==ValueNode.class) { // Final value. listNode.add(c, ((ValueNode)node).value); } else { listNode.add(c, node.register(builder)); } } while(++start equal=new ArrayList(); } private static abstract class BranchNode extends Node { public BranchNode() {} @Override public int hashCode() /*const*/ { return hash; } protected int hash; protected int firstEdgeNumber; } private static final class ListBranchNode extends BranchNode { public ListBranchNode(int capacity) { hash=0x444444*37+capacity; equal=new Node[capacity]; values=new int[capacity]; units=new char[capacity]; } @Override public boolean equals(Object other) { if(this==other) { return true; } if(!super.equals(other)) { return false; } ListBranchNode o=(ListBranchNode)other; for(int i=0; i0); offset=edgeNumber; } return edgeNumber; } @Override public void write(StringTrieBuilder builder) { // Write the sub-nodes in reverse order: The jump lengths are deltas from // after their own positions, so if we wrote the minUnit sub-node first, // then its jump delta would be larger. // Instead we write the minUnit sub-node last, for a shorter delta. int unitNumber=length-1; Node rightEdge=equal[unitNumber]; int rightEdgeNumber= rightEdge==null ? firstEdgeNumber : rightEdge.getOffset(); do { --unitNumber; if(equal[unitNumber]!=null) { equal[unitNumber].writeUnlessInsideRightEdge(firstEdgeNumber, rightEdgeNumber, builder); } } while(unitNumber>0); // The maxUnit sub-node is written as the very last one because we do // not jump for it at all. unitNumber=length-1; if(rightEdge==null) { builder.writeValueAndFinal(values[unitNumber], true); } else { rightEdge.write(builder); } offset=builder.write(units[unitNumber]); // Write the rest of this node's unit-value pairs. while(--unitNumber>=0) { int value; boolean isFinal; if(equal[unitNumber]==null) { // Write the final value for the one string ending with this unit. value=values[unitNumber]; isFinal=true; } else { // Write the delta to the start position of the sub-node. assert(equal[unitNumber].getOffset()>0); value=offset-equal[unitNumber].getOffset(); isFinal=false; } builder.writeValueAndFinal(value, isFinal); offset=builder.write(units[unitNumber]); } } // Adds a unit with a final value. public void add(int c, int value) { units[length]=(char)c; equal[length]=null; values[length]=value; ++length; hash=(hash*37+c)*37+value; } // Adds a unit which leads to another match node. public void add(int c, Node node) { units[length]=(char)c; equal[length]=node; values[length]=0; ++length; hash=(hash*37+c)*37+node.hashCode(); } // Note: We could try to reduce memory allocations // by replacing these per-node arrays with per-builder ArrayLists and // (for units) a StringBuilder (or even use its strings for the units too). // It remains to be seen whether that would improve performance. private Node[] equal; // null means "has final value". private int length; private int[] values; private char[] units; } private static final class SplitBranchNode extends BranchNode { public SplitBranchNode(char middleUnit, Node lessThanNode, Node greaterOrEqualNode) { hash=((0x555555*37+middleUnit)*37+ lessThanNode.hashCode())*37+greaterOrEqualNode.hashCode(); unit=middleUnit; lessThan=lessThanNode; greaterOrEqual=greaterOrEqualNode; } @Override public boolean equals(Object other) { if(this==other) { return true; } if(!super.equals(other)) { return false; } SplitBranchNode o=(SplitBranchNode)other; return unit==o.unit && lessThan==o.lessThan && greaterOrEqual==o.greaterOrEqual; } @Override public int hashCode() { return super.hashCode(); } @Override public int markRightEdgesFirst(int edgeNumber) { if(offset==0) { firstEdgeNumber=edgeNumber; edgeNumber=greaterOrEqual.markRightEdgesFirst(edgeNumber); offset=edgeNumber=lessThan.markRightEdgesFirst(edgeNumber-1); } return edgeNumber; } @Override public void write(StringTrieBuilder builder) { // Encode the less-than branch first. lessThan.writeUnlessInsideRightEdge(firstEdgeNumber, greaterOrEqual.getOffset(), builder); // Encode the greater-or-equal branch last because we do not jump for it at all. greaterOrEqual.write(builder); // Write this node. assert(lessThan.getOffset()>0); builder.writeDeltaTo(lessThan.getOffset()); // less-than offset=builder.write(unit); } private char unit; private Node lessThan; private Node greaterOrEqual; } // Branch head node, for writing the actual node lead unit. private static final class BranchHeadNode extends ValueNode { public BranchHeadNode(int len, Node subNode) { length=len; next=subNode; } @Override public int hashCode() /*const*/ { return (0x666666*37+length)*37+next.hashCode(); } @Override public boolean equals(Object other) { if(this==other) { return true; } if(!super.equals(other)) { return false; } BranchHeadNode o=(BranchHeadNode)other; return length==o.length && next==o.next; } @Override public int markRightEdgesFirst(int edgeNumber) { if(offset==0) { offset=edgeNumber=next.markRightEdgesFirst(edgeNumber); } return edgeNumber; } @Override public void write(StringTrieBuilder builder) { next.write(builder); if(length<=builder.getMinLinearMatch()) { offset=builder.writeValueAndType(hasValue, value, length-1); } else { builder.write(length-1); offset=builder.writeValueAndType(hasValue, value, 0); } } private int length; private Node next; // A branch sub-node. } private ValueNode createSuffixNode(CharSequence s, int start, int sValue) { ValueNode node=registerFinalValue(sValue); if(start nodes=new HashMap(); private ValueNode lookupFinalValueNode=new ValueNode(); }





© 2015 - 2024 Weber Informatics LLC | Privacy Policy