All Downloads are FREE. Search and download functionalities are using the official Maven repository.

org.apache.xerces.util.SymbolTable Maven / Gradle / Ivy

/*
 * Licensed to the Apache Software Foundation (ASF) under one or more
 * contributor license agreements.  See the NOTICE file distributed with
 * this work for additional information regarding copyright ownership.
 * The ASF licenses this file to You under the Apache License, Version 2.0
 * (the "License"); you may not use this file except in compliance with
 * the License.  You may obtain a copy of the License at
 * 
 *      http://www.apache.org/licenses/LICENSE-2.0
 * 
 * Unless required by applicable law or agreed to in writing, software
 * distributed under the License is distributed on an "AS IS" BASIS,
 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 * See the License for the specific language governing permissions and
 * limitations under the License.
 */

package org.apache.xerces.util;

/**
 * This class is a symbol table implementation that guarantees that
 * strings used as identifiers are unique references. Multiple calls
 * to addSymbol will always return the same string
 * reference.
 * 

* The symbol table performs the same task as String.intern() * with the following differences: *

    *
  • * A new string object does not need to be created in order to * retrieve a unique reference. Symbols can be added by using * a series of characters in a character array. *
  • *
  • * Users of the symbol table can provide their own symbol hashing * implementation. For example, a simple string hashing algorithm * may fail to produce a balanced set of hashcodes for symbols * that are mostly unique. Strings with similar leading * characters are especially prone to this poor hashing behavior. *
  • *
* * An instance of SymbolTable has two parameters that affect its * performance: initial capacity and load factor. The * capacity is the number of buckets in the SymbolTable, and the * initial capacity is simply the capacity at the time the SymbolTable * is created. Note that the SymbolTable is open: in the case of a "hash * collision", a single bucket stores multiple entries, which must be searched * sequentially. The load factor is a measure of how full the SymbolTable * is allowed to get before its capacity is automatically increased. * When the number of entries in the SymbolTable exceeds the product of the load * factor and the current capacity, the capacity is increased by calling the * rehash method.

* * Generally, the default load factor (.75) offers a good tradeoff between * time and space costs. Higher values decrease the space overhead but * increase the time cost to look up an entry (which is reflected in most * SymbolTable operations, including addSymbol and containsSymbol).

* * The initial capacity controls a tradeoff between wasted space and the * need for rehash operations, which are time-consuming. * No rehash operations will ever occur if the initial * capacity is greater than the maximum number of entries the * Hashtable will contain divided by its load factor. However, * setting the initial capacity too high can waste space.

* * If many entries are to be made into a SymbolTable, * creating it with a sufficiently large capacity may allow the * entries to be inserted more efficiently than letting it perform * automatic rehashing as needed to grow the table.

* @see SymbolHash * * @author Andy Clark * @author John Kim, IBM * * @version $Id: SymbolTable.java 1358351 2012-07-06 19:04:57Z mrglavas $ */ public class SymbolTable { // // Constants // /** Default table size. */ protected static final int TABLE_SIZE = 101; /** Maximum hash collisions per bucket for a table with load factor == 1. */ protected static final int MAX_HASH_COLLISIONS = 40; protected static final int MULTIPLIERS_SIZE = 1 << 5; protected static final int MULTIPLIERS_MASK = MULTIPLIERS_SIZE - 1; // // Data // /** Buckets. */ protected Entry[] fBuckets = null; /** actual table size **/ protected int fTableSize; /** The total number of entries in the hash table. */ protected transient int fCount; /** The table is rehashed when its size exceeds this threshold. (The * value of this field is (int)(capacity * loadFactor).) */ protected int fThreshold; /** The load factor for the SymbolTable. */ protected float fLoadFactor; /** * A new hash function is selected and the table is rehashed when * the number of keys in the bucket exceeds this threshold. */ protected final int fCollisionThreshold; /** * Array of randomly selected hash function multipliers or null * if the default String.hashCode() function should be used. */ protected int[] fHashMultipliers; // // Constructors // /** * Constructs a new, empty SymbolTable with the specified initial * capacity and the specified load factor. * * @param initialCapacity the initial capacity of the SymbolTable. * @param loadFactor the load factor of the SymbolTable. * @throws IllegalArgumentException if the initial capacity is less * than zero, or if the load factor is nonpositive. */ public SymbolTable(int initialCapacity, float loadFactor) { if (initialCapacity < 0) { throw new IllegalArgumentException("Illegal Capacity: " + initialCapacity); } if (loadFactor <= 0 || Float.isNaN(loadFactor)) { throw new IllegalArgumentException("Illegal Load: " + loadFactor); } if (initialCapacity == 0) { initialCapacity = 1; } fLoadFactor = loadFactor; fTableSize = initialCapacity; fBuckets = new Entry[fTableSize]; fThreshold = (int)(fTableSize * loadFactor); fCollisionThreshold = (int)(MAX_HASH_COLLISIONS * loadFactor); fCount = 0; } /** * Constructs a new, empty SymbolTable with the specified initial capacity * and default load factor, which is 0.75. * * @param initialCapacity the initial capacity of the hashtable. * @throws IllegalArgumentException if the initial capacity is less * than zero. */ public SymbolTable(int initialCapacity) { this(initialCapacity, 0.75f); } /** * Constructs a new, empty SymbolTable with a default initial capacity (101) * and load factor, which is 0.75. */ public SymbolTable() { this(TABLE_SIZE, 0.75f); } // // Public methods // /** * Adds the specified symbol to the symbol table and returns a * reference to the unique symbol. If the symbol already exists, * the previous symbol reference is returned instead, in order * guarantee that symbol references remain unique. * * @param symbol The new symbol. */ public String addSymbol(String symbol) { // search for identical symbol int collisionCount = 0; int bucket = hash(symbol) % fTableSize; for (Entry entry = fBuckets[bucket]; entry != null; entry = entry.next) { if (entry.symbol.equals(symbol)) { return entry.symbol; } ++collisionCount; } return addSymbol0(symbol, bucket, collisionCount); } // addSymbol(String):String private String addSymbol0(String symbol, int bucket, int collisionCount) { if (fCount >= fThreshold) { // Rehash the table if the threshold is exceeded rehash(); bucket = hash(symbol) % fTableSize; } else if (collisionCount >= fCollisionThreshold) { // Select a new hash function and rehash the table if // the collision threshold is exceeded. rebalance(); bucket = hash(symbol) % fTableSize; } // create new entry Entry entry = new Entry(symbol, fBuckets[bucket]); fBuckets[bucket] = entry; ++fCount; return entry.symbol; } // addSymbol0(String,int,int):String /** * Adds the specified symbol to the symbol table and returns a * reference to the unique symbol. If the symbol already exists, * the previous symbol reference is returned instead, in order * guarantee that symbol references remain unique. * * @param buffer The buffer containing the new symbol. * @param offset The offset into the buffer of the new symbol. * @param length The length of the new symbol in the buffer. */ public String addSymbol(char[] buffer, int offset, int length) { // search for identical symbol int collisionCount = 0; int bucket = hash(buffer, offset, length) % fTableSize; OUTER: for (Entry entry = fBuckets[bucket]; entry != null; entry = entry.next) { if (length == entry.characters.length) { for (int i = 0; i < length; i++) { if (buffer[offset + i] != entry.characters[i]) { ++collisionCount; continue OUTER; } } return entry.symbol; } ++collisionCount; } return addSymbol0(buffer, offset, length, bucket, collisionCount); } // addSymbol(char[],int,int):String private String addSymbol0(char[] buffer, int offset, int length, int bucket, int collisionCount) { if (fCount >= fThreshold) { // Rehash the table if the threshold is exceeded rehash(); bucket = hash(buffer, offset, length) % fTableSize; } else if (collisionCount >= fCollisionThreshold) { // Select a new hash function and rehash the table if // the collision threshold is exceeded. rebalance(); bucket = hash(buffer, offset, length) % fTableSize; } // add new entry Entry entry = new Entry(buffer, offset, length, fBuckets[bucket]); fBuckets[bucket] = entry; ++fCount; return entry.symbol; } // addSymbol0(char[],int,int,int,int):String /** * Returns a hashcode value for the specified symbol. The value * returned by this method must be identical to the value returned * by the hash(char[],int,int) method when called * with the character array that comprises the symbol string. * * @param symbol The symbol to hash. */ public int hash(String symbol) { if (fHashMultipliers == null) { return symbol.hashCode() & 0x7FFFFFFF; } return hash0(symbol); } // hash(String):int private int hash0(String symbol) { int code = 0; final int length = symbol.length(); final int[] multipliers = fHashMultipliers; for (int i = 0; i < length; ++i) { code = code * multipliers[i & MULTIPLIERS_MASK] + symbol.charAt(i); } return code & 0x7FFFFFFF; } // hash0(String):int /** * Returns a hashcode value for the specified symbol information. * The value returned by this method must be identical to the value * returned by the hash(String) method when called * with the string object created from the symbol information. * * @param buffer The character buffer containing the symbol. * @param offset The offset into the character buffer of the start * of the symbol. * @param length The length of the symbol. */ public int hash(char[] buffer, int offset, int length) { if (fHashMultipliers == null) { int code = 0; for (int i = 0; i < length; ++i) { code = code * 31 + buffer[offset + i]; } return code & 0x7FFFFFFF; } return hash0(buffer, offset, length); } // hash(char[],int,int):int private int hash0(char[] buffer, int offset, int length) { int code = 0; final int[] multipliers = fHashMultipliers; for (int i = 0; i < length; ++i) { code = code * multipliers[i & MULTIPLIERS_MASK] + buffer[offset + i]; } return code & 0x7FFFFFFF; } // hash0(char[],int,int):int /** * Increases the capacity of and internally reorganizes this * SymbolTable, in order to accommodate and access its entries more * efficiently. This method is called automatically when the * number of keys in the SymbolTable exceeds this hashtable's capacity * and load factor. */ protected void rehash() { rehashCommon(fBuckets.length * 2 + 1); } /** * Randomly selects a new hash function and reorganizes this SymbolTable * in order to more evenly distribute its entries across the table. This * method is called automatically when the number keys in one of the * SymbolTable's buckets exceeds the given collision threshold. */ protected void rebalance() { if (fHashMultipliers == null) { fHashMultipliers = new int[MULTIPLIERS_SIZE]; } PrimeNumberSequenceGenerator.generateSequence(fHashMultipliers); rehashCommon(fBuckets.length); } private void rehashCommon(final int newCapacity) { int oldCapacity = fBuckets.length; Entry[] oldTable = fBuckets; Entry[] newTable = new Entry[newCapacity]; fThreshold = (int)(newCapacity * fLoadFactor); fBuckets = newTable; fTableSize = fBuckets.length; for (int i = oldCapacity ; i-- > 0 ;) { for (Entry old = oldTable[i] ; old != null ; ) { Entry e = old; old = old.next; int index = hash(e.symbol) % newCapacity; e.next = newTable[index]; newTable[index] = e; } } } /** * Returns true if the symbol table already contains the specified * symbol. * * @param symbol The symbol to look for. */ public boolean containsSymbol(String symbol) { // search for identical symbol int bucket = hash(symbol) % fTableSize; int length = symbol.length(); OUTER: for (Entry entry = fBuckets[bucket]; entry != null; entry = entry.next) { if (length == entry.characters.length) { for (int i = 0; i < length; i++) { if (symbol.charAt(i) != entry.characters[i]) { continue OUTER; } } return true; } } return false; } // containsSymbol(String):boolean /** * Returns true if the symbol table already contains the specified * symbol. * * @param buffer The buffer containing the symbol to look for. * @param offset The offset into the buffer. * @param length The length of the symbol in the buffer. */ public boolean containsSymbol(char[] buffer, int offset, int length) { // search for identical symbol int bucket = hash(buffer, offset, length) % fTableSize; OUTER: for (Entry entry = fBuckets[bucket]; entry != null; entry = entry.next) { if (length == entry.characters.length) { for (int i = 0; i < length; i++) { if (buffer[offset + i] != entry.characters[i]) { continue OUTER; } } return true; } } return false; } // containsSymbol(char[],int,int):boolean // // Classes // /** * This class is a symbol table entry. Each entry acts as a node * in a linked list. */ protected static final class Entry { // // Data // /** Symbol. */ public final String symbol; /** * Symbol characters. This information is duplicated here for * comparison performance. */ public final char[] characters; /** The next entry. */ public Entry next; // // Constructors // /** * Constructs a new entry from the specified symbol and next entry * reference. */ public Entry(String symbol, Entry next) { this.symbol = symbol.intern(); characters = new char[symbol.length()]; symbol.getChars(0, characters.length, characters, 0); this.next = next; } /** * Constructs a new entry from the specified symbol information and * next entry reference. */ public Entry(char[] ch, int offset, int length, Entry next) { characters = new char[length]; System.arraycopy(ch, offset, characters, 0, length); symbol = new String(characters).intern(); this.next = next; } } // class Entry } // class SymbolTable





© 2015 - 2024 Weber Informatics LLC | Privacy Policy