com.fasterxml.aalto.out.WNameTable Maven / Gradle / Ivy
Show all versions of aalto-xml Show documentation
/* Aalto XML processor
*
* Copyright (c) 2006- Tatu Saloranta, [email protected]
*
* Licensed under the License specified in the file LICENSE which is
* included with the source code.
* You may not use this file except in compliance with the License.
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
package com.fasterxml.aalto.out;
import javax.xml.stream.XMLStreamException;
import com.fasterxml.aalto.util.NameTable;
/**
* This is a symbol table implementation used for storing byte-based
* WName
s.
*/
public final class WNameTable
extends NameTable
{
final static int MIN_HASH_SIZE = 16;
final static int INITIAL_COLLISION_LEN = 32;
/**
* Bucket index is 8 bits, and value 0 is reserved to represent
* 'empty' status.
*/
final static int LAST_VALID_BUCKET = 0xFE;
/*
/**********************************************************************
/* Related objects
/**********************************************************************
*/
final WNameFactory mNameFactory;
/**
* Parent reference is needed to be able to merge new symbols
* if and as necessary
*/
final WNameTable mParent;
/*
/**********************************************************************
/* Main table state
/**********************************************************************
*/
// // // First, global information
/**
* Total number of WNames in the symbol table
*/
private int mCount;
// // // Then information regarding primary hash array and its
// // // matching WName array
/**
* Mask used to truncate 32-bit hash value to current hash array
* size; essentially, hash array size - 1 (since hash array sizes
* are 2^N).
*/
private int mMainHashMask;
/**
* Array of 2^N size, which contains combination
* of 24-bits of hash (0 to indicate 'empty' slot),
* and 8-bit collision bucket index (0 to indicate empty
* collision bucket chain; otherwise subtract one from index)
*/
private int[] mMainHash;
/**
* Array that contains WName
instances matching
* entries in mMainHash
. Contains nulls for unused
* entries.
*/
private WName[] mMainNames;
// // // Then the collision/spill-over area info
/**
* Array of heads of collision bucket chains; size dynamically
*/
private Bucket[] mCollList;
/**
* Total number of WNames in collision buckets (included in
* mCount
along with primary entries)
*/
private int mCollCount;
/**
* Index of the first unused collision bucket entry (== size of
* the used portion of collision list): less than
* or equal to 0xFF (255), since max number of entries is 255
* (8-bit, minus 0 used as 'empty' marker)
*/
private int mCollEnd;
// // // Info regarding pending rehashing...
/**
* This flag is set if, after adding a new entry, it is deemed
* that a rehash is warranted if any more entries are to be added.
*/
private transient boolean mNeedRehash;
/*
/**********************************************************************
/* Sharing, versioning
/**********************************************************************
*/
// // // Which of the buffers may be shared (and are copy-on-write)?
/**
* Flag that indicates whether underlying data structures for
* the main hash area are shared or not. If they are, then they
* need to be handled in copy-on-write way, i.e. if they need
* to be modified, a copy needs to be made first; at this point
* it will not be shared any more, and can be modified.
*
* This flag needs to be checked both when adding new main entries,
* and when adding new collision list queues (i.e. creating a new
* collision list head entry)
*/
private boolean mMainHashShared;
private boolean mMainNamesShared;
/**
* Flag that indicates whether underlying data structures for
* the collision list are shared or not. If they are, then they
* need to be handled in copy-on-write way, i.e. if they need
* to be modified, a copy needs to be made first; at this point
* it will not be shared any more, and can be modified.
*
* This flag needs to be checked when adding new collision entries.
*/
private boolean mCollListShared;
/*
/**********************************************************************
/* Construction, merging
/**********************************************************************
*/
protected WNameTable(int hashSize)
{
mNameFactory = null;
mParent = null;
/* Sanity check: let's now allow hash sizes below certain
* min. value
*/
if (hashSize < MIN_HASH_SIZE) {
hashSize = MIN_HASH_SIZE;
} else {
/* Also; size must be 2^N; otherwise hash algorithm won't
* work... so let's just pad it up, if so
*/
if ((hashSize & (hashSize - 1)) != 0) { // only true if it's 2^N
int curr = MIN_HASH_SIZE;
while (curr < hashSize) {
curr += curr;
}
//System.out.println("WARNING: hashSize "+hashSize+" illegal; padding up to "+curr);
hashSize = curr;
}
}
mCount = 0;
mMainHashShared = false;
mMainNamesShared = false;
mMainHashMask = hashSize - 1;
mMainHash = new int[hashSize];
mMainNames = new WName[hashSize];
mCollListShared = true; // just since it'll need to be allocated
mCollList = null;
mCollEnd = 0;
mNeedRehash = false;
}
/**
* Constructor used when creating a child instance
*/
private WNameTable(WNameTable parent, WNameFactory f)
{
mParent = parent;
mNameFactory = f;
// First, let's copy the state as is:
mCount = parent.mCount;
mMainHashMask = parent.mMainHashMask;
mMainHash = parent.mMainHash;
mMainNames = parent.mMainNames;
mCollList = parent.mCollList;
mCollCount = parent.mCollCount;
mCollEnd = parent.mCollEnd;
mNeedRehash = false;
// And consider all shared, so far:
mMainHashShared = true;
mMainNamesShared = true;
mCollListShared = true;
}
protected synchronized WNameTable createChild(WNameFactory f)
{
return new WNameTable(this, f);
}
public boolean mergeToParent()
{
boolean changed = mParent.mergeFromChild(this);
/* Plus, as an added safety measure, let's mark child buffers
* as shared, just in case it might still be used:
*/
markAsShared();
return changed;
}
private synchronized boolean mergeFromChild(WNameTable child)
{
// Only makes sense if child has more entries
if (child.mCount <= mCount) {
return false;
}
//System.out.print("["+mCount+"->"+child.mCount+"/"+mMainHash.length+"]");
mCount = child.mCount;
mMainHashMask = child.mMainHashMask;
mMainHash = child.mMainHash;
mMainNames = child.mMainNames;
mCollList = child.mCollList;
mCollCount = child.mCollCount;
mCollEnd = child.mCollEnd;
return true;
}
public void markAsShared()
{
mMainHashShared = true;
mMainNamesShared = true;
mCollListShared = true;
}
/**
* Method used by test code, to reset state of the name table.
*/
public void nuke() {
mMainHash = null;
mMainNames = null;
mCollList = null;
}
/*
/**********************************************************************
/* API, accessors
/**********************************************************************
*/
@Override
public int size() { return mCount; }
/**
* Method called to check to quickly see if a child symbol table
* may have gotten additional entries. Used for checking to see
* if a child table should be merged into shared table.
*/
@Override
public boolean maybeDirty() {
return !mMainHashShared;
}
public WName findSymbol(String localName)
throws XMLStreamException
{
int hash = localName.hashCode();
int ix = (hash & mMainHashMask);
int val = mMainHash[ix];
/* High 24 bits of the value are low 24 bits of hash (low 8 bits
* are bucket index)... match?
*/
if ((((val >> 8) ^ hash) << 8) == 0) { // match
// Ok, but do we have an actual match?
WName wname = mMainNames[ix];
if (wname != null) {
if (wname.hasName(localName)) {
return wname;
}
}
}
if (val != 0) { // 0 == empty slot
// Maybe a spill-over?
val &= 0xFF;
if (val > 0) { // 0 means 'empty'
val -= 1; // to convert from 1-based to 0...
Bucket bucket = mCollList[val];
if (bucket != null) {
WName name = bucket.find(localName);
if (name != null) {
return name;
}
}
}
}
// Nope, no match. Have to construct (and add) one
WName name = mNameFactory.constructName(localName);
addSymbol(name);
return name;
}
/**
* Finds and returns name matching the specified symbol, if such
* name already exists in the table; or if not, creates name object,
* adds to the table, and returns it.
*/
public WName findSymbol(String prefix, String localName)
throws XMLStreamException
{
int hash = localName.hashCode() ^ prefix.hashCode();
int ix = (hash & mMainHashMask);
int val = mMainHash[ix];
/* High 24 bits of the value are low 24 bits of hash (low 8 bits
* are bucket index)... match?
*/
if ((((val >> 8) ^ hash) << 8) == 0) { // match
// Ok, but do we have an actual match?
WName wname = mMainNames[ix];
if (wname != null) {
if (wname.hasName(prefix, localName)) {
return wname;
}
}
}
if (val != 0) { // 0 == empty slot
// Maybe a spill-over?
val &= 0xFF;
if (val > 0) { // 0 means 'empty'
val -= 1; // to convert from 1-based to 0...
Bucket bucket = mCollList[val];
if (bucket != null) {
WName name = bucket.find(prefix, localName);
if (name != null) {
return name;
}
}
}
}
// Nope, no match. Have to construct (and add) one
WName name = mNameFactory.constructName(prefix, localName);
addSymbol(name);
return name;
}
/*
/**********************************************************************
/* Standard methods
/**********************************************************************
*/
@Override
public String toString()
{
StringBuilder sb = new StringBuilder();
sb.append("[WNameTable, size: ");
sb.append(mCount);
sb.append('/');
sb.append(mMainHash.length);
sb.append(", ");
sb.append(mCollCount);
sb.append(" coll; avg length: ");
/* Average length: minimum of 1 for all (1 == primary hit);
* and then 1 per each traversal for collisions/buckets
*/
//int maxDist = 1;
int pathCount = mCount;
for (int i = 0; i < mCollEnd; ++i) {
int spillLen = mCollList[i].length();
for (int j = 1; j <= spillLen; ++j) {
pathCount += j;
}
}
double avgLength;
if (mCount == 0) {
avgLength = 0.0;
} else {
avgLength = (double) pathCount / (double) mCount;
}
// let's round up a bit (two 2 decimal places)
//avgLength -= (avgLength % 0.01);
sb.append(avgLength);
sb.append(']');
return sb.toString();
}
// Not really a std method... but commonly useful
public String toDebugString()
{
StringBuilder sb = new StringBuilder();
sb.append("[WNameTable, size: ");
sb.append(mCount);
sb.append('/');
sb.append(mMainHash.length);
sb.append(" -> ");
for (int i = 0; i < mMainHash.length; ++i) {
sb.append("\n#");
sb.append(i);
sb.append(": 0x");
sb.append(Integer.toHexString(mMainHash[i]));
sb.append(" == ");
WName name = mMainNames[i];
if (name == null) {
sb.append("null");
} else {
sb.append('"');
sb.append(name.toString());
sb.append('"');
}
}
sb.append("\nSpill(");
sb.append(mCollEnd);
sb.append("):");
for (int i = 0; i < mCollEnd; ++i) {
Bucket bucket = mCollList[i];
sb.append("\nsp#");
sb.append(i);
sb.append(": ");
sb.append(bucket.toDebugString());
}
return sb.toString();
}
/*
/**********************************************************************
/* Internal methods
/**********************************************************************
*/
private void addSymbol(WName symbol)
{
if (mMainHashShared) { // always have to modify main entry
unshareMain();
}
// First, do we need to rehash?
if (mNeedRehash) {
rehash();
}
int hash = symbol.hashCode();
++mCount;
/* Ok, enough about set up: now we need to find the slot to add
* symbol in:
*/
int ix = (hash & mMainHashMask);
if (mMainNames[ix] == null) { // primary empty?
mMainHash[ix] = (hash << 8);
if (mMainNamesShared) {
unshareNames();
}
mMainNames[ix] = symbol;
} else { // nope, it's a collision, need to spill over
/* How about spill-over area... do we already know the bucket
* (is the case if it's not the first collision)
*/
if (mCollListShared) {
unshareCollision(); // also allocates if list was null
}
++mCollCount;
int entryValue = mMainHash[ix];
int bucket = entryValue & 0xFF;
if (bucket == 0) { // first spill over?
if (mCollEnd <= LAST_VALID_BUCKET) { // yup, still unshared bucket
bucket = mCollEnd;
++mCollEnd;
// need to expand?
if (bucket >= mCollList.length) {
expandCollision();
}
} else { // nope, have to share... let's find shortest?
bucket = findBestBucket();
}
// Need to mark the entry... and the spill index is 1-based
mMainHash[ix] = (entryValue & ~0xFF) | (bucket + 1);
} else {
--bucket; // 1-based index in value
}
// And then just need to link the new bucket entry in
mCollList[bucket] = new Bucket(symbol, mCollList[bucket]);
}
/* Ok. Now, do we need a rehash next time? Need to have at least
* 50% fill rate no matter what:
*/
{
int hashSize = mMainHash.length;
if (mCount > (hashSize >> 1)) {
int hashQuarter = (hashSize >> 2);
/* And either strictly above 75% (the usual) or
* just 50%, and collision count >= 25% of total hash size
*/
if (mCount > (hashSize - hashQuarter)) {
mNeedRehash = true;
} else if (mCollCount >= hashQuarter) {
mNeedRehash = true;
}
}
}
}
private void rehash()
{
mNeedRehash = false;
// Note: since we'll make copies, no need to unshare, can just mark as such:
mMainNamesShared = false;
/* And then we can first deal with the main hash area. Since we
* are expanding linearly (double up), we know there'll be no
* collisions during this phase.
*/
int symbolsSeen = 0; // let's do a sanity check
int[] oldMainHash = mMainHash;
int len = oldMainHash.length;
mMainHash = new int[len + len];
mMainHashMask = (len + len - 1);
WName[] oldNames = mMainNames;
mMainNames = new WName[len + len];
for (int i = 0; i < len; ++i) {
WName symbol = oldNames[i];
if (symbol != null) {
++symbolsSeen;
int hash = symbol.hashCode();
int ix = (hash & mMainHashMask);
mMainNames[ix] = symbol;
mMainHash[ix] = hash << 8; // will clear spill index
}
}
/* And then the spill area. This may cause collisions, although
* not necessarily as many as there were earlier. Let's allocate
* same amount of space, however
*/
int oldEnd = mCollEnd;
if (oldEnd == 0) { // no prior collisions...
return;
}
mCollCount = 0;
mCollEnd = 0;
mCollListShared = false;
Bucket[] oldBuckets = mCollList;
mCollList = new Bucket[oldBuckets.length];
for (int i = 0; i < oldEnd; ++i) {
for (Bucket curr = oldBuckets[i]; curr != null; curr = curr.mNext) {
++symbolsSeen;
WName symbol = curr.mName;
int hash = symbol.hashCode();
int ix = (hash & mMainHashMask);
int val = mMainHash[ix];
if (mMainNames[ix] == null) { // no primary entry?
mMainHash[ix] = (hash << 8);
mMainNames[ix] = symbol;
} else { // nope, it's a collision, need to spill over
++mCollCount;
int bucket = val & 0xFF;
if (bucket == 0) { // first spill over?
if (mCollEnd <= LAST_VALID_BUCKET) { // yup, still unshared bucket
bucket = mCollEnd;
++mCollEnd;
// need to expand?
if (bucket >= mCollList.length) {
expandCollision();
}
} else { // nope, have to share... let's find shortest?
bucket = findBestBucket();
}
// Need to mark the entry... and the spill index is 1-based
mMainHash[ix] = (val & ~0xFF) | (bucket + 1);
} else {
--bucket; // 1-based index in value
}
// And then just need to link the new bucket entry in
mCollList[bucket] = new Bucket(symbol, mCollList[bucket]);
}
} // for (... buckets in the chain ...)
} // for (... list of bucket heads ... )
if (symbolsSeen != mCount) { // sanity check
throw new Error("Internal error: count after rehash "+symbolsSeen+"; should be "+mCount);
}
}
/**
* Method called to find the best bucket to spill a WName over to:
* usually the first bucket that has only one entry, but in general
* first one of the buckets with least number of entries
*/
private int findBestBucket()
{
Bucket[] buckets = mCollList;
int bestCount = Integer.MAX_VALUE;
int bestIx = -1;
for (int i = 0, len = mCollEnd; i < len; ++i) {
int count = buckets[i].length();
if (count < bestCount) {
if (count == 1) { // best possible
return i;
}
bestCount = count;
bestIx = i;
}
}
return bestIx;
}
/**
* Method that needs to be called, if the main hash structure
* is (may be) shared. This happens every time something is added,
* even if addition is to the collision list (since collision list
* index comes from lowest 8 bits of the primary hash entry)
*/
private void unshareMain()
{
int[] old = mMainHash;
int len = mMainHash.length;
mMainHash = new int[len];
System.arraycopy(old, 0, mMainHash, 0, len);
mMainHashShared = false;
}
private void unshareCollision()
{
Bucket[] old = mCollList;
if (old == null) {
mCollList = new Bucket[INITIAL_COLLISION_LEN];
} else {
int len = old.length;
mCollList = new Bucket[len];
System.arraycopy(old, 0, mCollList, 0, len);
}
mCollListShared = false;
}
private void unshareNames()
{
WName[] old = mMainNames;
int len = old.length;
mMainNames = new WName[len];
System.arraycopy(old, 0, mMainNames, 0, len);
mMainNamesShared = false;
}
private void expandCollision()
{
Bucket[] old = mCollList;
int len = old.length;
mCollList = new Bucket[len+len];
System.arraycopy(old, 0, mCollList, 0, len);
}
/*
/**********************************************************************
/* Helper classes
/**********************************************************************
*/
final static class Bucket
{
final WName mName;
final Bucket mNext;
Bucket(WName name, Bucket next)
{
mName = name;
mNext = next;
}
public int length()
{
int len = 1;
for (Bucket curr = mNext; curr != null; curr = curr.mNext) {
++len;
}
return len;
}
public WName find(String localName)
{
if (mName.hasName(localName)) {
return mName;
}
for (Bucket curr = mNext; curr != null; curr = curr.mNext) {
WName currName = curr.mName;
if (currName.hasName(localName)) {
return currName;
}
}
return null;
}
public WName find(String prefix, String localName)
{
if (mName.hasName(prefix, localName)) {
return mName;
}
for (Bucket curr = mNext; curr != null; curr = curr.mNext) {
WName currName = curr.mName;
if (currName.hasName(prefix, localName)) {
return currName;
}
}
return null;
}
public String toDebugString()
{
StringBuilder sb = new StringBuilder();
sb.append("[Bucket(");
sb.append(length());
sb.append("): ");
for (Bucket curr = this; curr != null; curr = curr.mNext) {
sb.append('"');
sb.append(curr.mName.toString());
sb.append("\" -> ");
}
sb.append("NULL]");
return sb.toString();
}
}
}