All Downloads are FREE. Search and download functionalities are using the official Maven repository.

com.fasterxml.jackson.core.sym.BytesToNameCanonicalizer Maven / Gradle / Ivy

The newest version!
package com.fasterxml.jackson.core.sym;

import java.util.Arrays;
import java.util.BitSet;
import java.util.concurrent.atomic.AtomicReference;

import com.fasterxml.jackson.core.JsonFactory;
import com.fasterxml.jackson.core.util.InternCache;

/**
 * A caching symbol table implementation used for canonicalizing JSON field
 * names (as {@link Name}s which are constructed directly from a byte-based
 * input source).
 * Complications arise from trying to do efficient reuse and merging of
 * symbol tables, to be able to make use of usually shared vocabulary
 * of subsequent parsing runs.
 *
 * @author Tatu Saloranta
 */
public final class BytesToNameCanonicalizer
{
    private static final int DEFAULT_T_SIZE = 64;

    /**
     * Let's not expand symbol tables past some maximum size;
     * this should protected against OOMEs caused by large documents
     * with unique (~= random) names.
     */
    private static final int MAX_T_SIZE = 0x10000; // 64k entries == 256k mem
    
    /**
     * Let's only share reasonably sized symbol tables. Max size set to 3/4 of 16k;
     * this corresponds to 64k main hash index. This should allow for enough distinct
     * names for almost any case.
     */
    private final static int MAX_ENTRIES_FOR_REUSE = 6000;

    /**
     * Also: to thwart attacks based on hash collisions (which may or may not
     * be cheap to calculate), we will need to detect "too long"
     * collision chains.
     *

* Note: longest chain we have been able to produce without malicious * intent has been 10 (with "com.fasterxml.jackson.core.sym.TestSymbolTables"); * our setting should be reasonable here. Also note that overflow * chains are shared between multiple primary cells, which could cause * problems for lower values. *

* Also note that value was lowered from 255 (2.3 and earlier) to 100 for 2.4, * but raised again to 200 for 2.5.2 (as per [core#187]) * * @since 2.1 */ private final static int MAX_COLL_CHAIN_LENGTH = 200; /** * No point in trying to construct tiny tables, just need to resize soon. */ final static int MIN_HASH_SIZE = 16; /** * We will also need to define initial size for collision list, * when copying it. */ final static int INITIAL_COLLISION_LEN = 32; /** * Bucket index is 8 bits, and value 0 is reserved to represent * 'empty' status. */ final static int LAST_VALID_BUCKET = 0xFE; /* /********************************************************** /* Linkage, needed for merging symbol tables /********************************************************** */ /** * Reference to the root symbol table, for child tables, so * that they can merge table information back as necessary. */ final protected BytesToNameCanonicalizer _parent; /** * Member that is only used by the root table instance: root * passes immutable state into child instances, and children * may return new state if they add entries to the table. * Child tables do NOT use the reference. */ final protected AtomicReference _tableInfo; /** * Seed value we use as the base to make hash codes non-static between * different runs, but still stable for lifetime of a single symbol table * instance. * This is done for security reasons, to avoid potential DoS attack via * hash collisions. * * @since 2.1 */ final private int _seed; /* /********************************************************** /* Configuration /********************************************************** */ /** * Whether canonical symbol Strings are to be intern()ed before added * to the table or not. *

* NOTE: non-final to allow disabling intern()ing in case of excessive * collisions. */ protected boolean _intern; /** * Flag that indicates whether we should throw an exception if enough * hash collisions are detected (true); or just worked around (false). * * @since 2.4 */ protected final boolean _failOnDoS; /* /********************************************************** /* Main table state /********************************************************** */ // // // First, global information /** * Total number of Names in the symbol table; * only used for child tables. */ protected int _count; /** * We need to keep track of the longest collision list; this is needed * both to indicate problems with attacks and to allow flushing for * other cases. * * @since 2.1 */ protected int _longestCollisionList; // // // Then information regarding primary hash array and its // // // matching Name array /** * Mask used to truncate 32-bit hash value to current hash array * size; essentially, hash array size - 1 (since hash array sizes * are 2^N). */ protected int _hashMask; /** * Array of 2^N size, which contains combination * of 24-bits of hash (0 to indicate 'empty' slot), * and 8-bit collision bucket index (0 to indicate empty * collision bucket chain; otherwise subtract one from index) */ protected int[] _hash; /** * Array that contains Name instances matching * entries in _mainHash. Contains nulls for unused * entries. */ protected Name[] _mainNames; // // // Then the collision/spill-over area info /** * Array of heads of collision bucket chains; size dynamically */ protected Bucket[] _collList; /** * Total number of Names in collision buckets (included in * _count along with primary entries) */ protected int _collCount; /** * Index of the first unused collision bucket entry (== size of * the used portion of collision list): less than * or equal to 0xFF (255), since max number of entries is 255 * (8-bit, minus 0 used as 'empty' marker) */ protected int _collEnd; // // // Info regarding pending rehashing... /** * This flag is set if, after adding a new entry, it is deemed * that a rehash is warranted if any more entries are to be added. */ private transient boolean _needRehash; /* /********************************************************** /* Sharing, versioning /********************************************************** */ // // // Which of the buffers may be shared (and are copy-on-write)? /** * Flag that indicates whether underlying data structures for * the main hash area are shared or not. If they are, then they * need to be handled in copy-on-write way, i.e. if they need * to be modified, a copy needs to be made first; at this point * it will not be shared any more, and can be modified. *

* This flag needs to be checked both when adding new main entries, * and when adding new collision list queues (i.e. creating a new * collision list head entry) */ private boolean _hashShared; private boolean _namesShared; /** * Flag that indicates whether underlying data structures for * the collision list are shared or not. If they are, then they * need to be handled in copy-on-write way, i.e. if they need * to be modified, a copy needs to be made first; at this point * it will not be shared any more, and can be modified. *

* This flag needs to be checked when adding new collision entries. */ private boolean _collListShared; /* /********************************************************** /* Bit of DoS detection goodness /********************************************************** */ /** * Lazily constructed structure that is used to keep track of * collision buckets that have overflowed once: this is used * to detect likely attempts at denial-of-service attacks that * uses hash collisions. * * @since 2.4 */ protected BitSet _overflows; /* /********************************************************** /* Life-cycle: constructors /********************************************************** */ /** * Constructor used for creating per-JsonFactory "root" * symbol tables: ones used for merging and sharing common symbols * * @param sz Initial hash area size * @param intern Whether Strings contained should be {@link String#intern}ed * @param seed Random seed valued used to make it more difficult to cause * collisions (used for collision-based DoS attacks). */ private BytesToNameCanonicalizer(int sz, boolean intern, int seed, boolean failOnDoS) { _parent = null; _seed = seed; _intern = intern; _failOnDoS = failOnDoS; // Sanity check: let's now allow hash sizes below certain minimum value if (sz < MIN_HASH_SIZE) { sz = MIN_HASH_SIZE; } else { /* Also; size must be 2^N; otherwise hash algorithm won't * work... so let's just pad it up, if so */ if ((sz & (sz - 1)) != 0) { // only true if it's 2^N int curr = MIN_HASH_SIZE; while (curr < sz) { curr += curr; } sz = curr; } } _tableInfo = new AtomicReference(initTableInfo(sz)); } /** * Constructor used when creating a child instance */ private BytesToNameCanonicalizer(BytesToNameCanonicalizer parent, boolean intern, int seed, boolean failOnDoS, TableInfo state) { _parent = parent; _seed = seed; _intern = intern; _failOnDoS = failOnDoS; _tableInfo = null; // not used by child tables // Then copy shared state _count = state.count; _hashMask = state.mainHashMask; _hash = state.mainHash; _mainNames = state.mainNames; _collList = state.collList; _collCount = state.collCount; _collEnd = state.collEnd; _longestCollisionList = state.longestCollisionList; // and then set other state to reflect sharing status _needRehash = false; _hashShared = true; _namesShared = true; _collListShared = true; } /* public TableInfo(int count, int mainHashMask, int[] mainHash, Name[] mainNames, Bucket[] collList, int collCount, int collEnd, int longestCollisionList) */ private TableInfo initTableInfo(int sz) { return new TableInfo(0, // count sz - 1, // mainHashMask new int[sz], // mainHash new Name[sz], // mainNames null, // collList 0, // collCount, 0, // collEnd 0 // longestCollisionList ); } /* /********************************************************** /* Life-cycle: factory methods, merging /********************************************************** */ /** * Factory method to call to create a symbol table instance with a * randomized seed value. */ public static BytesToNameCanonicalizer createRoot() { /* [Issue-21]: Need to use a variable seed, to thwart hash-collision * based attacks. */ long now = System.currentTimeMillis(); // ensure it's not 0; and might as well require to be odd so: int seed = (((int) now) + ((int) (now >>> 32))) | 1; return createRoot(seed); } /** * Factory method that should only be called from unit tests, where seed * value should remain the same. */ protected static BytesToNameCanonicalizer createRoot(int seed) { return new BytesToNameCanonicalizer(DEFAULT_T_SIZE, true, seed, true); } /** * Factory method used to create actual symbol table instance to * use for parsing. */ public BytesToNameCanonicalizer makeChild(int flags) { return new BytesToNameCanonicalizer(this, JsonFactory.Feature.INTERN_FIELD_NAMES.enabledIn(flags), _seed, JsonFactory.Feature.FAIL_ON_SYMBOL_HASH_OVERFLOW.enabledIn(flags), _tableInfo.get()); } @Deprecated // since 2.4 public BytesToNameCanonicalizer makeChild(boolean canonicalize, boolean intern) { return new BytesToNameCanonicalizer(this, intern, _seed, true, // JsonFactory.Feature.FAIL_ON_SYMBOL_HASH_OVERFLOW _tableInfo.get()); } /** * Method called by the using code to indicate it is done * with this instance. This lets instance merge accumulated * changes into parent (if need be), safely and efficiently, * and without calling code having to know about parent * information */ public void release() { // we will try to merge if child table has new entries if (_parent != null && maybeDirty()) { _parent.mergeChild(new TableInfo(this)); /* Let's also mark this instance as dirty, so that just in * case release was too early, there's no corruption of possibly shared data. */ _hashShared = true; _namesShared = true; _collListShared = true; } } private void mergeChild(TableInfo childState) { final int childCount = childState.count; TableInfo currState = _tableInfo.get(); /* Should usually grow; but occasionally could also shrink if * (but only if) collision list overflow ends up clearing * some collision lists. */ if (childCount == currState.count) { return; } /* One caveat: let's try to avoid problems with * degenerate cases of documents with generated "random" * names: for these, symbol tables would bloat indefinitely. * One way to do this is to just purge tables if they grow * too large, and that's what we'll do here. */ if (childCount > MAX_ENTRIES_FOR_REUSE) { /* Should there be a way to get notified about this * event, to log it or such? (as it's somewhat abnormal * thing to happen) */ // At any rate, need to clean up the tables childState = initTableInfo(DEFAULT_T_SIZE); } _tableInfo.compareAndSet(currState, childState); } /* /********************************************************** /* API, accessors /********************************************************** */ public int size() { if (_tableInfo != null) { // root table return _tableInfo.get().count; } // nope, child table return _count; } /** * @since 2.1 */ public int bucketCount() { return _hash.length; } /** * Method called to check to quickly see if a child symbol table * may have gotten additional entries. Used for checking to see * if a child table should be merged into shared table. */ public boolean maybeDirty() { return !_hashShared; } /** * @since 2.1 */ public int hashSeed() { return _seed; } /** * Method mostly needed by unit tests; calculates number of * entries that are in collision list. Value can be at most * ({@link #size} - 1), but should usually be much lower, ideally 0. * * @since 2.1 */ public int collisionCount() { return _collCount; } /** * Method mostly needed by unit tests; calculates length of the * longest collision chain. This should typically be a low number, * but may be up to {@link #size} - 1 in the pathological case * * @since 2.1 */ public int maxCollisionLength() { return _longestCollisionList; } /* /********************************************************** /* Public API, accessing symbols: /********************************************************** */ public static Name getEmptyName() { return Name1.getEmptyName(); } /** * Finds and returns name matching the specified symbol, if such * name already exists in the table. * If not, will return null. *

* Note: separate methods to optimize common case of * short element/attribute names (4 or less ascii characters) * * @param q1 int32 containing first 4 bytes of the name; * if the whole name less than 4 bytes, padded with zero bytes * in front (zero MSBs, ie. right aligned) * * @return Name matching the symbol passed (or constructed for * it) */ public Name findName(int q1) { int hash = calcHash(q1); int ix = (hash & _hashMask); int val = _hash[ix]; /* High 24 bits of the value are low 24 bits of hash (low 8 bits * are bucket index)... match? */ if ((((val >> 8) ^ hash) << 8) == 0) { // match // Ok, but do we have an actual match? Name name = _mainNames[ix]; if (name == null) { // main slot empty; can't find return null; } if (name.equals(q1)) { return name; } } else if (val == 0) { // empty slot? no match return null; } // Maybe a spill-over? val &= 0xFF; if (val > 0) { // 0 means 'empty' val -= 1; // to convert from 1-based to 0... Bucket bucket = _collList[val]; if (bucket != null) { return bucket.find(hash, q1, 0); } } // Nope, no match whatsoever return null; } /** * Finds and returns name matching the specified symbol, if such * name already exists in the table. * If not, will return null. *

* Note: separate methods to optimize common case of relatively * short element/attribute names (8 or less ascii characters) * * @param q1 int32 containing first 4 bytes of the name. * @param q2 int32 containing bytes 5 through 8 of the * name; if less than 8 bytes, padded with up to 3 zero bytes * in front (zero MSBs, ie. right aligned) * * @return Name matching the symbol passed (or constructed for it) */ public Name findName(int q1, int q2) { int hash = (q2 == 0) ? calcHash(q1) : calcHash(q1, q2); int ix = (hash & _hashMask); int val = _hash[ix]; /* High 24 bits of the value are low 24 bits of hash (low 8 bits * are bucket index)... match? */ if ((((val >> 8) ^ hash) << 8) == 0) { // match // Ok, but do we have an actual match? Name name = _mainNames[ix]; if (name == null) { // main slot empty; can't find return null; } if (name.equals(q1, q2)) { return name; } } else if (val == 0) { // empty slot? no match return null; } // Maybe a spill-over? val &= 0xFF; if (val > 0) { // 0 means 'empty' val -= 1; // to convert from 1-based to 0... Bucket bucket = _collList[val]; if (bucket != null) { return bucket.find(hash, q1, q2); } } // Nope, no match whatsoever return null; } /** * Finds and returns name matching the specified symbol, if such * name already exists in the table; or if not, creates name object, * adds to the table, and returns it. *

* Note: this is the general purpose method that can be called for * names of any length. However, if name is less than 9 bytes long, * it is preferable to call the version optimized for short * names. * * @param q Array of int32s, each of which contain 4 bytes of * encoded name * @param qlen Number of int32s, starting from index 0, in quads * parameter * * @return Name matching the symbol passed (or constructed for it) */ public Name findName(int[] q, int qlen) { if (qlen < 3) { // another sanity check return findName(q[0], (qlen < 2) ? 0 : q[1]); } int hash = calcHash(q, qlen); // (for rest of comments regarding logic, see method above) int ix = (hash & _hashMask); int val = _hash[ix]; if ((((val >> 8) ^ hash) << 8) == 0) { Name name = _mainNames[ix]; if (name == null // main slot empty; no collision list then either || name.equals(q, qlen)) { // should be match, let's verify return name; } } else if (val == 0) { // empty slot? no match return null; } val &= 0xFF; if (val > 0) { // 0 means 'empty' val -= 1; // to convert from 1-based to 0... Bucket bucket = _collList[val]; if (bucket != null) { return bucket.find(hash, q, qlen); } } return null; } /* /********************************************************** /* API, mutators /********************************************************** */ public Name addName(String name, int q1, int q2) { if (_intern) { name = InternCache.instance.intern(name); } int hash = (q2 == 0) ? calcHash(q1) : calcHash(q1, q2); Name symbol = constructName(hash, name, q1, q2); _addSymbol(hash, symbol); return symbol; } public Name addName(String name, int[] q, int qlen) { if (_intern) { name = InternCache.instance.intern(name); } int hash; if (qlen < 3) { hash = (qlen == 1) ? calcHash(q[0]) : calcHash(q[0], q[1]); } else { hash = calcHash(q, qlen); } Name symbol = constructName(hash, name, q, qlen); _addSymbol(hash, symbol); return symbol; } /* /********************************************************** /* Helper methods /********************************************************** */ /* Note on hash calculation: we try to make it more difficult to * generate collisions automatically; part of this is to avoid * simple "multiply-add" algorithm (like JDK String.hashCode()), * and add bit of shifting. And other part is to make this * non-linear, at least for shorter symbols. */ // JDK uses 31; other fine choices are 33 and 65599, let's use 33 // as it seems to give fewest collisions for us // (see [http://www.cse.yorku.ca/~oz/hash.html] for details) private final static int MULT = 33; private final static int MULT2 = 65599; private final static int MULT3 = 31; public int calcHash(int q1) { int hash = q1 ^ _seed; hash += (hash >>> 15); // to xor hi- and low- 16-bits hash ^= (hash >>> 9); // as well as lowest 2 bytes return hash; } public int calcHash(int q1, int q2) { // For two quads, let's change algorithm a bit, to spice // things up (can do bit more processing anyway) int hash = q1; hash ^= (hash >>> 15); // try mixing first and second byte pairs first hash += (q2 * MULT); // then add second quad hash ^= _seed; hash += (hash >>> 7); // and shuffle some more // 26-Mar-2015, tatu: As per [core#187] need bit more shuffling. This may // seem like a magical number (and in a way, it is), but it was the sweet // spot for some reason (5 and 3 work ok but converges for 4, for tested case) hash ^= (hash >>> 4); return hash; } public int calcHash(int[] q, int qlen) { // Note: may be called for qlen < 3; but has at least one int if (qlen < 3) { throw new IllegalArgumentException(); } /* And then change handling again for "multi-quad" case; mostly * to make calculation of collisions less fun. For example, * add seed bit later in the game, and switch plus/xor around, * use different shift lengths. */ int hash = q[0] ^ _seed; hash += (hash >>> 9); hash *= MULT; hash += q[1]; hash *= MULT2; hash += (hash >>> 15); hash ^= q[2]; hash += (hash >>> 17); for (int i = 3; i < qlen; ++i) { hash = (hash * MULT3) ^ q[i]; // for longer entries, mess a bit in-between too hash += (hash >>> 3); hash ^= (hash << 7); } // and finally shuffle some more once done hash += (hash >>> 15); // to get high-order bits to mix more hash ^= (hash << 9); // as well as lowest 2 bytes return hash; } // Method only used by unit tests protected static int[] calcQuads(byte[] wordBytes) { int blen = wordBytes.length; int[] result = new int[(blen + 3) / 4]; for (int i = 0; i < blen; ++i) { int x = wordBytes[i] & 0xFF; if (++i < blen) { x = (x << 8) | (wordBytes[i] & 0xFF); if (++i < blen) { x = (x << 8) | (wordBytes[i] & 0xFF); if (++i < blen) { x = (x << 8) | (wordBytes[i] & 0xFF); } } } result[i >> 2] = x; } return result; } /* /********************************************************** /* Standard methods /********************************************************** */ /* @Override public String toString() { StringBuilder sb = new StringBuilder(); sb.append("[BytesToNameCanonicalizer, size: "); sb.append(_count); sb.append('/'); sb.append(_mainHash.length); sb.append(", "); sb.append(_collCount); sb.append(" coll; avg length: "); // Average length: minimum of 1 for all (1 == primary hit); // and then 1 per each traversal for collisions/buckets //int maxDist = 1; int pathCount = _count; for (int i = 0; i < _collEnd; ++i) { int spillLen = _collList[i].length(); for (int j = 1; j <= spillLen; ++j) { pathCount += j; } } double avgLength; if (_count == 0) { avgLength = 0.0; } else { avgLength = (double) pathCount / (double) _count; } // let's round up a bit (two 2 decimal places) //avgLength -= (avgLength % 0.01); sb.append(avgLength); sb.append(']'); return sb.toString(); } */ /* /********************************************************** /* Internal methods /********************************************************** */ private void _addSymbol(int hash, Name symbol) { if (_hashShared) { // always have to modify main entry unshareMain(); } // First, do we need to rehash? if (_needRehash) { rehash(); } ++_count; /* Ok, enough about set up: now we need to find the slot to add * symbol in: */ int ix = (hash & _hashMask); if (_mainNames[ix] == null) { // primary empty? _hash[ix] = (hash << 8); if (_namesShared) { unshareNames(); } _mainNames[ix] = symbol; } else { // nope, it's a collision, need to spill over /* How about spill-over area... do we already know the bucket * (is the case if it's not the first collision) */ if (_collListShared) { unshareCollision(); // also allocates if list was null } ++_collCount; int entryValue = _hash[ix]; int bucket = entryValue & 0xFF; if (bucket == 0) { // first spill over? if (_collEnd <= LAST_VALID_BUCKET) { // yup, still unshared bucket bucket = _collEnd; ++_collEnd; // need to expand? if (bucket >= _collList.length) { expandCollision(); } } else { // nope, have to share... let's find shortest? bucket = findBestBucket(); } // Need to mark the entry... and the spill index is 1-based _hash[ix] = (entryValue & ~0xFF) | (bucket + 1); } else { --bucket; // 1-based index in value } // And then just need to link the new bucket entry in Bucket newB = new Bucket(symbol, _collList[bucket]); int collLen = newB.length; if (collLen > MAX_COLL_CHAIN_LENGTH) { /* 23-May-2014, tatu: Instead of throwing an exception right away, let's handle * in bit smarter way. */ _handleSpillOverflow(bucket, newB); } else { _collList[bucket] = newB; // but, be careful wrt attacks _longestCollisionList = Math.max(newB.length, _longestCollisionList); } } /* Ok. Now, do we need a rehash next time? Need to have at least * 50% fill rate no matter what: */ { int hashSize = _hash.length; if (_count > (hashSize >> 1)) { int hashQuarter = (hashSize >> 2); /* And either strictly above 75% (the usual) or * just 50%, and collision count >= 25% of total hash size */ if (_count > (hashSize - hashQuarter)) { _needRehash = true; } else if (_collCount >= hashQuarter) { _needRehash = true; } } } } private void _handleSpillOverflow(int bindex, Bucket newBucket) { if (_overflows == null) { _overflows = new BitSet(); _overflows.set(bindex); } else { if (_overflows.get(bindex)) { // Has happened once already, so not a coincident... if (_failOnDoS) { reportTooManyCollisions(MAX_COLL_CHAIN_LENGTH); } // but even if we don't fail, we will stop intern()ing _intern = false; } else { _overflows.set(bindex); } } // regardless, if we get this far, clear up the bucket, adjust size appropriately. _collList[bindex] = null; _count -= (newBucket.length); // we could calculate longest; but for now just mark as invalid _longestCollisionList = -1; } private void rehash() { _needRehash = false; // Note: since we'll make copies, no need to unshare, can just mark as such: _namesShared = false; /* And then we can first deal with the main hash area. Since we * are expanding linearly (double up), we know there'll be no * collisions during this phase. */ int[] oldMainHash = _hash; int len = oldMainHash.length; int newLen = len+len; /* 13-Mar-2010, tatu: Let's guard against OOME that could be caused by * large documents with unique (or mostly so) names */ if (newLen > MAX_T_SIZE) { nukeSymbols(); return; } _hash = new int[newLen]; _hashMask = (newLen - 1); Name[] oldNames = _mainNames; _mainNames = new Name[newLen]; int symbolsSeen = 0; // let's do a sanity check for (int i = 0; i < len; ++i) { Name symbol = oldNames[i]; if (symbol != null) { ++symbolsSeen; int hash = symbol.hashCode(); int ix = (hash & _hashMask); _mainNames[ix] = symbol; _hash[ix] = hash << 8; // will clear spill index } } /* And then the spill area. This may cause collisions, although * not necessarily as many as there were earlier. Let's allocate * same amount of space, however */ int oldEnd = _collEnd; if (oldEnd == 0) { // no prior collisions... _longestCollisionList = 0; return; } _collCount = 0; _collEnd = 0; _collListShared = false; int maxColl = 0; Bucket[] oldBuckets = _collList; _collList = new Bucket[oldBuckets.length]; for (int i = 0; i < oldEnd; ++i) { for (Bucket curr = oldBuckets[i]; curr != null; curr = curr.next) { ++symbolsSeen; Name symbol = curr.name; int hash = symbol.hashCode(); int ix = (hash & _hashMask); int val = _hash[ix]; if (_mainNames[ix] == null) { // no primary entry? _hash[ix] = (hash << 8); _mainNames[ix] = symbol; } else { // nope, it's a collision, need to spill over ++_collCount; int bucket = val & 0xFF; if (bucket == 0) { // first spill over? if (_collEnd <= LAST_VALID_BUCKET) { // yup, still unshared bucket bucket = _collEnd; ++_collEnd; // need to expand? if (bucket >= _collList.length) { expandCollision(); } } else { // nope, have to share... let's find shortest? bucket = findBestBucket(); } // Need to mark the entry... and the spill index is 1-based _hash[ix] = (val & ~0xFF) | (bucket + 1); } else { --bucket; // 1-based index in value } // And then just need to link the new bucket entry in Bucket newB = new Bucket(symbol, _collList[bucket]); _collList[bucket] = newB; maxColl = Math.max(maxColl, newB.length); } } // for (... buckets in the chain ...) } // for (... list of bucket heads ... ) _longestCollisionList = maxColl; if (symbolsSeen != _count) { // sanity check throw new RuntimeException("Internal error: count after rehash "+symbolsSeen+"; should be "+_count); } } /** * Helper method called to empty all shared symbols, but to leave * arrays allocated */ private void nukeSymbols() { _count = 0; _longestCollisionList = 0; Arrays.fill(_hash, 0); Arrays.fill(_mainNames, null); Arrays.fill(_collList, null); _collCount = 0; _collEnd = 0; } /** * Method called to find the best bucket to spill a Name over to: * usually the first bucket that has only one entry, but in general * first one of the buckets with least number of entries */ private int findBestBucket() { Bucket[] buckets = _collList; int bestCount = Integer.MAX_VALUE; int bestIx = -1; for (int i = 0, len = _collEnd; i < len; ++i) { Bucket b = buckets[i]; // [#145] may become null due to long overflow chain if (b == null) { return i; } int count = b.length; if (count < bestCount) { if (count == 1) { // best possible return i; } bestCount = count; bestIx = i; } } return bestIx; } /** * Method that needs to be called, if the main hash structure * is (may be) shared. This happens every time something is added, * even if addition is to the collision list (since collision list * index comes from lowest 8 bits of the primary hash entry) */ private void unshareMain() { final int[] old = _hash; _hash = Arrays.copyOf(old, old.length); _hashShared = false; } private void unshareCollision() { Bucket[] old = _collList; if (old == null) { _collList = new Bucket[INITIAL_COLLISION_LEN]; } else { _collList = Arrays.copyOf(old, old.length); } _collListShared = false; } private void unshareNames() { final Name[] old = _mainNames; _mainNames = Arrays.copyOf(old, old.length); _namesShared = false; } private void expandCollision() { final Bucket[] old = _collList; _collList = Arrays.copyOf(old, old.length * 2); } /* /********************************************************** /* Constructing name objects /********************************************************** */ private static Name constructName(int hash, String name, int q1, int q2) { if (q2 == 0) { // one quad only? return new Name1(name, hash, q1); } return new Name2(name, hash, q1, q2); } private static Name constructName(int hash, String name, int[] quads, int qlen) { if (qlen < 4) { // Need to check for 3 quad one, can do others too switch (qlen) { case 1: return new Name1(name, hash, quads[0]); case 2: return new Name2(name, hash, quads[0], quads[1]); case 3: return new Name3(name, hash, quads[0], quads[1], quads[2]); default: } } return NameN.construct(name, hash, quads, qlen); } /* /********************************************************** /* Other helper methods /********************************************************** */ /** * @since 2.1 */ protected void reportTooManyCollisions(int maxLen) { throw new IllegalStateException("Longest collision chain in symbol table (of size "+_count +") now exceeds maximum, "+maxLen+" -- suspect a DoS attack based on hash collisions"); } /* /********************************************************** /* Helper classes /********************************************************** */ /** * Immutable value class used for sharing information as efficiently * as possible, by only require synchronization of reference manipulation * but not access to contents. * * @since 2.1 */ private final static class TableInfo { public final int count; public final int mainHashMask; public final int[] mainHash; public final Name[] mainNames; public final Bucket[] collList; public final int collCount; public final int collEnd; public final int longestCollisionList; public TableInfo(int count, int mainHashMask, int[] mainHash, Name[] mainNames, Bucket[] collList, int collCount, int collEnd, int longestCollisionList) { this.count = count; this.mainHashMask = mainHashMask; this.mainHash = mainHash; this.mainNames = mainNames; this.collList = collList; this.collCount = collCount; this.collEnd = collEnd; this.longestCollisionList = longestCollisionList; } public TableInfo(BytesToNameCanonicalizer src) { count = src._count; mainHashMask = src._hashMask; mainHash = src._hash; mainNames = src._mainNames; collList = src._collList; collCount = src._collCount; collEnd = src._collEnd; longestCollisionList = src._longestCollisionList; } } final private static class Bucket { public final Name name; public final Bucket next; public final int hash; public final int length; Bucket(Name name, Bucket next) { this.name = name; this.next = next; length = (next == null) ? 1 : next.length+1; hash = name.hashCode(); } public Name find(int h, int firstQuad, int secondQuad) { if (hash == h) { if (name.equals(firstQuad, secondQuad)) { return name; } } for (Bucket curr = next; curr != null; curr = curr.next) { if (curr.hash == h) { Name currName = curr.name; if (currName.equals(firstQuad, secondQuad)) { return currName; } } } return null; } public Name find(int h, int[] quads, int qlen) { if (hash == h) { if (name.equals(quads, qlen)) { return name; } } for (Bucket curr = next; curr != null; curr = curr.next) { if (curr.hash == h) { Name currName = curr.name; if (currName.equals(quads, qlen)) { return currName; } } } return null; } } }





© 2015 - 2024 Weber Informatics LLC | Privacy Policy