com.fasterxml.aalto.util.UriCanonicalizer Maven / Gradle / Ivy

Go to download
Show more of this group Show more artifacts with this name
Show all versions of aalto-xml Show documentation
Ultra-high performance non-blocking XML processor (Stax/Stax2, SAX/SAX2)
There is a newer version: 1.3.3
package com.fasterxml.aalto.util;

import java.util.*;

/**
 * This class is used for canonicalization of namespace URIs.
 * It will act as a layer above String.intern(), trying to reduce
 * calls to somewhat slow intern() method, and to do that as efficiently
 * as possible considering that Strings in question are often
 * longer than names in xml documents.
 */
public final class UriCanonicalizer
{
    private BoundedHashMap mURIs = null;

    public UriCanonicalizer() { }

    private void init()
    {
        mURIs = new BoundedHashMap();
    }

    public synchronized String canonicalizeURI(char[] ch, int len)
    {
        CanonicalKey key = new CanonicalKey(ch, len);
        if (mURIs == null) {
            init();
        } else {
            String result = (String) mURIs.get(key);
            if (result != null) {
                return result;
            }
        }
        /* Key we have is not yet stable, as the underlying array 
         * is shared and mutable. So:
         */
        key = key.safeClone();
        // Also, now we should intern() the URI
        String uri = new String(ch, 0, len).intern();
        mURIs.put(key, uri);
        return uri;
    }

    /*
    ///////////////////////////////////////////////////
    // Helper classes
    ///////////////////////////////////////////////////
     */

    /**
     * We'll use a bounded map, which should work well for most normal
     * cases, but avoid excesses for degenerate cases (unique URIs
     * used as idenfitiers etc).
     */
    @SuppressWarnings("serial")
	final static class BoundedHashMap
        extends LinkedHashMap
    {
        /**
         * Let's create cache big enough to usually have enough space for
         * all/most entries for normal cases, but that won't grow
         * indefinitely for degenerate cases
         */
        private final static int DEFAULT_SIZE = 64;
        
        private final static int MAX_SIZE = (int) (1023 * 0.7f); // 4k primary hash

        public BoundedHashMap()
        {
            super(DEFAULT_SIZE, 0.7f, true);
        }

        @Override
            public boolean removeEldestEntry(Map.Entry entry)
        {
            return (size() >= MAX_SIZE);
        }
    }

    final static class CanonicalKey
    {
        /**
         * Array containing characters of the canonicalized String.
         */
        final char[] mChars;

        /**
         * Length of canonicalized String
         */
        final int mLength;

        /**
         * Hash of the URI string, calculated using fast(er) hash
         * function (compared to regular String).
         */
        final int mHash;

        public CanonicalKey(char[] buffer, int len)
        {
            mChars = buffer;
            mLength = len;
            mHash = calcKeyHash(buffer, len);
        }

        public CanonicalKey(char[] buffer, int len, int hashCode)
        {
            mChars = buffer;
            mLength = len;
            mHash = hashCode;
        }

        public CanonicalKey safeClone()
        {
            char[] newBuf = new char[mLength];
            System.arraycopy(mChars, 0, newBuf, 0, mLength);
            return new CanonicalKey(newBuf, mLength, mHash);
        }

        public static int calcKeyHash(char[] buffer, int len)
        {
            /* Short URIs are not common, but if they were to
             * happen, let's just use regular String.hashCode();
             * it's good one, and for short strings, fast enough
             */
            if (len <= 8) { // we know it's at least one char, though
                int hash = buffer[0];
                // For these, let's use regular hashing method
                for (int i = 1; i < len; ++i) {
                    hash = (hash * 31) + buffer[i];
                }
                return hash;
            }

            /* Ok, longer. So first let's use length xored with first char;
             * usually first 4 will just be "http" anyways (and could
             * just be skipped for good?)
             */
            int hash = len ^ buffer[0];
            
            /* Otherwise, let's start with length, xor with first char,
             * then latter chars separated by larger and larger
             * spaces. The idea is to severely limit time needed
             * to calc hash code as URIs can get quite long.
             * But let's ignore last 4 chars, for now (we'll use them
             * all after the loop)
             */
            int ix = 2; // start from 3rd char (buffer[2])
            int dist = 2; // and skip 1 char first
            int end = (len - 4);
            
            while (ix < end) {
                hash = (hash * 31) + buffer[ix];
                ix += dist;
                ++dist; // will skip progressively longer spans
            }
            
            // And then last 4 chars...
            hash = (hash * 31) ^ (buffer[end] << 2) + buffer[end+1];
            hash = (hash * 31) + (buffer[end+2] << 2) ^ buffer[end+3];
            return hash;
        }

        @Override
            public String toString() { return "{URI, hash: 0x"+Integer.toHexString(mHash)+"}"; }

        @Override
            public int hashCode() { return mHash; }

        @Override
            public boolean equals(Object o)
        {
            if (o == this) return true;
            if (o == null) return false;
            if (o.getClass() != getClass()) return false;

            CanonicalKey other = (CanonicalKey) o;
            if (other.mLength != mLength) return false;
            char[] c1 = mChars;
            char[] c2 = other.mChars;
            for (int i = 0, len = mLength; i < len; ++i) {
                if (c1[i] != c2[i]) {
                    return false;
                }
            }
            return true;
        }

    }
}