com.ctc.wstx.util.WordSet Maven / Gradle / Ivy

Go to download
Show more of this group Show more artifacts with this name
Show all versions of woodstox-core Show documentation
Woodstox is a high-performance XML processor that implements Stax (JSR-173), SAX2 and Stax2 APIs
There is a newer version: 7.1.0
package com.ctc.wstx.util;

import java.util.*;

/**
 * An efficient (both memory and time) implementation of a Set used to
 * verify that a given
 * word is contained within the set. The general usage pattern is expected
 * to be such that most checks are positive, ie. that the word indeed
 * is contained in the set.
 *
 * Performance of the set is comparable to that of {@link java.util.TreeSet}
 * for Strings, ie. 2-3x slower than {@link java.util.HashSet} when
 * using pre-constructed Strings. This is generally result of algorithmic
 * complexity of structures; Word and Tree sets are roughly logarithmic
 * to the whole data, whereas Hash set is linear to the length of key.
 * However:
 * 

 *  WordSet can use char arrays as keys, without constructing Strings.
 *     In cases where there is no (need for) Strings, WordSet seems to be
 *     about twice as fast, even without considering additional GC caused
 *     by temporary String instances.
 *   
 *  WordSet is more compact in its memory presentation; if Strings are
 *    shared its size is comparable to optimally filled HashSet, and if
 *    no such Strings exists, its much more compact (relatively speaking)
 *   
 * 
 *
 * Although this is an efficient set for specific set of usage patterns,
 * one restriction is that the full set of words to include has to be
 * known before constructing the set. Also, the size of the set is
 * limited to total word content of about 20k characters; factory method
 * does verify the limit and indicates if an instance can not be created.
 */
public final class WordSet
{
    final static char CHAR_NULL = (char) 0;

    /**
     * Offset added to numbers to mark 'negative' numbers. Asymmetric,
     * since range of negative markers needed is smaller than positive
     * numbers...
     */
    final static int NEGATIVE_OFFSET = 0xC000;

    /**
     * This is actually just a guess; but in general linear search should
     * be faster for short sequences (definitely for 4 or less; maybe up
     * to 8 or less?)
     */
    final static int MIN_BINARY_SEARCH = 7;

    /**
     * Compressed presentation of the word set.
     */
    final char[] mData;

    /*
    ////////////////////////////////////////////////
    // Life-cycle
    ////////////////////////////////////////////////
     */

    private WordSet(char[] data) {
        mData = data;
    }

    public static WordSet constructSet(TreeSet wordSet)
    {
        return new WordSet(new Builder(wordSet).construct());
    }

    public static char[] constructRaw(TreeSet wordSet)
    {
        return new Builder(wordSet).construct();
    }

    /*
    ////////////////////////////////////////////////
    // Public API
    ////////////////////////////////////////////////
     */

    public boolean contains(char[] buf, int start, int end) {
        return contains(mData, buf, start, end);
    }

    @SuppressWarnings("cast")
	public static boolean contains(char[] data, char[] str, int start, int end)
    {
        int ptr = 0; // pointer to compressed set data

        main_loop:
        do {
            int left = end-start;

            // End of input String? Need to have the run entry:
            if (left == 0) {
                return (data[ptr+1] == CHAR_NULL);
            }

            int count = data[ptr++];

            // Nope, but do we have an end marker?
            if (count >= NEGATIVE_OFFSET) {
                // How many chars do we need to have left to match?
                int expCount = count - NEGATIVE_OFFSET;
                if (left != expCount) {
                    return false;
                }
                while (start < end) {
                    if (data[ptr] != str[start]) {
                        return false;
                    }
                    ++ptr;
                    ++start;
                }
                return true;
            }

            // No, need to find the branch to follow, if any
            char c = str[start++];

            // Linear or binary search?
            if (count < MIN_BINARY_SEARCH) {
                // always at least two branches; never less
                if (data[ptr] == c) {
                    ptr = (int) data[ptr+1];
                    continue main_loop;
                }
                if (data[ptr+2] == c) {
                    ptr = (int) data[ptr+3];
                    continue main_loop;
                }
                int branchEnd = ptr + (count << 1);
                // Starts from entry #3, if such exists
                for (ptr += 4; ptr < branchEnd; ptr += 2) {
                    if (data[ptr] == c) {
                        ptr = (int) data[ptr+1];
                        continue main_loop;
                    }
                }
                return false; // No match!
            }

            { // Ok, binary search:
                int low = 0;
                int high = count-1;
                int mid;

                while (low <= high) {
                    mid = (low + high) >> 1;
                    int ix = ptr + (mid << 1);
                    int diff = data[ix] - c;
                    if (diff > 0) { // char was 'higher', need to go down
                        high = mid-1;
                    } else if (diff < 0) { // lower, need to go up
                        low = mid+1;
                    } else { // match
                        ptr = (int) data[ix+1];
                        continue main_loop;
                    }
                }
            }

            // If we fall here, no match!
            return false;

        } while (ptr != 0);

        // If we reached an end state, must match the length
        return (start == end);
    }

    public boolean contains(String str) {
        return contains(mData, str);
    }

    @SuppressWarnings("cast")
	public static boolean contains(char[] data, String str)
    {
        // Let's use same vars as array-based code, to allow cut'n pasting
        int ptr = 0; // pointer to compressed set data
        int start = 0;
        int end = str.length();

        main_loop:
        do {
            int left = end-start;

            // End of input String? Need to have the run entry:
            if (left == 0) {
                return (data[ptr+1] == CHAR_NULL);
            }

            int count = data[ptr++];

            // Nope, but do we have an end marker?
            if (count >= NEGATIVE_OFFSET) {
                // How many chars do we need to have left to match?
                int expCount = count - NEGATIVE_OFFSET;
                if (left != expCount) {
                    return false;
                }
                while (start < end) {
                    if (data[ptr] != str.charAt(start)) {
                        return false;
                    }
                    ++ptr;
                    ++start;
                }
                return true;
            }

            // No, need to find the branch to follow, if any
            char c = str.charAt(start++);

            // Linear or binary search?
            if (count < MIN_BINARY_SEARCH) {
                // always at least two branches; never less
                if (data[ptr] == c) {
                    ptr = (int) data[ptr+1];
                    continue main_loop;
                }
                if (data[ptr+2] == c) {
                    ptr = (int) data[ptr+3];
                    continue main_loop;
                }
                int branchEnd = ptr + (count << 1);
                // Starts from entry #3, if such exists
                for (ptr += 4; ptr < branchEnd; ptr += 2) {
                    if (data[ptr] == c) {
                        ptr = (int) data[ptr+1];
                        continue main_loop;
                    }
                }
                return false; // No match!
            }

            { // Ok, binary search:
                int low = 0;
                int high = count-1;
                int mid;

                while (low <= high) {
                    mid = (low + high) >> 1;
                    int ix = ptr + (mid << 1);
                    int diff = data[ix] - c;
                    if (diff > 0) { // char was 'higher', need to go down
                        high = mid-1;
                    } else if (diff < 0) { // lower, need to go up
                        low = mid+1;
                    } else { // match
                        ptr = (int) data[ix+1];
                        continue main_loop;
                    }
                }
            }

            // If we fall here, no match!
            return false;

        } while (ptr != 0);

        // If we reached an end state, must match the length
        return (start == end);
    }

    /*
    ////////////////////////////////////////////////
    // Private methods
    ////////////////////////////////////////////////
     */

    /*
    ////////////////////////////////////////////////
    // Helper classes
    ////////////////////////////////////////////////
     */

    private final static class Builder
    {
        final String[] mWords;

        char[] mData;

        /**
         * Number of characters currently used from mData
         */
        int mSize;

        public Builder(TreeSet wordSet) {
            int wordCount = wordSet.size();
            mWords = new String[wordCount];
            wordSet.toArray(mWords);

            /* Let's guess approximate size we should need, assuming
             * average word length of 6 characters, and 100% overhead
             * in structure:
             */
            int size = wordCount * 12;
            if (size < 256) {
                size = 256;
            }
            mData = new char[size];
        }

        /**
         * @return Raw character data that contains compressed structure
         *   of the word set
         */
        public char[] construct() 
        {
// Uncomment if you need to debug array-out-of-bound probs
//try {
            // Let's check degenerate case of 1 word:
            if (mWords.length == 1) {
                constructLeaf(0, 0);
            } else {
                constructBranch(0, 0, mWords.length);
            }
//} catch (Throwable t) { System.err.println("Error: "+t); }

            char[] result = new char[mSize];
            System.arraycopy(mData, 0, result, 0, mSize);
            return result;
        }

        /**
         * Method that is called recursively to build the data
         * representation for a branch, ie. part of word set tree
         * that still has more than one ending
         *
         * @param charIndex Index of the character in words to consider
         *   for this round
         * @param start Index of the first word to be processed
         * @param end Index of the word after last word to be processed
         *   (so that number of words is end - start - 1
         */
        @SuppressWarnings("cast")
		private void constructBranch(int charIndex, int start, int end)
        {
            // If more than one entry, need to divide into groups

            // First, need to add placeholder for branch count:
            if (mSize >= mData.length) {
                expand(1);
            }
            mData[mSize++] = 0; // placeholder!
            /* structStart will point to second char of first entry
             * (which will temporarily have entry count, eventually 'link'
             * to continuation)
             */
            int structStart = mSize + 1;
            int groupCount = 0;
            int groupStart = start;
            String[] words = mWords;

            /* First thing we need to do is a special check for the
             * first entry -- it may be "runt" word, one that has no
             * more chars but also has a longer version ("id" vs.
             * "identifier"). If there is such a word, it'll always
             * be first in alphabetic ordering:
             */
            if (words[groupStart].length() == charIndex) { // yup, got one:
                if ((mSize + 2) > mData.length) {
                    expand(2);
                }
                /* Nulls mark both imaginary branching null char and
                 * "missing link" to the rest
                 */
                mData[mSize++] = CHAR_NULL;
                mData[mSize++] = CHAR_NULL;

                // Ok, let's then ignore that entry
                ++groupStart;
                ++groupCount;
            }

            // Ok, then, let's find the ('real') groupings:
            while (groupStart < end) {
                // Inner loop, let's find the group:
                char c = words[groupStart].charAt(charIndex);
                int j = groupStart+1;
                while (j < end && words[j].charAt(charIndex) == c) {
                    ++j;
                }
                /* Ok, let's store the char in there, along with count;
                 * count will be needed in second, and will then get
                 * overwritten with actual data later on
                 */
                if ((mSize + 2) > mData.length) {
                    expand(2);
                }
                mData[mSize++] = c;
                mData[mSize++] = (char) (j - groupStart); // entries in group
                groupStart = j;
                ++groupCount;
            }

            /* Ok, groups found; need to loop through them, recursively
             * calling branch and/or leaf methods
             */
            // first let's output the header, ie. group count:
            mData[structStart-2] = (char) groupCount;
            groupStart = start;

            // Do we have the "runt" to skip?
            if (mData[structStart] == CHAR_NULL) {
                structStart += 2;
                ++groupStart;
            }

            int structEnd = mSize;
            ++charIndex;
            for (; structStart < structEnd; structStart += 2) {
                groupCount = (int) mData[structStart]; // no sign expansion, is ok
                // Ok, count gotten, can now put the 'link' (pointer) in there
                mData[structStart] = (char) mSize;
                if (groupCount == 1) {
                    /* One optimization; if it'd lead to a single runt
                     * entry, we can just add 'null' link:
                     */
                    String word = words[groupStart];
                    if (word.length() == charIndex) {
                        mData[structStart] = CHAR_NULL;
                    } else { // otherwise, let's just create end state:
                        constructLeaf(charIndex, groupStart);
                    }
                } else {
                    constructBranch(charIndex, groupStart,
                                    groupStart + groupCount);
                }
                groupStart += groupCount;
            }

            // done!
        }

        /**
         * Method called to add leaf entry to word set; basically
         * "here is the rest of the only matching word"
         */
        private void constructLeaf(int charIndex, int wordIndex)
        {
            String word = mWords[wordIndex];
            int len = word.length();
            char[] data = mData;

            // need room for 1 header char, rest of the word
            if ((mSize + len + 1) >= data.length) {
                data = expand(len+1);
            }

            data[mSize++] = (char) (NEGATIVE_OFFSET + (len - charIndex));
            for (; charIndex < len; ++charIndex) {
                data[mSize++] = word.charAt(charIndex);
            }
        }

        private char[] expand(int needSpace)
        {
            char[] old = mData;
            int len = old.length;
            int newSize = len + ((len < 4096) ? len : (len >> 1));

            /* Let's verify we get enough; should always be true but
             * better safe than sorry
             */
            if (newSize < (mSize + needSpace)) {
                newSize = mSize + needSpace + 64;
            }
            mData = new char[newSize];
            System.arraycopy(old, 0, mData, 0, len);
            return mData;
        }
    }
}