src.java.com.ctc.wstx.util.WordResolver Maven / Gradle / Ivy

Go to download
package com.ctc.wstx.util;

import java.util.*;

/**
 * A specialized Map/Symbol table - like data structure that can be used
 * for both checking whether a word (passed in as a char array) exists
 * in certain set of words AND getting that word as a String.
 * It is reasonably efficient both time and speed-wise, at least for
 * certain use cases; specifically, if there is no existing key to use,
 * it is more efficient way to get to a shared copy of that String
 * The general usage pattern is expected
 * to be such that most checks are positive, ie. that the word indeed
 * is contained in the structure.
 *
 * Although this is an efficient data struct for specific set of usage
 * patterns, one restriction is that the full set of words to include has to
 * be known before constructing the instnace. Also, the size of the set is
 * limited to total word content of about 20k characters.
 *

 * TODO: Should document the internal data structure...
 */
public final class WordResolver
{
    /**
     * Maximum number of words (Strings) an instance can contain
     */
    public final static int MAX_WORDS = 0x2000;

    final static char CHAR_NULL = (char) 0;

    /**
     * Offset added to numbers to mark 'negative' numbers. Asymmetric,
     * since range of negative markers needed is smaller than positive
     * numbers...
     */
    final static int NEGATIVE_OFFSET = 0x10000 - MAX_WORDS;

    /**
     * This is actually just a guess; but in general linear search should
     * be faster for short sequences (definitely for 4 or less; maybe up
     * to 8 or less?)
     */
    final static int MIN_BINARY_SEARCH = 7;

    /**
     * Compressed presentation of the word set.
     */
    final char[] mData;

    /**
     * Array of actual words returned resolved for matches.
     */
    final String[] mWords;

    /*
    ////////////////////////////////////////////////
    // Life-cycle
    ////////////////////////////////////////////////
     */

    private WordResolver(String[] words, char[] index) {
        mWords = words;
        mData = index;
    }

    /**
     * Tries to construct an instance given ordered set of words.
     *
     * Note: currently maximum number of words that can be contained
     * is limited to {@link #MAX_WORDS}; additionally, maximum length
     * of all such words can not exceed roughly 28000 characters.
     *
     * @return WordResolver constructed for given set of words, if
     *   the word set size is not too big; null to indicate "too big"
     *   instance.
     */
    public static WordResolver constructInstance(TreeSet wordSet)
    {
        if (wordSet.size() > MAX_WORDS) {
            return null;
        }
        return new Builder(wordSet).construct();
    }

    /*
    ////////////////////////////////////////////////
    // Public API
    ////////////////////////////////////////////////
     */

    /**
     * @return Number of words contained
     */
    public int size() {
        return mWords.length;
    }

    /*
    public int indexSize() {
        return mData.length;
    }
    */

    /**
     * @param str Character array that contains the word to find
     * @param start Index of the first character of the word
     * @param end Index following the last character of the word,
     *   so that end - start equals word length (similar
     *   to the way String.substring() has).
     *
     * @return (Shared) string instance of the word, if it exists in
     *   the word set; null if not.
     */
    public String find(char[] str, final int start, final int end)
    {
        char[] data = mData;

        // 03-Jan-2006, TSa: Special case; one entry
        if (data == null) {
            return findFromOne(str, start, end);
        }

        int ptr = 0; // pointer to compressed set data
        int offset = start;

        while (true) {
            // End of input String? Need to match the runt entry!
            if (offset == end) {
                if (data[ptr+1] == CHAR_NULL) {
                    return mWords[data[ptr+2] - NEGATIVE_OFFSET];
                }
                return null;
            }

            int count = data[ptr++];
            // Need to find the branch to follow, if any
            char c = str[offset++];

            inner_block:
            do { // dummy loop, need to have break
                // Linear or binary search?
                if (count < MIN_BINARY_SEARCH) {
                    // always at least two branches; never less
                    if (data[ptr] == c) {
                        ptr = (int) data[ptr+1];
                        break inner_block;
                    }
                    if (data[ptr+2] == c) {
                        ptr = (int) data[ptr+3];
                        break inner_block;
                    }
                    int branchEnd = ptr + (count << 1);
                    // Starts from entry #3, if such exists
                    for (ptr += 4; ptr < branchEnd; ptr += 2) {
                        if (data[ptr] == c) {
                            ptr = (int) data[ptr+1];
                            break inner_block;
                        }
                    }
                    return null; // No match!
                } else { // Ok, binary search:
                    int low = 0;
                    int high = count-1;
                    int mid;
                    
                    while (low <= high) {
                        mid = (low + high) >> 1;
                        int ix = ptr + (mid << 1);
                        int diff = data[ix] - c;
                        if (diff > 0) { // char was 'higher', need to go down
                            high = mid-1;
                        } else if (diff < 0) { // lower, need to go up
                            low = mid+1;
                        } else { // match (so far)
                            ptr = (int) data[ix+1];
                            break inner_block;
                        }
                    }
                    return null; // No match!
                }
            } while (false);

            // Ok; now, is it the end?
            if (ptr >= NEGATIVE_OFFSET) {
                String word = mWords[ptr - NEGATIVE_OFFSET];
                int expLen = (end - start);
                if (word.length() != expLen) {
                    return null;
                }
                for (int i = offset - start; offset < end; ++i, ++offset) {
                    if (word.charAt(i) != str[offset]) {
                        return null;
                    }
                }
                return word;
            }
        }
        // never gets here
    }

    private String findFromOne(char[] str, final int start, final int end)
    {
        String word = mWords[0];
        int len = end-start;
        if (word.length() != len) {
            return null;
        }
        for (int i = 0; i < len; ++i) {
            if (word.charAt(i) != str[start+i]) {
                return null;
            }
        }
        return word;
    }

    /**
     * @return (Shared) string instance of the word, if it exists in
     *   the word set; null if not.
     */
    public String find(String str)
    {
        char[] data = mData;

        // 03-Jan-2006, TSa: Special case; one entry
        if (data == null) {
            String word = mWords[0];
            return word.equals(str) ? word : null;
        }

        int ptr = 0; // pointer to compressed set data
        int offset = 0;
        int end = str.length();

        while (true) {
            // End of input String? Need to match the runt entry!
            if (offset == end) {
                if (data[ptr+1] == CHAR_NULL) {
                    return mWords[data[ptr+2] - NEGATIVE_OFFSET];
                }
                return null;
            }

            int count = data[ptr++];
            // Need to find the branch to follow, if any
            char c = str.charAt(offset++);

            inner_block:
            do { // dummy loop, need to have break
                // Linear or binary search?
                if (count < MIN_BINARY_SEARCH) {
                    // always at least two branches; never less
                    if (data[ptr] == c) {
                        ptr = (int) data[ptr+1];
                        break inner_block;
                    }
                    if (data[ptr+2] == c) {
                        ptr = (int) data[ptr+3];
                        break inner_block;
                    }
                    int branchEnd = ptr + (count << 1);
                    // Starts from entry #3, if such exists
                    for (ptr += 4; ptr < branchEnd; ptr += 2) {
                        if (data[ptr] == c) {
                            ptr = (int) data[ptr+1];
                            break inner_block;
                        }
                    }
                    return null; // No match!
                } else { // Ok, binary search:
                    int low = 0;
                    int high = count-1;
                    int mid;
                    
                    while (low <= high) {
                        mid = (low + high) >> 1;
                        int ix = ptr + (mid << 1);
                        int diff = data[ix] - c;
                        if (diff > 0) { // char was 'higher', need to go down
                            high = mid-1;
                        } else if (diff < 0) { // lower, need to go up
                            low = mid+1;
                        } else { // match (so far)
                            ptr = (int) data[ix+1];
                            break inner_block;
                        }
                    }
                    return null; // No match!
                }
            } while (false);

            // Ok; now, is it the end?
            if (ptr >= NEGATIVE_OFFSET) {
                String word = mWords[ptr - NEGATIVE_OFFSET];
                if (word.length() != str.length()) {
                    return null;
                }
                for (; offset < end; ++offset) {
                    if (word.charAt(offset) != str.charAt(offset)) {
                        return null;
                    }
                }
                return word;
            }
        }
        // never gets here
    }

    /*
    ////////////////////////////////////////////////
    // Re-defined public methods
    ////////////////////////////////////////////////
     */

    public String toString()
    {
        StringBuffer sb = new StringBuffer(16 + (mWords.length << 3));
        for (int i = 0, len = mWords.length; i < len; ++i) {
            if (i > 0) {
                sb.append(", ");
            }
            sb.append(mWords[i]);
        }
        return sb.toString();
    }

    /*
    ////////////////////////////////////////////////
    // Private methods
    ////////////////////////////////////////////////
     */

    /*
    ////////////////////////////////////////////////
    // Helper classes
    ////////////////////////////////////////////////
     */

    private final static class Builder
    {
        final String[] mWords;

        char[] mData;

        /**
         * Number of characters currently used from mData
         */
        int mSize;

        public Builder(TreeSet wordSet)
        {
            int wordCount = wordSet.size();

            mWords = new String[wordCount];
            wordSet.toArray(mWords);

            /* 03-Jan-2006, TSa: Special case: just one entry; if so,
             *   let's leave char array null, and just have the String
             *   array with one entry.
             */
            if (wordCount < 2) {
                if (wordCount == 0) {
                    throw new IllegalArgumentException(); // not legal
                }
                mData = null;
            } else {
                /* Let's guess approximate size we should need, assuming
                 * average word length of 6 characters, overhead matching
                 * compression (ie. about 1-to-1 ratio overall)
                 */
                int size = wordCount * 6;
                if (size < 256) {
                    size = 256;
                }
                mData = new char[size];
            }
        }

        /**
         * @return Raw character data that contains compressed structure
         *   of the word set
         */
        public WordResolver construct() 
        {
            char[] result;

            /* 03-Jan-2006, TSa: Special case: just one entry; if so,
             *   let's leave char array null, and just have the String
             *   array with one entry.
             */
            if (mData == null) {
                result = null;
            } else {
                constructBranch(0, 0, mWords.length);
                
                // Too big?
                if (mSize > NEGATIVE_OFFSET) {
                    return null;
                }
                
                result = new char[mSize];
                System.arraycopy(mData, 0, result, 0, mSize);
            }

            return new WordResolver(mWords, result);
        }

        /**
         * Method that is called recursively to build the data
         * representation for a branch, ie. part of word set tree
         * that still has more than one ending
         *
         * @param charIndex Index of the character in words to consider
         *   for this round
         * @param start Index of the first word to be processed
         * @param end Index of the word after last word to be processed
         *   (so that number of words is end - start - 1
         */
        private void constructBranch(int charIndex, int start, int end)
        {
            // If more than one entry, need to divide into groups

            // First, need to add placeholder for branch count:
            if (mSize >= mData.length) {
                expand(1);
            }
            mData[mSize++] = 0; // placeholder!
            /* structStart will point to second char of first entry
             * (which will temporarily have entry count, eventually 'link'
             * to continuation)
             */
            int structStart = mSize + 1;
            int groupCount = 0;
            int groupStart = start;
            String[] words = mWords;
            boolean gotRunt;

            /* First thing we need to do is a special check for the
             * first entry -- it may be "runt" word, one that has no
             * more chars but also has a longer version ("id" vs.
             * "identifier"). If so, it needs to be marked; this is done
             * by adding a special entry before other entries (since such
             * entry would always be ordered first alphabetically)
             */
            if (words[groupStart].length() == charIndex) { // yup, got one:
                if ((mSize + 2) > mData.length) {
                    expand(2);
                }
                /* First null marks the "missing" char (or, end-of-word);
                 * and then we need the index
                 */
                mData[mSize++] = CHAR_NULL;
                mData[mSize++] = (char) (NEGATIVE_OFFSET + groupStart);

                // Ok, let's then ignore that entry
                ++groupStart;
                ++groupCount;
                gotRunt = true;
            } else {
                gotRunt = false;
            }

            // Ok, then, let's find the ('real') groupings:
            while (groupStart < end) {
                // Inner loop, let's find the group:
                char c = words[groupStart].charAt(charIndex);
                int j = groupStart+1;
                while (j < end && words[j].charAt(charIndex) == c) {
                    ++j;
                }
                /* Ok, let's store the char in there, along with count;
                 * count will be needed in second, and will then get
                 * overwritten with actual data later on
                 */
                if ((mSize + 2) > mData.length) {
                    expand(2);
                }
                mData[mSize++] = c;
                mData[mSize++] = (char) (j - groupStart); // entries in group
                groupStart = j;
                ++groupCount;
            }

            /* Ok, groups found; need to loop through them, recursively
             * calling branch and/or leaf methods
             */
            // first let's output the header, ie. group count:
            mData[structStart-2] = (char) groupCount;
            groupStart = start;

            // Do we have the "runt" to skip?
            if (gotRunt) {
                structStart += 2;
                ++groupStart;
            }

            int structEnd = mSize;
            ++charIndex;
            for (; structStart < structEnd; structStart += 2) {
                groupCount = (int) mData[structStart]; // no sign expansion, is ok
                /* Ok, count gotten, can either create a branch (if more than
                 * one entry) or leaf (just one entry)
                 */
                if (groupCount == 1) {
                    mData[structStart] = (char) (NEGATIVE_OFFSET + groupStart);
                } else {
                    mData[structStart] = (char) mSize;
                    constructBranch(charIndex, groupStart,
                                    groupStart + groupCount);
                }
                groupStart += groupCount;
            }

            // done!
        }

        private char[] expand(int needSpace)
        {
            char[] old = mData;
            int len = old.length;
            int newSize = len + ((len < 4096) ? len : (len >> 1));

            /* Let's verify we get enough; should always be true but
             * better safe than sorry
             */
            if (newSize < (mSize + needSpace)) {
                newSize = mSize + needSpace + 64;
            }
            mData = new char[newSize];
            System.arraycopy(old, 0, mData, 0, len);
            return mData;
        }
    }

    /*
    ////////////////////////////////////////////////////
    // Simple test driver, useful for debugging
    // (uncomment if needed -- commented out so it won't
    // affect coverage testing)
    ////////////////////////////////////////////////////
     */

    /*
    public static void main(String[] args)
    {
        if (args.length < 2) {
            System.err.println("Usage: "+WordResolver.class+" word1 [word2] ... [wordN] keyword");
            System.exit(1);
        }
        String key = args[args.length-1];
        TreeSet words = new TreeSet();
        for (int i = 0; i < args.length-1; ++i) {
            words.add(args[i]);
        }

        WordResolver set = WordResolver.constructInstance(words);

//outputData(set.mData);

        // Ok, and then the test!
        char[] keyA = new char[key.length() + 4];
        key.getChars(0, key.length(), keyA, 2);
        //System.out.println("Word '"+key+"' found via array search: "+WordResolver.find(data, keyA, 2, key.length() + 2));
        System.out.println("Word '"+key+"' found via array search: "+set.find(keyA, 2, key.length() + 2));
    }

    static void outputData(char[] data)
    {
        for (int i = 0; i < data.length; ++i) {
            char c = data[i];
            System.out.print(Integer.toHexString(i)+" ["+Integer.toHexString(c)+"]");
            if (c > 32 && c <= 127) { // printable char (letter)
                System.out.println(" -> '"+c+"'");
            } else {
                System.out.println();
            }
        }
    }
    */
}