All Downloads are FREE. Search and download functionalities are using the official Maven repository.

com.ctc.wstx.util.WordResolver Maven / Gradle / Ivy

package com.ctc.wstx.util;

import java.util.*;

/**
 * A specialized Map/Symbol table - like data structure that can be used
 * for both checking whether a word (passed in as a char array) exists
 * in certain set of words AND getting that word as a String.
 * It is reasonably efficient both time and speed-wise, at least for
 * certain use cases; specifically, if there is no existing key to use,
 * it is more efficient way to get to a shared copy of that String
 * The general usage pattern is expected
 * to be such that most checks are positive, ie. that the word indeed
 * is contained in the structure.
 *

* Although this is an efficient data struct for specific set of usage * patterns, one restriction is that the full set of words to include has to * be known before constructing the instnace. Also, the size of the set is * limited to total word content of about 20k characters. *

* TODO: Should document the internal data structure... */ public final class WordResolver { /** * Maximum number of words (Strings) an instance can contain */ public final static int MAX_WORDS = 0x2000; final static char CHAR_NULL = (char) 0; /** * Offset added to numbers to mark 'negative' numbers. Asymmetric, * since range of negative markers needed is smaller than positive * numbers... */ final static int NEGATIVE_OFFSET = 0x10000 - MAX_WORDS; /** * This is actually just a guess; but in general linear search should * be faster for short sequences (definitely for 4 or less; maybe up * to 8 or less?) */ final static int MIN_BINARY_SEARCH = 7; /** * Compressed presentation of the word set. */ final char[] mData; /** * Array of actual words returned resolved for matches. */ final String[] mWords; /* //////////////////////////////////////////////// // Life-cycle //////////////////////////////////////////////// */ private WordResolver(String[] words, char[] index) { mWords = words; mData = index; } /** * Tries to construct an instance given ordered set of words. *

* Note: currently maximum number of words that can be contained * is limited to {@link #MAX_WORDS}; additionally, maximum length * of all such words can not exceed roughly 28000 characters. * * @return WordResolver constructed for given set of words, if * the word set size is not too big; null to indicate "too big" * instance. */ public static WordResolver constructInstance(TreeSet wordSet) { if (wordSet.size() > MAX_WORDS) { return null; } return new Builder(wordSet).construct(); } /* //////////////////////////////////////////////// // Public API //////////////////////////////////////////////// */ /** * @return Number of words contained */ public int size() { return mWords.length; } /* public int indexSize() { return mData.length; } */ /** * @param str Character array that contains the word to find * @param start Index of the first character of the word * @param end Index following the last character of the word, * so that end - start equals word length (similar * to the way String.substring() has). * * @return (Shared) string instance of the word, if it exists in * the word set; null if not. */ @SuppressWarnings("cast") public String find(char[] str, final int start, final int end) { char[] data = mData; // 03-Jan-2006, TSa: Special case; one entry if (data == null) { return findFromOne(str, start, end); } int ptr = 0; // pointer to compressed set data int offset = start; while (true) { // End of input String? Need to match the runt entry! if (offset == end) { if (data[ptr+1] == CHAR_NULL) { return mWords[data[ptr+2] - NEGATIVE_OFFSET]; } return null; } int count = data[ptr++]; // Need to find the branch to follow, if any char c = str[offset++]; inner_block: do { // dummy loop, need to have break // Linear or binary search? if (count < MIN_BINARY_SEARCH) { // always at least two branches; never less if (data[ptr] == c) { ptr = (int) data[ptr+1]; break inner_block; } if (data[ptr+2] == c) { ptr = (int) data[ptr+3]; break inner_block; } int branchEnd = ptr + (count << 1); // Starts from entry #3, if such exists for (ptr += 4; ptr < branchEnd; ptr += 2) { if (data[ptr] == c) { ptr = (int) data[ptr+1]; break inner_block; } } return null; // No match! } else { // Ok, binary search: int low = 0; int high = count-1; int mid; while (low <= high) { mid = (low + high) >> 1; int ix = ptr + (mid << 1); int diff = data[ix] - c; if (diff > 0) { // char was 'higher', need to go down high = mid-1; } else if (diff < 0) { // lower, need to go up low = mid+1; } else { // match (so far) ptr = (int) data[ix+1]; break inner_block; } } return null; // No match! } } while (false); // Ok; now, is it the end? if (ptr >= NEGATIVE_OFFSET) { String word = mWords[ptr - NEGATIVE_OFFSET]; int expLen = (end - start); if (word.length() != expLen) { return null; } for (int i = offset - start; offset < end; ++i, ++offset) { if (word.charAt(i) != str[offset]) { return null; } } return word; } } // never gets here } private String findFromOne(char[] str, final int start, final int end) { String word = mWords[0]; int len = end-start; if (word.length() != len) { return null; } for (int i = 0; i < len; ++i) { if (word.charAt(i) != str[start+i]) { return null; } } return word; } /** * @return (Shared) string instance of the word, if it exists in * the word set; null if not. */ @SuppressWarnings("cast") public String find(String str) { char[] data = mData; // 03-Jan-2006, TSa: Special case; one entry if (data == null) { String word = mWords[0]; return word.equals(str) ? word : null; } int ptr = 0; // pointer to compressed set data int offset = 0; int end = str.length(); while (true) { // End of input String? Need to match the runt entry! if (offset == end) { if (data[ptr+1] == CHAR_NULL) { return mWords[data[ptr+2] - NEGATIVE_OFFSET]; } return null; } int count = data[ptr++]; // Need to find the branch to follow, if any char c = str.charAt(offset++); inner_block: do { // dummy loop, need to have break // Linear or binary search? if (count < MIN_BINARY_SEARCH) { // always at least two branches; never less if (data[ptr] == c) { ptr = (int) data[ptr+1]; break inner_block; } if (data[ptr+2] == c) { ptr = (int) data[ptr+3]; break inner_block; } int branchEnd = ptr + (count << 1); // Starts from entry #3, if such exists for (ptr += 4; ptr < branchEnd; ptr += 2) { if (data[ptr] == c) { ptr = (int) data[ptr+1]; break inner_block; } } return null; // No match! } else { // Ok, binary search: int low = 0; int high = count-1; int mid; while (low <= high) { mid = (low + high) >> 1; int ix = ptr + (mid << 1); int diff = data[ix] - c; if (diff > 0) { // char was 'higher', need to go down high = mid-1; } else if (diff < 0) { // lower, need to go up low = mid+1; } else { // match (so far) ptr = (int) data[ix+1]; break inner_block; } } return null; // No match! } } while (false); // Ok; now, is it the end? if (ptr >= NEGATIVE_OFFSET) { String word = mWords[ptr - NEGATIVE_OFFSET]; if (word.length() != str.length()) { return null; } for (; offset < end; ++offset) { if (word.charAt(offset) != str.charAt(offset)) { return null; } } return word; } } // never gets here } /* //////////////////////////////////////////////// // Re-defined public methods //////////////////////////////////////////////// */ @Override public String toString() { StringBuilder sb = new StringBuilder(16 + (mWords.length << 3)); for (int i = 0, len = mWords.length; i < len; ++i) { if (i > 0) { sb.append(", "); } sb.append(mWords[i]); } return sb.toString(); } /* //////////////////////////////////////////////// // Helper classes //////////////////////////////////////////////// */ private final static class Builder { final String[] mWords; char[] mData; /** * Number of characters currently used from mData */ int mSize; public Builder(TreeSet wordSet) { int wordCount = wordSet.size(); mWords = new String[wordCount]; wordSet.toArray(mWords); /* 03-Jan-2006, TSa: Special case: just one entry; if so, * let's leave char array null, and just have the String * array with one entry. */ if (wordCount < 2) { if (wordCount == 0) { throw new IllegalArgumentException(); // not legal } mData = null; } else { /* Let's guess approximate size we should need, assuming * average word length of 6 characters, overhead matching * compression (ie. about 1-to-1 ratio overall) */ int size = wordCount * 6; if (size < 256) { size = 256; } mData = new char[size]; } } /** * @return Raw character data that contains compressed structure * of the word set */ public WordResolver construct() { char[] result; /* 03-Jan-2006, TSa: Special case: just one entry; if so, * let's leave char array null, and just have the String * array with one entry. */ if (mData == null) { result = null; } else { constructBranch(0, 0, mWords.length); // Too big? if (mSize > NEGATIVE_OFFSET) { return null; } result = new char[mSize]; System.arraycopy(mData, 0, result, 0, mSize); } return new WordResolver(mWords, result); } /** * Method that is called recursively to build the data * representation for a branch, ie. part of word set tree * that still has more than one ending * * @param charIndex Index of the character in words to consider * for this round * @param start Index of the first word to be processed * @param end Index of the word after last word to be processed * (so that number of words is end - start - 1 */ @SuppressWarnings("cast") private void constructBranch(int charIndex, int start, int end) { // If more than one entry, need to divide into groups // First, need to add placeholder for branch count: if (mSize >= mData.length) { expand(1); } mData[mSize++] = 0; // placeholder! /* structStart will point to second char of first entry * (which will temporarily have entry count, eventually 'link' * to continuation) */ int structStart = mSize + 1; int groupCount = 0; int groupStart = start; String[] words = mWords; boolean gotRunt; /* First thing we need to do is a special check for the * first entry -- it may be "runt" word, one that has no * more chars but also has a longer version ("id" vs. * "identifier"). If so, it needs to be marked; this is done * by adding a special entry before other entries (since such * entry would always be ordered first alphabetically) */ if (words[groupStart].length() == charIndex) { // yup, got one: if ((mSize + 2) > mData.length) { expand(2); } /* First null marks the "missing" char (or, end-of-word); * and then we need the index */ mData[mSize++] = CHAR_NULL; mData[mSize++] = (char) (NEGATIVE_OFFSET + groupStart); // Ok, let's then ignore that entry ++groupStart; ++groupCount; gotRunt = true; } else { gotRunt = false; } // Ok, then, let's find the ('real') groupings: while (groupStart < end) { // Inner loop, let's find the group: char c = words[groupStart].charAt(charIndex); int j = groupStart+1; while (j < end && words[j].charAt(charIndex) == c) { ++j; } /* Ok, let's store the char in there, along with count; * count will be needed in second, and will then get * overwritten with actual data later on */ if ((mSize + 2) > mData.length) { expand(2); } mData[mSize++] = c; mData[mSize++] = (char) (j - groupStart); // entries in group groupStart = j; ++groupCount; } /* Ok, groups found; need to loop through them, recursively * calling branch and/or leaf methods */ // first let's output the header, ie. group count: mData[structStart-2] = (char) groupCount; groupStart = start; // Do we have the "runt" to skip? if (gotRunt) { structStart += 2; ++groupStart; } int structEnd = mSize; ++charIndex; for (; structStart < structEnd; structStart += 2) { groupCount = (int) mData[structStart]; // no sign expansion, is ok /* Ok, count gotten, can either create a branch (if more than * one entry) or leaf (just one entry) */ if (groupCount == 1) { mData[structStart] = (char) (NEGATIVE_OFFSET + groupStart); } else { mData[structStart] = (char) mSize; constructBranch(charIndex, groupStart, groupStart + groupCount); } groupStart += groupCount; } // done! } private char[] expand(int needSpace) { char[] old = mData; int len = old.length; int newSize = len + ((len < 4096) ? len : (len >> 1)); /* Let's verify we get enough; should always be true but * better safe than sorry */ if (newSize < (mSize + needSpace)) { newSize = mSize + needSpace + 64; } mData = new char[newSize]; System.arraycopy(old, 0, mData, 0, len); return mData; } } /* //////////////////////////////////////////////////// // Simple test driver, useful for debugging // (uncomment if needed -- commented out so it won't // affect coverage testing) //////////////////////////////////////////////////// */ /* public static void main(String[] args) { if (args.length < 2) { System.err.println("Usage: "+WordResolver.class+" word1 [word2] ... [wordN] keyword"); System.exit(1); } String key = args[args.length-1]; TreeSet words = new TreeSet(); for (int i = 0; i < args.length-1; ++i) { words.add(args[i]); } WordResolver set = WordResolver.constructInstance(words); //outputData(set.mData); // Ok, and then the test! char[] keyA = new char[key.length() + 4]; key.getChars(0, key.length(), keyA, 2); //System.out.println("Word '"+key+"' found via array search: "+WordResolver.find(data, keyA, 2, key.length() + 2)); System.out.println("Word '"+key+"' found via array search: "+set.find(keyA, 2, key.length() + 2)); } static void outputData(char[] data) { for (int i = 0; i < data.length; ++i) { char c = data[i]; System.out.print(Integer.toHexString(i)+" ["+Integer.toHexString(c)+"]"); if (c > 32 && c <= 127) { // printable char (letter) System.out.println(" -> '"+c+"'"); } else { System.out.println(); } } } */ }





© 2015 - 2025 Weber Informatics LLC | Privacy Policy