All Downloads are FREE. Search and download functionalities are using the official Maven repository.

com.ctc.wstx.util.WordSet Maven / Gradle / Ivy

There is a newer version: 1.0.0
Show newest version
package com.ctc.wstx.util;

import java.util.*;

/**
 * An efficient (both memory and time) implementation of a Set used to
 * verify that a given
 * word is contained within the set. The general usage pattern is expected
 * to be such that most checks are positive, ie. that the word indeed
 * is contained in the set.
 *

* Performance of the set is comparable to that of {@link java.util.TreeSet} * for Strings, ie. 2-3x slower than {@link java.util.HashSet} when * using pre-constructed Strings. This is generally result of algorithmic * complexity of structures; Word and Tree sets are roughly logarithmic * to the whole data, whereas Hash set is linear to the length of key. * However: *

    *
  • WordSet can use char arrays as keys, without constructing Strings. * In cases where there is no (need for) Strings, WordSet seems to be * about twice as fast, even without considering additional GC caused * by temporary String instances. *
  • *
  • WordSet is more compact in its memory presentation; if Strings are * shared its size is comparable to optimally filled HashSet, and if * no such Strings exists, its much more compact (relatively speaking) *
  • *
*

* Although this is an efficient set for specific set of usage patterns, * one restriction is that the full set of words to include has to be * known before constructing the set. Also, the size of the set is * limited to total word content of about 20k characters; factory method * does verify the limit and indicates if an instance can not be created. */ public final class WordSet { final static char CHAR_NULL = (char) 0; /** * Offset added to numbers to mark 'negative' numbers. Asymmetric, * since range of negative markers needed is smaller than positive * numbers... */ final static int NEGATIVE_OFFSET = 0xC000; /** * This is actually just a guess; but in general linear search should * be faster for short sequences (definitely for 4 or less; maybe up * to 8 or less?) */ final static int MIN_BINARY_SEARCH = 7; /** * Compressed presentation of the word set. */ final char[] mData; /* //////////////////////////////////////////////// // Life-cycle //////////////////////////////////////////////// */ private WordSet(char[] data) { mData = data; } public static WordSet constructSet(TreeSet wordSet) { return new WordSet(new Builder(wordSet).construct()); } public static char[] constructRaw(TreeSet wordSet) { return new Builder(wordSet).construct(); } /* //////////////////////////////////////////////// // Public API //////////////////////////////////////////////// */ public boolean contains(char[] buf, int start, int end) { return contains(mData, buf, start, end); } @SuppressWarnings("cast") public static boolean contains(char[] data, char[] str, int start, int end) { int ptr = 0; // pointer to compressed set data main_loop: do { int left = end-start; // End of input String? Need to have the run entry: if (left == 0) { return (data[ptr+1] == CHAR_NULL); } int count = data[ptr++]; // Nope, but do we have an end marker? if (count >= NEGATIVE_OFFSET) { // How many chars do we need to have left to match? int expCount = count - NEGATIVE_OFFSET; if (left != expCount) { return false; } while (start < end) { if (data[ptr] != str[start]) { return false; } ++ptr; ++start; } return true; } // No, need to find the branch to follow, if any char c = str[start++]; // Linear or binary search? if (count < MIN_BINARY_SEARCH) { // always at least two branches; never less if (data[ptr] == c) { ptr = (int) data[ptr+1]; continue main_loop; } if (data[ptr+2] == c) { ptr = (int) data[ptr+3]; continue main_loop; } int branchEnd = ptr + (count << 1); // Starts from entry #3, if such exists for (ptr += 4; ptr < branchEnd; ptr += 2) { if (data[ptr] == c) { ptr = (int) data[ptr+1]; continue main_loop; } } return false; // No match! } { // Ok, binary search: int low = 0; int high = count-1; int mid; while (low <= high) { mid = (low + high) >> 1; int ix = ptr + (mid << 1); int diff = data[ix] - c; if (diff > 0) { // char was 'higher', need to go down high = mid-1; } else if (diff < 0) { // lower, need to go up low = mid+1; } else { // match ptr = (int) data[ix+1]; continue main_loop; } } } // If we fall here, no match! return false; } while (ptr != 0); // If we reached an end state, must match the length return (start == end); } public boolean contains(String str) { return contains(mData, str); } @SuppressWarnings("cast") public static boolean contains(char[] data, String str) { // Let's use same vars as array-based code, to allow cut'n pasting int ptr = 0; // pointer to compressed set data int start = 0; int end = str.length(); main_loop: do { int left = end-start; // End of input String? Need to have the run entry: if (left == 0) { return (data[ptr+1] == CHAR_NULL); } int count = data[ptr++]; // Nope, but do we have an end marker? if (count >= NEGATIVE_OFFSET) { // How many chars do we need to have left to match? int expCount = count - NEGATIVE_OFFSET; if (left != expCount) { return false; } while (start < end) { if (data[ptr] != str.charAt(start)) { return false; } ++ptr; ++start; } return true; } // No, need to find the branch to follow, if any char c = str.charAt(start++); // Linear or binary search? if (count < MIN_BINARY_SEARCH) { // always at least two branches; never less if (data[ptr] == c) { ptr = (int) data[ptr+1]; continue main_loop; } if (data[ptr+2] == c) { ptr = (int) data[ptr+3]; continue main_loop; } int branchEnd = ptr + (count << 1); // Starts from entry #3, if such exists for (ptr += 4; ptr < branchEnd; ptr += 2) { if (data[ptr] == c) { ptr = (int) data[ptr+1]; continue main_loop; } } return false; // No match! } { // Ok, binary search: int low = 0; int high = count-1; int mid; while (low <= high) { mid = (low + high) >> 1; int ix = ptr + (mid << 1); int diff = data[ix] - c; if (diff > 0) { // char was 'higher', need to go down high = mid-1; } else if (diff < 0) { // lower, need to go up low = mid+1; } else { // match ptr = (int) data[ix+1]; continue main_loop; } } } // If we fall here, no match! return false; } while (ptr != 0); // If we reached an end state, must match the length return (start == end); } /* //////////////////////////////////////////////// // Private methods //////////////////////////////////////////////// */ /* //////////////////////////////////////////////// // Helper classes //////////////////////////////////////////////// */ private final static class Builder { final String[] mWords; char[] mData; /** * Number of characters currently used from mData */ int mSize; public Builder(TreeSet wordSet) { int wordCount = wordSet.size(); mWords = new String[wordCount]; wordSet.toArray(mWords); /* Let's guess approximate size we should need, assuming * average word length of 6 characters, and 100% overhead * in structure: */ int size = wordCount * 12; if (size < 256) { size = 256; } mData = new char[size]; } /** * @return Raw character data that contains compressed structure * of the word set */ public char[] construct() { // Uncomment if you need to debug array-out-of-bound probs //try { // Let's check degenerate case of 1 word: if (mWords.length == 1) { constructLeaf(0, 0); } else { constructBranch(0, 0, mWords.length); } //} catch (Throwable t) { System.err.println("Error: "+t); } char[] result = new char[mSize]; System.arraycopy(mData, 0, result, 0, mSize); return result; } /** * Method that is called recursively to build the data * representation for a branch, ie. part of word set tree * that still has more than one ending * * @param charIndex Index of the character in words to consider * for this round * @param start Index of the first word to be processed * @param end Index of the word after last word to be processed * (so that number of words is end - start - 1 */ @SuppressWarnings("cast") private void constructBranch(int charIndex, int start, int end) { // If more than one entry, need to divide into groups // First, need to add placeholder for branch count: if (mSize >= mData.length) { expand(1); } mData[mSize++] = 0; // placeholder! /* structStart will point to second char of first entry * (which will temporarily have entry count, eventually 'link' * to continuation) */ int structStart = mSize + 1; int groupCount = 0; int groupStart = start; String[] words = mWords; /* First thing we need to do is a special check for the * first entry -- it may be "runt" word, one that has no * more chars but also has a longer version ("id" vs. * "identifier"). If there is such a word, it'll always * be first in alphabetic ordering: */ if (words[groupStart].length() == charIndex) { // yup, got one: if ((mSize + 2) > mData.length) { expand(2); } /* Nulls mark both imaginary branching null char and * "missing link" to the rest */ mData[mSize++] = CHAR_NULL; mData[mSize++] = CHAR_NULL; // Ok, let's then ignore that entry ++groupStart; ++groupCount; } // Ok, then, let's find the ('real') groupings: while (groupStart < end) { // Inner loop, let's find the group: char c = words[groupStart].charAt(charIndex); int j = groupStart+1; while (j < end && words[j].charAt(charIndex) == c) { ++j; } /* Ok, let's store the char in there, along with count; * count will be needed in second, and will then get * overwritten with actual data later on */ if ((mSize + 2) > mData.length) { expand(2); } mData[mSize++] = c; mData[mSize++] = (char) (j - groupStart); // entries in group groupStart = j; ++groupCount; } /* Ok, groups found; need to loop through them, recursively * calling branch and/or leaf methods */ // first let's output the header, ie. group count: mData[structStart-2] = (char) groupCount; groupStart = start; // Do we have the "runt" to skip? if (mData[structStart] == CHAR_NULL) { structStart += 2; ++groupStart; } int structEnd = mSize; ++charIndex; for (; structStart < structEnd; structStart += 2) { groupCount = (int) mData[structStart]; // no sign expansion, is ok // Ok, count gotten, can now put the 'link' (pointer) in there mData[structStart] = (char) mSize; if (groupCount == 1) { /* One optimization; if it'd lead to a single runt * entry, we can just add 'null' link: */ String word = words[groupStart]; if (word.length() == charIndex) { mData[structStart] = CHAR_NULL; } else { // otherwise, let's just create end state: constructLeaf(charIndex, groupStart); } } else { constructBranch(charIndex, groupStart, groupStart + groupCount); } groupStart += groupCount; } // done! } /** * Method called to add leaf entry to word set; basically * "here is the rest of the only matching word" */ private void constructLeaf(int charIndex, int wordIndex) { String word = mWords[wordIndex]; int len = word.length(); char[] data = mData; // need room for 1 header char, rest of the word if ((mSize + len + 1) >= data.length) { data = expand(len+1); } data[mSize++] = (char) (NEGATIVE_OFFSET + (len - charIndex)); for (; charIndex < len; ++charIndex) { data[mSize++] = word.charAt(charIndex); } } private char[] expand(int needSpace) { char[] old = mData; int len = old.length; int newSize = len + ((len < 4096) ? len : (len >> 1)); /* Let's verify we get enough; should always be true but * better safe than sorry */ if (newSize < (mSize + needSpace)) { newSize = mSize + needSpace + 64; } mData = new char[newSize]; System.arraycopy(old, 0, mData, 0, len); return mData; } } }





© 2015 - 2025 Weber Informatics LLC | Privacy Policy