com.shikhir.util.stringops.NGramSet Maven / Gradle / Ivy
Go to download
Show more of this group Show more artifacts with this name
Show all versions of Lsh4Text Show documentation
Show all versions of Lsh4Text Show documentation
This is a simplified implementation for Locality sensitive hashing(LSH) for text documents
The newest version!
package com.shikhir.util.stringops;
import java.util.Iterator;
import java.util.LinkedHashSet;
import opennlp.tools.dictionary.Dictionary;
import opennlp.tools.util.StringList;
import opennlp.tools.util.StringUtil;
/**
* The {@link NGramSet} can be used to crate ngrams and character ngrams.
*
* @see StringList
*/
public class NGramSet implements Iterable {
protected static final String COUNT = "count";
private LinkedHashSet mNGrams = new LinkedHashSet();
/**
* Initializes an empty instance.
*/
public NGramSet() {
}
/**
* Adds one NGram, if it already exists the count increase by one.
*
* @param ngram
*/
public void add(StringList ngram) {
mNGrams.add(ngram);
}
/**
* Adds NGrams up to the specified length to the current instance.
*
* @param ngram the tokens to build the uni-grams, bi-grams, tri-grams, ..
* from.
* @param minLength - minimal length
* @param maxLength - maximal length
*/
public void add(StringList ngram, int minLength, int maxLength) {
if (minLength < 1 || maxLength < 1)
throw new IllegalArgumentException("minLength and maxLength param must be at least 1. " +
"minLength=" + minLength + ", maxLength= " + maxLength);
if (minLength > maxLength)
throw new IllegalArgumentException("minLength param must not be larger than " +
"maxLength param. minLength=" + minLength + ", maxLength= " + maxLength);
for (int lengthIndex = minLength; lengthIndex < maxLength + 1; lengthIndex++) {
for (int textIndex = 0;
textIndex + lengthIndex - 1 < ngram.size(); textIndex++) {
String[] grams = new String[lengthIndex];
for (int i = textIndex; i < textIndex + lengthIndex; i++) {
grams[i - textIndex] = ngram.getToken(i);
}
add(new StringList(grams));
}
}
}
/**
* Adds character NGrams to the current instance.
*
* @param chars
* @param minLength
* @param maxLength
*/
public void add(CharSequence chars, int minLength, int maxLength) {
for (int lengthIndex = minLength; lengthIndex < maxLength + 1; lengthIndex++) {
for (int textIndex = 0;
textIndex + lengthIndex - 1 < chars.length(); textIndex++) {
String gram = StringUtil.toLowerCase(
chars.subSequence(textIndex, textIndex + lengthIndex));
add(new StringList(new String[]{gram}));
}
}
}
/**
* Removes the specified tokens form the NGram model, they are just dropped.
*
* @param tokens
*/
public void remove(StringList tokens) {
mNGrams.remove(tokens);
}
/**
* Checks fit he given tokens are contained by the current instance.
*
* @param tokens
*
* @return true if the ngram is contained
*/
public boolean contains(StringList tokens) {
return mNGrams.contains(tokens);
}
/**
* Retrieves the number of {@link StringList} entries in the current instance.
*
* @return number of different grams
*/
public int size() {
return mNGrams.size();
}
/**
* Retrieves an {@link Iterator} over all {@link StringList} entries.
*
* @return iterator over all grams
*/
@Override
public Iterator iterator() {
return mNGrams.iterator();
}
/**
* Retrieves the total count of all Ngrams.
*
* @return total count of all ngrams
*/
public int numberOfGrams() {
return mNGrams.size();
}
/**
* Creates a dictionary which contain all {@link StringList} which
* are in the current {@link NGramSet}.
*
* Entries which are only different in the case are merged into one.
*
* Calling this method is the same as calling {@link #toDictionary(boolean)} with true.
*
* @return a dictionary of the ngrams
*/
public Dictionary toDictionary() {
return toDictionary(false);
}
/**
* Creates a dictionary which contains all {@link StringList}s which
* are in the current {@link NGramSet}.
*
* @param caseSensitive Specifies whether case distinctions should be kept
* in the creation of the dictionary.
*
* @return a dictionary of the ngrams
*/
public Dictionary toDictionary(boolean caseSensitive) {
Dictionary dict = new Dictionary(caseSensitive);
for (StringList stringList : this) {
dict.put(stringList);
}
return dict;
}
@Override
public boolean equals(Object obj) {
boolean result;
if (obj == this) {
result = true;
}
else if (obj instanceof NGramSet) {
NGramSet model = (NGramSet) obj;
result = mNGrams.equals(model.mNGrams);
}
else {
result = false;
}
return result;
}
@Override
public String toString() {
return "Size: " + size();
}
@Override
public int hashCode() {
return mNGrams.hashCode();
}
}