com.shikhir.lsh.untrimmed.forest.shingling.ShinglingSet Maven / Gradle / Ivy
Go to download
Show more of this group Show more artifacts with this name
Show all versions of Lsh4Text Show documentation
Show all versions of Lsh4Text Show documentation
This is a simplified implementation for Locality sensitive hashing(LSH) for text documents
The newest version!
package com.shikhir.lsh.untrimmed.forest.shingling;
import java.util.ArrayList;
import java.util.Arrays;
import java.util.HashSet;
import java.util.Iterator;
import java.util.LinkedHashSet;
import java.util.Map;
import java.util.Set;
import java.util.TreeMap;
import java.util.TreeSet;
import com.shikhir.util.stringops.NGramSet;
import com.shikhir.util.stringops.StringOperations;
import opennlp.tools.tokenize.SimpleTokenizer;
import opennlp.tools.ngram.NGramModel;
import opennlp.tools.util.StringList;
public class ShinglingSet{
private TreeMap shinglingSet = new TreeMap();
public ShinglingSet(){
}
ShinglingSet(String text, boolean wordTokens, int kGramsMin, int kGramsMax){
addShingling(text, wordTokens, kGramsMin, kGramsMax);
}
public static Shingle[] getTokensForMessage(String text, boolean wordTokens, int kGramsMin, int kGramsMax) {
text = text.trim();
if(text==null || text.length()==0) {
return new Shingle[0];
};
NGramSet nGramModel = new NGramSet();
StringList slTokens = new StringList(SimpleTokenizer.INSTANCE.tokenize(text.trim()));
if(slTokens.size()==0) return null;
if(!wordTokens) { // character tokens
for(String strTkn: slTokens) {
nGramModel.add(strTkn, kGramsMin, kGramsMax);
}
}
else {
nGramModel.add(slTokens, kGramsMin, kGramsMax);
}
LinkedHashSet localSet = new LinkedHashSet();
for (StringList ngram : nGramModel) {
Shingle s = new Shingle(ngram.toString());
if(!localSet.contains(s)) {
localSet.add(s);
}
}
Object[] objArray = localSet.toArray();
Shingle[] tokenArray = Arrays.copyOf(objArray, objArray.length, Shingle[].class);
localSet=null; // to save memory
nGramModel=null; // to save memory
return tokenArray;
}
public void addShingling(String text, boolean wordTokens, int kGramsMin, int kGramsMax) {
Shingle[] shingleArray = getTokensForMessage(text, wordTokens, kGramsMin, kGramsMax);
for(Shingle s : shingleArray) {
shinglingSet.put(s.getId(), s);
}
}
public int size() {
return shinglingSet.size();
}
public boolean contains(Integer id){
return shinglingSet.containsKey(id);
}
public Integer[] getAllId() {
Set allIdSet = shinglingSet.keySet();
Integer[] arr = Arrays.copyOf(allIdSet.toArray(), allIdSet.size(), Integer[].class);
Arrays.sort(arr);
return arr;
}
public Integer[] subset(int count) {
if(count>shinglingSet.size()) throw new IllegalArgumentException();
Integer[] subSet = new Integer[count];
Set keys = shinglingSet.keySet();
int i=0;
Iterator itr = keys.iterator();
while (itr.hasNext() && i