com.shikhir.lsh.untrimmed.forest.UntrimmedForest Maven / Gradle / Ivy
Go to download
Show more of this group Show more artifacts with this name
Show all versions of Lsh4Text Show documentation
Show all versions of Lsh4Text Show documentation
This is a simplified implementation for Locality sensitive hashing(LSH) for text documents
The newest version!
package com.shikhir.lsh.untrimmed.forest;
import java.io.File;
import java.io.FileNotFoundException;
import java.io.FileReader;
import java.io.FileWriter;
import java.io.IOException;
import java.io.Reader;
import java.text.DecimalFormat;
import java.util.ArrayList;
import java.util.Arrays;
import java.util.Collections;
import java.util.Map;
import java.util.Objects;
import java.util.TreeMap;
import java.util.TreeSet;
import java.util.logging.Logger;
import org.apache.commons.csv.CSVFormat;
import org.apache.commons.csv.CSVPrinter;
import org.apache.commons.csv.CSVRecord;
import org.apache.commons.io.FileUtils;
import org.apache.commons.io.LineIterator;
import org.apache.commons.lang3.StringUtils;
import com.shikhir.lsh.trimmed.TForest;
import com.shikhir.lsh.untrimmed.forest.shingling.ForestShingle;
import com.shikhir.lsh.untrimmed.forest.shingling.Shingle;
import com.shikhir.lsh.untrimmed.forest.shingling.ShinglingSet;
import com.shikhir.util.stringops.Stopwords;
import com.shikhir.util.stringops.StringOperations;
import com.shikhir.util.stringops.normalize.Normalize;
public class UntrimmedForest {
private TreeMap untrimmedForestMap = new TreeMap();
private int documentInsertionCount=0;
private boolean removeStopWords=false;
private boolean removeStopCharacters=false;
private boolean caseSensitive=false;
private boolean normalize=false;
private static final Logger log = Logger.getLogger(UntrimmedForest.class.getName());
private static final int RECOMMENDED_VECTOR_SIZE = 1000;
public UntrimmedForest() {
}
public int size() {
return untrimmedForestMap.size();
}
public TreeMap getUntrimmedForestMap() {
return untrimmedForestMap;
}
public void setUntrimmedForestMap(TreeMap untrimmedForestMap) {
this.untrimmedForestMap = untrimmedForestMap;
}
public boolean isRemoveStopWords() {
return removeStopWords;
}
public void setRemoveStopWords(boolean removeStopWords) {
this.removeStopWords = removeStopWords;
}
public boolean isRemoveStopCharacters() {
return removeStopCharacters;
}
public void setRemoveStopCharacters(boolean removeStopCharacters) {
this.removeStopCharacters = removeStopCharacters;
}
public int getDocumentSize() {
return documentInsertionCount;
}
public void addDocument(String document, boolean wordTokens, int minKGram, int maxKGram) {
Objects.requireNonNull(document, "document parameter must not be null");
if(StringUtils.isBlank(document)) throw new IllegalArgumentException("document parameter cannot be empty");
document = this.removeStopCharacters?StringOperations.removeStopChar(document):document;
document = normalize?Normalize.all(document):document;
document = isCaseSensitive()?document:document.toLowerCase();
if(this.removeStopWords) {
document=Stopwords.removeStopWords(document);
};
if(document.trim().length()==0) {
return;
}
Shingle[] documentShingles = ShinglingSet.getTokensForMessage(document, wordTokens, minKGram, maxKGram);
if(documentShingles==null || documentShingles.length==0 ) return;
documentInsertionCount++;
for (Shingle s : documentShingles) {
ForestShingle fs = untrimmedForestMap.get(s.getId());
if (fs == null) fs = new ForestShingle(s.getToken(), 0);
fs.increment();
untrimmedForestMap.put(fs.getId(), fs);
}
}
/**
* By default, the digits are normalized to increase the the chances of collision for signature.
* You can turn this off.
*
* @param normalize The text of the document for which the boolean vector is being created
*/
public void setNormalize(boolean normalize) {
this.normalize = normalize;
}
/**
* This will print all the shingles and their count from the unTrimmed forest in descending order of frequency count.
* This could be used to identify the vector size needed to build a forest.
*
* @param head the count of top shingles to be returned
*/
public void printTopShingleAndCount(int head) {
int size=untrimmedForestMap.size();
if(size forest = getUntrimmedForest(true);
for(int i=0; i getUntrimmedForest(boolean decending) {
if (untrimmedForestMap == null)
throw new NullPointerException();
ArrayList forestMapValues = new ArrayList(untrimmedForestMap.values());
if (decending) {
Collections.sort(forestMapValues, Collections.reverseOrder());
} else {
Collections.sort(forestMapValues);
}
return forestMapValues;
}
/**
* An untrimmed forest is sorted by frequency of shinglings found in all
* documents. This method will find the index in the array where the frequency
* count is less than or equal to the value of the parameter. This should be
* used to determine the size of the vector.
*
* @param countNumber The frequency count of the token
* @return The frequency could to find in an untrimmed forest.
*/
public int findCountofIndexInUntrimmedForest(int countNumber) {
ArrayList forest = getUntrimmedForest(true);
for (int i = 0; i < forest.size(); i++) {
if (forest.get(i).getShingleCountInForest() <= countNumber) {
return i;
}
}
return forest.size();
}
/**
* Removes duplicates from untrimmed forest. This function is useful when the encoding is by characters instead of words
*
* @param percentage The percentage of frequency count a token must be in the range of in order to remove
*/
public void cleanUntrimmedForest(int percentage) {
ForestShingle[] values = untrimmedForestMap.values().toArray(new ForestShingle[untrimmedForestMap.size()]);
Arrays.sort(values, Collections.reverseOrder());
for(int i=0; i< values.length; i++) {
int iCount = values[i].getShingleCountInForest();
String iToken = values[i].getToken().replace("[","").replace("]", "");
if(i%1000==0) {
float outputPercent = (float) (100.0*i/values.length);
String formattedString = String.format("%.02f", outputPercent);
if(values.length>10000) {
log.info(formattedString+"% done ");
}
}
for(int j=i; j percentage) {
break;
};
if(jToken.contains(iToken) && p <= percentage) {
untrimmedForestMap.remove(values[i].getId());
break;
}
}
}
}
}
/**
* An untrimmed forest is sorted by frequency of shinglings found in all
* documents and all shinglings less than or equal to the count number are removed
*
* @param countNumber The frequency count of the token
*/
public void removeLessThanFrequency(int countNumber) {
ArrayList forest = getUntrimmedForest(true);
TreeSet ts = new TreeSet();
for(ForestShingle sh: forest) {
if(sh.getShingleCountInForest()<= countNumber) {
ts.add(sh.getId());
}
}
for(int i: ts) {
untrimmedForestMap.remove(i);
}
}
/**
* Gets the default vector size if none is provided
*
* @return The default vector size
*/
public int getDefaultVector() {
if (this.untrimmedForestMap.size() < 800) {
return this.untrimmedForestMap.size() - 1;
}
int duplicateIndex = findCountofIndexInUntrimmedForest(1);
if (duplicateIndex < 1200)
return duplicateIndex;
return RECOMMENDED_VECTOR_SIZE;
}
/**
* Builds a trimmed forest of vectorSize from using a untrimmed forest by
* removing all the leafs that had the lowest frequency of use
*
* @param vectorSize The size of the vector used to build the forest
*/
public TForest buildForest(int vectorSize) {
if (vectorSize > untrimmedForestMap.size()) {
throw new IllegalArgumentException();
}
TForest trimmedForest = new TForest(removeStopCharacters, normalize, removeStopWords, caseSensitive);
trimmedForest.setRemoveStopCharacters(removeStopCharacters);
trimmedForest.setNormalize(normalize);
trimmedForest.setRemoveStopWords(removeStopWords);
trimmedForest.setCaseSensitive(caseSensitive);
ArrayList unTrimmedForestAL = getUntrimmedForest(true);
for (int j = 0; j < vectorSize; j++) {
int id = unTrimmedForestAL.get(j).getId();
int shingleCount = unTrimmedForestAL.get(j).getShingleCountInForest();
String token = unTrimmedForestAL.get(j).getToken();
float percentage = (float) (1.0*shingleCount/documentInsertionCount);
trimmedForest.add(id, percentage);
}
trimmedForest.finalize();
untrimmedForestMap = null; // releasing to free up memory
System.gc();
return trimmedForest;
}
/**
* Builds a trimmed forest from using a untrimmed forest by removing all the
* leafs that had the lowest frequency of use. The a default vector size of less
* than 1200 is used.
*
*/
public TForest buildForest() {
return buildForest(getDefaultVector());
}
/**
* This will build a forest using all the shinglings. This should only be used
* if the size of the forest is small (less than 1000) or there is very little
* redunancy.
*/
public TForest buildFullForest() {
return buildForest(untrimmedForestMap.size());
}
public boolean isCaseSensitive() {
return caseSensitive;
}
public void setCaseSensitive(boolean caseSensitive) {
this.caseSensitive=caseSensitive;
}
}