com.shikhir.lsh.trimmed.TForest Maven / Gradle / Ivy

Go to download
Show more of this group Show more artifacts with this name
Show all versions of Lsh4Text Show documentation
This is a simplified implementation for Locality sensitive hashing(LSH) for text documents
The newest version!
package com.shikhir.lsh.trimmed;

import java.util.ArrayList;
import java.util.Arrays;
import java.util.Map;
import java.util.Objects;
import java.util.TreeMap;

import org.apache.commons.lang3.ArrayUtils;
import org.apache.commons.lang3.StringUtils;

import com.shikhir.lsh.untrimmed.forest.shingling.Shingle;
import com.shikhir.lsh.untrimmed.forest.shingling.ShinglingSet;
import com.shikhir.util.stringops.Stopwords;
import com.shikhir.util.stringops.StringOperations;
import com.shikhir.util.stringops.normalize.Normalize;

import info.debatty.java.lsh.LSHMinHash;
import info.debatty.java.lsh.MinHash;

public class TForest {

	TreeMap trimmedForest = new TreeMap();

	private boolean removeStopCharacters;
	private boolean normalize;
	private boolean removeStopWords;
	private boolean caseSensitive;
	public static final int LSH_SEED = 1234567890; // Seed chosen

	public TForest() {
		
	}
	
	public TForest(boolean removeStopCharacters, boolean normalize, boolean removeStopWords, boolean caseSensitive) {
		this.removeStopCharacters = removeStopCharacters;
		this.normalize = normalize;
		this.removeStopWords = removeStopWords;
		this.setCaseSensitive(caseSensitive);
	}

	public Sentence getSentence(String document, boolean wordTokens, int minKGrams, int maxKGrams) {
 		Shingle[] shinglingArr = getShingleArr(	document, wordTokens, minKGrams, maxKGrams, 
 												removeStopCharacters, normalize, removeStopWords, 
 												caseSensitive);
		float percentage = 1.0f;
		ArrayList locationAL = new ArrayList();
		ArrayList idAL = new ArrayList();

		for(Shingle iTkn: shinglingArr) {
			idAL.add(iTkn.getId());
			TShingleProperties prop = trimmedForest.get(iTkn.getId());
			if(prop==null) {
				continue;
			}
			else {
				int location = trimmedForest.get(iTkn.getId()).getLocation();
				locationAL.add(location);
				float tknPercentage = trimmedForest.get(iTkn.getId()).getPercentage();
				percentage = percentage*tknPercentage;
			}
		}
		Sentence retVal = new Sentence(document, percentage, locationAL, idAL );
		return retVal;
	}

	private Shingle[] getShingleArr(String document, boolean wordTokens, int minKGrams, int maxKGrams, 
										boolean removeStopCharacters, boolean normalize, boolean removeStopWords, boolean caseSensitive) {
	    if(StringUtils.isBlank(document)) throw new IllegalArgumentException("document parameter cannot be empty");
		if (this.trimmedForest == null) {
			throw new NullPointerException();
		}

		document = removeStopCharacters?StringOperations.removeStopChar(document):document;
		document = normalize?Normalize.all(document):document;
		document = isCaseSensitive()?document:document.toLowerCase();
		if(removeStopWords) document = Stopwords.removeStopWords(document);

		return ShinglingSet.getTokensForMessage(document, wordTokens, minKGrams, maxKGrams);	
	}	
	
	/**
	 * This will get you the vector for a string from the forest. The forest must be
	 * built in order to create a vector. The size of the vector must be defined
	 * during the buildForest process. The forest is build by hashing the shinglings
	 * into an integer and then putting them into a sorted array.
	 * 
	 * @param document The text of the document for which the boolean vector is
	 *                 being created
	 * @param wordTokens if true, tokens of words are assumed, otherwise characters
 	 * @param minKGrams minimum size of shingling
	 * @param maxKGrams maximum size of shingling
	 *                 
	 * @return The vector for the string
	 */

	public boolean[] getVector(String document, boolean wordTokens, int minKGrams, int maxKGrams, 
							   boolean removeStopCharacters, boolean normalize, boolean removeStopWords) {

		Shingle[] shinglingArr = getShingleArr(document, wordTokens, minKGrams, maxKGrams, removeStopCharacters, normalize, removeStopWords, caseSensitive);
		
		int forestSize = this.trimmedForest.size();
		boolean[] vector = new boolean[forestSize];

		for(Shingle s: shinglingArr) {
			TShingleProperties index = trimmedForest.get(s.getId());
			if(index!=null) {
				vector[index.getLocation()]=true;
			}
		}		
		return vector;
	}
	
	
	public void add(Integer id, Float percentage) {
		TShingleProperties property = new TShingleProperties(percentage);
		trimmedForest.put(id, property);
	}
	
	
	public void finalize() {
		int i=0;
        for (Map.Entry entry : trimmedForest.entrySet()) {
        	Integer key = entry.getKey();
        	TShingleProperties prop = entry.getValue();
        	prop.setLocation(i);
        	trimmedForest.put(key, prop);
        	i++;
        }

	}
	
	/**
	 * This will get you the vector for a string from the forest. The forest must be
	 * built in order to create a vector. The size of the vector must be defined
	 * during the buildForest process. The forest is build by hashing the shinglings
	 * into an integer and then putting them into a sorted array.
	 * 
	 * @param document The text of the document for which the boolean vector is
	 *                 being created
	 * @param wordTokens if true, tokens of words are assumed, otherwise characters
 	 * @param minKGrams minimum size of shingling
	 * @param maxKGrams maximum size of shingling
	 *                 
	 * @return The vector for the string
	 */
	public boolean[] getVector(String document, boolean wordTokens, int minKGrams, int maxKGrams) {
	    if(StringUtils.isBlank(document)) throw new IllegalArgumentException("document parameter cannot be empty");

		document = removeStopCharacters?StringOperations.removeStopChar(document):document;
		document = normalize?Normalize.all(document):document;
		
		if(removeStopWords) {
			document = Stopwords.removeStopWords(document);
		}
		document = isCaseSensitive()?document: document.toLowerCase();

		ShinglingSet set = new ShinglingSet();
		set.addShingling(document, wordTokens, minKGrams, maxKGrams);

		Integer[] idsArry = set.getAllId();

		int forestSize = this.trimmedForest.size();
		boolean[] vector = new boolean[forestSize];

		for(Integer id: idsArry) {
			TShingleProperties prop = trimmedForest.get(id);
			if(prop==null) continue;
			Integer location = trimmedForest.get(id).getLocation();
			if(location!=null) {
				vector[location]=true;
			}
		}
		return vector;
	}
	

	/**
	 * The a bucket size is automatically estimated if not provided in the
	 * getBuckets() function. This function estimates a bucket size.
	 * 
	 * @return returns the default bucket size based on the size of the forest()
	 */
	
	public int defaultBucketSize() {
		final int BUCKET_MULTIPLIER = 3;
		int bucketSize = (int) (Math.sqrt(trimmedForest.size()) * BUCKET_MULTIPLIER);
		return bucketSize;
	}
	
	/**
	 * The MinHash signature of the text created by using the forest vector
	 * 
	 * @param document The text of the document for which the signature is being created
	 * @param wordTokens if true, tokens of words are assumed, otherwise characters
 	 * @param minKGram minimum size of shingling
	 * @param maxKGram maximum size of shingling
	 * @param similartyError The similarity error 
	 * @return Gets the MinHash signature of the documents
	 */
	
	public int[] getMinHashSignature(String document, boolean wordTokens, int minKGram, int maxKGram, double similartyError) {
	    if(StringUtils.isBlank(document)) throw new IllegalArgumentException("document parameter cannot be empty");

		MinHash minhash = new MinHash(similartyError, trimmedForest.size(), TForest.LSH_SEED);
		return minhash.signature(getVector(document, wordTokens, minKGram, maxKGram));
	}

	/**
	 * Count the number of shinglings in the forest
	 * 
	 * @param document The text of the document for which the signature is being created
	 * @param wordTokens if true, tokens of words are assumed, otherwise characters
 	 * @param minKGram minimum size of shingling
	 * @param maxKGram maximum size of shingling
	 * @return Gets the MinHash signature of the documents
	 */
	
	public int countDocumentShinglingsInForest(String document, boolean wordTokens, int minKGram, int maxKGram) {
	    if(StringUtils.isBlank(document)) throw new IllegalArgumentException("document parameter cannot be empty");

		document = removeStopCharacters?StringOperations.removeStopChar(document):document;
		document = normalize?Normalize.all(document):document;
		
		if(removeStopWords) document = Stopwords.removeStopWords(document);
		
		Sentence sentence = getSentence(document, wordTokens, minKGram, maxKGram);
		return sentence.getDictionaryLocation().size();
	}
	

	
	/**
	 * Using two MinHash signatures, you can compute the similarity of the
	 * signatures. Looking at the similarity of the signatures can be a faster
	 * alternative to looking at the entire corpus of document and comparing their
	 * similarity. Although it's not as accurate
	 * 
	 * @param sig1            The MinHash signature of the first document vector
	 * @param sig2            The MinHash signature of the second document vector
	 * @param similarityError The similarity error
	 * @return The similarity of the two signatures
	 * 
	 */

	public double signatureSimilarity(int[] sig1, int[] sig2, double similarityError) {
		MinHash minhash = new MinHash(similarityError, this.trimmedForest.size(), LSH_SEED);
		return minhash.similarity(sig1, sig2);
	}

	
	private static int[] removeDuplicates(int[] iArr) {
		// remove duplicates buckets
		for (int i = 0; i < iArr.length - 1; i++) {
			for (int j = i + 1; j < iArr.length; j++) {
				if (iArr[i] == iArr[j]) {
					iArr = ArrayUtils.remove(iArr, j);
				}
			}
		}
		return iArr;
	}
	
	/**
	 * Returns all the possible buckets which may contain this text. This function
	 * must be used with same stages and bucketSize parameters for all searches in
	 * order to get a valid result. You will need to store the signature / content
	 * key/value in the bucket and then do a signature similarity. If the signatures
	 * are similar, you should then do a stronger test of similarity such as
	 * Levenshtein distance or cosine similarity on the actual body of the document.
	 * A default bucket is estimate for convenience.
	 * 
	 * @param document The document which needs to be analyzed
	 * @param wordTokens if true, tokens of words are assumed, otherwise characters
 	 * @param minKGrams minimum size of shingling
	 * @param maxKGrams maximum size of shingling
	 * @param stages   determines the number of possible buckets
	 * @return returns list of possible buckets which may contain this text.
	 */
	public int[] getBuckets(String document, boolean wordTokens, int minKGrams, int maxKGrams, int stages) {
		return getBuckets(document, wordTokens, minKGrams, maxKGrams, stages, defaultBucketSize());
	}

	/**
	 * Returns all the possible buckets which may contain this text. This function
	 * must be used with same stages and bucketSize parameters for all searches in
	 * order to get a valid result. You will need to store the signature / content
	 * key/value in the bucket and then do a signature similarity. If the signatures
	 * are similar, you should then do a stronger test of similarity such as
	 * Levenshtein distance or cosine similarity on the actual body of the document.
	 * 
	 * @param document   The document which needs to be analyzed
	 * @param wordTokens if true, tokens of words are assumed, otherwise characters
 	 * @param minKGrams minimum size of shingling
	 * @param maxKGrams maximum size of shingling
	 * @param stages     determines the number of possible buckets
	 * @param bucketSize The document for which the boolean vector is being created
	 * @return returns list of possible buckets which may contain this text.
	 */
	public int[] getBuckets(String document, boolean wordTokens, int minKGrams, int maxKGrams, int stages, int bucketSize) {
	    Objects.requireNonNull(document, "document must not be null");
	    if(StringUtils.isBlank(document)) throw new IllegalArgumentException("document parameter cannot be empty");
	    
		boolean vector[] = getVector(document, wordTokens, minKGrams, maxKGrams);

		// Create and configure LSH algorithm
		LSHMinHash lsh = new LSHMinHash(stages, bucketSize, vector.length, TForest.LSH_SEED);
		int[] buckets = lsh.hash(vector);
		Arrays.sort(buckets);

		// remove duplicates buckets
		return removeDuplicates(buckets);
	}

	
	public void printTrimmedForest() {
        for (Map.Entry entry : trimmedForest.entrySet()) {
        	System.out.println(entry.getKey()+" -> "+entry.getValue().getPercentage()+" -> "+ entry.getValue().getLocation());
        }
	}

	public int size() {
		return trimmedForest.size();
	}
	
	public boolean isRemoveStopCharacters() {
		return removeStopCharacters;
	}

	public void setRemoveStopCharacters(boolean removeStopCharacters) {
		this.removeStopCharacters = removeStopCharacters;
	}

	public boolean isRemoveStopWords() {
		return removeStopWords;
	}

	public void setRemoveStopWords(boolean removeStopWords) {
		this.removeStopWords = removeStopWords;
	}

	public boolean isNormalize() {
		return normalize;
	}

	public void setNormalize(boolean normalize) {
		this.normalize = normalize;
	}


	public boolean isCaseSensitive() {
		return caseSensitive;
	}


	public void setCaseSensitive(boolean caseSensitive) {
		this.caseSensitive = caseSensitive;
	}

	public TreeMap getTrimmedForest() {
		return trimmedForest;
	}

	public void setTrimmedForest(TreeMap trimmedForest) {
		this.trimmedForest = trimmedForest;
	}

}