org.molgenis.data.semanticsearch.string.NGramDistanceAlgorithm Maven / Gradle / Ivy

Go to download
Show more of this group Show more artifacts with this name
Show all versions of molgenis-data-semanticsearch Show documentation
There is a newer version: 4.1.0
Show newest version
package org.molgenis.data.semanticsearch.string;

import java.util.HashMap;
import java.util.List;
import java.util.Map;
import java.util.Set;
import java.util.stream.Collectors;

import org.apache.commons.lang3.StringUtils;
import org.elasticsearch.common.collect.Sets;

import com.google.common.collect.Lists;

/**
 * This class has implemented n-gram distance algorithm so a similarity score could be calculated between two sequences.
 * The two input strings would be tokenized depending on what nGrams we have specified. The default ngram is 2 which can
 * be changed in the constructor. The two groups of tokens will be further used to work out the similarity score. In
 * addition, by default a list of stop words has been defined, in the method stringMatching(), one of the parameters
 * "removeStopWords" indicates whether the stop words will be used to remove the useless or meaningless words from the
 * String. This the stop words could be customized by setStopWords(List stopWords) or setStopWords(String[]
 * stopWords).
 * 
 * How to use? LevenShteinDistanceModel model = new LevenShteinDistanceModel(2); double similarityScore =
 * model.stringMatching("Smoking", "Smoker", false); System.out.println(similarityScore);
 * 
 * The other way List tokens_1 = model.createNGrams("Smoking", false); List tokens_2 =
 * model.createNGrams("Have you smoked last year?", true); //remove stop words! double similarityScore =
 * model.calculateScore(tokens_1, tokens_2);
 * 
 * 
 * @author Chao Pang
 * 
 */

public class NGramDistanceAlgorithm
{
	private static int N_GRAMS = 2;
	public static final Set STOPWORDSLIST;

	static
	{
		STOPWORDSLIST = Sets.newHashSet("a", "you", "about", "above", "after", "again", "against", "all", "am", "an",
				"and", "any", "are", "aren't", "as", "at", "be", "because", "been", "before", "being", "below",
				"between", "both", "but", "by", "can't", "cannot", "could", "couldn't", "did", "didn't", "do", "does",
				"doesn't", "doing", "don't", "down", "during", "each", "few", "for", "from", "further", "had", "hadn't",
				"has", "hasn't", "have", "haven't", "having", "he", "he'd", "he'll", "he's", "her", "here", "here's",
				"hers", "herself", "him", "himself", "his", "how", "how's", "i", "i'd", "i'll", "i'm", "i've", "if",
				"in", "into", "is", "isn't", "it", "it's", "its", "itself", "let's", "me", "more", "most", "mustn't",
				"my", "myself", "no", "nor", "not", "of", "off", "on", "once", "only", "or", "other", "ought", "our",
				"ours", "ourselves", "out", "over", "own", "same", "shan't", "she", "she'd", "she'll", "she's",
				"should", "shouldn't", "so", "some", "such", "than", "that", "that's", "the", "their", "theirs", "them",
				"themselves", "then", "there", "there's", "these", "they", "they'd", "they'll", "they're", "they've",
				"this", "those", "through", "to", "too", "under", "until", "up", "very", "was", "wasn't", "we", "we'd",
				"we'll", "we're", "we've", "were", "weren't", "what", "what's", "when", "when's", "where", "where's",
				"which", "while", "who", "who's", "whom", "why", "why's", "with", "won't", "would", "wouldn't", "you",
				"you'd", "you'll", "you're", "you've", "your", "yours", "yourself", "yourselves", "many", ")", "(");
	}

	public static double stringMatching(String queryOne, String queryTwo)
	{
		double similarityScore = calculateScore(createNGrams(queryOne.toLowerCase().trim(), true),
				createNGrams(queryTwo.toLowerCase().trim(), true));
		return similarityScore;
	}

	public static double stringMatching(String queryOne, String queryTwo, boolean removeStopWords)
	{
		double similarityScore = calculateScore(createNGrams(queryOne.toLowerCase().trim(), removeStopWords),
				createNGrams(queryTwo.toLowerCase().trim(), removeStopWords));
		return similarityScore;
	}

	/**
	 * create n-grams tokens of the string.
	 * 
	 * @param inputQuery
	 * @param removeStopWords
	 * @return a map of ngram tokens with the corresponding frequency
	 */

	public static Map createNGrams(String inputQuery, boolean removeStopWords)
	{
		List wordsInString = Lists.newArrayList(Stemmer.replaceIllegalCharacter(inputQuery).split(" "));
		if (removeStopWords) wordsInString.removeAll(STOPWORDSLIST);
		List stemmedWordsInString = wordsInString.stream().map(Stemmer::stem).collect(Collectors.toList());
		Map tokens = new HashMap();
		// Padding the string
		for (String singleWord : stemmedWordsInString)
		{
			if (!StringUtils.isEmpty(singleWord))
			{
				// The s$ will be the produced from two words.
				StringBuilder singleString = new StringBuilder(singleWord.length() + 2);
				singleString.append('^').append(singleWord.toLowerCase()).append('$');
				int length = singleString.length();
				for (int i = 0; i < length - 1; i++)
				{
					String token = null;
					if (i + N_GRAMS < length)
					{
						token = singleString.substring(i, i + N_GRAMS);

					}
					else
					{
						token = singleString.substring(length - 2);
					}

					if (!tokens.containsKey(token))
					{
						tokens.put(token, 1);
					}
					else
					{
						tokens.put(token, (tokens.get(token) + 1));
					}
				}
			}
		}

		return tokens;
	}

	/**
	 * Calculate the ngram distance
	 * 
	 * @param inputStringTokens
	 * @param ontologyTermTokens
	 * @return
	 */
	private static double calculateScore(Map inputStringTokens,
			Map ontologyTermTokens)
	{
		if (inputStringTokens.size() == 0 || ontologyTermTokens.size() == 0) return (double) 0;
		int totalToken = getTotalNumTokens(inputStringTokens) + getTotalNumTokens(ontologyTermTokens);
		int numMatchedToken = 0;

		for (String token : inputStringTokens.keySet())
		{
			if (ontologyTermTokens.containsKey(token))
			{
				numMatchedToken += Math.min(inputStringTokens.get(token), ontologyTermTokens.get(token));
			}
		}

		return 2.0 * numMatchedToken / totalToken * 100;
	}

	private static int getTotalNumTokens(Map inputStringTokens)
	{
		int totalNum = 0;

		for (Integer frequency : inputStringTokens.values())
		{
			totalNum += frequency;
		}
		return totalNum;
	}
}