org.molgenis.semanticsearch.string.NGramDistanceAlgorithm Maven / Gradle / Ivy
Show all versions of molgenis-semantic-search Show documentation
package org.molgenis.semanticsearch.string;
import com.google.common.collect.ImmutableSet;
import com.google.common.collect.Lists;
import java.util.HashMap;
import java.util.List;
import java.util.Map;
import java.util.Map.Entry;
import java.util.Set;
import java.util.stream.Collectors;
import org.apache.commons.lang3.StringUtils;
/**
* This class has implemented n-gram distance algorithm so a similarity score could be calculated
* between two sequences. The two input strings would be tokenized depending on what nGrams we have
* specified. The default ngram is 2 which can be changed in the constructor. The two groups of
* tokens will be further used to work out the similarity score. In addition, by default a list of
* stop words has been defined, in the method stringMatching(), one of the parameters
* "removeStopWords" indicates whether the stop words will be used to remove the useless or
* meaningless words from the String. This the stop words could be customized by
* setStopWords(List stopWords) or setStopWords(String[] stopWords).
*
* How to use? LevenShteinDistanceModel model = new LevenShteinDistanceModel(2); double
* similarityScore = model.stringMatching("Smoking", "Smoker", false);
* System.out.println(similarityScore);
*
*
The other way List tokens_1 = model.createNGrams("Smoking", false); List
* tokens_2 = model.createNGrams("Have you smoked last year?", true); //remove stop words! double
* similarityScore = model.calculateScore(tokens_1, tokens_2);
*
* @author Chao Pang
*/
@SuppressWarnings("java:S2386") // false positive: Mutable fields should not be "public static"
public class NGramDistanceAlgorithm {
private static int N_GRAMS = 2;
public static final Set STOPWORDSLIST;
static {
STOPWORDSLIST =
ImmutableSet.of(
"a",
"you",
"about",
"above",
"after",
"again",
"against",
"all",
"am",
"an",
"and",
"any",
"are",
"aren't",
"as",
"at",
"be",
"because",
"been",
"before",
"being",
"below",
"between",
"both",
"but",
"by",
"can't",
"cannot",
"could",
"couldn't",
"did",
"didn't",
"do",
"does",
"doesn't",
"doing",
"don't",
"down",
"during",
"each",
"few",
"for",
"from",
"further",
"had",
"hadn't",
"has",
"hasn't",
"have",
"haven't",
"having",
"he",
"he'd",
"he'll",
"he's",
"her",
"here",
"here's",
"hers",
"herself",
"him",
"himself",
"his",
"how",
"how's",
"i",
"i'd",
"i'll",
"i'm",
"i've",
"if",
"in",
"into",
"is",
"isn't",
"it",
"it's",
"its",
"itself",
"let's",
"me",
"more",
"most",
"mustn't",
"my",
"myself",
"no",
"nor",
"not",
"of",
"off",
"on",
"once",
"only",
"or",
"other",
"ought",
"our",
"ours",
"ourselves",
"out",
"over",
"own",
"same",
"shan't",
"she",
"she'd",
"she'll",
"she's",
"should",
"shouldn't",
"so",
"some",
"such",
"than",
"that",
"that's",
"the",
"their",
"theirs",
"them",
"themselves",
"then",
"there",
"there's",
"these",
"they",
"they'd",
"they'll",
"they're",
"they've",
"this",
"those",
"through",
"to",
"too",
"under",
"until",
"up",
"very",
"was",
"wasn't",
"we",
"we'd",
"we'll",
"we're",
"we've",
"were",
"weren't",
"what",
"what's",
"when",
"when's",
"where",
"where's",
"which",
"while",
"who",
"who's",
"whom",
"why",
"why's",
"with",
"won't",
"would",
"wouldn't",
"you",
"you'd",
"you'll",
"you're",
"you've",
"your",
"yours",
"yourself",
"yourselves",
"many",
")",
"(");
}
private NGramDistanceAlgorithm() {}
public static double stringMatching(String queryOne, String queryTwo) {
return calculateScore(
createNGrams(queryOne.toLowerCase().trim(), true),
createNGrams(queryTwo.toLowerCase().trim(), true));
}
/**
* create n-grams tokens of the string.
*
* @return a map of ngram tokens with the corresponding frequency
*/
public static Map createNGrams(String inputQuery, boolean removeStopWords) {
List wordsInString =
Lists.newArrayList(Stemmer.replaceIllegalCharacter(inputQuery).split(" "));
if (removeStopWords) {
wordsInString.removeAll(STOPWORDSLIST);
}
List stemmedWordsInString =
wordsInString.stream().map(Stemmer::stem).collect(Collectors.toList());
Map tokens = new HashMap<>();
// Padding the string
for (String singleWord : stemmedWordsInString) {
if (!StringUtils.isEmpty(singleWord)) {
// The s$ will be the produced from two words.
StringBuilder singleString = new StringBuilder(singleWord.length() + 2);
singleString.append('^').append(singleWord.toLowerCase()).append('$');
int length = singleString.length();
for (int i = 0; i < length - 1; i++) {
String token = null;
if (i + N_GRAMS < length) {
token = singleString.substring(i, i + N_GRAMS);
} else {
token = singleString.substring(length - 2);
}
if (!tokens.containsKey(token)) {
tokens.put(token, 1);
} else {
tokens.put(token, (tokens.get(token) + 1));
}
}
}
}
return tokens;
}
/** Calculate the ngram distance */
private static double calculateScore(
Map inputStringTokens, Map ontologyTermTokens) {
if (inputStringTokens.size() == 0 || ontologyTermTokens.size() == 0) {
return (double) 0;
}
int totalToken = getTotalNumTokens(inputStringTokens) + getTotalNumTokens(ontologyTermTokens);
int numMatchedToken = 0;
for (Entry token : inputStringTokens.entrySet()) {
if (ontologyTermTokens.containsKey(token.getKey())) {
numMatchedToken += Math.min(token.getValue(), ontologyTermTokens.get(token.getKey()));
}
}
if (totalToken == 0) {
return 0;
} else {
return 2.0 * numMatchedToken / totalToken * 100;
}
}
private static int getTotalNumTokens(Map inputStringTokens) {
int totalNum = 0;
for (Integer frequency : inputStringTokens.values()) {
totalNum += frequency;
}
return totalNum;
}
}