All Downloads are FREE. Search and download functionalities are using the official Maven repository.

org.molgenis.semanticsearch.string.NGramDistanceAlgorithm Maven / Gradle / Ivy

The newest version!
package org.molgenis.semanticsearch.string;

import com.google.common.collect.ImmutableSet;
import com.google.common.collect.Lists;
import java.util.HashMap;
import java.util.List;
import java.util.Map;
import java.util.Map.Entry;
import java.util.Set;
import java.util.stream.Collectors;
import org.apache.commons.lang3.StringUtils;

/**
 * This class has implemented n-gram distance algorithm so a similarity score could be calculated
 * between two sequences. The two input strings would be tokenized depending on what nGrams we have
 * specified. The default ngram is 2 which can be changed in the constructor. The two groups of
 * tokens will be further used to work out the similarity score. In addition, by default a list of
 * stop words has been defined, in the method stringMatching(), one of the parameters
 * "removeStopWords" indicates whether the stop words will be used to remove the useless or
 * meaningless words from the String. This the stop words could be customized by
 * setStopWords(List stopWords) or setStopWords(String[] stopWords).
 *
 * 

How to use? LevenShteinDistanceModel model = new LevenShteinDistanceModel(2); double * similarityScore = model.stringMatching("Smoking", "Smoker", false); * System.out.println(similarityScore); * *

The other way List tokens_1 = model.createNGrams("Smoking", false); List * tokens_2 = model.createNGrams("Have you smoked last year?", true); //remove stop words! double * similarityScore = model.calculateScore(tokens_1, tokens_2); * * @author Chao Pang */ @SuppressWarnings("java:S2386") // false positive: Mutable fields should not be "public static" public class NGramDistanceAlgorithm { private static int N_GRAMS = 2; public static final Set STOPWORDSLIST; static { STOPWORDSLIST = ImmutableSet.of( "a", "you", "about", "above", "after", "again", "against", "all", "am", "an", "and", "any", "are", "aren't", "as", "at", "be", "because", "been", "before", "being", "below", "between", "both", "but", "by", "can't", "cannot", "could", "couldn't", "did", "didn't", "do", "does", "doesn't", "doing", "don't", "down", "during", "each", "few", "for", "from", "further", "had", "hadn't", "has", "hasn't", "have", "haven't", "having", "he", "he'd", "he'll", "he's", "her", "here", "here's", "hers", "herself", "him", "himself", "his", "how", "how's", "i", "i'd", "i'll", "i'm", "i've", "if", "in", "into", "is", "isn't", "it", "it's", "its", "itself", "let's", "me", "more", "most", "mustn't", "my", "myself", "no", "nor", "not", "of", "off", "on", "once", "only", "or", "other", "ought", "our", "ours", "ourselves", "out", "over", "own", "same", "shan't", "she", "she'd", "she'll", "she's", "should", "shouldn't", "so", "some", "such", "than", "that", "that's", "the", "their", "theirs", "them", "themselves", "then", "there", "there's", "these", "they", "they'd", "they'll", "they're", "they've", "this", "those", "through", "to", "too", "under", "until", "up", "very", "was", "wasn't", "we", "we'd", "we'll", "we're", "we've", "were", "weren't", "what", "what's", "when", "when's", "where", "where's", "which", "while", "who", "who's", "whom", "why", "why's", "with", "won't", "would", "wouldn't", "you", "you'd", "you'll", "you're", "you've", "your", "yours", "yourself", "yourselves", "many", ")", "("); } private NGramDistanceAlgorithm() {} public static double stringMatching(String queryOne, String queryTwo) { return calculateScore( createNGrams(queryOne.toLowerCase().trim(), true), createNGrams(queryTwo.toLowerCase().trim(), true)); } /** * create n-grams tokens of the string. * * @return a map of ngram tokens with the corresponding frequency */ public static Map createNGrams(String inputQuery, boolean removeStopWords) { List wordsInString = Lists.newArrayList(Stemmer.replaceIllegalCharacter(inputQuery).split(" ")); if (removeStopWords) { wordsInString.removeAll(STOPWORDSLIST); } List stemmedWordsInString = wordsInString.stream().map(Stemmer::stem).collect(Collectors.toList()); Map tokens = new HashMap<>(); // Padding the string for (String singleWord : stemmedWordsInString) { if (!StringUtils.isEmpty(singleWord)) { // The s$ will be the produced from two words. StringBuilder singleString = new StringBuilder(singleWord.length() + 2); singleString.append('^').append(singleWord.toLowerCase()).append('$'); int length = singleString.length(); for (int i = 0; i < length - 1; i++) { String token = null; if (i + N_GRAMS < length) { token = singleString.substring(i, i + N_GRAMS); } else { token = singleString.substring(length - 2); } if (!tokens.containsKey(token)) { tokens.put(token, 1); } else { tokens.put(token, (tokens.get(token) + 1)); } } } } return tokens; } /** Calculate the ngram distance */ private static double calculateScore( Map inputStringTokens, Map ontologyTermTokens) { if (inputStringTokens.size() == 0 || ontologyTermTokens.size() == 0) { return (double) 0; } int totalToken = getTotalNumTokens(inputStringTokens) + getTotalNumTokens(ontologyTermTokens); int numMatchedToken = 0; for (Entry token : inputStringTokens.entrySet()) { if (ontologyTermTokens.containsKey(token.getKey())) { numMatchedToken += Math.min(token.getValue(), ontologyTermTokens.get(token.getKey())); } } if (totalToken == 0) { return 0; } else { return 2.0 * numMatchedToken / totalToken * 100; } } private static int getTotalNumTokens(Map inputStringTokens) { int totalNum = 0; for (Integer frequency : inputStringTokens.values()) { totalNum += frequency; } return totalNum; } }





© 2015 - 2024 Weber Informatics LLC | Privacy Policy