org.molgenis.semanticsearch.string.NGramDistanceAlgorithm Maven / Gradle / Ivy

Go to download
Show more of this group Show more artifacts with this name
Show all versions of molgenis-semantic-search Show documentation
Semantic data search service functionality.
The newest version!
package org.molgenis.semanticsearch.string;

import com.google.common.collect.ImmutableSet;
import com.google.common.collect.Lists;
import java.util.HashMap;
import java.util.List;
import java.util.Map;
import java.util.Map.Entry;
import java.util.Set;
import java.util.stream.Collectors;
import org.apache.commons.lang3.StringUtils;

/**
 * This class has implemented n-gram distance algorithm so a similarity score could be calculated
 * between two sequences. The two input strings would be tokenized depending on what nGrams we have
 * specified. The default ngram is 2 which can be changed in the constructor. The two groups of
 * tokens will be further used to work out the similarity score. In addition, by default a list of
 * stop words has been defined, in the method stringMatching(), one of the parameters
 * "removeStopWords" indicates whether the stop words will be used to remove the useless or
 * meaningless words from the String. This the stop words could be customized by
 * setStopWords(List stopWords) or setStopWords(String[] stopWords).
 *
 * How to use? LevenShteinDistanceModel model = new LevenShteinDistanceModel(2); double
 * similarityScore = model.stringMatching("Smoking", "Smoker", false);
 * System.out.println(similarityScore);
 *
 * The other way List tokens_1 = model.createNGrams("Smoking", false); List
 * tokens_2 = model.createNGrams("Have you smoked last year?", true); //remove stop words! double
 * similarityScore = model.calculateScore(tokens_1, tokens_2);
 *
 * @author Chao Pang
 */
@SuppressWarnings("java:S2386") // false positive: Mutable fields should not be "public static"
public class NGramDistanceAlgorithm {

  private static int N_GRAMS = 2;
  public static final Set STOPWORDSLIST;

  static {
    STOPWORDSLIST =
        ImmutableSet.of(
            "a",
            "you",
            "about",
            "above",
            "after",
            "again",
            "against",
            "all",
            "am",
            "an",
            "and",
            "any",
            "are",
            "aren't",
            "as",
            "at",
            "be",
            "because",
            "been",
            "before",
            "being",
            "below",
            "between",
            "both",
            "but",
            "by",
            "can't",
            "cannot",
            "could",
            "couldn't",
            "did",
            "didn't",
            "do",
            "does",
            "doesn't",
            "doing",
            "don't",
            "down",
            "during",
            "each",
            "few",
            "for",
            "from",
            "further",
            "had",
            "hadn't",
            "has",
            "hasn't",
            "have",
            "haven't",
            "having",
            "he",
            "he'd",
            "he'll",
            "he's",
            "her",
            "here",
            "here's",
            "hers",
            "herself",
            "him",
            "himself",
            "his",
            "how",
            "how's",
            "i",
            "i'd",
            "i'll",
            "i'm",
            "i've",
            "if",
            "in",
            "into",
            "is",
            "isn't",
            "it",
            "it's",
            "its",
            "itself",
            "let's",
            "me",
            "more",
            "most",
            "mustn't",
            "my",
            "myself",
            "no",
            "nor",
            "not",
            "of",
            "off",
            "on",
            "once",
            "only",
            "or",
            "other",
            "ought",
            "our",
            "ours",
            "ourselves",
            "out",
            "over",
            "own",
            "same",
            "shan't",
            "she",
            "she'd",
            "she'll",
            "she's",
            "should",
            "shouldn't",
            "so",
            "some",
            "such",
            "than",
            "that",
            "that's",
            "the",
            "their",
            "theirs",
            "them",
            "themselves",
            "then",
            "there",
            "there's",
            "these",
            "they",
            "they'd",
            "they'll",
            "they're",
            "they've",
            "this",
            "those",
            "through",
            "to",
            "too",
            "under",
            "until",
            "up",
            "very",
            "was",
            "wasn't",
            "we",
            "we'd",
            "we'll",
            "we're",
            "we've",
            "were",
            "weren't",
            "what",
            "what's",
            "when",
            "when's",
            "where",
            "where's",
            "which",
            "while",
            "who",
            "who's",
            "whom",
            "why",
            "why's",
            "with",
            "won't",
            "would",
            "wouldn't",
            "you",
            "you'd",
            "you'll",
            "you're",
            "you've",
            "your",
            "yours",
            "yourself",
            "yourselves",
            "many",
            ")",
            "(");
  }

  private NGramDistanceAlgorithm() {}

  public static double stringMatching(String queryOne, String queryTwo) {
    return calculateScore(
        createNGrams(queryOne.toLowerCase().trim(), true),
        createNGrams(queryTwo.toLowerCase().trim(), true));
  }

  /**
   * create n-grams tokens of the string.
   *
   * @return a map of ngram tokens with the corresponding frequency
   */
  public static Map createNGrams(String inputQuery, boolean removeStopWords) {
    List wordsInString =
        Lists.newArrayList(Stemmer.replaceIllegalCharacter(inputQuery).split(" "));
    if (removeStopWords) {
      wordsInString.removeAll(STOPWORDSLIST);
    }
    List stemmedWordsInString =
        wordsInString.stream().map(Stemmer::stem).collect(Collectors.toList());
    Map tokens = new HashMap<>();
    // Padding the string
    for (String singleWord : stemmedWordsInString) {
      if (!StringUtils.isEmpty(singleWord)) {
        // The s$ will be the produced from two words.
        StringBuilder singleString = new StringBuilder(singleWord.length() + 2);
        singleString.append('^').append(singleWord.toLowerCase()).append('$');
        int length = singleString.length();
        for (int i = 0; i < length - 1; i++) {
          String token = null;
          if (i + N_GRAMS < length) {
            token = singleString.substring(i, i + N_GRAMS);

          } else {
            token = singleString.substring(length - 2);
          }

          if (!tokens.containsKey(token)) {
            tokens.put(token, 1);
          } else {
            tokens.put(token, (tokens.get(token) + 1));
          }
        }
      }
    }

    return tokens;
  }

  /** Calculate the ngram distance */
  private static double calculateScore(
      Map inputStringTokens, Map ontologyTermTokens) {
    if (inputStringTokens.size() == 0 || ontologyTermTokens.size() == 0) {
      return (double) 0;
    }
    int totalToken = getTotalNumTokens(inputStringTokens) + getTotalNumTokens(ontologyTermTokens);
    int numMatchedToken = 0;

    for (Entry token : inputStringTokens.entrySet()) {
      if (ontologyTermTokens.containsKey(token.getKey())) {
        numMatchedToken += Math.min(token.getValue(), ontologyTermTokens.get(token.getKey()));
      }
    }

    if (totalToken == 0) {
      return 0;
    } else {
      return 2.0 * numMatchedToken / totalToken * 100;
    }
  }

  private static int getTotalNumTokens(Map inputStringTokens) {
    int totalNum = 0;

    for (Integer frequency : inputStringTokens.values()) {
      totalNum += frequency;
    }
    return totalNum;
  }
}