All Downloads are FREE. Search and download functionalities are using the official Maven repository.

org.molgenis.semanticsearch.string.Stemmer Maven / Gradle / Ivy

There is a newer version: 8.4.5
Show newest version
package org.molgenis.semanticsearch.string;

import java.util.Set;
import java.util.stream.Collectors;
import org.apache.commons.lang3.StringUtils;
import org.tartarus.snowball.ext.PorterStemmer;

public class Stemmer {
  private static final String ILLEGAL_REGEX_PATTERN = "[^a-zA-Z0-9 ]";

  private Stemmer() {}

  /**
   * Remove illegal characters from the string and stem each single word
   *
   * @return a string that consists of stemmed words
   */
  public static String cleanStemPhrase(String phrase) {
    StringBuilder stringBuilder = new StringBuilder();
    for (String word : replaceIllegalCharacter(phrase).split(" ")) {
      String stemmedWord = stem(word);
      if (StringUtils.isNotEmpty(stemmedWord)) {
        if (stringBuilder.length() > 0) {
          stringBuilder.append(' ');
        }

        stringBuilder.append(stemmedWord);
      }
    }
    return stringBuilder.toString();
  }

  public static String stem(String word) {
    PorterStemmer porterStemmer = new PorterStemmer();
    porterStemmer.setCurrent(word);
    porterStemmer.stem();
    return porterStemmer.getCurrent();
  }

  public static String stemAndJoin(Set terms) {
    return terms.stream().map(Stemmer::stem).collect(Collectors.joining(" "));
  }

  public static String replaceIllegalCharacter(String string) {
    return string.replaceAll(ILLEGAL_REGEX_PATTERN, " ").replaceAll(" +", " ").trim().toLowerCase();
  }
}




© 2015 - 2025 Weber Informatics LLC | Privacy Policy