org.molgenis.semanticsearch.string.Stemmer Maven / Gradle / Ivy
Go to download
Show more of this group Show more artifacts with this name
Show all versions of molgenis-semantic-search Show documentation
Show all versions of molgenis-semantic-search Show documentation
Semantic data search service functionality.
package org.molgenis.semanticsearch.string;
import java.util.Set;
import java.util.stream.Collectors;
import org.apache.commons.lang3.StringUtils;
import org.tartarus.snowball.ext.PorterStemmer;
public class Stemmer {
private static final String ILLEGAL_REGEX_PATTERN = "[^a-zA-Z0-9 ]";
private Stemmer() {}
/**
* Remove illegal characters from the string and stem each single word
*
* @return a string that consists of stemmed words
*/
public static String cleanStemPhrase(String phrase) {
StringBuilder stringBuilder = new StringBuilder();
for (String word : replaceIllegalCharacter(phrase).split(" ")) {
String stemmedWord = stem(word);
if (StringUtils.isNotEmpty(stemmedWord)) {
if (stringBuilder.length() > 0) {
stringBuilder.append(' ');
}
stringBuilder.append(stemmedWord);
}
}
return stringBuilder.toString();
}
public static String stem(String word) {
PorterStemmer porterStemmer = new PorterStemmer();
porterStemmer.setCurrent(word);
porterStemmer.stem();
return porterStemmer.getCurrent();
}
public static String stemAndJoin(Set terms) {
return terms.stream().map(Stemmer::stem).collect(Collectors.joining(" "));
}
public static String replaceIllegalCharacter(String string) {
return string.replaceAll(ILLEGAL_REGEX_PATTERN, " ").replaceAll(" +", " ").trim().toLowerCase();
}
}
© 2015 - 2025 Weber Informatics LLC | Privacy Policy