All Downloads are FREE. Search and download functionalities are using the official Maven repository.

org.molgenis.data.semanticsearch.string.Stemmer Maven / Gradle / Ivy

package org.molgenis.data.semanticsearch.string;

import java.util.Set;
import java.util.stream.Collectors;

import org.apache.commons.lang3.StringUtils;
import org.tartarus.snowball.SnowballProgram;
import org.tartarus.snowball.ext.PorterStemmer;

public class Stemmer
{
	private final static String ILLEGAL_REGEX_PATTERN = "[^a-zA-Z0-9 ]";
	private final SnowballProgram porterStemmer;

	public Stemmer()
	{
		porterStemmer = new PorterStemmer();
	}

	/**
	 * Remove illegal characters from the string and stem each single word
	 * 
	 * @param phrase
	 * @return a string that consists of stemmed words
	 */
	public String cleanStemPhrase(String phrase)
	{
		StringBuilder stringBuilder = new StringBuilder();
		for (String word : replaceIllegalCharacter(phrase).split(" "))
		{
			String stemmedWord = stem(word);
			if (StringUtils.isNotEmpty(stemmedWord))
			{
				if (stringBuilder.length() > 0)
				{
					stringBuilder.append(' ');
				}

				stringBuilder.append(stemmedWord);
			}
		}
		return stringBuilder.toString();
	}

	public synchronized String stem(String word)
	{
		porterStemmer.setCurrent(word);
		porterStemmer.stem();
		return porterStemmer.getCurrent();
	}

	public String stemAndJoin(Set terms)
	{
		return terms.stream().map(this::stem).collect(Collectors.joining(" "));
	}

	public String replaceIllegalCharacter(String string)
	{
		return string.replaceAll(ILLEGAL_REGEX_PATTERN, " ").replaceAll(" +", " ").trim().toLowerCase();
	}
}




© 2015 - 2024 Weber Informatics LLC | Privacy Policy