All Downloads are FREE. Search and download functionalities are using the official Maven repository.

org.openlca.text.PhraseParser Maven / Gradle / Ivy

There is a newer version: 2.2.1
Show newest version
package org.openlca.text;

import java.util.HashSet;
import java.util.Set;

public class PhraseParser {

	private final HashSet stopWords;
	private final HashSet fillSet;

	public PhraseParser() {
		this(null);
	}

	public PhraseParser(Set stopWords) {
		if (stopWords == null || stopWords.isEmpty()) {
			this.stopWords = null;
		} else {
			this.stopWords = new HashSet<>();
			for (var w : stopWords) {
				if (w == null || w.isBlank())
					continue;
				var trimmed = w.trim().toLowerCase();
				this.stopWords.add(trimmed);
			}
		}
		this.fillSet = new HashSet<>();
	}

	/**
	 * Parses the words of the given phrase into a new word buffer.
	 */
	public WordBuffer parse(String phrase) {
		var buffer = new WordBuffer();
		parseInto(buffer, phrase);
		return buffer;
	}

	/**
	 * Resets the given buffer and parses the phrase into the buffer.
	 */
	public void parseInto(WordBuffer buffer, String phrase) {
		fillSet.clear();
		buffer.reset();

		if (phrase == null || phrase.isBlank())
			return;

		var feed = phrase.trim().toLowerCase();
		var word = new StringBuilder();
		Runnable wordEnd = () -> {
			if (word.isEmpty())
				return;
			var w = word.toString().toLowerCase();
			word.setLength(0);
			if (fillSet.contains(w) || isStopWord(w))
				return;
			fillSet.add(w);
			buffer.add(w);
		};

		for (char c : feed.toCharArray()) {
			if (Character.isLetterOrDigit(c)
				|| c == '-' || c == '%') {
				word.append(c);
				continue;
			}
			wordEnd.run();
		}
		wordEnd.run();

	}

	private boolean isStopWord(String word) {
		return stopWords != null && stopWords.contains(word);
	}

}




© 2015 - 2024 Weber Informatics LLC | Privacy Policy