org.conqat.lib.commons.string.SimpleNLPUtils Maven / Gradle / Ivy

Show more of this group Show more artifacts with this name
Show all versions of teamscale-lib-commons Show documentation
Provides common utility functions
There is a newer version: 2024.7.2
Show newest version
/*
 * Copyright (c) CQSE GmbH
 *
 * Licensed under the Apache License, Version 2.0 (the "License");
 * you may not use this file except in compliance with the License.
 * You may obtain a copy of the License at
 *
 *     http://www.apache.org/licenses/LICENSE-2.0
 *
 * Unless required by applicable law or agreed to in writing, software
 * distributed under the License is distributed on an "AS IS" BASIS,
 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 * See the License for the specific language governing permissions and
 * limitations under the License.
 */

package org.conqat.lib.commons.string;

import java.util.ArrayList;
import java.util.Arrays;
import java.util.List;
import java.util.regex.Matcher;
import java.util.regex.Pattern;
import java.util.stream.Collectors;

import org.checkerframework.checker.nullness.qual.NonNull;
import org.checkerframework.checker.nullness.qual.Nullable;
import org.conqat.lib.commons.collections.CollectionUtils;

/** Utility class for natural language processing. */
public class SimpleNLPUtils {

	/**
	 * The regex pattern used to identify word boundaries. Using only "\\s" here is not enough. We use
	 * \p{Zs}" which matches all unicode horizontal whitespace characters including invisible
	 * non-whitespace characters such as NBSP.
	 */
	private static final Pattern WORD_SEPARATION_PATTERN = Pattern.compile("[\\s\\p{Z}]+");

	/** List of irregular superlative adjectives. */
	public static final List IRREGULAR_SUPERLATIVES = Arrays.asList("best", "worst", "least", "furthest",
			"farthest");

	/** List of irregular comparative adjectives */
	public static final List IRREGULAR_COMPARATIVES = Arrays.asList("better", "worse", "less", "further",
			"farther");

	/**
	 * Regular expression to match words with at least two syllables. This regular expression uses the
	 * simplified assumption that a syllable is formed by a vowel. The English language sometimes
	 * considers the letter 'y' as a vowel, for instance in words like "myth" or "symmetry", where 'y'
	 * makes a sound that is often associated with vowels. That's why 'y' is included in this
	 * implementation.
	 * 
	 * [A-Za-z]* : Matches any sequence of letters (upper-case and lower-case), including an empty
	 * sequence
	 * [aeiouy] : Matches a single vowel
	 * [A-Za-z]* : Matches any sequence of letters (upper-case and lower-case), including an empty
	 * sequence
	 * [aeiouy] : Matches a single vowel
	 * [A-Za-z]* : Matches any sequence of letters (upper-case and lower-case), including an empty
	 * sequence
	 * 
	 * Please note this pattern is a simplification. For accurate syllable counting in natural
	 * languages, more advanced linguistic processing techniques or libraries may be required.
	 */
	public static final String REGEX_WORD_WITH_TWO_SYLLABLES = "\\b\\w*?([aeiouy]+[^aeiouy\\s]+){2,}\\w*?\\b";

	/** Pattern that matches sentences in a text. */
	private static final Pattern SENTENCE_END_PATTERN = Pattern.compile("\\s*(\\S.*?[.!?])(?=\\s+\\p{Lu}|$)",
			Pattern.DOTALL);

	/**
	 * Represents the details of a sentence, including the sentence itself, the starting offset
	 * (inclusive), and the ending offset (inclusive).
	 */
	public static class SentenceDetails {

		private final String sentence;

		private final int startOffset;

		private final int endOffset;

		public SentenceDetails(String sentence, int startOffset, int endOffset) {
			this.sentence = sentence;
			this.startOffset = startOffset;
			this.endOffset = endOffset;
		}

		public String getSentence() {
			return sentence;
		}

		public int getStartOffset() {
			return startOffset;
		}

		public int getEndOffset() {
			return endOffset;
		}
	}

	/**
	 * Splits the given text into words using whitespace (including control characters such as line
	 * breaks or tabs) as boundary.
	 *
	 * @see #WORD_SEPARATION_PATTERN
	 */
	public static @NonNull List splitIntoWords(@Nullable String text) {
		if (StringUtils.isEmpty(text)) {
			return CollectionUtils.emptyList();
		}
		return Arrays.stream(WORD_SEPARATION_PATTERN.split(text))
				.filter(word -> !org.apache.commons.lang3.StringUtils.isBlank(word)).collect(Collectors.toList());
	}

	/**
	 * Replace substrings that match one of the ignore-patterns with spaces without changing the length
	 * of the text or other character positions.
	 */
	public static @NonNull String removeIgnoredSubstrings(@NonNull Pattern ignorePattern, @Nullable String text) {
		if (text == null) {
			return StringUtils.EMPTY_STRING;
		}

		Matcher matcher = ignorePattern.matcher(text);
		StringBuilder builder = new StringBuilder(text);
		while (matcher.find()) {
			int startIndex = matcher.start();
			int endIndex = matcher.end();
			builder.replace(startIndex, endIndex, StringUtils.repeat(StringUtils.SPACE, endIndex - startIndex));
		}
		return builder.toString();
	}

	/**
	 * Check if the given character is considered a word boundary character, i.e., anything other than
	 * letters or digits.
	 */
	private static boolean isWordBoundary(char character) {
		return !Character.isLetterOrDigit(character);
	}

	/**
	 * Check if the word starting at {@code index} with length {@code wordLength} within {@code text}
	 * has word boundary characters at its left and right. Word boundary characters include any
	 * character that is not a letter or a digit, as determined by the {@link #isWordBoundary} method.
	 */
	public static boolean hasWordBoundaries(int index, int wordLength, String text) {
		// no word boundary on the left?
		if (index > 0 && !isWordBoundary(text.charAt(index - 1))) {
			return false;
		}
		// no word boundary on the right?
		return index + wordLength >= text.length() || isWordBoundary(text.charAt(index + wordLength));
	}

	/**
	 * Split the input text into sentences and provide their respective start and end offsets.
	 *
	 * Each sentence is identified by sentence terminators (".", "!" & "?"). For each sentence, an
	 * instance of {@link SentenceDetails} is created, capturing the sentence text, start offset
	 * (inclusive), and end offset (inclusive). These details are stored in a list of objects. The
	 * method returns a list of {@link SentenceDetails} for all sentences in the input text.
	 *
	 * 
	 * Limitations: The sentence splitting method currently has the following limitations:
	 * 

	 * The method may not properly handle sentences that end with non-word characters (like quotes
	 * or parentheses) before the space and the next sentence starting with an uppercase letter or
	 * reaching the end of the line. For example, a text like `She said, "This is a quote!".` will be
	 * split into `She said,`, `"This is a quote!`); while it is likely that the expected split sentence
	 * is `She said, "This is a quote!".`
	 * The sentence terminators (".", "!" & "?") are considered as the end of the sentences. In some
	 * contexts, these characters may not represent the end of a sentence. For example, in honorifics
	 * (like Mr., Mrs., Dr.), abbreviations, or decimal numbers.
	 * It may not correctly handle sentences where the next sentence does not start with an
	 * uppercase letter after the punctuation.
	 * 
	 * 
	 */
	public static List splitIntoSentences(String text) {
		if (StringUtils.isEmpty(text)) {
			return CollectionUtils.emptyList();
		}
		List sentences = new ArrayList<>();
		Matcher matcher = SENTENCE_END_PATTERN.matcher(text);

		while (matcher.find()) {
			// skip leading whitespaces
			String sentence = matcher.group(1);
			int start = matcher.start(1);
			int end = matcher.end(1);
			sentences.add(new SentenceDetails(sentence, start, end - 1));
		}
		return sentences;
	}
}