org.conqat.lib.commons.string.SimpleNLPUtils Maven / Gradle / Ivy
Go to download
Show more of this group Show more artifacts with this name
Show all versions of teamscale-lib-commons Show documentation
Show all versions of teamscale-lib-commons Show documentation
Provides common utility functions
/*
* Copyright (c) CQSE GmbH
*
* Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License.
* You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
package org.conqat.lib.commons.string;
import java.util.ArrayList;
import java.util.Arrays;
import java.util.List;
import java.util.regex.Matcher;
import java.util.regex.Pattern;
import java.util.stream.Collectors;
import org.checkerframework.checker.nullness.qual.NonNull;
import org.checkerframework.checker.nullness.qual.Nullable;
import org.conqat.lib.commons.collections.CollectionUtils;
/** Utility class for natural language processing. */
public class SimpleNLPUtils {
/**
* The regex pattern used to identify word boundaries. Using only "\\s" here is not enough. We use
* \p{Zs}" which matches all unicode horizontal whitespace characters including invisible
* non-whitespace characters such as NBSP.
*/
private static final Pattern WORD_SEPARATION_PATTERN = Pattern.compile("[\\s\\p{Z}]+");
/** List of irregular superlative adjectives. */
public static final List IRREGULAR_SUPERLATIVES = Arrays.asList("best", "worst", "least", "furthest",
"farthest");
/** List of irregular comparative adjectives */
public static final List IRREGULAR_COMPARATIVES = Arrays.asList("better", "worse", "less", "further",
"farther");
/**
* Regular expression to match words with at least two syllables. This regular expression uses the
* simplified assumption that a syllable is formed by a vowel. The English language sometimes
* considers the letter 'y' as a vowel, for instance in words like "myth" or "symmetry", where 'y'
* makes a sound that is often associated with vowels. That's why 'y' is included in this
* implementation.
*
* - [A-Za-z]* : Matches any sequence of letters (upper-case and lower-case), including an empty
* sequence
* - [aeiouy] : Matches a single vowel
* - [A-Za-z]* : Matches any sequence of letters (upper-case and lower-case), including an empty
* sequence
* - [aeiouy] : Matches a single vowel
* - [A-Za-z]* : Matches any sequence of letters (upper-case and lower-case), including an empty
* sequence
*
* Please note this pattern is a simplification. For accurate syllable counting in natural
* languages, more advanced linguistic processing techniques or libraries may be required.
*/
public static final String REGEX_WORD_WITH_TWO_SYLLABLES = "\\b\\w*?([aeiouy]+[^aeiouy\\s]+){2,}\\w*?\\b";
/** Pattern that matches sentences in a text. */
private static final Pattern SENTENCE_END_PATTERN = Pattern.compile("\\s*(\\S.*?[.!?])(?=\\s+\\p{Lu}|$)",
Pattern.DOTALL);
/**
* Represents the details of a sentence, including the sentence itself, the starting offset
* (inclusive), and the ending offset (inclusive).
*/
public static class SentenceDetails {
private final String sentence;
private final int startOffset;
private final int endOffset;
public SentenceDetails(String sentence, int startOffset, int endOffset) {
this.sentence = sentence;
this.startOffset = startOffset;
this.endOffset = endOffset;
}
public String getSentence() {
return sentence;
}
public int getStartOffset() {
return startOffset;
}
public int getEndOffset() {
return endOffset;
}
}
/**
* Splits the given text into words using whitespace (including control characters such as line
* breaks or tabs) as boundary.
*
* @see #WORD_SEPARATION_PATTERN
*/
public static @NonNull List splitIntoWords(@Nullable String text) {
if (StringUtils.isEmpty(text)) {
return CollectionUtils.emptyList();
}
return Arrays.stream(WORD_SEPARATION_PATTERN.split(text))
.filter(word -> !org.apache.commons.lang3.StringUtils.isBlank(word)).collect(Collectors.toList());
}
/**
* Replace substrings that match one of the ignore-patterns with spaces without changing the length
* of the text or other character positions.
*/
public static @NonNull String removeIgnoredSubstrings(@NonNull Pattern ignorePattern, @Nullable String text) {
if (text == null) {
return StringUtils.EMPTY_STRING;
}
Matcher matcher = ignorePattern.matcher(text);
StringBuilder builder = new StringBuilder(text);
while (matcher.find()) {
int startIndex = matcher.start();
int endIndex = matcher.end();
builder.replace(startIndex, endIndex, StringUtils.repeat(StringUtils.SPACE, endIndex - startIndex));
}
return builder.toString();
}
/**
* Check if the given character is considered a word boundary character, i.e., anything other than
* letters or digits.
*/
private static boolean isWordBoundary(char character) {
return !Character.isLetterOrDigit(character);
}
/**
* Check if the word starting at {@code index} with length {@code wordLength} within {@code text}
* has word boundary characters at its left and right. Word boundary characters include any
* character that is not a letter or a digit, as determined by the {@link #isWordBoundary} method.
*/
public static boolean hasWordBoundaries(int index, int wordLength, String text) {
// no word boundary on the left?
if (index > 0 && !isWordBoundary(text.charAt(index - 1))) {
return false;
}
// no word boundary on the right?
return index + wordLength >= text.length() || isWordBoundary(text.charAt(index + wordLength));
}
/**
* Split the input text into sentences and provide their respective start and end offsets.
*
* Each sentence is identified by sentence terminators (".", "!" & "?"). For each sentence, an
* instance of {@link SentenceDetails} is created, capturing the sentence text, start offset
* (inclusive), and end offset (inclusive). These details are stored in a list of objects. The
* method returns a list of {@link SentenceDetails} for all sentences in the input text.
*
*
* Limitations: The sentence splitting method currently has the following limitations:
*
* - The method may not properly handle sentences that end with non-word characters (like quotes
* or parentheses) before the space and the next sentence starting with an uppercase letter or
* reaching the end of the line. For example, a text like `She said, "This is a quote!".` will be
* split into `She said,`, `"This is a quote!`); while it is likely that the expected split sentence
* is `She said, "This is a quote!".`
* - The sentence terminators (".", "!" & "?") are considered as the end of the sentences. In some
* contexts, these characters may not represent the end of a sentence. For example, in honorifics
* (like Mr., Mrs., Dr.), abbreviations, or decimal numbers.
* - It may not correctly handle sentences where the next sentence does not start with an
* uppercase letter after the punctuation.
*
*
*/
public static List splitIntoSentences(String text) {
if (StringUtils.isEmpty(text)) {
return CollectionUtils.emptyList();
}
List sentences = new ArrayList<>();
Matcher matcher = SENTENCE_END_PATTERN.matcher(text);
while (matcher.find()) {
// skip leading whitespaces
String sentence = matcher.group(1);
int start = matcher.start(1);
int end = matcher.end(1);
sentences.add(new SentenceDetails(sentence, start, end - 1));
}
return sentences;
}
}