All Downloads are FREE. Search and download functionalities are using the official Maven repository.

org.conqat.lib.commons.string.SimpleNLPUtils Maven / Gradle / Ivy

There is a newer version: 2024.7.2
Show newest version
/*
 * Copyright (c) CQSE GmbH
 *
 * Licensed under the Apache License, Version 2.0 (the "License");
 * you may not use this file except in compliance with the License.
 * You may obtain a copy of the License at
 *
 *     http://www.apache.org/licenses/LICENSE-2.0
 *
 * Unless required by applicable law or agreed to in writing, software
 * distributed under the License is distributed on an "AS IS" BASIS,
 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 * See the License for the specific language governing permissions and
 * limitations under the License.
 */

package org.conqat.lib.commons.string;

import java.util.ArrayList;
import java.util.Arrays;
import java.util.List;
import java.util.regex.Matcher;
import java.util.regex.Pattern;
import java.util.stream.Collectors;

import org.checkerframework.checker.nullness.qual.NonNull;
import org.checkerframework.checker.nullness.qual.Nullable;
import org.conqat.lib.commons.collections.CollectionUtils;

/** Utility class for natural language processing. */
public class SimpleNLPUtils {

	/**
	 * The regex pattern used to identify word boundaries. Using only "\\s" here is not enough. We use
	 * \p{Zs}" which matches all unicode horizontal whitespace characters including invisible
	 * non-whitespace characters such as NBSP.
	 */
	private static final Pattern WORD_SEPARATION_PATTERN = Pattern.compile("[\\s\\p{Z}]+");

	/** List of irregular superlative adjectives. */
	public static final List IRREGULAR_SUPERLATIVES = Arrays.asList("best", "worst", "least", "furthest",
			"farthest");

	/** List of irregular comparative adjectives */
	public static final List IRREGULAR_COMPARATIVES = Arrays.asList("better", "worse", "less", "further",
			"farther");

	/**
	 * Regular expression to match words with at least two syllables. This regular expression uses the
	 * simplified assumption that a syllable is formed by a vowel. The English language sometimes
	 * considers the letter 'y' as a vowel, for instance in words like "myth" or "symmetry", where 'y'
	 * makes a sound that is often associated with vowels. That's why 'y' is included in this
	 * implementation.
	 * 
    *
  • [A-Za-z]* : Matches any sequence of letters (upper-case and lower-case), including an empty * sequence
  • *
  • [aeiouy] : Matches a single vowel
  • *
  • [A-Za-z]* : Matches any sequence of letters (upper-case and lower-case), including an empty * sequence
  • *
  • [aeiouy] : Matches a single vowel
  • *
  • [A-Za-z]* : Matches any sequence of letters (upper-case and lower-case), including an empty * sequence
  • *
* Please note this pattern is a simplification. For accurate syllable counting in natural * languages, more advanced linguistic processing techniques or libraries may be required. */ public static final String REGEX_WORD_WITH_TWO_SYLLABLES = "\\b\\w*?([aeiouy]+[^aeiouy\\s]+){2,}\\w*?\\b"; /** Pattern that matches sentences in a text. */ private static final Pattern SENTENCE_END_PATTERN = Pattern.compile("\\s*(\\S.*?[.!?])(?=\\s+\\p{Lu}|$)", Pattern.DOTALL); /** * Represents the details of a sentence, including the sentence itself, the starting offset * (inclusive), and the ending offset (inclusive). */ public static class SentenceDetails { private final String sentence; private final int startOffset; private final int endOffset; public SentenceDetails(String sentence, int startOffset, int endOffset) { this.sentence = sentence; this.startOffset = startOffset; this.endOffset = endOffset; } public String getSentence() { return sentence; } public int getStartOffset() { return startOffset; } public int getEndOffset() { return endOffset; } } /** * Splits the given text into words using whitespace (including control characters such as line * breaks or tabs) as boundary. * * @see #WORD_SEPARATION_PATTERN */ public static @NonNull List splitIntoWords(@Nullable String text) { if (StringUtils.isEmpty(text)) { return CollectionUtils.emptyList(); } return Arrays.stream(WORD_SEPARATION_PATTERN.split(text)) .filter(word -> !org.apache.commons.lang3.StringUtils.isBlank(word)).collect(Collectors.toList()); } /** * Replace substrings that match one of the ignore-patterns with spaces without changing the length * of the text or other character positions. */ public static @NonNull String removeIgnoredSubstrings(@NonNull Pattern ignorePattern, @Nullable String text) { if (text == null) { return StringUtils.EMPTY_STRING; } Matcher matcher = ignorePattern.matcher(text); StringBuilder builder = new StringBuilder(text); while (matcher.find()) { int startIndex = matcher.start(); int endIndex = matcher.end(); builder.replace(startIndex, endIndex, StringUtils.repeat(StringUtils.SPACE, endIndex - startIndex)); } return builder.toString(); } /** * Check if the given character is considered a word boundary character, i.e., anything other than * letters or digits. */ private static boolean isWordBoundary(char character) { return !Character.isLetterOrDigit(character); } /** * Check if the word starting at {@code index} with length {@code wordLength} within {@code text} * has word boundary characters at its left and right. Word boundary characters include any * character that is not a letter or a digit, as determined by the {@link #isWordBoundary} method. */ public static boolean hasWordBoundaries(int index, int wordLength, String text) { // no word boundary on the left? if (index > 0 && !isWordBoundary(text.charAt(index - 1))) { return false; } // no word boundary on the right? return index + wordLength >= text.length() || isWordBoundary(text.charAt(index + wordLength)); } /** * Split the input text into sentences and provide their respective start and end offsets. * * Each sentence is identified by sentence terminators (".", "!" & "?"). For each sentence, an * instance of {@link SentenceDetails} is created, capturing the sentence text, start offset * (inclusive), and end offset (inclusive). These details are stored in a list of objects. The * method returns a list of {@link SentenceDetails} for all sentences in the input text. * *

* Limitations: The sentence splitting method currently has the following limitations: *

    *
  • The method may not properly handle sentences that end with non-word characters (like quotes * or parentheses) before the space and the next sentence starting with an uppercase letter or * reaching the end of the line. For example, a text like `She said, "This is a quote!".` will be * split into `She said,`, `"This is a quote!`); while it is likely that the expected split sentence * is `She said, "This is a quote!".`
  • *
  • The sentence terminators (".", "!" & "?") are considered as the end of the sentences. In some * contexts, these characters may not represent the end of a sentence. For example, in honorifics * (like Mr., Mrs., Dr.), abbreviations, or decimal numbers.
  • *
  • It may not correctly handle sentences where the next sentence does not start with an * uppercase letter after the punctuation.
  • *
*

*/ public static List splitIntoSentences(String text) { if (StringUtils.isEmpty(text)) { return CollectionUtils.emptyList(); } List sentences = new ArrayList<>(); Matcher matcher = SENTENCE_END_PATTERN.matcher(text); while (matcher.find()) { // skip leading whitespaces String sentence = matcher.group(1); int start = matcher.start(1); int end = matcher.end(1); sentences.add(new SentenceDetails(sentence, start, end - 1)); } return sentences; } }




© 2015 - 2024 Weber Informatics LLC | Privacy Policy