com.metaeffekt.artifact.analysis.utils.StringStats Maven / Gradle / Ivy
/*
* Copyright 2021-2024 the original author or authors.
*
* Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License.
* You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
package com.metaeffekt.artifact.analysis.utils;
import lombok.Getter;
import java.util.*;
import java.util.regex.Pattern;
/**
* Builds internal structures for matching strings in a normalized fashion.
*/
public class StringStats {
private static final Pattern PATTERN_WHITESPACES = Pattern.compile("[\\r\\n\\s]+");
private static final Pattern PATTERN_TEMPLATE_DELIMITER = Pattern.compile("\\.\\\\\" ");
private static final Pattern PATTERN_DNL_SEPARATOR = Pattern.compile(" dnl ");
private static final Pattern PATTERN_HYPHEN_LOWERCASE = Pattern.compile("([a-z]{2}) - ([a-z]{2})");
private static final Pattern PATTERN_HYPHEN_UPPERCASE = Pattern.compile("([A-Z]{2}) - ([A-Z]{2})");
private static final Pattern PATTERN_COMMA = Pattern.compile(" , ");
private static final Pattern PATTERN_SIMPLE_WILDCARD = Pattern.compile("\\*");
private static final Pattern PATTERN_WILDCARD_ORIGINAL = Pattern.compile("\\*\\{|}\\*");
private static final Pattern FAILURE_PATTERN = Pattern.compile("(\\*\\{.*)|(.*}\\*)");
public static final String QUOTES = "\"\u0027\u0060\u00B4\u2018\u2019\u201C\u201D";
public static final String SEPARATORS = "():;,.\\!?=+-_[]{}<>";
public static final String COMMENT_NO_WILDCARD = "\\/|#";
public static final String COMMENT_WITH_WILDCARD = "\\*/|#";
public static final String MARKER = "##MARKER##";
public static final String PREFIX_REGEXP = "^";
public static final SimpleIntPair PAIR_NO_MATCH = SimpleIntPair.of(-1, -1);
private static final int[] EMPTY_INT_ARRAY = new int[0];
private static final int[] FAILURE_INT = {-1};
@Getter
private final String originalString;
private final boolean isImmutable;
@Getter
private String normalizedString;
@Getter
private String normalizedStringLowerCase;
private transient final Map indexOfCache = new HashMap<>();
private static final Map STRINGSTAT_PERM_CACHE = Collections.synchronizedMap(new HashMap<>());
private static final Map STRINGSTAT_TEMP_CACHE = Collections.synchronizedMap(new WeakHashMap<>());
private StringStats(String originalString, boolean isMatch, boolean isSubmatch) {
this.originalString = originalString.trim();
this.isImmutable = isMatch || isSubmatch;
this.normalizedString = normalizeInternal(this.originalString, isMatch, isSubmatch);
this.normalizedStringLowerCase = this.normalizedString.toLowerCase();
}
public static StringStats normalize(String originalString, boolean isMatch, boolean isSubmatch) {
final Map cache;
if (isSubmatch) {
cache = STRINGSTAT_TEMP_CACHE;
} else if (isMatch) {
cache = STRINGSTAT_PERM_CACHE;
} else {
cache = null;
}
if (cache != null) {
final StringStats cached = cache.get(originalString);
if (cached != null) return cached;
final StringStats newInstance = new StringStats(originalString, isMatch, isSubmatch);
cache.put(originalString, newInstance);
return newInstance;
} else {
return new StringStats(originalString, isMatch, isSubmatch);
}
}
private String normalizeInternal(String originalString, boolean isMatch, boolean isSubmatch) {
String normalized = originalString.trim();
// template files may use this as delimiter
normalized = PATTERN_TEMPLATE_DELIMITER.matcher(normalized).replaceAll("");
// enclose separators with spaces
for (int i = 0; i < SEPARATORS.length(); i++) {
char character = SEPARATORS.charAt(i);
normalized = normalized.replace(String.valueOf(character), " " + character + " ");
}
// normalize enumerators; replace with dash '-'
final String enumerators = "\u2022°";
for (int i = 0; i < enumerators.length(); i++) {
normalized = normalized.replace(enumerators.substring(i, i + 1), "-");
}
// eliminate quotes
for (int i = 0; i < QUOTES.length(); i++) {
normalized = normalized.replace(QUOTES.substring(i, i + 1), "");
}
// for contains we do not replace the wildcards
final String comments = isMatch ? COMMENT_NO_WILDCARD : COMMENT_WITH_WILDCARD;
for (int i = 0; i < comments.length(); i++) {
normalized = normalized.replace("" + comments.charAt(i), " ");
}
if (!isSubmatch) {
// enclose whole string with spaces (no boundaries)
normalized = " " + normalized + " ";
}
// FIXME: move to mappings; should not be hard-coded here
// dnl is a separator used in .m4 autoconf files
normalized = PATTERN_DNL_SEPARATOR.matcher(normalized).replaceAll(" ");
// remove multiple whitespaces
normalized = PATTERN_WHITESPACES.matcher(normalized).replaceAll(" ");
// some specialists introduce hyphens; we remove them again
// NOTE: this is a rather conservative approach; the split word must have the same case
// Other such cases must be covered by explicit mappings
// FIXME: this should be rather performed on the content with markers; try to move to mappings
normalized = PATTERN_HYPHEN_LOWERCASE.matcher(normalized).replaceAll("$1$2");
normalized = PATTERN_HYPHEN_UPPERCASE.matcher(normalized).replaceAll("$1$2");
// FIXME-KKL: is this a good idea? Text meaning can significantly change without comma. Needs revision.
// remove ','; replace with ' '
normalized = PATTERN_COMMA.matcher(normalized).replaceAll(" ");
// remove multiple whitespaces; ensure; second pass (required ate least due to comma replacement)
normalized = PATTERN_WHITESPACES.matcher(normalized).replaceAll(" ");
return normalized;
}
/**
* Checks whether this {@link StringStats} contains the given {@link StringStats} value.
*
* @param value The value to check for containment.
* @param caseSensitive Whether to match case-sensitive.
*
* @return Returns {@code true} when the value is contained in this {@link StringStats} instance.
*/
public final boolean contains(final StringStats value, final boolean caseSensitive) {
return indexOf(value, caseSensitive).getLeft() > -1;
}
/**
* Tries to find value in the local (this) instance. Returns a pair consisting of start and end index.
*
* @param value The value to compute the index of.
* @param caseSensitive Whether the check is case-sensitive or not.
*
* @return A pair of start and end index.
*/
public final SimpleIntPair indexOf(final StringStats value, final boolean caseSensitive) {
// OPTIMIZATION-TARGET: this method is central for license matching
final String searchString = value.getNormalizedString(!caseSensitive);
final String originalSearchString = value.getOriginalString();
// compute cache key
final String cacheKey = caseSensitive ? searchString : searchString + "-false";
// access cache
final SimpleIntPair cachedIndex = indexOfCache.get(cacheKey);
if (cachedIndex != null) {
return cachedIndex;
}
// support regular expressions for special situations; regular expressions are identified by prefixed '^'
final String originalString = value.getOriginalString();
if (originalString.startsWith(PREFIX_REGEXP)) {
// NOTE: case sensitivity parameter is not considered
final String replacedString = normalizedString.replaceFirst(originalString.substring(1), MARKER);
final int minIndex = replacedString.indexOf(MARKER);
final int replacedLength = normalizedString.length() - (replacedString.length() - MARKER.length());
return cacheAndReturn(cacheKey, SimpleIntPair.of(minIndex, minIndex + replacedLength));
}
final String baseString = getNormalizedString(!caseSensitive);
// detect regexp in search-string
if (originalSearchString.contains("*{")) { //alternatively with .matches and regex ".*\\*\\{(.*?)\\}\\*.*"
final List matches = matchRegexp(value);
if (matches.isEmpty()) {
return PAIR_NO_MATCH;
}
return cacheAndReturn(cacheKey, SimpleIntPair.of(matches.get(0)[0], matches.get(0)[1]));
}
if (searchString.contains("*")) {
// OPTIMIZATION: 6%
final String[] searchStringElements = PATTERN_SIMPLE_WILDCARD.split(searchString);
int minIndex = Integer.MAX_VALUE;
int maxIndex = 0;
int startIndex = 0;
for (final String searchStringElement : searchStringElements) {
// OPTIMIZATION: 93%
final int index = baseString.indexOf(searchStringElement, startIndex);
// fast exit (as soon as one searchStringElement is not found)
if (index == -1) return cacheAndReturn(cacheKey, PAIR_NO_MATCH);
// skip from found index + searchStringElement length characters
maxIndex = index + searchStringElement.length();
startIndex = maxIndex;
// memorize first index; this is what we return
if (minIndex == Integer.MAX_VALUE) {
minIndex = index;
}
}
return cacheAndReturn(cacheKey, minIndex == Integer.MAX_VALUE ?
PAIR_NO_MATCH : SimpleIntPair.of(minIndex, maxIndex));
} else {
// optimized code no wildcard support
final int index = baseString.indexOf(searchString);
return cacheAndReturn(cacheKey, index == -1 ?
PAIR_NO_MATCH : SimpleIntPair.of(index, index + searchString.length()));
}
}
private SimpleIntPair cacheAndReturn(final String cacheKey, final SimpleIntPair indexPair) {
indexOfCache.put(cacheKey, indexPair);
return indexPair;
}
@Override
public String toString() {
return normalizedString;
}
public int[] allMatchesOriginalString(StringStats matchStats) {
return matchIndexes(matchStats, matchStats.getOriginalString());
}
public int[] allMatches(StringStats matchStats) {
return matchIndexes(matchStats, matchStats.getNormalizedString());
}
public int[] matchIndexes(StringStats matchStats, String matchString) {
if (StringUtils.isEmpty(matchString)) {
return EMPTY_INT_ARRAY;
}
// quick-check whether any wildcard is included
if (matchString.contains("*")) {
// NOTE: *{ is normalized to * {
if (matchString.contains("* {")) {
final List ints = matchRegexp(matchStats);
// project to array of start indexes
int[] matches = new int[(ints.size())];
for (int i = 0; i < matches.length; i++) {
matches[i] = ints.get(i)[0];
}
return matches;
}
// contains wildcard, but no expression
throw new IllegalStateException("Wildcard not supported in match string" + matchStats.getOriginalString());
}
// subsequently no wildcards need to be treated
int index;
int current = 0;
String string = this.normalizedString;
// collect matches
final List indexes = new ArrayList<>();
final int length = matchString.length();
do {
index = string.indexOf(matchString);
if (index != -1) {
indexes.add(current + index);
string = string.substring(index + length);
current = current + index + length;
}
} while (index != -1 && string.length() >= length);
// optimization for simple cases
if (indexes.isEmpty()) {
return EMPTY_INT_ARRAY;
} else if (indexes.size() == 1) {
final int[] ints = new int[1];
ints[0] = indexes.get(0);
return ints;
}
// list to array (optimization option to use lists in signatures instead?)
int[] indexArray = new int[indexes.size()];
for (int i = 0; i < indexArray.length; i++) {
indexArray[i] = indexes.get(i);
}
return indexArray;
}
public List matchRegexp(StringStats matchStats) {
final String originalSearchString = matchStats.getOriginalString();
// FIXME: regex's at the beginning/ending of a string should work!
if (FAILURE_PATTERN.matcher(originalSearchString).matches()) {
return Collections.singletonList(FAILURE_INT);
}
// array with search string split into text and regular expression elements -> searchStringElements
if (originalSearchString.contains("*{")) {
// NOTE: a precompiled pattern does not show any timing improvement
final String[] searchStringElements = PATTERN_WILDCARD_ORIGINAL.split(originalSearchString);
// for every textElement from SearchString, allMatches, in reference to Text String, are collected
// -> startingIndex of each match
int i = 0;
final List results = new ArrayList<>();
for (final String searchStringElement : searchStringElements) {
if (i % 2 == 0) {
int[] matches = allMatches(StringStats.normalize(searchStringElement, true, true));
// FIXME: review early exit
// once an element is observed to not match; the whole sequence cannot be matched;
// we return with empty result.
if (matches.length == 0) {
return new ArrayList<>();
}
results.add(matches);
}
i++;
}
// Matches of textElements are being combined -> one checkTriplet() go through combines two textElements with
// each other considering the regular expression between them.
final List matches = checkTriplet(searchStringElements, results, 0, new ArrayList<>());
// FIXME: what does this snippet do? Remove all matches, where the end is 0? Does this ever happen?
matches.removeIf(ints -> ints[1] == 0);
return matches;
}
return null;
}
private List checkTriplet(String[] searchStringElements, List indexes, int currentElement, List tuples) {
final String searchStringLeft = searchStringElements[currentElement];
final String searchStringRight = searchStringElements[currentElement + 2];
final String normalizedLeft = normalize(searchStringLeft, true, true).getNormalizedString();
final String normalizedRight = normalize(searchStringRight, true, true).getNormalizedString();
// FIXME: document what happens here; why does the process start in the middle of indexes? Do they alternate?
final int halfWaysIndex = currentElement / 2;
final int[] indexLeft = indexes.get(halfWaysIndex);
final int[] indexRight = indexes.get(1 + halfWaysIndex);
final String regexp = searchStringElements[currentElement + 1];
// FIXME-KKL: needs review; looking for optimization potentials
for (final int indexCurrent : indexLeft) {
for (final int indexNext : indexRight) {
if (indexCurrent < indexNext) {
// FIXME: please comment this method to document intention
// checks if indexCurrent and indexNext are in range of 500 chars
if (indexCurrent + normalizedLeft.length() + 500 >= indexNext) {
if (indexCurrent + normalizedLeft.length() <= indexNext) {
// extracting string between indexes to apply it on regex
String toBeMatched = getNormalizedString().substring(indexCurrent + normalizedLeft.length(), indexNext);
// trim, since we are agnostic to whitespaces; the regexp may match completely optionally
toBeMatched = toBeMatched.trim();
if (toBeMatched.matches(regexp)) {
if (currentElement == 0) {
final int[] tuple = new int[2];
tuple[0] = indexCurrent;
tuples.add(tuple);
}
// if last element
if (currentElement == searchStringElements.length - 3) {
final int[] tuple = tuples.get(tuples.size() - 1);
tuple[1] = indexNext + normalizedRight.length();
} else {
checkTriplet(searchStringElements, indexes, currentElement + 2, tuples);
}
}
}
}
} else {
// NOTE: in this case we need to catch up; continue
}
}
}
return tuples;
}
public void update(String normalizedString) {
if (isImmutable) {
throw new IllegalStateException("Cannot modify immutable StringStats.");
}
this.normalizedString = normalizedString;
this.normalizedStringLowerCase = normalizedString.toLowerCase();
}
public final String getNormalizedString(boolean lowercase) {
return lowercase ? normalizedStringLowerCase : normalizedString;
}
public static StringStats normalize(String originalString, boolean isMatch) {
return normalize(originalString, isMatch, false);
}
}
© 2015 - 2025 Weber Informatics LLC | Privacy Policy