All Downloads are FREE. Search and download functionalities are using the official Maven repository.

com.metaeffekt.artifact.analysis.utils.StringStats Maven / Gradle / Ivy

The newest version!
/*
 * Copyright 2021-2024 the original author or authors.
 *
 * Licensed under the Apache License, Version 2.0 (the "License");
 * you may not use this file except in compliance with the License.
 * You may obtain a copy of the License at
 *
 *     http://www.apache.org/licenses/LICENSE-2.0
 *
 * Unless required by applicable law or agreed to in writing, software
 * distributed under the License is distributed on an "AS IS" BASIS,
 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 * See the License for the specific language governing permissions and
 * limitations under the License.
 */
package com.metaeffekt.artifact.analysis.utils;

import lombok.Getter;

import java.util.*;
import java.util.regex.Pattern;

/**
 * Builds internal structures for matching strings in a normalized fashion.
 */
public class StringStats {

    private static final Pattern PATTERN_WHITESPACES = Pattern.compile("[\\r\\n\\s]+");
    private static final Pattern PATTERN_TEMPLATE_DELIMITER = Pattern.compile("\\.\\\\\" ");
    private static final Pattern PATTERN_DNL_SEPARATOR = Pattern.compile(" dnl ");
    private static final Pattern PATTERN_HYPHEN_LOWERCASE = Pattern.compile("([a-z]{2}) - ([a-z]{2})");
    private static final Pattern PATTERN_HYPHEN_UPPERCASE = Pattern.compile("([A-Z]{2}) - ([A-Z]{2})");
    private static final Pattern PATTERN_COMMA = Pattern.compile(" , ");
    private static final Pattern PATTERN_SIMPLE_WILDCARD = Pattern.compile("\\*");
    private static final Pattern PATTERN_WILDCARD_ORIGINAL = Pattern.compile("\\*\\{|}\\*");
    private static final Pattern FAILURE_PATTERN = Pattern.compile("(\\*\\{.*)|(.*}\\*)");

    public static final String QUOTES = "\"\u0027\u0060\u00B4\u2018\u2019\u201C\u201D";
    public static final String SEPARATORS = "():;,.\\!?=+-_[]{}<>";
    public static final String COMMENT_NO_WILDCARD = "\\/|#";
    public static final String COMMENT_WITH_WILDCARD = "\\*/|#";
    public static final String MARKER = "##MARKER##";
    public static final String PREFIX_REGEXP = "^";
    public static final SimpleIntPair PAIR_NO_MATCH = SimpleIntPair.of(-1, -1);

    private static final int[] EMPTY_INT_ARRAY = new int[0];
    private static final int[] FAILURE_INT = {-1};

    @Getter
    private final String originalString;

    private final boolean isImmutable;

    @Getter
    private String normalizedString;

    @Getter
    private String normalizedStringLowerCase;

    private transient final Map indexOfCache = new HashMap<>();

    private static final Map STRINGSTAT_PERM_CACHE = Collections.synchronizedMap(new HashMap<>());
    private static final Map STRINGSTAT_TEMP_CACHE = Collections.synchronizedMap(new WeakHashMap<>());

    private StringStats(String originalString, boolean isMatch, boolean isSubmatch) {
        this.originalString = originalString.trim();
        this.isImmutable = isMatch || isSubmatch;
        this.normalizedString = normalizeInternal(this.originalString, isMatch, isSubmatch);
        this.normalizedStringLowerCase = this.normalizedString.toLowerCase();
    }

    public static StringStats normalize(String originalString, boolean isMatch, boolean isSubmatch) {
        final Map cache;
        if (isSubmatch) {
            cache = STRINGSTAT_TEMP_CACHE;
        } else if (isMatch) {
            cache = STRINGSTAT_PERM_CACHE;
        } else {
            cache = null;
        }

        if (cache != null) {
            final StringStats cached = cache.get(originalString);
            if (cached != null) return cached;
            final StringStats newInstance = new StringStats(originalString, isMatch, isSubmatch);
            cache.put(originalString, newInstance);
            return newInstance;
        } else {
            return new StringStats(originalString, isMatch, isSubmatch);
        }
    }

    private String normalizeInternal(String originalString, boolean isMatch, boolean isSubmatch) {
        String normalized = originalString.trim();

        // template files may use this as delimiter
        normalized = PATTERN_TEMPLATE_DELIMITER.matcher(normalized).replaceAll("");

        // enclose separators with spaces
        for (int i = 0; i < SEPARATORS.length(); i++) {
            char character = SEPARATORS.charAt(i);
            normalized = normalized.replace(String.valueOf(character), " " + character + " ");
        }

        // normalize enumerators; replace with dash '-'
        final String enumerators = "\u2022°";
        for (int i = 0; i < enumerators.length(); i++) {
            normalized = normalized.replace(enumerators.substring(i, i + 1), "-");
        }

        // eliminate quotes
        for (int i = 0; i < QUOTES.length(); i++) {
            normalized = normalized.replace(QUOTES.substring(i, i + 1), "");
        }

        // for contains we do not replace the wildcards
        final String comments = isMatch ? COMMENT_NO_WILDCARD : COMMENT_WITH_WILDCARD;
        for (int i = 0; i < comments.length(); i++) {
            normalized = normalized.replace("" + comments.charAt(i), " ");
        }

        if (!isSubmatch) {
            // enclose whole string with spaces (no boundaries)
            normalized = " " + normalized + " ";
        }

        // FIXME: move to mappings; should not be hard-coded here
        // dnl is a separator used in .m4 autoconf files
        normalized = PATTERN_DNL_SEPARATOR.matcher(normalized).replaceAll(" ");

        // remove multiple whitespaces
        normalized = PATTERN_WHITESPACES.matcher(normalized).replaceAll(" ");

        // some specialists introduce hyphens; we remove them again
        // NOTE: this is a rather conservative approach; the split word must have the same case
        //   Other such cases must be covered by explicit mappings
        // FIXME: this should be rather performed on the content with markers; try to move to mappings
        normalized = PATTERN_HYPHEN_LOWERCASE.matcher(normalized).replaceAll("$1$2");
        normalized = PATTERN_HYPHEN_UPPERCASE.matcher(normalized).replaceAll("$1$2");

        // FIXME-KKL: is this a good idea? Text meaning can significantly change without comma. Needs revision.
        // remove ','; replace with ' '
        normalized = PATTERN_COMMA.matcher(normalized).replaceAll(" ");

        // remove multiple whitespaces; ensure; second pass (required ate least due to comma replacement)
        normalized = PATTERN_WHITESPACES.matcher(normalized).replaceAll(" ");

        return normalized;
    }

    /**
     * Checks whether this {@link StringStats} contains the given {@link StringStats} value.
     *
     * @param value         The value to check for containment.
     * @param caseSensitive Whether to match case-sensitive.
     *
     * @return Returns {@code true} when the value is contained in this {@link StringStats} instance.
     */
    public final boolean contains(final StringStats value, final boolean caseSensitive) {
        return indexOf(value, caseSensitive).getLeft() > -1;
    }

    /**
     * Tries to find value in the local (this) instance. Returns a pair consisting of start and end index.
     *
     * @param value         The value to compute the index of.
     * @param caseSensitive Whether the check is case-sensitive or not.
     *
     * @return A pair of start and end index.
     */
    public final SimpleIntPair indexOf(final StringStats value, final boolean caseSensitive) {

        // OPTIMIZATION-TARGET: this method is central for license matching

        final String searchString = value.getNormalizedString(!caseSensitive);
        final String originalSearchString = value.getOriginalString();

        // compute cache key
        final String cacheKey = caseSensitive ? searchString : searchString + "-false";

        // access cache
        final SimpleIntPair cachedIndex = indexOfCache.get(cacheKey);
        if (cachedIndex != null) {
            return cachedIndex;
        }

        // support regular expressions for special situations; regular expressions are identified by prefixed '^'
        final String originalString = value.getOriginalString();
        if (originalString.startsWith(PREFIX_REGEXP)) {
            // NOTE: case sensitivity parameter is not considered
            final String replacedString = normalizedString.replaceFirst(originalString.substring(1), MARKER);
            final int minIndex = replacedString.indexOf(MARKER);
            final int replacedLength = normalizedString.length() - (replacedString.length() - MARKER.length());
            return cacheAndReturn(cacheKey, SimpleIntPair.of(minIndex, minIndex + replacedLength));
        }

        final String baseString = getNormalizedString(!caseSensitive);

        // detect regexp in search-string
        if (originalSearchString.contains("*{")) { //alternatively with .matches and regex ".*\\*\\{(.*?)\\}\\*.*"
            final List matches = matchRegexp(value);
            if (matches.isEmpty()) {
                return PAIR_NO_MATCH;
            }
            return cacheAndReturn(cacheKey, SimpleIntPair.of(matches.get(0)[0], matches.get(0)[1]));
        }

        if (searchString.contains("*")) {

            // OPTIMIZATION: 6%
            final String[] searchStringElements = PATTERN_SIMPLE_WILDCARD.split(searchString);

            int minIndex = Integer.MAX_VALUE;
            int maxIndex = 0;
            int startIndex = 0;
            for (final String searchStringElement : searchStringElements) {
                // OPTIMIZATION: 93%
                final int index = baseString.indexOf(searchStringElement, startIndex);
                // fast exit (as soon as one searchStringElement is not found)
                if (index == -1) return cacheAndReturn(cacheKey, PAIR_NO_MATCH);

                // skip from found index + searchStringElement length characters
                maxIndex = index + searchStringElement.length();
                startIndex = maxIndex;

                // memorize first index; this is what we return
                if (minIndex == Integer.MAX_VALUE) {
                    minIndex = index;
                }
            }
            return cacheAndReturn(cacheKey, minIndex == Integer.MAX_VALUE ?
                    PAIR_NO_MATCH : SimpleIntPair.of(minIndex, maxIndex));

        } else {
            // optimized code no wildcard support
            final int index = baseString.indexOf(searchString);
            return cacheAndReturn(cacheKey, index == -1 ?
                    PAIR_NO_MATCH : SimpleIntPair.of(index, index + searchString.length()));
        }
    }

    private SimpleIntPair cacheAndReturn(final String cacheKey, final SimpleIntPair indexPair) {
        indexOfCache.put(cacheKey, indexPair);
        return indexPair;
    }

    @Override
    public String toString() {
        return normalizedString;
    }

    public int[] allMatchesOriginalString(StringStats matchStats) {
        return matchIndexes(matchStats, matchStats.getOriginalString());
    }

    public int[] allMatches(StringStats matchStats) {
        return matchIndexes(matchStats, matchStats.getNormalizedString());
    }

    public int[] matchIndexes(StringStats matchStats, String matchString) {
        if (StringUtils.isEmpty(matchString)) {
            return EMPTY_INT_ARRAY;
        }

        // quick-check whether any wildcard is included
        if (matchString.contains("*")) {
            // NOTE: *{ is normalized to * {
            if (matchString.contains("* {")) {
                final List ints = matchRegexp(matchStats);

                // project to array of start indexes
                int[] matches = new int[(ints.size())];
                for (int i = 0; i < matches.length; i++) {
                    matches[i] = ints.get(i)[0];
                }

                return matches;
            }
            // contains wildcard, but no expression
            throw new IllegalStateException("Plain wildcard not supported in match string " + matchStats.getOriginalString() + ". Use *{...}* pattern instead.");
        }

        // subsequently no wildcards need to be treated

        final String string = this.normalizedString;
        final int length = matchString.length();

        // last possible index; for while condition; computed once
        final int maxIndex = string.length() - length;

        // collect matches
        final List indexes = new ArrayList<>();
        int index;
        int fromIndex = 0;
        do {
            index = string.indexOf(matchString, fromIndex);
            if (index != -1) {
                indexes.add(index);
                fromIndex = index + length;
            }
        } while (index != -1 && fromIndex <= maxIndex);

        // optimization for simple cases
        if (indexes.isEmpty()) {
            return EMPTY_INT_ARRAY;
        } else if (indexes.size() == 1) {
            final int[] ints = new int[1];
            ints[0] = indexes.get(0);
            return ints;
        }

        // list to array (optimization option to use lists in signatures instead?)
        int[] indexArray = new int[indexes.size()];
        for (int i = 0; i < indexArray.length; i++) {
            indexArray[i] = indexes.get(i);
        }
        return indexArray;
    }

    public List matchRegexp(StringStats matchStats) {
        final String originalSearchString = matchStats.getOriginalString();

        // FIXME: regex's at the beginning/ending of a string should work!
        if (FAILURE_PATTERN.matcher(originalSearchString).matches()) {
            return Collections.singletonList(FAILURE_INT);
        }

        // array with search string split into text and regular expression elements -> searchStringElements

        if (originalSearchString.contains("*{")) {
            // NOTE: a precompiled pattern does not show any timing improvement
            final String[] searchStringElements = PATTERN_WILDCARD_ORIGINAL.split(originalSearchString);

            // for every textElement from SearchString, allMatches, in reference to Text String, are collected
            // -> startingIndex of each match
            int i = 0;
            final List results = new ArrayList<>();
            for (final String searchStringElement : searchStringElements) {

                if (i % 2 == 0) {
                    int[] matches = allMatches(StringStats.normalize(searchStringElement, true, true));

                    // FIXME: review early exit
                    // once an element is observed to not match; the whole sequence cannot be matched;
                    // we return with empty result.
                    if (matches.length == 0) {
                        return new ArrayList<>();
                    }
                    results.add(matches);
                }
                i++;
            }

            // Matches of textElements are being combined -> one checkTriplet() go through combines two textElements with
            // each other considering the regular expression between them.
            final List matches = checkTriplet(searchStringElements, results, 0, new ArrayList<>());

            // FIXME: what does this snippet do? Remove all matches, where the end is 0? Does this ever happen?
            matches.removeIf(ints -> ints[1] == 0);

            return matches;
        }
        return null;
    }

    private List checkTriplet(String[] searchStringElements, List indexes, int currentElement, List tuples) {
        final String searchStringLeft = searchStringElements[currentElement];
        final String searchStringRight = searchStringElements[currentElement + 2];

        final String normalizedLeft = normalize(searchStringLeft, true, true).getNormalizedString();
        final String normalizedRight = normalize(searchStringRight, true, true).getNormalizedString();

        // FIXME: document what happens here; why does the process start in the middle of indexes? Do they alternate?
        final int halfWaysIndex = currentElement / 2;
        final int[] indexLeft = indexes.get(halfWaysIndex);
        final int[] indexRight = indexes.get(1 + halfWaysIndex);

        final String regexp = searchStringElements[currentElement + 1];

        // FIXME-KKL: needs review; looking for optimization potentials
        for (final int indexCurrent : indexLeft) {
            for (final int indexNext : indexRight) {
                if (indexCurrent < indexNext) {
                    // FIXME: please comment this method to document intention
                    // checks if indexCurrent and indexNext are in range of 500 chars
                    if (indexCurrent + normalizedLeft.length() + 500 >= indexNext) {
                        if (indexCurrent + normalizedLeft.length() <= indexNext) {
                            // extracting string between indexes to apply it on regex
                            String toBeMatched = getNormalizedString().substring(indexCurrent + normalizedLeft.length(), indexNext);

                            // trim, since we are agnostic to whitespaces; the regexp may match completely optionally
                            toBeMatched = toBeMatched.trim();

                            if (toBeMatched.matches(regexp)) {
                                if (currentElement == 0) {
                                    final int[] tuple = new int[2];
                                    tuple[0] = indexCurrent;
                                    tuples.add(tuple);
                                }
                                // if last element
                                if (currentElement == searchStringElements.length - 3) {
                                    final int[] tuple = tuples.get(tuples.size() - 1);
                                    tuple[1] = indexNext + normalizedRight.length();
                                } else {
                                    checkTriplet(searchStringElements, indexes, currentElement + 2, tuples);
                                }
                            }
                        }
                    }
                } else {
                    // NOTE: in this case we need to catch up; continue
                }
            }
        }
        return tuples;
    }

    public void update(String normalizedString) {
        if (isImmutable) {
            throw new IllegalStateException("Cannot modify immutable StringStats.");
        }
        this.normalizedString = normalizedString;
        this.normalizedStringLowerCase = normalizedString.toLowerCase();
    }

    public final String getNormalizedString(boolean lowercase) {
        return lowercase ? normalizedStringLowerCase : normalizedString;
    }

    public static StringStats normalize(String originalString, boolean isMatch) {
        return normalize(originalString, isMatch, false);
    }

}




© 2015 - 2025 Weber Informatics LLC | Privacy Policy