All Downloads are FREE. Search and download functionalities are using the official Maven repository.

de.undercouch.citeproc.helper.Levenshtein Maven / Gradle / Ivy

package de.undercouch.citeproc.helper;

import org.apache.commons.lang3.StringUtils;
import org.apache.commons.text.similarity.LevenshteinDistance;

import java.util.ArrayList;
import java.util.Collection;
import java.util.Collections;
import java.util.LinkedHashSet;
import java.util.LinkedList;
import java.util.List;

/**
 * Uses {@link LevenshteinDistance#apply(CharSequence, CharSequence)}
 * to calculate the edit distance between two strings. Provides useful helper
 * methods to traverse a set of strings and select the most similar ones
 * to a given input string.
 * @author Michel Kraemer
 */
public class Levenshtein {
    private static class Item implements Comparable> {
        private final T str;
        private final int distance;

        public Item(T str, int distance) {
            this.str = str;
            this.distance = distance;
        }

        @Override
        public int compareTo(Item o) {
            return Integer.compare(distance, o.distance);
        }
    }

    /**
     * Searches the given collection of strings and returns the string that
     * has the lowest Levenshtein distance to a given second string t.
     * If the collection contains multiple strings with the same distance to
     * t only the first one will be returned.
     * @param  the type of the strings in the given collection
     * @param ss the collection to search
     * @param t the second string
     * @return the string with the lowest Levenshtein distance
     */
    public static  T findMinimum(Collection ss,
            CharSequence t) {
        int min = Integer.MAX_VALUE;
        T result = null;
        for (T s : ss) {
            int d = LevenshteinDistance.getDefaultInstance().apply(s, t);
            if (d < min) {
                min = d;
                result = s;
            }
        }
        return result;
    }

    /**
     * Searches the given collection of strings and returns a collection of at
     * most n strings that have the lowest Levenshtein distance
     * to a given string t. The returned collection will be
     * sorted according to the distance with the string with the lowest
     * distance at the first position.
     * @param  the type of the strings in the given collection
     * @param ss the collection to search
     * @param t the string to compare to
     * @param n the maximum number of strings to return
     * @param threshold a threshold for individual item distances. Only items
     * with a distance below this threshold will be included in the result.
     * @return the strings with the lowest Levenshtein distance
     */
    public static  Collection findMinimum(
            Collection ss, CharSequence t, int n, int threshold) {
        LinkedList> result = new LinkedList<>();
        for (T s : ss) {
            int d = LevenshteinDistance.getDefaultInstance().apply(s, t);
            if (d < threshold) {
                result.offer(new Item<>(s, d));

                if (result.size() > n + 10) {
                    // resort, but not too often
                    Collections.sort(result);
                    while (result.size() > n) result.removeLast();
                }
            }
        }

        Collections.sort(result);
        while (result.size() > n) result.removeLast();

        List arr = new ArrayList<>(n);
        for (Item i : result) {
            arr.add(i.str);
        }
        return arr;
    }

    /**
     * Searches the given collection of strings and returns a collection of
     * strings similar to a given string t. Uses reasonable default
     * values for human-readable strings. The returned collection will be
     * sorted according to their similarity with the string with the best
     * match at the first position.
     * @param  the type of the strings in the given collection
     * @param ss the collection to search
     * @param t the string to compare to
     * @return a collection with similar strings
     */
    public static  Collection findSimilar(
            Collection ss, CharSequence t) {
        // look for strings prefixed by 't'
        Collection result = new LinkedHashSet<>();
        for (T s : ss) {
            if (StringUtils.startsWithIgnoreCase(s, t)) {
                result.add(s);
            }
        }

        // find strings according to their levenshtein distance
        Collection mins = findMinimum(ss, t, 5, Math.min(t.length() - 1, 7));
        result.addAll(mins);

        return result;
    }
}




© 2015 - 2024 Weber Informatics LLC | Privacy Policy