All Downloads are FREE. Search and download functionalities are using the official Maven repository.

org.apache.openjpa.lib.util.StringDistance Maven / Gradle / Ivy

There is a newer version: 4.0.1
Show newest version
/*
 * Licensed to the Apache Software Foundation (ASF) under one
 * or more contributor license agreements.  See the NOTICE file
 * distributed with this work for additional information
 * regarding copyright ownership.  The ASF licenses this file
 * to you under the Apache License, Version 2.0 (the
 * "License"); you may not use this file except in compliance
 * with the License.  You may obtain a copy of the License at
 *
 * http://www.apache.org/licenses/LICENSE-2.0
 *
 * Unless required by applicable law or agreed to in writing,
 * software distributed under the License is distributed on an
 * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
 * KIND, either express or implied.  See the License for the
 * specific language governing permissions and limitations
 * under the License.    
 */
package org.apache.openjpa.lib.util;

import java.util.Arrays;
import java.util.Collection;

/**
 * Utilities for calculating string distance.
 *
 * @author Marc Prud'hommeaux
 */
public class StringDistance {

    /**
     * Returns the candidate string with the closest Levenshtein distance
     * to the given string.
     *
     * @see #getClosestLevenshteinDistance(String,Collection,int)
     */
    public static String getClosestLevenshteinDistance(String str,
        String[] candidates) {
        if (candidates == null)
            return null;
        return getClosestLevenshteinDistance(str, Arrays.asList(candidates));
    }

    /**
     * Returns the candidate string with the closest Levenshtein distance
     * to the given string.
     *
     * @see #getClosestLevenshteinDistance(String,Collection,int)
     */
    public static String getClosestLevenshteinDistance(String str,
        Collection candidates) {
        return getClosestLevenshteinDistance(str, candidates,
            Integer.MAX_VALUE);
    }

    /**
     * Returns the candidate string with the closest Levenshtein distance
     * to the given string.
     *
     * @see #getClosestLevenshteinDistance(String,Collection,int)
     */
    public static String getClosestLevenshteinDistance(String str,
        String[] candidates, int threshold) {
        if (candidates == null)
            return null;
        return getClosestLevenshteinDistance(str, Arrays.asList(candidates),
            threshold);
    }

    /**
     * Returns the candidate string with the closest Levenshtein distance
     * to the given string and using the threshold as the specified
     * percentage of the length of the candidate string(0.0f-1.0f).
     *
     * @see #getClosestLevenshteinDistance(String,Collection,int)
     */
    public static String getClosestLevenshteinDistance(String str,
        String[] candidates, float thresholdPercentage) {
        if (candidates == null)
            return null;

        return getClosestLevenshteinDistance(str, Arrays.asList(candidates),
            thresholdPercentage);
    }

    /**
     * Returns the candidate string with the closest Levenshtein distance
     * to the given string and using the threshold as the specified
     * percentage of the length of the candidate string(0.0f-1.0f).
     *
     * @see #getClosestLevenshteinDistance(String,Collection,int)
     */
    public static String getClosestLevenshteinDistance(String str,
        Collection candidates, float thresholdPercentage) {
        if (str == null)
            return null;

        thresholdPercentage = Math.min(thresholdPercentage, 1.0f);
        thresholdPercentage = Math.max(thresholdPercentage, 0.0f);

        return getClosestLevenshteinDistance(str, candidates,
            (int) (str.length() * thresholdPercentage));
    }

    /**
     * Returns the candidate string with the closest Levenshtein distance
     * to the given string.
     *
     * @param str the string to check
     * @param candidates the list of strings to test against
     * @param threshold the threshold distance a candidate must meet
     * @see #getLevenshteinDistance
     */
    public static String getClosestLevenshteinDistance(String str,
        Collection candidates, int threshold) {
        if (candidates == null || candidates.isEmpty())
            return null;

        String minString = null;
        int minValue = Integer.MAX_VALUE;

        for(String candidate : candidates) { 
            int distance = getLevenshteinDistance(str, candidate);
            if (distance < minValue) {
                minValue = distance;
                minString = candidate;
            }
        }

        // return the lowest close string only if we surpass the threshhold
        if (minValue <= threshold)
            return minString;
        else
            return null;
    }

    /**
     * Returns the Levenshtein distance between the two strings.
     * The distance is the minimum number of changes that need to be
     * applied to the first string in order to get to the second
     * string. For details of the algorithm, see
     * 
     * http://en.wikipedia.org/wiki/Levenshtein_distance.
     */
    public static int getLevenshteinDistance(String s, String t) {
        int n = s.length();
        int m = t.length();

        if (n == 0)
            return m;

        if (m == 0)
            return n;

        int[][] matrix = new int[n + 1][m + 1];

        for (int i = 0; i <= n; i++)
            matrix[i][0] = i;

        for (int j = 0; j <= m; j++)
            matrix[0][j] = j;

        for (int i = 1; i <= n; i++) {
            int si = s.charAt(i - 1);

            for (int j = 1; j <= m; j++) {
                int tj = t.charAt(j - 1);

                int cost;

                if (si == tj)
                    cost = 0;
                else
                    cost = 1;

                matrix[i][j] = min(matrix[i - 1][j] + 1, matrix[i][j - 1] + 1,
                    matrix[i - 1][j - 1] + cost);
            }
        }

        return matrix[n][m];
    }

    private static int min(int a, int b, int c) {
        int mi = a;

        if (b < mi)
            mi = b;

        if (c < mi)
            mi = c;

        return mi;
    }
}





© 2015 - 2024 Weber Informatics LLC | Privacy Policy