All Downloads are FREE. Search and download functionalities are using the official Maven repository.

org.simmetrics.metrics.Levenshtein Maven / Gradle / Ivy

There is a newer version: 4.1.1
Show newest version
/*
 * #%L
 * Simmetrics Core
 * %%
 * Copyright (C) 2014 - 2016 Simmetrics Authors
 * %%
 * Licensed under the Apache License, Version 2.0 (the "License");
 * you may not use this file except in compliance with the License.
 * You may obtain a copy of the License at
 * 
 *      http://www.apache.org/licenses/LICENSE-2.0
 * 
 * Unless required by applicable law or agreed to in writing, software
 * distributed under the License is distributed on an "AS IS" BASIS,
 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 * See the License for the specific language governing permissions and
 * limitations under the License.
 * #L%
 */

package org.simmetrics.metrics;

import static com.google.common.base.Preconditions.checkArgument;
import static java.lang.Math.max;
import static org.simmetrics.metrics.Math.min;

import org.simmetrics.StringDistance;
import org.simmetrics.StringMetric;

/**
 * Calculates the Levenshtein distance and similarity over two strings.
 * 

* Insert/delete and substitute operations can be weighted. When the cost for * substitution is zero Levenshtein does not satisfy the coincidence property. *

* This class is immutable and thread-safe. * * @see Wikipedia - * Levenshtein distance * @see DamerauLevenshtein * */ public final class Levenshtein implements StringMetric, StringDistance { private final float maxCost; private final float insertDelete; private final float substitute; /** * Constructs a new weighted Levenshtein metric. When the cost for * substitution is zero Levenshtein does not satisfy the coincidence * property. * * @param insertDelete * positive non-zero cost of an insert or deletion operation * @param substitute * positive cost of a substitute operation */ public Levenshtein(float insertDelete, float substitute) { checkArgument(insertDelete > 0); checkArgument(substitute >= 0); this.maxCost = max(insertDelete, substitute); this.insertDelete = insertDelete; this.substitute = substitute; } /** * Constructs a new Levenshtein metric. */ public Levenshtein() { this(1.0f, 1.0f); } @Override public float compare(final String a, final String b) { if (a.isEmpty() && b.isEmpty()) { return 1.0f; } return 1.0f - (distance(a, b) / (maxCost * max(a.length(), b.length()))); } @Override public float distance(final String s, final String t) { if (s.isEmpty()) return t.length(); if (t.isEmpty()) return s.length(); if (s.equals(t)) return 0; final int tLength = t.length(); final int sLength = s.length(); float[] swap; float[] v0 = new float[tLength + 1]; float[] v1 = new float[tLength + 1]; // initialize v0 (the previous row of distances) // this row is A[0][i]: edit distance for an empty s // the distance is just the number of characters to delete from t for (int i = 0; i < v0.length; i++) { v0[i] = i * insertDelete; } for (int i = 0; i < sLength; i++) { // first element of v1 is A[i+1][0] // edit distance is delete (i+1) chars from s to match empty t v1[0] = (i + 1) * insertDelete; for (int j = 0; j < tLength; j++) { v1[j + 1] = min(v1[j] + insertDelete, v0[j + 1] + insertDelete, v0[j] + (s.charAt(i) == t.charAt(j) ? 0.0f : substitute)); } swap = v0; v0 = v1; v1 = swap; } // latest results was in v1 which was swapped with v0 return v0[tLength]; } @Override public String toString() { return "Levenshtein [insertDelete=" + insertDelete + ", substitute=" + substitute + "]"; } }





© 2015 - 2024 Weber Informatics LLC | Privacy Policy