All Downloads are FREE. Search and download functionalities are using the official Maven repository.

edu.stanford.nlp.util.EditDistance Maven / Gradle / Ivy

Go to download

Stanford CoreNLP provides a set of natural language analysis tools which can take raw English language text input and give the base forms of words, their parts of speech, whether they are names of companies, people, etc., normalize dates, times, and numeric quantities, mark up the structure of sentences in terms of phrases and word dependencies, and indicate which noun phrases refer to the same entities. It provides the foundational building blocks for higher level text understanding applications.

There is a newer version: 4.5.7
Show newest version
package edu.stanford.nlp.util; 
import edu.stanford.nlp.util.logging.Redwood;

import java.util.Arrays;

/** Find the (Levenshtein) edit distance between two Strings or Character
 *  arrays.
 * By default it allows transposition. 
 *  
* This is an object so that you can save on the cost of allocating / * deallocating the large array when possible * @author Dan Klein * @author John Bauer - rewrote using DP instead of memorization */ public class EditDistance { /** A logger for this class */ private static Redwood.RedwoodChannels log = Redwood.channels(EditDistance.class); final boolean allowTranspose; protected double[][] score = null; public EditDistance() { allowTranspose = true; } public EditDistance(boolean allowTranspose) { this.allowTranspose = allowTranspose; } protected void clear(int sourceLength, int targetLength) { if (score == null || score.length < sourceLength + 1 || score[0].length < targetLength + 1) { score = new double[sourceLength + 1][targetLength + 1]; } for (double[] aScore : score) { Arrays.fill(aScore, worst()); } } // CONSTRAINT SEMIRING START protected double best() { return 0.0; } protected double worst() { return Double.POSITIVE_INFINITY; } protected double unit() { return 1.0; } protected double better(double x, double y) { if (x < y) { return x; } return y; } protected double combine(double x, double y) { return x + y; } // CONSTRAINT SEMIRING END // COST FUNCTION BEGIN protected double insertCost(Object o) { return unit(); } protected double deleteCost(Object o) { return unit(); } protected double substituteCost(Object source, Object target) { if (source.equals(target)) { return best(); } return unit(); } double transposeCost(Object s1, Object s2, Object t1, Object t2) { if (s1.equals(t2) && s2.equals(t1)) { if (allowTranspose) { return unit(); } else { return 2*unit(); } } return worst(); } // COST FUNCTION END double score(Object[] source, int sPos, Object[] target, int tPos) { for (int i = 0; i <= sPos; ++i) { for (int j = 0; j <= tPos; ++j) { double bscore = score[i][j]; if (bscore != worst()) continue; if (i == 0 && j == 0) { bscore = best(); } else { if (i > 0) { bscore = better(bscore, (combine(score[i - 1][j], deleteCost(source[i - 1])))); } if (j > 0) { bscore = better(bscore, (combine(score[i][j - 1], insertCost(target[j - 1])))); } if (i > 0 && j > 0) { bscore = better(bscore, (combine(score[i - 1][j - 1], substituteCost(source[i - 1], target[j - 1])))); } if (i > 1 && j > 1) { bscore = better(bscore, (combine(score[i - 2][j - 2], transposeCost(source[i - 2], source[i - 1], target[j - 2], target[j - 1])))); } } score[i][j] = bscore; } } return score[sPos][tPos]; } public double score(Object[] source, Object[] target) { clear(source.length, target.length); return score(source, source.length, target, target.length); } public double score(String sourceStr, String targetStr) { if(sourceStr.equals(targetStr)) return 0; Object[] source = Characters.asCharacterArray(sourceStr); Object[] target = Characters.asCharacterArray(targetStr); clear(source.length, target.length); return score(source, source.length, target, target.length); } public static void main(String[] args) { if (args.length >= 2) { EditDistance d = new EditDistance(); System.out.println(d.score(args[0], args[1])); } else { log.info("usage: java EditDistance str1 str2"); } } }




© 2015 - 2024 Weber Informatics LLC | Privacy Policy