All Downloads are FREE. Search and download functionalities are using the official Maven repository.

de.citec.scie.pdf.StringSimilarity Maven / Gradle / Ivy

/*
 * SCIE -- Spinal Cord Injury Information Extraction
 * Copyright (C) 2013, 2014
 * Raphael Dickfelder, Jan Göpfert, Benjamin Paaßen, Andreas Stöckel
 * 
 * This program is free software: you can redistribute it and/or modify
 * it under the terms of the GNU Affero General Public License as
 * published by the Free Software Foundation, either version 3 of the
 * License, or (at your option) any later version.
 * 
 * This program is distributed in the hope that it will be useful,
 * but WITHOUT ANY WARRANTY; without even the implied warranty of
 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
 * GNU Affero General Public License for more details.
 * 
 * You should have received a copy of the GNU Affero General Public License
 * along with this program.  If not, see .
 */

package de.citec.scie.pdf;

/**
 * This implements an algorithm to determine the similarity between Strings by
 * utilizing an alignment/edit distance approach. In this special case of
 * alignment the distance between numbers is regarded as zero while each
 * mismatch otherwise is punished with 1 as is each deletion or insertion.
 *
 * The edit distance than is transformed to a similarity by taking
 * 1-distance/(max{|a|,|b|}), which is 1 - the number of costly edit operations
 * that had to be used relative to the worst case (replace the whole first
 * sequence with the whole second sequence using only mismatches and elongate or
 * shorten if necessary = max{|a|,|b|}). This can be interpreted as a confidence
 * value that the two Strings represent the same content.
 *
 * @author Benjamin Paassen - bpaassen(at)techfak.uni-bielefeld.de
 */
public class StringSimilarity {

	public StringSimilarity() {
	}

	/**
	 * This implements an algorithm to determine the similarity between Strings
	 * by utilizing an alignment/edit distance approach. In this special case of
	 * alignment the distance between numbers is regarded as zero while each
	 * mismatch otherwise is punished with 1 as is each deletion or insertion.
	 *
	 * The edit distance than is transformed to a similarity by taking
	 * 1-distance/(max{|a|,|b|}), which is 1 - the number of costly edit
	 * operations that had to be used relative to the worst case (replace the
	 * whole first sequence with the whole second sequence using only mismatches
	 * and elongate or shorten if necessary = max{|a|,|b|}). This can be
	 * interpreted as a confidence value that the two Strings represent the same
	 * content.
	 *
	 * @param a the first string
	 * @param b the second string
	 * @return a confidence value (between 0 and 1) that the two Strings
	 * represent the same content
	 */
	public double calculate(String a, String b) {
		final char[] aContent = a.toCharArray();
		final char[] bContent = b.toCharArray();

		final int[][] alignMat = new int[aContent.length + 1][bContent.length + 1];

		//initialize first row (insertions only)
		for (int j = 1; j <= bContent.length; j++) {
			alignMat[0][j] = alignMat[0][j - 1] + 1;
		}
		//initialize first column (deletions only)
		for (int i = 1; i <= aContent.length; i++) {
			alignMat[i][0] = alignMat[i - 1][0] + 1;
		}
		//calculate alignment matrix
		for (int i = 1; i <= aContent.length; i++) {
			for (int j = 1; j <= bContent.length; j++) {
				final int insertionScore = alignMat[i][j - 1] + 1;
				final int deletionScore = alignMat[i - 1][j] + 1;
				final int localRepScore;
				if (aContent[i - 1] == bContent[j - 1]) {
					localRepScore = 0;
				} else {
					if (Character.isDigit(aContent[i - 1]) && Character.isDigit(bContent[j - 1])) {
						localRepScore = 0;
					} else {
						localRepScore = 1;
					}
				}
				final int replacementScore = alignMat[i - 1][j - 1] + localRepScore;

				alignMat[i][j] = Math.min(insertionScore, Math.min(deletionScore, replacementScore));
			}
		}

		final double actualAlignDistance = alignMat[aContent.length][bContent.length];
		final double worstCaseAlignDistance = Math.max(aContent.length, bContent.length);

		return 1 - actualAlignDistance / worstCaseAlignDistance;
	}
}




© 2015 - 2024 Weber Informatics LLC | Privacy Policy