de.citec.scie.pdf.StringSimilarity Maven / Gradle / Ivy
/*
* SCIE -- Spinal Cord Injury Information Extraction
* Copyright (C) 2013, 2014
* Raphael Dickfelder, Jan Göpfert, Benjamin Paaßen, Andreas Stöckel
*
* This program is free software: you can redistribute it and/or modify
* it under the terms of the GNU Affero General Public License as
* published by the Free Software Foundation, either version 3 of the
* License, or (at your option) any later version.
*
* This program is distributed in the hope that it will be useful,
* but WITHOUT ANY WARRANTY; without even the implied warranty of
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
* GNU Affero General Public License for more details.
*
* You should have received a copy of the GNU Affero General Public License
* along with this program. If not, see .
*/
package de.citec.scie.pdf;
/**
* This implements an algorithm to determine the similarity between Strings by
* utilizing an alignment/edit distance approach. In this special case of
* alignment the distance between numbers is regarded as zero while each
* mismatch otherwise is punished with 1 as is each deletion or insertion.
*
* The edit distance than is transformed to a similarity by taking
* 1-distance/(max{|a|,|b|}), which is 1 - the number of costly edit operations
* that had to be used relative to the worst case (replace the whole first
* sequence with the whole second sequence using only mismatches and elongate or
* shorten if necessary = max{|a|,|b|}). This can be interpreted as a confidence
* value that the two Strings represent the same content.
*
* @author Benjamin Paassen - bpaassen(at)techfak.uni-bielefeld.de
*/
public class StringSimilarity {
public StringSimilarity() {
}
/**
* This implements an algorithm to determine the similarity between Strings
* by utilizing an alignment/edit distance approach. In this special case of
* alignment the distance between numbers is regarded as zero while each
* mismatch otherwise is punished with 1 as is each deletion or insertion.
*
* The edit distance than is transformed to a similarity by taking
* 1-distance/(max{|a|,|b|}), which is 1 - the number of costly edit
* operations that had to be used relative to the worst case (replace the
* whole first sequence with the whole second sequence using only mismatches
* and elongate or shorten if necessary = max{|a|,|b|}). This can be
* interpreted as a confidence value that the two Strings represent the same
* content.
*
* @param a the first string
* @param b the second string
* @return a confidence value (between 0 and 1) that the two Strings
* represent the same content
*/
public double calculate(String a, String b) {
final char[] aContent = a.toCharArray();
final char[] bContent = b.toCharArray();
final int[][] alignMat = new int[aContent.length + 1][bContent.length + 1];
//initialize first row (insertions only)
for (int j = 1; j <= bContent.length; j++) {
alignMat[0][j] = alignMat[0][j - 1] + 1;
}
//initialize first column (deletions only)
for (int i = 1; i <= aContent.length; i++) {
alignMat[i][0] = alignMat[i - 1][0] + 1;
}
//calculate alignment matrix
for (int i = 1; i <= aContent.length; i++) {
for (int j = 1; j <= bContent.length; j++) {
final int insertionScore = alignMat[i][j - 1] + 1;
final int deletionScore = alignMat[i - 1][j] + 1;
final int localRepScore;
if (aContent[i - 1] == bContent[j - 1]) {
localRepScore = 0;
} else {
if (Character.isDigit(aContent[i - 1]) && Character.isDigit(bContent[j - 1])) {
localRepScore = 0;
} else {
localRepScore = 1;
}
}
final int replacementScore = alignMat[i - 1][j - 1] + localRepScore;
alignMat[i][j] = Math.min(insertionScore, Math.min(deletionScore, replacementScore));
}
}
final double actualAlignDistance = alignMat[aContent.length][bContent.length];
final double worstCaseAlignDistance = Math.max(aContent.length, bContent.length);
return 1 - actualAlignDistance / worstCaseAlignDistance;
}
}