org.simmetrics.metrics.SmithWaterman Maven / Gradle / Ivy
/*
* SimMetrics - SimMetrics is a java library of Similarity or Distance Metrics,
* e.g. Levenshtein Distance, that provide float based similarity measures
* between String Data. All metrics return consistent measures rather than
* unbounded similarity scores.
*
* Copyright (C) 2014 SimMetrics authors
*
* This file is part of SimMetrics. This program is free software: you can
* redistribute it and/or modify it under the terms of the GNU General Public
* License as published by the Free Software Foundation, either version 3 of the
* License, or (at your option) any later version.
*
* This program is distributed in the hope that it will be useful, but WITHOUT
* ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS
* FOR A PARTICULAR PURPOSE. See the GNU General Public License for more
* details.
*
* You should have received a copy of the GNU General Public License along with
* SimMetrics. If not, see .
*/
package org.simmetrics.metrics;
import static com.google.common.base.Preconditions.checkArgument;
import static com.google.common.base.Preconditions.checkNotNull;
import static java.lang.Math.max;
import static java.lang.Math.min;
import static org.simmetrics.utils.Math.max3;
import org.simmetrics.StringMetric;
import org.simmetrics.metrics.functions.AffineGap;
import org.simmetrics.metrics.functions.Gap;
import org.simmetrics.metrics.functions.MatchMismatch;
import org.simmetrics.metrics.functions.Substitution;
/**
* Smith-Waterman algorithm providing a similarity measure between two strings.
*
*
* Implementation uses implementation by Smith and Waterman. This implementation
* uses quadratic space and cubic time.
*
* This class is immutable and thread-safe if its substitution and gap functions are.
*
*
* @see NeedlemanWunch
* @see SmithWatermanGotoh * @see Wikipedia - Smith-Waterman algorithm
*/
public class SmithWaterman implements StringMetric {
private final Gap gap;
private final Substitution substitution;
private final int windowSize;
/**
* Constructs a new Smith Waterman metric. Uses an affine gap of
* -5.0 - gapLength
a -3.0
substitution penalty
* for mismatches, 5.0
for matches.
*
*/
public SmithWaterman() {
this(new AffineGap(-5.0f, -1.0f), new MatchMismatch(5.0f, -3.0f),
Integer.MAX_VALUE);
}
/**
* Constructs a new Smith Waterman metric.
*
* @param gap
* a gap function to score gaps by
* @param substitution
* a substitution function to score substitutions by
* @param windowSize
* a non-negative window in which
*/
public SmithWaterman(Gap gap, Substitution substitution, int windowSize) {
checkNotNull(gap);
checkNotNull(substitution);
checkArgument(windowSize >= 0);
this.gap = gap;
this.substitution = substitution;
this.windowSize = windowSize;
}
@Override
public float compare(String a, String b) {
if (a.isEmpty() && b.isEmpty()) {
return 1.0f;
}
if (a.isEmpty() || b.isEmpty()) {
return 0.0f;
}
float maxDistance = min(a.length(), b.length())
* max(substitution.max(), gap.min());
return smithWatermanGotoh(a, b) / maxDistance;
}
private float smithWatermanGotoh(String a, String b) {
final int n = a.length();
final int m = b.length();
final float[][] d = new float[n][m];
// Initialize corner
float max = d[0][0] = max(0, substitution.compare(a, 0, b, 0));
// Initialize edge
for (int i = 0; i < n; i++) {
// Find most optimal deletion
float maxGapCost = 0;
for (int k = max(1, i - windowSize); k < i; k++) {
maxGapCost = max(maxGapCost, d[i - k][0] + gap.value(i - k, i));
}
d[i][0] = max3(0, maxGapCost, substitution.compare(a, i, b, 0));
max = max(max, d[i][0]);
}
// Initialize edge
for (int j = 1; j < m; j++) {
// Find most optimal insertion
float maxGapCost = 0;
for (int k = max(1, j - windowSize); k < j; k++) {
maxGapCost = max(maxGapCost, d[0][j - k] + gap.value(j - k, j));
}
d[0][j] = max3(0, maxGapCost, substitution.compare(a, 0, b, j));
max = max(max, d[0][j]);
}
// Build matrix
for (int i = 1; i < n; i++) {
for (int j = 1; j < m; j++) {
float maxGapCost = 0;
// Find most optimal deletion
for (int k = max(1, i - windowSize); k < i; k++) {
maxGapCost = max(maxGapCost,
d[i - k][j] + gap.value(i - k, i));
}
// Find most optimal insertion
for (int k = max(1, j - windowSize); k < j; k++) {
maxGapCost = max(maxGapCost,
d[i][j - k] + gap.value(j - k, j));
}
// Find most optimal of insertion, deletion and substitution
d[i][j] = max3(0, maxGapCost,
d[i - 1][j - 1] + substitution.compare(a, i, b, j));
max = max(max, d[i][j]);
}
}
return max;
}
@Override
public String toString() {
return "SmithWatermanGotoh [gap=" + gap + ", substitution="
+ substitution + ", windowSize=" + windowSize + "]";
}
}