All Downloads are FREE. Search and download functionalities are using the official Maven repository.

net.maizegenetics.analysis.gbs.SmithWaterman Maven / Gradle / Ivy

Go to download

TASSEL is a software package to evaluate traits associations, evolutionary patterns, and linkage disequilibrium.

There is a newer version: 5.2.94
Show newest version
package net.maizegenetics.analysis.gbs;

/**
 * Created by IntelliJ IDEA.
 * User: ed
 * Date: May 31, 2008
 * Time: 9:35:16 AM
 * To change this template use File | Settings | File Templates.
 */

/*
 * SmithWaterman.java
 *
 * Copyright 2003 Sergio Anibal de Carvalho Junior
 *
 * This file is part of NeoBio.
 *
 * NeoBio is free software; you can redistribute it and/or modify it under the terms of
 * the GNU General Public License as published by the Free Software Foundation; either
 * version 2 of the License, or (at your option) any later version.
 *
 * NeoBio is distributed in the hope that it will be useful, but WITHOUT ANY WARRANTY;
 * without even the implied warranty of MERCHANTABILITY or FITNESS FOR A PARTICULAR
 * PURPOSE. See the GNU General Public License for more details.
 *
 * You should have received a copy of the GNU General Public License along with NeoBio;
 * if not, write to the Free Software Foundation, Inc., 59 Temple Place, Suite 330,
 * Boston, MA 02111-1307, USA.
 *
 * Proper attribution of the author as the source of the software would be appreciated.
 *
 * Sergio Anibal de Carvalho Junior		mailto:[email protected]
 * Department of Computer Science		http://www.dcs.kcl.ac.uk
 * King's College London, UK			http://www.kcl.ac.uk
 *
 * Please visit http://neobio.sourceforge.net
 *
 * This project was supervised by Professor Maxime Crochemore.
 *
 */
import java.util.Arrays;
import org.biojava.nbio.alignment.NeedlemanWunsch;

/**
 * This class implement the classic local alignment algorithm (with linear gap penalty
 * function) due to T.F.Smith and M.S.Waterman (1981).
 *
 * 

This algorithm is very similar to the {@linkplain NeedlemanWunsch} algorithm for * global alignment. The idea here also consists of building an (n+1 x m+1) matrix M given * two sequences A and B of sizes n and m, respectively. However, unlike in the global * alignment case, every position M[i,j] in the matrix contains the similarity score of * suffixes of A[1..i] and B[1..j].

* *

Starting from row 0, column 0, the {@link #computeMatrix computeMatrix} method * computes each position M[i,j] with the following recurrence:

* *
 * M[0,0] = M[0,j] = M[i,0] = 0
 * M[i,j] = max { M[i,j-1]   + scoreInsertion (B[j]),
 *                M[i-1,j-1] + scoreSubstitution (A[i], B[j]),
 *                M[i-1,j]   + scoreDeletion(A[i])             }
 * 
* *

Note that, here, all cells in the first row and column are set to zero. The best * local alignment score is the highest value found anywhere in the matrix.

* *

Just like in global alignment case, this algorithm has quadratic space complexity * because it needs to keep an (n+1 x m+1) matrix in memory. And since the work of * computing each cell is constant, it also has quadratic time complexity.

* *

After the matrix has been computed, the alignment can be retrieved by tracing a path * back in the matrix from the position of the highest score until a cell of value zero is * reached. This step is performed by the { * buildOptimalAlignment} method, and its time complexity is linear on the size of the * alignment. * *

If the similarity value only is needed (and not the alignment itself), it is easy to * reduce the space requirement to O(n) by keeping just the last row or column in memory. * This is precisely what is done by the {@link #computeScore computeScore} method. Note * that it still requires O(n2) time.

* *

For a more efficient approach to the local alignment problem, see the * {@linkplain CrochemoreLandauZivUkelson} algorithm. For global alignment, see the * {@linkplain NeedlemanWunsch} algorithm.

* * @author Sergio A. de Carvalho Jr. */ public class SmithWaterman { /** * The first sequence of an alignment. */ protected String seq1 = "CGGGTGTGACAGTCGTGCAGTCGACCGTTGGG"; /** * The second sequence of an alignment. */ protected String seq2 = "XXXXXCGGGTGTGACAGTCGTGCAGTCGACCGTTGGGXXXXXXX"; // protected String seq2="CGGGTGTGACAGTCGTGCAGTCGACCGTTGGG"; /** * The dynamic programming matrix. Each position (i, j) represents the best score * between a suffic of the firsts i characters of seq1 and a suffix of * the first j characters of seq2. */ protected int[][] matrix; /** * Indicate the row of where an optimal local alignment can be found in the matrix.. */ protected int max_row; /** * Indicate the column of where an optimal local alignment can be found in the matrix. */ protected int max_col; int rows, cols; byte[] bseq1, bseq2; int[] array; public SmithWaterman() { long time = System.currentTimeMillis(); int maxScore = 0; rows = seq1.length() + 1; cols = seq2.length() + 1; if (rows <= cols) { array = new int[rows]; } else { array = new int[cols]; } bseq1 = seq1.getBytes(); bseq2 = seq2.getBytes(); // matrix = new int[rows][cols]; for (int i = 0; i < 30000; i++) { // maxScore=computeMatrix(); maxScore = computeScore(); } // System.out.println(Arrays.deepToString(matrix)); //System.out.println("MaxScore:" + maxScore); //System.out.println("time:" + (System.currentTimeMillis() - time)); } public SmithWaterman(int rows, int cols) { this.rows = rows + 1; this.cols = cols + 1; int max = (rows > cols) ? rows : cols; array = new int[max]; // if (rows <= cols) {array = new int [this.rows];} // else{array = new int [this.cols];} } public SmithWaterman(byte[] b1, byte[] b2) { bseq1 = b1; bseq2 = b2; this.rows = b1.length + 1; this.cols = b2.length + 1; int max = (rows > cols) ? rows : cols; array = new int[max]; } int computeScore(byte[] b1, byte[] b2) { bseq1 = b1; bseq2 = b2; this.rows = b1.length + 1; this.cols = b2.length + 1; return computeScore(); } /** * Computes the dynamic programming matrix. * * If the scoring scheme is not compatible * with the loaded sequences. */ protected int computeMatrix() { int r, c, ins, sub, del, max_score; long time = System.currentTimeMillis(); // initiate first row for (c = 0; c < cols; c++) { matrix[0][c] = 0; } // keep track of the maximum score this.max_row = this.max_col = max_score = 0; // calculates the similarity matrix (row-wise) for (r = 1; r < rows; r++) { // initiate first column matrix[r][0] = 0; for (c = 1; c < cols; c++) { // ins = matrix[r][c-1] + scoreInsertion(seq2.charAt(c)); // sub = matrix[r-1][c-1] + scoreSubstitution(seq1.charAt(r),seq2.charAt(c)); // del = matrix[r-1][c] + scoreDeletion(seq1.charAt(r)); ins = matrix[r][c - 1] - 1; // sub = matrix[r-1][c-1] + scoreSubstitution(seq1.charAt(r-1),seq2.charAt(c-1)); sub = matrix[r - 1][c - 1] + (bseq1[r - 1] == bseq2[c - 1] ? 2 : 0); del = matrix[r - 1][c] - 1; // choose the greatest matrix[r][c] = max(ins, sub, del, 0); if (matrix[r][c] > max_score) { // keep track of the maximum score max_score = matrix[r][c]; this.max_row = r; this.max_col = c; } } } System.out.println("MaxScore:" + max_score); System.out.println("time:" + (System.currentTimeMillis() - time)); return max_score; } /** * Computes the score of the best local alignment between the two sequences using the * scoring scheme previously set. This method calculates the similarity value only * (doesn't build the whole matrix so the alignment cannot be recovered, however it * has the advantage of requiring O(n) space only). * * @return the score of the best local alignment between the loaded sequences * with the loaded sequences. */ public int computeScore() { // int[] array; // int rows = seq1.length()+1, cols = seq2.length()+1; int r, c, tmp, ins, del, sub, max_score; long time = System.currentTimeMillis(); // keep track of the maximum score max_score = 0; if (rows <= cols) { // // goes columnwise // array = new int [rows]; // initiate first column for (r = 0; r < rows; r++) { array[r] = 0; } // calculate the similarity matrix (keep current column only) for (c = 1; c < cols; c++) { // set first position to zero (tmp hold values // that will be later moved to the array) tmp = 0; for (r = 1; r < rows; r++) { ins = array[r] - 1; // if(r>30) { // System.out.println(); // } sub = array[r - 1] + (bseq1[r - 1] == bseq2[c - 1] ? 2 : 0); del = tmp - 1; // move the temp value to the array array[r - 1] = tmp; // choose the greatest (or zero if all negative) tmp = max(ins, sub, del, 0); // keep track of the maximum score if (tmp > max_score) { max_score = tmp; } } // move the temp value to the array array[rows - 1] = tmp; } } else { // goes rowwise // array = new int [cols]; // initiate first row for (c = 0; c < cols; c++) { array[c] = 0; } // calculate the similarity matrix (keep current row only) for (r = 1; r < rows; r++) { // set first position to zero (tmp hold values // that will be later moved to the array) tmp = 0; for (c = 1; c < cols; c++) { ins = tmp - 1; sub = array[c - 1] + (bseq1[r - 1] == bseq2[c - 1] ? 2 : 0); del = array[c] - 1; // move the temp value to the array array[c - 1] = tmp; // choose the greatest (or zero if all negative) tmp = max(ins, sub, del, 0); // keep track of the maximum score if (tmp > max_score) { max_score = tmp; } } // move the temp value to the array array[cols - 1] = tmp; } } // System.out.println("SmithWaterman: MaxScore:" + max_score + " time:" + (System.currentTimeMillis() - time)); return max_score; } protected final int scoreInsertion(char a) { return -1; } protected final int scoreSubstitution(char a, char b) { if (a == b) { return 2; } return 0; } protected final int scoreDeletion(char a) { return -1; } /** * Helper method to compute the the greater of two values. * * @param v1 first value * @param v2 second value * @return the larger of v1 and v2 */ protected final int max(int v1, int v2) { return (v1 >= v2) ? v1 : v2; } /** * Helper method to compute the the greater of three values. * * @param v1 first value * @param v2 second value * @param v3 third value * @return the larger of v1, v2 and v3 */ protected final int max(int v1, int v2, int v3) { return (v1 >= v2) ? ((v1 >= v3) ? v1 : v3) : ((v2 >= v3) ? v2 : v3); } /** * Helper method to compute the the greater of four values. * * @param v1 first value * @param v2 second value * @param v3 third value * @param v4 fourth value * @return the larger of v1, v2 v3 and * v4 */ protected final int max(int v1, int v2, int v3, int v4) { int m1 = ((v1 >= v2) ? v1 : v2); int m2 = ((v3 >= v4) ? v3 : v4); return (m1 >= m2) ? m1 : m2; } }




© 2015 - 2025 Weber Informatics LLC | Privacy Policy