All Downloads are FREE. Search and download functionalities are using the official Maven repository.

net.maizegenetics.analysis.gbs.neobio.ScoringMatrix Maven / Gradle / Ivy

/*
 * ScoringMatrix.java
 *
 * Copyright 2003 Sergio Anibal de Carvalho Junior
 *
 * This file is part of NeoBio.
 *
 * NeoBio is free software; you can redistribute it and/or modify it under the terms of
 * the GNU General Public License as published by the Free Software Foundation; either
 * version 2 of the License, or (at your option) any later version.
 *
 * NeoBio is distributed in the hope that it will be useful, but WITHOUT ANY WARRANTY;
 * without even the implied warranty of MERCHANTABILITY or FITNESS FOR A PARTICULAR
 * PURPOSE. See the GNU General Public License for more details.
 *
 * You should have received a copy of the GNU General Public License along with NeoBio;
 * if not, write to the Free Software Foundation, Inc., 59 Temple Place, Suite 330,
 * Boston, MA 02111-1307, USA.
 *
 * Proper attribution of the author as the source of the software would be appreciated.
 *
 * Sergio Anibal de Carvalho Junior		mailto:[email protected]
 * Department of Computer Science		http://www.dcs.kcl.ac.uk
 * King's College London, UK			http://www.kcl.ac.uk
 *
 * Please visit http://neobio.sourceforge.net
 *
 * This project was supervised by Professor Maxime Crochemore.
 *
 */

package net.maizegenetics.analysis.gbs.neobio;

import java.io.Reader;
import java.io.StreamTokenizer;
import java.io.IOException;

/**
 * This class implements a scoring scheme based on a substitution matrix. It is useful
 * to represent PAM and BLOSUM family of amino acids scoring matrices. Its constructor
 * loads such matrices from a file (or any other character stream). The following is an
 * extract of a BLOSUM62 scoring matrix file:
 * 
 *       A  R  N  D  C  Q  E  G  H  I  L  K  M  F  P  S  T  W  Y  V  B  Z  X  *
 *    A  4 -1 -2 -2  0 -1 -1  0 -2 -1 -1 -1 -1 -2 -1  1  0 -3 -2  0 -2 -1  0 -4
 *    R -1  5  0 -2 -3  1  0 -2  0 -3 -2  2 -1 -3 -2 -1 -1 -3 -2 -3 -1  0 -1 -4
 *    ...
 *    B -2 -1  3  4 -3  0  1 -1  0 -3 -4  0 -3 -3 -2  0 -1 -4 -3 -3  4  1 -1 -4
 *    Z -1  0  0  1 -3  3  4 -2  0 -3 -3  1 -1 -3 -1  0 -1 -3 -2 -2  1  4 -1 -4
 *    X  0 -1 -1 -1 -2 -1 -1 -1 -1 -1 -1 -1 -1 -1 -2  0  0 -2 -1 -1 -1 -1 -1 -4
 *    * -4 -4 -4 -4 -4 -4 -4 -4 -4 -4 -4 -4 -4 -4 -4 -4 -4 -4 -4 -4 -4 -4 -4  1
 * 
* *

Matrices are expected to follow this format. They must have one row an one column * for each defined character (not necessarily in the same order). Each row and column * must start with a distinct character (no repetition) and all row characters must have a * correspondent column, and vice versa.

* *

Value at position (i,j) represent the score of substituting character of row i for * character of column j. Insertion penalties are specified by the last row while deletion * penalties must be located at the last column (both represented by the special character * defined by the INDEL_CHAR constant). Note that it only supports an * additive gap cost function. In case any of this rules are not followed, an * {@linkplain InvalidScoringMatrixException} exception is raised by the constructor.

* *

If a scoring operation (substitution, insertion or deletion) involves a character * not found in the matrix, an exception is raised.

* * @author Sergio A. de Carvalho Jr. * @see InvalidScoringMatrixException */ public class ScoringMatrix extends ScoringScheme { /** * The character that indicates the row and column for insertion and deletion * penalties in the matrix. */ protected static final char INDEL_CHAR = '*'; /** * The character used to start a comment line in the scoring matrix file. */ protected static final char COMMENT_CHAR = '#'; /** * Stores matrix column headers in the order they were found. */ protected String col_codes; /** * Stores matrix row headers in the order they were found. */ protected String row_codes; /** * Stores values for each operation (substitution, insertion or deletion) defined by * this matrix. */ protected int matrix[][]; /** * Dimension of the (squared) matrix. */ protected int dimension; /** * The maximum absolute score that this matrix can return for any substitution, * deletion or insertion. */ protected int max_absolute_score; /** * Creates a new instance of a substitution matrix loaded from the character stream. * The case of characters is significant when subsequently computing their score. * * @param input character stream from where the matrix is read * @throws IOException if an I/O operation fails when reading from input * @throws InvalidScoringMatrixException if the matrix does not comply with the * specification */ public ScoringMatrix (Reader input) throws IOException, InvalidScoringMatrixException { this (input, true); } /** * Creates a new instance of a substitution matrix loaded from the character stream. * If case_sensitive is true, the case of characters is * significant when subsequently computing their score; otherwise the case is * ignored. * * @param input character stream from where the matrix is read * @param case_sensitive true if the case of characters must be * @throws IOException if an I/O operation fails when reading from input * @throws InvalidScoringMatrixException if the matrix does not comply with the * specification */ public ScoringMatrix (Reader input, boolean case_sensitive) throws IOException, InvalidScoringMatrixException { super (case_sensitive); StreamTokenizer in; StringBuffer buf = new StringBuffer(); int row, col, max_abs = 0; char c; // create a stream tokenizer on top of the input // stream and set the COMMENT_CHAR as the comment character in = new StreamTokenizer(input); in.commentChar(COMMENT_CHAR); // consider ends of line when reading the first row in.eolIsSignificant(true); // skip blank lines (if any) for (in.nextToken(); in.ttype == StreamTokenizer.TT_EOL; in.nextToken()); // read first row: column character codes while ((in.ttype != StreamTokenizer.TT_EOF) && (in.ttype != StreamTokenizer.TT_EOL)) { if (in.ttype == StreamTokenizer.TT_WORD) { if (in.sval.length() > 1) throw new InvalidScoringMatrixException ("Column headers must have one-character only."); buf.append(in.sval.charAt(0)); } else if (in.ttype == INDEL_CHAR) { buf.append(INDEL_CHAR); } else { throw new InvalidScoringMatrixException("Column headers must be " + "one-character codes or the special character '" + INDEL_CHAR + "'."); } in.nextToken(); } // convert everything to upper case if it's not case sensitive if (case_sensitive) col_codes = buf.toString(); else col_codes = buf.toString().toUpperCase(); dimension = col_codes.length(); // check if there's a column for deletion penalties if (col_codes.indexOf (INDEL_CHAR) == -1) throw new InvalidScoringMatrixException ("Matrix have no column for deletion penalties."); // check if there is at least one character code (besides the INDEL char) if (dimension < 2) throw new InvalidScoringMatrixException ("Matrix must have at least one column with a character code."); // check for repeated column codes for (int i = 0; i < dimension; i++) if (col_codes.indexOf(col_codes.charAt(i),i+1) > i) throw new InvalidScoringMatrixException ("Columns must have distinct one-character codes."); // allocate matrix matrix = new int[dimension][dimension]; // reset buffer buf.delete (0, dimension); // from now on, ignore ends of line in.eolIsSignificant(false); if (in.ttype == StreamTokenizer.TT_EOL) in.nextToken(); // read rest of matrix (one line for each character, but // not necessarily in the same order as the columns) for (row = 0; row < dimension && in.ttype != StreamTokenizer.TT_EOF; row++) { // start reading the line: the character code must come first if (in.ttype == StreamTokenizer.TT_WORD) { if (in.sval.length() > 1) throw new InvalidScoringMatrixException ("Codes must have one character only."); buf.append(in.sval.charAt(0)); } else if (in.ttype == INDEL_CHAR) { buf.append(INDEL_CHAR); } else { throw new InvalidScoringMatrixException ("Rows must start with an" + " one-character code or the special character '" + INDEL_CHAR + "'."); } // now, the set of values for (col = 0; col < dimension; col++) { // start reading the values if (in.nextToken() != StreamTokenizer.TT_NUMBER) throw new InvalidScoringMatrixException ("Invalid value at row " + (row+1) + ", column " + (col+1) + "."); matrix[row][col] = (int) in.nval; if (Math.abs(matrix[row][col]) > max_abs) max_abs = Math.abs(matrix[row][col]); } in.nextToken(); } // convert everything to upper case if it's not case sensitive if (case_sensitive) row_codes = buf.toString(); else row_codes = buf.toString().toUpperCase(); // check if read as many rows as columns if (row_codes.length() != dimension) throw new InvalidScoringMatrixException ("Matrix must have as many rows as columns."); // check if there's a row for insertion penalties if (row_codes.indexOf(INDEL_CHAR) == -1) throw new InvalidScoringMatrixException ("Matrix have no row for insertion penalties."); // check for repeated row codes for (int i = 0; i < dimension; i++) if (row_codes.indexOf(row_codes.charAt(i),i+1) > i) throw new InvalidScoringMatrixException ("Rows must have distinct one-character codes."); // check if all rows have a corresponding column for (int i = 0; i < dimension; i++) if (col_codes.indexOf(c = row_codes.charAt(i)) == -1) throw new InvalidScoringMatrixException ("There is no corresponding column for row character '" + c + "'."); // store the maximum absolute value found this.max_absolute_score = max_abs; } /** * Returns the score of a substitution of character a for character * b according to this scoring matrix. * * @param a first character * @param b second character * @return score of a substitution of character a for b * @throws IncompatibleScoringSchemeException if this substitution is not defined */ public int scoreSubstitution (char a, char b) throws IncompatibleScoringSchemeException { int r,c; if (case_sensitive) { r = row_codes.indexOf(a); c = col_codes.indexOf(b); } else { r = row_codes.indexOf(Character.toUpperCase(a)); c = col_codes.indexOf(Character.toUpperCase(b)); } if (r < 0 || c < 0) throw new IncompatibleScoringSchemeException ("Substitution of character " + a + " for " + b + " is not defined."); return matrix[r][c]; } /** * Returns the score of an insertion of character a according to this * scoring matrix. * * @param a character to be inserted * @return score of insertion of a * @throws IncompatibleScoringSchemeException if this character is not recognised */ public int scoreInsertion (char a) throws IncompatibleScoringSchemeException { return scoreSubstitution (INDEL_CHAR, a); } /** * Returns the score of a deletion of character a according to this * scoring matrix. * * @param a character to be deleted * @return score of deletion of a * @throws IncompatibleScoringSchemeException if this character is not recognised */ public int scoreDeletion (char a) throws IncompatibleScoringSchemeException { return scoreSubstitution (a, INDEL_CHAR); } /** * Tells whether this scoring scheme supports partial matches, which it does, although * a particular scoring matrix loaded by this instace might not. A partial match is * a situation when two characters are not equal but, for any reason, are regarded * as similar by this scoring scheme, which then returns a positive score value. This * is common for amino acid scoring matrices. * * @return always return true */ public boolean isPartialMatchSupported () { return true; } /** * Returns the maximum absolute score that this scoring scheme can return for any * substitution, deletion or insertion. * * @return maximum absolute score that can be returned */ public int maxAbsoluteScore () { return max_absolute_score; } /** * Returns a String representation of this scoring matrix. * * @return a String representation of this scoring matrix */ public String toString () { int row, col; StringBuffer buf = new StringBuffer(); // column numbers buf.append("Scoring matrix:\n\t"); for (col = 0; col < dimension; col++) { buf.append("\t" + col); } buf.append("\n\t"); // column headers for (col = 0; col < dimension; col++) { buf.append('\t'); buf.append(col_codes.charAt(col)); } // rest of matrix for (row = 0; row < dimension; row++) { // row number and code buf.append("\n" + row + "\t" + row_codes.charAt(row)); for (col = 0; col < dimension; col++) { buf.append('\t'); buf.append(matrix[row][col]); } } return buf.toString(); } }




© 2015 - 2025 Weber Informatics LLC | Privacy Policy