com.hfg.bio.seq.alignment.PairwiseSeqAligner Maven / Gradle / Ivy
Go to download
Show more of this group Show more artifacts with this name
Show all versions of com_hfg Show documentation
Show all versions of com_hfg Show documentation
com.hfg xml, html, svg, and bioinformatics utility library
package com.hfg.bio.seq.alignment;
import java.util.*;
import com.hfg.bio.seq.BioSequence;
import com.hfg.bio.seq.BioSequenceType;
import com.hfg.bio.seq.NucleicAcid;
import com.hfg.bio.seq.Protein;
import com.hfg.bio.seq.alignment.matrix.PSSM;
import com.hfg.bio.seq.alignment.matrix.SubstitutionMatrix;
import com.hfg.math.SimpleSampleStats;
import com.hfg.util.StringBuilderPlus;
import com.hfg.util.StringUtil;
//------------------------------------------------------------------------------
/**
Tool for creating pairwise sequence alignments. Flexible configuration allows
the specification of whether the alignments returned should be global or local
with respect to each sequence. Gap penalties including whether or not to penalize
terminal gaps can also be specified per-sequence. Using a Position-specific score matrix
(PSSM) as the subject is also allowed.
Global alignments use a version of the Needleman-Wunsch alogrithm.
@author J. Alex Taylor, hairyfatguy.com
*/
//------------------------------------------------------------------------------
// com.hfg XML/HTML Coding Library
//
// This library is free software; you can redistribute it and/or
// modify it under the terms of the GNU Lesser General Public
// License as published by the Free Software Foundation; either
// version 2.1 of the License, or (at your option) any later version.
//
// This library is distributed in the hope that it will be useful,
// but WITHOUT ANY WARRANTY; without even the implied warranty of
// MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
// Lesser General Public License for more details.
//
// You should have received a copy of the GNU Lesser General Public
// License along with this library; if not, write to the Free Software
// Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA
//
// J. Alex Taylor, President, Founder, CEO, COO, CFO, OOPS hairyfatguy.com
// [email protected]
//------------------------------------------------------------------------------
public class PairwiseSeqAligner
{
private PairwiseSettings mSettings;
private BioSequence mQuery;
private BioSequence mSubject;
private PSSM mSubjectPSSM;
private String mQueryResidues;
private String mSubjectResidues;
int[] mQuerySubstitutionIndices;
int[] mSubjectSubstitutionIndices;
private int mMinY;
private int mMaxY;
private float[][] mScoreMatrix;
private byte[][] mTracebackMatrix;
private char mGapCharacter = sDefaultGapChar;
// Traceback matrix values
private byte DIAGONAL = 0;
private byte QUERY_GAP = -1;
private byte SUBJECT_GAP = 1;
private AlignmentSeed mNextSeed;
private float mBestScore;
private int mBestScoreX;
private int mBestScoreY;
private static char sDefaultGapChar = '-';
//###########################################################################
// CONSTRUCTORS
//###########################################################################
//---------------------------------------------------------------------------
public PairwiseSeqAligner()
{
mSettings = generateDefaultSettings();
}
//---------------------------------------------------------------------------
public PairwiseSeqAligner(PairwiseSettings inSettings)
{
setSettings(inSettings != null ? inSettings : generateDefaultSettings());
}
//###########################################################################
// PUBLIC METHODS
//###########################################################################
//---------------------------------------------------------------------------
public PairwiseSettings getSettings()
{
return mSettings;
}
//---------------------------------------------------------------------------
public PairwiseSeqAligner setSettings(PairwiseSettings inValue)
{
mSettings = inValue;
return this;
}
//---------------------------------------------------------------------------
public synchronized List align(NucleicAcid inQuery, NucleicAcid inSubject)
{
fillInMatrix(inQuery, inSubject);
List alignments = generateAlignments();
if (getSettings().getCalculateStatisticalScores())
{
calculateStatisticalScores(alignments);
}
return alignments;
}
//---------------------------------------------------------------------------
public synchronized List align(Protein inQuery, Protein inSubject)
{
fillInMatrix(inQuery, inSubject);
List alignments = generateAlignments();
if (getSettings().getCalculateStatisticalScores())
{
calculateStatisticalScores(alignments);
}
return alignments;
}
//---------------------------------------------------------------------------
public synchronized List align(BioSequence inQuery, PSSM inSubject)
{
fillInMatrix(inQuery, inSubject);
List alignments = generateAlignments();
if (getSettings().getCalculateStatisticalScores())
{
calculateStatisticalScores(alignments);
}
return alignments;
}
//---------------------------------------------------------------------------
public String showScoring(Collection inQueries, Protein inSubject)
{
Map scoringMap = new HashMap<>(inQueries.size());
SubstitutionMatrix matrix = getSettings().getSubstitutionMatrix(BioSequenceType.PROTEIN);
char[] subjectResidues = inSubject.getSequence().toCharArray();
int subjectLength = StringUtil.replaceAll(inSubject.getSequence(), "-", "").length();
for (Protein seq : inQueries)
{
int queryLength = StringUtil.replaceAll(seq.getSequence(), "-", "").length();
char[] queryResidues = seq.getSequence().toCharArray();
boolean nTerminalQueryGap = true;
boolean openQueryGap = false;
boolean nTerminalSubjectGap = true;
boolean openSubjectGap = false;
Float[] positionScores = new Float[seq.length()];
int index = 0;
int queryResidueCount = 0;
int subjectResidueCount = 0;
for (char queryResidue : queryResidues)
{
char subjectResidue = subjectResidues[index];
if (nTerminalQueryGap
&& queryResidue != '-')
{
nTerminalQueryGap = false;
}
if (nTerminalSubjectGap
&& subjectResidue != '-')
{
nTerminalSubjectGap = false;
}
Float positionScore;
if ('-' == queryResidue)
{
if ((nTerminalQueryGap
&& ! getSettings().getPenalizeTerminalGaps(PairwiseSeqType.QUERY, PairwiseSeqTerminus.LEFT))
|| (queryLength == queryResidueCount
&& ! getSettings().getPenalizeTerminalGaps(PairwiseSeqType.QUERY, PairwiseSeqTerminus.RIGHT)))
{
positionScore = 0.0f;
}
else if (openQueryGap)
{
positionScore = getSettings().getGapPenalties(PairwiseSeqType.QUERY).getExtensionPenalty();
}
else
{
positionScore = getSettings().getGapPenalties(PairwiseSeqType.QUERY).getOpenPenalty();
openQueryGap = true;
}
subjectResidueCount++;
}
else if ('-' == subjectResidue)
{
if ((nTerminalSubjectGap
&& ! getSettings().getPenalizeTerminalGaps(PairwiseSeqType.SUBJECT, PairwiseSeqTerminus.LEFT))
|| (subjectLength == subjectResidueCount
&& ! getSettings().getPenalizeTerminalGaps(PairwiseSeqType.SUBJECT, PairwiseSeqTerminus.RIGHT)))
{
positionScore = 0.0f;
}
else if (openSubjectGap)
{
positionScore = getSettings().getGapPenalties(PairwiseSeqType.SUBJECT).getExtensionPenalty();
}
else
{
positionScore = getSettings().getGapPenalties(PairwiseSeqType.SUBJECT).getOpenPenalty();
openSubjectGap = true;
}
queryResidueCount++;
}
else
{
openQueryGap = false;
openSubjectGap = false;
queryResidueCount++;
subjectResidueCount++;
positionScore = (getSettings().getScoreCaseInsensitive() ? matrix.scoreCaseInsensitive(subjectResidue, queryResidue) : matrix.score(subjectResidue, queryResidue));
}
positionScores[index++] = positionScore;
}
scoringMap.put(seq, positionScores);
}
StringBuilderPlus buffer = new StringBuilderPlus();
for (int position = 0; position < inSubject.length(); position++)
{
buffer.append(String.format("%3d. ", position + 1));
for (Protein seq : inQueries)
{
char residue = seq.getSequence().charAt(position);
Float score = scoringMap.get(seq)[position];
buffer.append(String.format("%c %5.2f ", residue, score));
}
buffer.appendln();
}
buffer.appendln();
buffer.append("TOTAL: ");
for (Protein seq : inQueries)
{
Float[] positionScores = scoringMap.get(seq);
float total = 0.0f;
for (int i = 0; i < positionScores.length; i++)
{
total += positionScores[i];
}
buffer.append(String.format("%.2f ", total));
}
buffer.appendln();
return buffer.toString();
}
//---------------------------------------------------------------------------
public String showScoring(BioSequence inQuery, PSSM inSubject)
{
List queries = new ArrayList<>(1);
queries.add(inQuery);
return showScoring(queries, inSubject);
}
//---------------------------------------------------------------------------
public String showScoring(Collection extends BioSequence> inQueries, PSSM inSubject)
{
Map scoringMap = new HashMap<>(inQueries.size());
for (BioSequence seq : inQueries)
{
int queryLength = StringUtil.replaceAll(seq.getSequence(), "-", "").length();
char[] residues = seq.getSequence().toCharArray();
boolean nTerminalGap = true;
boolean openGap = false;
Float[] positionScores = new Float[seq.length()];
int index = 0;
int residueCount = 0;
for (char residue : residues)
{
if (nTerminalGap
&& residue != '-')
{
nTerminalGap = false;
}
Float positionScore;
if ('-' == residue)
{
if ((nTerminalGap
&& ! getSettings().getPenalizeTerminalGaps(PairwiseSeqType.QUERY, PairwiseSeqTerminus.LEFT))
|| (queryLength == residueCount
&& ! getSettings().getPenalizeTerminalGaps(PairwiseSeqType.QUERY, PairwiseSeqTerminus.RIGHT)))
{
positionScore = 0.0f;
}
else if (openGap)
{
positionScore = getSettings().getGapPenalties(PairwiseSeqType.QUERY).getExtensionPenalty()
+ ((inSubject.score(index + 1, mGapCharacter) + 0.0001f) / 2);
}
else
{
positionScore = getSettings().getGapPenalties(PairwiseSeqType.QUERY).getOpenPenalty()
+ ((inSubject.score(index + 1, mGapCharacter) + 0.0001f) / 2);
openGap = true;
}
}
else
{
openGap = false;
residueCount++;
positionScore = inSubject.score(index + 1, residue);
}
positionScores[index++] = positionScore;
}
scoringMap.put(seq, positionScores);
}
StringBuilderPlus buffer = new StringBuilderPlus();
for (int position = 0; position < inSubject.length(); position++)
{
buffer.append(String.format("%3d. ", position + 1));
for (BioSequence seq : inQueries)
{
char residue = seq.getSequence().charAt(position);
Float score = scoringMap.get(seq)[position];
buffer.append(String.format("%c %5.2f ", residue, score));
}
buffer.appendln();
}
buffer.appendln();
buffer.append("TOTAL: ");
for (BioSequence seq : inQueries)
{
Float[] positionScores = scoringMap.get(seq);
float total = 0.0f;
for (int i = 0; i < positionScores.length; i++)
{
total += positionScores[i];
}
buffer.append(String.format("%.2f ", total));
}
buffer.appendln();
return buffer.toString();
}
//###########################################################################
// PRIVATE METHODS
//###########################################################################
//---------------------------------------------------------------------------
private PairwiseSettings generateDefaultSettings()
{
PairwiseSettings settings = new PairwiseSettings();
return settings;
}
//---------------------------------------------------------------------------
private void setup(BioSequence inQuery, BioSequence inSubject)
{
mQuery = inQuery;
mSubject = inSubject;
mQueryResidues = inQuery.getSequence();
mSubjectResidues = inSubject.getSequence();
int queryLength = inQuery.length();
int subjectLength = inSubject.length();
mMinY = 1;
mMaxY = subjectLength;
if (getSettings().getSubjectTemplateMode())
{
while (mMinY < subjectLength
&& mSubjectResidues.charAt(mMinY - 1) == '-')
{
mMinY++;
}
while (mMaxY > 1
&& mSubjectResidues.charAt(mMaxY - 1) == '-')
{
mMaxY--;
}
}
if (null == mScoreMatrix
|| inQuery.length() + 1 > mScoreMatrix.length
|| inSubject.length() +1 > mScoreMatrix[0].length)
{
mScoreMatrix = new float[queryLength + 1][subjectLength + 1];
mTracebackMatrix = new byte[queryLength + 1][subjectLength + 1];
}
mBestScore = -1;
mBestScoreX = -1;
mBestScoreY = -1;
mScoreMatrix[0][0] = 0;
mTracebackMatrix[0][0] = 0;
boolean queryLocal = getSettings().getQueryAlignmentType().equals(PairwiseAlignmentType.LOCAL);
boolean subjectLocal = getSettings().getSubjectAlignmentType().equals(PairwiseAlignmentType.LOCAL);
boolean penalizeQueryLeftTerminalGaps = (subjectLocal ? false : getSettings().getPenalizeTerminalGaps(PairwiseSeqType.QUERY, PairwiseSeqTerminus.LEFT));
boolean penalizeSubjectLeftTerminalGaps = (queryLocal ? false : getSettings().getPenalizeTerminalGaps(PairwiseSeqType.SUBJECT, PairwiseSeqTerminus.LEFT));
GapPenalties queryGapPenalties = getSettings().getGapPenalties(PairwiseSeqType.QUERY);
GapPenalties subjectGapPenalties = getSettings().getGapPenalties(PairwiseSeqType.SUBJECT);
SubstitutionMatrix substitutionMatrix = getSettings().getSubstitutionMatrix(inQuery.getType());
// We will use pre-fetched substitution matrix indices to speed up the score matrix generation.
mQuerySubstitutionIndices = substitutionMatrix.getResidueIndicesForSequence(getSettings().getScoreCaseInsensitive() ? mQuery.getSequence().toUpperCase() : mQuery.getSequence());
mSubjectSubstitutionIndices = substitutionMatrix.getResidueIndicesForSequence(getSettings().getScoreCaseInsensitive() ? mSubject.getSequence().toUpperCase() : mSubject.getSequence());
float gapGapScore = 0;
if (substitutionMatrix.contains('-'))
{
gapGapScore = (getSettings().getScoreCaseInsensitive() ? substitutionMatrix.scoreCaseInsensitive('-', '-') : substitutionMatrix.score('-', '-'));
}
// First row
boolean openGap = true;
for (int x = 1; x <= queryLength; x++)
{
if (penalizeSubjectLeftTerminalGaps)
{
mScoreMatrix[x][0] = mScoreMatrix[x - 1][0]; // Start with the score to the left
if (openGap)
{
if (mQueryResidues.charAt(x - 1) == mGapCharacter)
{
mScoreMatrix[x][0] += gapGapScore;
}
else
{
openGap = false;
mScoreMatrix[x][0] += subjectGapPenalties.getOpenPenalty();
}
}
else
{
mScoreMatrix[x][0] += subjectGapPenalties.getExtensionPenalty();
}
}
else // No subject left terminal gap penalty
{
mScoreMatrix[x][0] = 0;
}
mTracebackMatrix[x][0] = SUBJECT_GAP;
}
// First column
for (int y = 1; y <= subjectLength; y++)
{
if (penalizeQueryLeftTerminalGaps)
{
mScoreMatrix[0][y] = mScoreMatrix[0][y - 1]; // Start with the score above
if (openGap)
{
if (mSubjectResidues.charAt(y - 1) == mGapCharacter)
{
mScoreMatrix[0][y] += gapGapScore;
}
else
{
openGap = false;
mScoreMatrix[0][y] += queryGapPenalties.getOpenPenalty();
}
}
else
{
mScoreMatrix[0][y] += queryGapPenalties.getExtensionPenalty();
}
}
else if (mSubjectResidues.charAt(y - 1) == '-')
{
mScoreMatrix[0][y] = mScoreMatrix[0][y - 1] + gapGapScore;
}
else // No query left terminal gap penalty
{
mScoreMatrix[0][y] = mScoreMatrix[0][y - 1];
}
mTracebackMatrix[0][y] = QUERY_GAP;
}
}
//---------------------------------------------------------------------------
private void setup(BioSequence inQuery, PSSM inSubject)
{
mQuery = inQuery;
mSubjectPSSM = inSubject;
mQueryResidues = inQuery.getSequence();
mSubjectResidues = StringUtil.polyChar(' ', inSubject.length()); //TODO: What should be done here?
mMinY = 1;
mMaxY = mSubjectPSSM.length();
if (null == mScoreMatrix
|| inQuery.length() + 1 > mScoreMatrix.length
|| inSubject.length() +1 > mScoreMatrix[0].length)
{
mScoreMatrix = new float[inQuery.length() + 1][inSubject.length() + 1];
mTracebackMatrix = new byte[inQuery.length() + 1][inSubject.length() + 1];
}
mBestScore = -1;
mBestScoreX = -1;
mBestScoreY = -1;
mScoreMatrix[0][0] = 0;
mTracebackMatrix[0][0] = 0;
boolean queryLocal = getSettings().getQueryAlignmentType().equals(PairwiseAlignmentType.LOCAL);
boolean subjectLocal = getSettings().getSubjectAlignmentType().equals(PairwiseAlignmentType.LOCAL);
boolean fullLocal = queryLocal && subjectLocal;
boolean penalizeQueryLeftTerminalGaps = (fullLocal || subjectLocal ? false : getSettings().getPenalizeTerminalGaps(PairwiseSeqType.QUERY, PairwiseSeqTerminus.LEFT));
boolean penalizeSubjectLeftTerminalGaps = (fullLocal || queryLocal ? false : getSettings().getPenalizeTerminalGaps(PairwiseSeqType.SUBJECT, PairwiseSeqTerminus.LEFT));
GapPenalties queryGapPenalties = getSettings().getGapPenalties(PairwiseSeqType.QUERY);
GapPenalties subjectGapPenalties = getSettings().getGapPenalties(PairwiseSeqType.SUBJECT);
// First row
boolean openGap = true;
for (int x = 1; x <= inQuery.length(); x++)
{
if (penalizeSubjectLeftTerminalGaps)
{
mScoreMatrix[x][0] = mScoreMatrix[x - 1][0]; // Start with the score to the left
if (openGap)
{
if (mQueryResidues.charAt(x - 1) == mGapCharacter)
{
mScoreMatrix[x][0] += inSubject.score(x, mGapCharacter);
}
else
{
openGap = false;
mScoreMatrix[x][0] += subjectGapPenalties.getOpenPenalty();
}
}
else
{
mScoreMatrix[x][0] += subjectGapPenalties.getExtensionPenalty();
}
}
else // No subject left terminal gap penalty
{
mScoreMatrix[x][0] = 0;
}
mTracebackMatrix[x][0] = SUBJECT_GAP;
}
// First column
for (int y = 1; y <= inSubject.length(); y++)
{
if (penalizeQueryLeftTerminalGaps)
{
mScoreMatrix[0][y] = mScoreMatrix[0][y - 1]; // Start with the score above
mScoreMatrix[0][y] += queryGapPenalties.getExtensionPenalty();
}
else // No query left terminal gap penalty
{
mScoreMatrix[0][y] = 0;
}
mTracebackMatrix[0][y] = QUERY_GAP;
}
}
//---------------------------------------------------------------------------
private void fillInMatrix(BioSequence inQuery, BioSequence inSubject)
{
setup(inQuery, inSubject);
GapPenalties queryGapPenalties = mSettings.getGapPenalties(PairwiseSeqType.QUERY);
GapPenalties subjectGapPenalties = mSettings.getGapPenalties(PairwiseSeqType.SUBJECT);
boolean queryLocal = getSettings().getQueryAlignmentType().equals(PairwiseAlignmentType.LOCAL);
boolean subjectLocal = getSettings().getSubjectAlignmentType().equals(PairwiseAlignmentType.LOCAL);
boolean fullGlobal = (! queryLocal && ! subjectLocal);
boolean penalizeQueryRightTerminalGaps = (subjectLocal ? false : getSettings().getPenalizeTerminalGaps(PairwiseSeqType.QUERY, PairwiseSeqTerminus.RIGHT));
boolean penalizeSubjectRightTerminalGaps = (queryLocal ? false : getSettings().getPenalizeTerminalGaps(PairwiseSeqType.SUBJECT, PairwiseSeqTerminus.RIGHT));
boolean subjectTemplateMode = getSettings().getSubjectTemplateMode();
SubstitutionMatrix substitutionMatrix = getSettings().getSubstitutionMatrix(inQuery.getType());
// Use the raw matrix for speed so we won't have to repeatedly look up the substitution matrix indices.
float[][] rawSubstitutionMatrix = substitutionMatrix.getRawMatrix();
float gapGapScore = 0;
if (substitutionMatrix.contains('-'))
{
gapGapScore = (getSettings().getScoreCaseInsensitive() ? substitutionMatrix.scoreCaseInsensitive('-', '-') : substitutionMatrix.score('-', '-'));
}
int queryLength = inQuery.length();
int subjectLength = inSubject.length();
for (int y = 1; y <= subjectLength; y++)
{
for (int x = 1; x <= mQueryResidues.length(); x++)
{
// Calculate a diagonal score, a query gap score, and a subject gap score; then choose the highest score
// as this cell's value
float diagScore = mScoreMatrix[x - 1][y - 1] + rawSubstitutionMatrix[mQuerySubstitutionIndices[x - 1]][mSubjectSubstitutionIndices[y - 1]];
byte tracebackDirection = DIAGONAL;
float bestScore = diagScore;
// Don't create right-terminal gaps for the query if it is in local mode
if (fullGlobal
|| x < mQueryResidues.length())
{
float queryGapScore = mScoreMatrix[x][y - 1];
if (! penalizeQueryRightTerminalGaps
&& x == queryLength)
{
// Don't add any other penalty
}
else if (subjectTemplateMode
&& mSubjectResidues.charAt(y - 1) == '-')
{
// Don't penalize - subject has a gap at this position.
// However, only add the gap score if it is not a terminal gap.
if (y > mMinY
&& y < mMaxY)
{
queryGapScore += gapGapScore;
}
}
else
{
// If there was a gap in the query sequence at the preceding position, the gap opening penalty should apply and not the gap extension penalty.
// If we are in subject template mode and there was a subject gap at the preceding position, the gap opening penalty should apply and not the gap extension penalty.
boolean preceedingQueryGap = (mQueryResidues.charAt(x - 1) == '-');
queryGapScore += (preceedingQueryGap || (QUERY_GAP == mTracebackMatrix[x][y - 1] && (!subjectTemplateMode || y < 2 || mSubjectResidues.charAt(y - 2) != '-')) ? queryGapPenalties.getExtensionPenalty() : queryGapPenalties.getOpenPenalty());
}
if (queryGapScore > bestScore)
{
tracebackDirection = QUERY_GAP;
bestScore = queryGapScore;
}
}
// Don't create right-terminal gaps for the subject if it is in local mode
if (fullGlobal
|| y < mSubjectResidues.length())
{
float subjectGapScore = mScoreMatrix[x - 1][y];
if (! penalizeSubjectRightTerminalGaps
&& y == subjectLength)
{
// Full global and no subject right terminal gap penalty.
// Don't add any other penalty
}
else
{
subjectGapScore += (SUBJECT_GAP == mTracebackMatrix[x - 1][y] ? subjectGapPenalties.getExtensionPenalty() : subjectGapPenalties.getOpenPenalty());
}
if (subjectGapScore > bestScore)
{
tracebackDirection = SUBJECT_GAP;
bestScore = subjectGapScore;
}
}
mTracebackMatrix[x][y] = tracebackDirection;
mScoreMatrix[x][y] = bestScore;
// For perfomance, keep track of the best score.
// If only one alignment is desired, this will keep us from having to examine the whole matrix.
if (bestScore >= mBestScore)
{
mBestScore = bestScore;
mBestScoreX = x;
mBestScoreY = y;
}
}
}
if (getSettings().getDumpMatrices())
{
dumpMatrices();
}
}
//---------------------------------------------------------------------------
private void fillInMatrix(BioSequence inQuery, PSSM inSubject)
{
setup(inQuery, inSubject);
GapPenalties queryGapPenalties = mSettings.getGapPenalties(PairwiseSeqType.QUERY);
GapPenalties subjectGapPenalties = mSettings.getGapPenalties(PairwiseSeqType.SUBJECT);
boolean queryLocal = getSettings().getQueryAlignmentType().equals(PairwiseAlignmentType.LOCAL);
boolean subjectLocal = getSettings().getSubjectAlignmentType().equals(PairwiseAlignmentType.LOCAL);
boolean fullLocal = queryLocal && subjectLocal;
boolean queryGlobal = ! queryLocal;
boolean subjectGlobal = ! subjectLocal;
boolean fullGlobal = queryGlobal && subjectGlobal;
boolean penalizeQueryRightTerminalGaps = (fullLocal ? false : getSettings().getPenalizeTerminalGaps(PairwiseSeqType.QUERY, PairwiseSeqTerminus.RIGHT));
boolean penalizeSubjectRightTerminalGaps = (fullLocal ? false : getSettings().getPenalizeTerminalGaps(PairwiseSeqType.SUBJECT, PairwiseSeqTerminus.RIGHT));
int queryLength = inQuery.length();
int subjectLength = inSubject.length();
for (int y = 1; y <= subjectLength; y++)
{
for (int x = 1; x <= mQueryResidues.length(); x++)
{
char queryResidue = mQueryResidues.charAt(x - 1);
// Calculate a diagonal score, a query gap score, and a subject gap score; then choose the highest score
// as this cell's value
float diagScore = mScoreMatrix[x - 1][y - 1] + inSubject.score(y, queryResidue);
byte tracebackDirection = DIAGONAL;
float bestScore = diagScore;
// Don't create right-terminal gaps for the query if it is in local mode
if (fullGlobal
|| x < mQueryResidues.length())
{
float queryGapScore = mScoreMatrix[x][y - 1];
if (!penalizeQueryRightTerminalGaps
&& x == queryLength)
{
// Don't add any other penalty
}
else
{
queryGapScore += (QUERY_GAP == (mTracebackMatrix[x][y - 1]) ? queryGapPenalties.getExtensionPenalty() * inSubject.getGapExtScore(y)
: queryGapPenalties.getOpenPenalty() * inSubject.getGapOpenScore(y) );
}
if (queryGapScore > bestScore)
{
tracebackDirection = QUERY_GAP;
bestScore = queryGapScore;
}
}
// Don't create right-terminal gaps for the subject if it is in local mode
if (fullGlobal
|| y < mSubjectResidues.length())
{
float subjectGapScore = mScoreMatrix[x - 1][y];
if (!penalizeSubjectRightTerminalGaps
&& y == subjectLength)
{
// Don't add any other penalty
}
else
{
subjectGapScore += (SUBJECT_GAP == (mTracebackMatrix)[x - 1][y] ? subjectGapPenalties.getExtensionPenalty() : subjectGapPenalties.getOpenPenalty());
}
if (subjectGapScore > bestScore)
{
tracebackDirection = SUBJECT_GAP;
bestScore = subjectGapScore;
}
}
mTracebackMatrix[x][y] = tracebackDirection;
mScoreMatrix[x][y] = bestScore;
// For perfomance, keep track of the best score.
// If only one alignment is desired, this will keep us from having to examine the whole matrix.
if (bestScore >= mBestScore)
{
mBestScore = bestScore;
mBestScoreX = x;
mBestScoreY = y;
}
}
}
if (getSettings().getDumpMatrices())
{
dumpMatrices();
}
}
//---------------------------------------------------------------------------
private void dumpMatrices()
{
StringBuilderPlus buffer = new StringBuilderPlus();
buffer.appendln("SCORE MATRIX:");
buffer.append(" ");
for (int x = 0; x < mQueryResidues.length(); x++)
{
buffer.append(String.format(" %c ", mQueryResidues.charAt(x)));
}
buffer.appendln("");
for (int y = 0; y <= mSubjectResidues.length(); y++)
{
if (0 == y)
{
buffer.append(" ");
}
else
{
if (mSubject != null)
{
buffer.append(String.format(" %c ", mSubjectResidues.charAt(y - 1)));
}
else
{
buffer.append(String.format("%3d. ", y));
}
}
for (int x = 0; x <= mQueryResidues.length(); x++)
{
buffer.append(String.format("%7.1f ", mScoreMatrix[x][y]));
}
buffer.appendln("");
}
buffer.appendln("\n\nTRACEBACK MATRIX:");
buffer.append(" ");
for (int x = 0; x < mQueryResidues.length(); x++)
{
buffer.append(String.format(" %c ", mQueryResidues.charAt(x)));
}
buffer.appendln("");
for (int y = 0; y <= mSubjectResidues.length(); y++)
{
if (0 == y)
{
buffer.append(" ");
}
else
{
if (mSubject != null)
{
buffer.append(String.format(" %c ", mSubjectResidues.charAt(y - 1)));
}
else
{
buffer.append(String.format("%3d. ", y));
}
}
for (int x = 0; x <= mQueryResidues.length(); x++)
{
buffer.append(String.format("%7d ", mTracebackMatrix[x][y]));
}
buffer.appendln("");
}
System.out.println(buffer);
}
//---------------------------------------------------------------------------
// Tried another implementation keeping a queue of seeds but it didn't perform as well
private boolean hasMoreSeeds()
{
mNextSeed = null;
boolean queryGlobal = getSettings().getQueryAlignmentType().equals(PairwiseAlignmentType.GLOBAL);
boolean subjectGlobal = getSettings().getSubjectAlignmentType().equals(PairwiseAlignmentType.GLOBAL);
// Full global?
if (queryGlobal
&& subjectGlobal)
{
// Start from the bottom right cell in the matrix
float score = mScoreMatrix[mQueryResidues.length()][mSubjectResidues.length()];
if (score != Integer.MIN_VALUE)
{
mNextSeed = new AlignmentSeed(mQueryResidues.length(), mSubjectResidues.length(),score);
}
}
else if (mBestScore > 0)
{
mNextSeed = new AlignmentSeed(mBestScoreX, mBestScoreY, mBestScore);
mBestScore = -1;
mBestScoreX = -1;
mBestScoreY = -1;
}
else
{
int minX = 0;
int maxX = mQueryResidues.length();
float largestScore = 0;
int seedX = 0;
int seedY = 0;
if (subjectGlobal
&& getSettings().getPenalizeTerminalGaps(PairwiseSeqType.QUERY, PairwiseSeqTerminus.RIGHT))
{
// Only search the bottom
for (int x = minX; x <= maxX; x++)
{
float score = mScoreMatrix[x][mMaxY];
if (score >= largestScore)
{
seedX = x;
seedY = mMaxY;
largestScore = score;
}
}
}
else if (queryGlobal
&& getSettings().getPenalizeTerminalGaps(PairwiseSeqType.SUBJECT, PairwiseSeqTerminus.RIGHT))
{
// Only search the side
for (int y = mMinY - 1; y < mMaxY; y++)
{
float score = mScoreMatrix[maxX][y];
if (score >= largestScore)
{
seedX = maxX;
seedY = y;
largestScore = score;
}
}
}
else
{
for (int x = minX; x <= maxX; x++)
{
for (int y = mMinY - 1; y <= mMaxY; y++)
{
float score = mScoreMatrix[x][y];
if (score >= largestScore)
{
seedX = x;
seedY = y;
largestScore = score;
}
}
}
}
mNextSeed = new AlignmentSeed(seedX, seedY, largestScore);
}
if (mNextSeed != null
&& getSettings().getMinRawScore() != null
&& getSettings().getMinRawScore() > mNextSeed.getScore())
{
mNextSeed = null;
}
return (mNextSeed != null);
}
//---------------------------------------------------------------------------
private AlignmentSeed nextSeed()
{
return mNextSeed;
}
//---------------------------------------------------------------------------
private List generateAlignments()
{
List alignments = new ArrayList<>(mSettings.getMaxLocalAlignmentResults());
boolean queryGlobal = getSettings().getQueryAlignmentType().equals(PairwiseAlignmentType.GLOBAL);
boolean subjectGlobal = getSettings().getSubjectAlignmentType().equals(PairwiseAlignmentType.GLOBAL);
boolean fullGlobal = queryGlobal && subjectGlobal;
boolean fullLocal = ! queryGlobal && ! subjectGlobal;
boolean queryLocal = ! queryGlobal;
boolean subjectLocal = ! subjectGlobal;
int maxAlignments = (fullGlobal ? 1 : getSettings().getMaxLocalAlignmentResults());
while (hasMoreSeeds()
&& alignments.size() < maxAlignments)
{
AlignmentSeed seed = nextSeed();
// Note that these aligned sequences will be built up in the reverse order and then reversed at the end for performance
StringBuilder alignedQuery = new StringBuilder();
StringBuilder alignedSubject = new StringBuilder();
int x = seed.getX();
int y = seed.getY();
byte tracebackDirection = mTracebackMatrix[x][y];
// Query global / Subject local or Subject global / Query local alignment.
// Don't align past the end of the global seq.
while ((x > mQueryResidues.length()
&& queryGlobal
&& subjectLocal)
|| (y > mSubjectResidues.length()
&& subjectGlobal
&& queryLocal))
{
if (DIAGONAL == tracebackDirection)
{
x--;
y--;
}
else if (QUERY_GAP == tracebackDirection)
{
y--;
}
else if (SUBJECT_GAP == tracebackDirection)
{
x--;
}
tracebackDirection = mTracebackMatrix[x][y];
}
// int xLeftLimit = (queryGlobal && subjectLocal ? 1 : 0);
// int yLeftLimit = (subjectGlobal && queryLocal ? 1 : 0);
int xLeftLimit = (fullGlobal ? 0 : 1);
int yLeftLimit = (fullGlobal ? 0 : 1);
while ((x > 0
|| y > 0)
&& (fullLocal ? mScoreMatrix[x][y] > 0 : true))
{
float matrixScore = mScoreMatrix[x][y];
// Did we overlap a previous alignment?
if (Integer.MIN_VALUE == matrixScore)
{
break; // Skip
}
else if (x < xLeftLimit)
{
break; // Query global / Subject local alignment. Don't align past the beginning of the Query
}
else if (y < yLeftLimit)
{
break; // Subject global / Query local alignment. Don't align past the beginning of the Subject
}
// Deface this position so we won't get overlapping subsequences
mScoreMatrix[x][y] = Integer.MIN_VALUE;
if (DIAGONAL == tracebackDirection)
{
alignedQuery.append(mQueryResidues.charAt(x - 1));
alignedSubject.append(mSubjectResidues.charAt(y - 1));
x--;
y--;
}
else if (QUERY_GAP == tracebackDirection)
{
alignedQuery.append(mGapCharacter);
alignedSubject.append(mSubjectResidues.charAt(y - 1));
y--;
}
else if (SUBJECT_GAP == tracebackDirection)
{
alignedQuery.append(mQueryResidues.charAt(x - 1));
alignedSubject.append(mGapCharacter);
x--;
}
tracebackDirection = mTracebackMatrix[x][y];
}
// Did we overlap a previous alignment?
if (Integer.MIN_VALUE == mScoreMatrix[x][y])
{
continue; // Skip
}
else
{
// Deface this position so we won't get overlapping subsequences
mScoreMatrix[x][y] = Integer.MIN_VALUE;
}
if (queryGlobal
|| (alignedQuery.length() >= mSettings.getMinLocalAlignmentLength()
&& StringUtil.replaceAll(alignedQuery, "-", "").length() >= mSettings.getMinQueryMatchLength()))
{
int initialQueryPosition = x + 1;
int initialSubjectPosition = y + 1;
if (queryLocal && subjectGlobal)
{
if (y > 0)
{
// Add a left terminal query gap
alignedQuery.append(StringUtil.polyChar('-', y));
alignedSubject = alignedSubject.reverse();
alignedSubject.insert(0, mSubjectResidues, 0, y);
initialSubjectPosition = 1;
}
else
{
alignedSubject = alignedSubject.reverse();
}
alignedQuery = alignedQuery.reverse();
if (seed.getY() < mSubjectResidues.length())
{
// Add a right terminal query gap
alignedQuery.append(StringUtil.polyChar('-', mSubjectResidues.length() - seed.getY()));
alignedSubject.append(mSubjectResidues, seed.getY(), mSubjectResidues.length());
}
}
else if (subjectLocal && queryGlobal)
{
if (x > 0)
{
// Add a left terminal subject gap
alignedQuery = alignedQuery.reverse();
alignedQuery.insert(0, mQueryResidues, 0, x);
alignedSubject.append(StringUtil.polyChar('-', x));
initialSubjectPosition = 1;
}
else
{
alignedQuery = alignedQuery.reverse();
}
alignedSubject = alignedSubject.reverse();
if (seed.getX() < mQueryResidues.length())
{
// Add a right terminal subject gap
alignedQuery.append(mQueryResidues, seed.getX(), mQueryResidues.length());
alignedSubject.append(StringUtil.polyChar('-', mQueryResidues.length() - seed.getX()));
}
}
else
{
alignedQuery = alignedQuery.reverse();
alignedSubject = alignedSubject.reverse();
}
AlignedQuery query = new AlignedQuery(mQuery, alignedQuery, initialQueryPosition);
AlignedSubject subject = new AlignedSubject(mSubject, alignedSubject, initialSubjectPosition);
PairwiseSeqAlignment alignment = new PairwiseSeqAlignment(query, subject).setScore(seed.getScore());
alignment.setSettings(getSettings().clone());
// TODO: force full local alignment if specified
alignments.add(alignment);
}
if (getSettings().getDumpMatrices())
{
dumpMatrices();
}
}
return alignments;
}
//---------------------------------------------------------------------------
private void calculateStatisticalScores(List inAlignments)
{
String originalQuerySeq = mQuery.getSequence();
// Calculate the background
double prevMean = 0;
SimpleSampleStats stats = new SimpleSampleStats();
do
{
if (stats.getSampleSize() > 0)
{
prevMean = stats.getMean();
}
List seeds;
// Scramble the query
String querySeq = mQuery.getSequence();
mQuery.setSequence(StringUtil.scramble(querySeq));
if (mSubject != null)
{
fillInMatrix(mQuery, mSubject);
}
else // Working w/ a PSSM
{
fillInMatrix(mQuery, mSubjectPSSM);
}
stats.add(hasMoreSeeds() ? nextSeed().getScore() : 0);
// Repeat until the mean background value isn't changing significantly
}
while (stats.getSampleSize() < getSettings().getMinNumOfBackgroundCycles()
&& Math.abs(stats.getMean() - prevMean) > 0.000001 * stats.getMean());
// Calculate the decay constant
double lambda = Math.PI / (stats.getSampleStandardDeviation() * Math.sqrt(6));
// Estimate the extreme value distribution's location constant
double mu = stats.getMean() - 0.5772 / lambda;
for (PairwiseSeqAlignment alignment : inAlignments)
{
double eValue = Math.exp(-lambda * (alignment.getScore() - mu));
alignment.setEValue((float) eValue);
alignment.setPValue((float) (1 - Math.exp(-eValue)));
alignment.setZScore((float) ((alignment.getScore() - stats.getMean()) / stats.getSampleStandardDeviation()));
}
// Replace the original query seq
if (originalQuerySeq != null)
{
mQuery.setSequence(originalQuerySeq);
}
}
}