All Downloads are FREE. Search and download functionalities are using the official Maven repository.

com.hfg.bio.seq.alignment.PairwiseSeqAligner Maven / Gradle / Ivy

There is a newer version: 20240423
Show newest version
package com.hfg.bio.seq.alignment;

import java.util.*;

import com.hfg.bio.seq.BioSequence;
import com.hfg.bio.seq.BioSequenceType;
import com.hfg.bio.seq.NucleicAcid;
import com.hfg.bio.seq.Protein;
import com.hfg.bio.seq.alignment.matrix.PSSM;
import com.hfg.bio.seq.alignment.matrix.SubstitutionMatrix;
import com.hfg.math.SimpleSampleStats;
import com.hfg.util.CompareUtil;
import com.hfg.util.StringBuilderPlus;
import com.hfg.util.StringUtil;


//------------------------------------------------------------------------------
/**
 Tool for creating pairwise sequence alignments. Flexible configuration allows
 the specification of whether the alignments returned should be global or local
 with respect to each sequence. Gap penalties including whether or not to penalize
 terminal gaps can also be specified per-sequence. Using a Position-specific score matrix
 (PSSM) as the subject is also allowed.

 Global alignments use a version of the Needleman-Wunsch alogrithm.
 @author J. Alex Taylor, hairyfatguy.com
 */
//------------------------------------------------------------------------------
// com.hfg XML/HTML Coding Library
//
// This library is free software; you can redistribute it and/or
// modify it under the terms of the GNU Lesser General Public
// License as published by the Free Software Foundation; either
// version 2.1 of the License, or (at your option) any later version.
//
// This library is distributed in the hope that it will be useful,
// but WITHOUT ANY WARRANTY; without even the implied warranty of
// MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
// Lesser General Public License for more details.
//
// You should have received a copy of the GNU Lesser General Public
// License along with this library; if not, write to the Free Software
// Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA  02111-1307  USA
//
// J. Alex Taylor, President, Founder, CEO, COO, CFO, OOPS hairyfatguy.com
// [email protected]
//------------------------------------------------------------------------------


public class PairwiseSeqAligner
{
   private BioSequence mQuery;
   private BioSequence mSubject;
   private PSSM mSubjectPSSM;
   private String mQueryResidues;
   private String mSubjectResidues;
   int[] mQuerySubstitutionIndices;
   int[] mSubjectSubstitutionIndices;
   private char mGapCharacter = sDefaultGapChar;
   private int mMinY;
   private int mMaxY;

   // Cached
   private int mQueryLength;
   private int mSubjectLength;

   private PairwiseSettings mSettings;

   // Cached settings values for performance
   private Integer mMinRawScore;
   private boolean mQueryLocal;
   private boolean mSubjectLocal;
   private boolean mPenalizeQueryLeftTerminalGaps;
   private boolean mPenalizeQueryRightTerminalGaps;
   private boolean mPenalizeSubjectLeftTerminalGaps;
   private boolean mPenalizeSubjectRightTerminalGaps;

   // Matrices
   private float[][] mScoreMatrix;
   private byte[][] mTracebackMatrix;

   // Traceback matrix values
   private byte DIAGONAL = 0;
   private byte QUERY_GAP = -1;
   private byte SUBJECT_GAP = 1;

   private AlignmentSeed mNextSeed = new AlignmentSeed();
   private boolean mHasMoreSeeds;
   
   private float mBestScore;
   private int   mBestScoreX;
   private int   mBestScoreY;

   private static char sDefaultGapChar = '-';

   //###########################################################################
   // CONSTRUCTORS
   //###########################################################################

   //---------------------------------------------------------------------------
   public PairwiseSeqAligner()
   {
      mSettings = generateDefaultSettings();
   }

   //---------------------------------------------------------------------------
   public PairwiseSeqAligner(PairwiseSettings inSettings)
   {
      setSettings(inSettings != null ? inSettings : generateDefaultSettings());
   }

   //###########################################################################
   // PUBLIC METHODS
   //###########################################################################

   //---------------------------------------------------------------------------
   public PairwiseSettings getSettings()
   {
      return mSettings;
   }

   //---------------------------------------------------------------------------
   public PairwiseSeqAligner setSettings(PairwiseSettings inValue)
   {
      mSettings = inValue;
      return this;
   }

   //---------------------------------------------------------------------------
   public synchronized List align(NucleicAcid inQuery, NucleicAcid inSubject)
   {
      fillInMatrix(inQuery, inSubject);

      List alignments = generateAlignments();

      if (getSettings().getCalculateStatisticalScores())
      {
         calculateStatisticalScores(alignments);
      }

      return alignments;
   }

   //---------------------------------------------------------------------------
   public synchronized List align(Protein inQuery, Protein inSubject)
   {
      fillInMatrix(inQuery, inSubject);

      List alignments = generateAlignments();

      if (getSettings().getCalculateStatisticalScores())
      {
         calculateStatisticalScores(alignments);
      }

      return alignments;
   }

   //---------------------------------------------------------------------------
   public synchronized List align(BioSequence inQuery, PSSM inSubject)
   {
      fillInMatrix(inQuery, inSubject);

      List alignments = generateAlignments();

      if (getSettings().getCalculateStatisticalScores())
      {
         calculateStatisticalScores(alignments);
      }

      return alignments;
   }

   //---------------------------------------------------------------------------
   public String showScoring(Collection inQueries, Protein inSubject)
   {
      Map scoringMap = new HashMap<>(inQueries.size());

      SubstitutionMatrix matrix = getSettings().getSubstitutionMatrix(BioSequenceType.PROTEIN);

      char[] subjectResidues = inSubject.getSequence().toCharArray();
      int subjectLength = StringUtil.replaceAll(inSubject.getSequence(), "-", "").length();

      for (Protein seq : inQueries)
      {
         int queryLength = StringUtil.replaceAll(seq.getSequence(), "-", "").length();

         char[] queryResidues = seq.getSequence().toCharArray();

         boolean nTerminalQueryGap = true;
         boolean openQueryGap = false;

         boolean nTerminalSubjectGap = true;
         boolean openSubjectGap = false;

         Float[] positionScores = new Float[seq.length()];
         int index = 0;
         int queryResidueCount = 0;
         int subjectResidueCount = 0;
         for (char queryResidue : queryResidues)
         {
            char subjectResidue = subjectResidues[index];

            if (nTerminalQueryGap
                && queryResidue != '-')
            {
               nTerminalQueryGap = false;
            }

            if (nTerminalSubjectGap
                && subjectResidue != '-')
            {
               nTerminalSubjectGap = false;
            }


            Float positionScore;
            if ('-' == queryResidue)
            {
               if ((nTerminalQueryGap
                    && ! getSettings().getPenalizeTerminalGaps(PairwiseSeqType.QUERY, PairwiseSeqTerminus.LEFT))
                   || (queryLength == queryResidueCount
                       && ! getSettings().getPenalizeTerminalGaps(PairwiseSeqType.QUERY, PairwiseSeqTerminus.RIGHT)))
               {
                  positionScore = 0.0f;
               }
               else if (openQueryGap)
               {
                  positionScore = getSettings().getGapPenalties(PairwiseSeqType.QUERY).getExtensionPenalty();
               }
               else
               {
                  positionScore = getSettings().getGapPenalties(PairwiseSeqType.QUERY).getOpenPenalty();
                  openQueryGap = true;
               }

               subjectResidueCount++;
            }
            else if ('-' == subjectResidue)
            {
               if ((nTerminalSubjectGap
                    && ! getSettings().getPenalizeTerminalGaps(PairwiseSeqType.SUBJECT, PairwiseSeqTerminus.LEFT))
                   || (subjectLength == subjectResidueCount
                       && ! getSettings().getPenalizeTerminalGaps(PairwiseSeqType.SUBJECT, PairwiseSeqTerminus.RIGHT)))
               {
                  positionScore = 0.0f;
               }
               else if (openSubjectGap)
               {
                  positionScore = getSettings().getGapPenalties(PairwiseSeqType.SUBJECT).getExtensionPenalty();
               }
               else
               {
                  positionScore = getSettings().getGapPenalties(PairwiseSeqType.SUBJECT).getOpenPenalty();
                  openSubjectGap = true;
               }

               queryResidueCount++;
            }
            else
            {
               openQueryGap = false;
               openSubjectGap = false;
               queryResidueCount++;
               subjectResidueCount++;
               positionScore = (getSettings().getScoreCaseInsensitive() ? matrix.scoreCaseInsensitive(subjectResidue, queryResidue) : matrix.score(subjectResidue, queryResidue));
            }

            positionScores[index++] = positionScore;
         }

         scoringMap.put(seq, positionScores);
      }

      StringBuilderPlus buffer = new StringBuilderPlus();

      for (int position = 0; position < inSubject.length(); position++)
      {
         buffer.append(String.format("%3d.  ", position + 1));

         for (Protein seq : inQueries)
         {
            char residue = seq.getSequence().charAt(position);
            Float score = scoringMap.get(seq)[position];
            buffer.append(String.format("%c %5.2f   ", residue, score));
         }

         buffer.appendln();
      }

      buffer.appendln();
      buffer.append("TOTAL: ");
      for (Protein seq : inQueries)
      {
         Float[] positionScores = scoringMap.get(seq);
         float total = 0.0f;
         for (int i = 0; i < positionScores.length; i++)
         {
            total += positionScores[i];
         }
         buffer.append(String.format("%.2f    ", total));
      }
      buffer.appendln();

      return buffer.toString();
   }

   //---------------------------------------------------------------------------
   public String showScoring(BioSequence inQuery, PSSM inSubject)
   {
      List queries = new ArrayList<>(1);
      queries.add(inQuery);

      return showScoring(queries, inSubject);
   }

   //---------------------------------------------------------------------------
   public String showScoring(Collection inQueries, PSSM inSubject)
   {
      Map scoringMap = new HashMap<>(inQueries.size());

      for (BioSequence seq : inQueries)
      {
         int queryLength = StringUtil.replaceAll(seq.getSequence(), "-", "").length();

         char[] residues = seq.getSequence().toCharArray();

         boolean nTerminalGap = true;
         boolean openGap = false;

         Float[] positionScores = new Float[seq.length()];
         int index = 0;
         int residueCount = 0;
         for (char residue : residues)
         {
            if (nTerminalGap
                && residue != '-')
            {
               nTerminalGap = false;
            }

            Float positionScore;
            if ('-' == residue)
            {
               if ((nTerminalGap
                    && ! getSettings().getPenalizeTerminalGaps(PairwiseSeqType.QUERY, PairwiseSeqTerminus.LEFT))
                   || (queryLength == residueCount
                       && ! getSettings().getPenalizeTerminalGaps(PairwiseSeqType.QUERY, PairwiseSeqTerminus.RIGHT)))
               {
                  positionScore = 0.0f;
               }
               else if (openGap)
               {
                  positionScore = getSettings().getGapPenalties(PairwiseSeqType.QUERY).getExtensionPenalty()
                                  + ((inSubject.score(index + 1, mGapCharacter) + 0.0001f) / 2);
               }
               else
               {
                  positionScore = getSettings().getGapPenalties(PairwiseSeqType.QUERY).getOpenPenalty()
                                  + ((inSubject.score(index + 1, mGapCharacter) + 0.0001f) / 2);
                  openGap = true;
               }
            }
            else
            {
               openGap = false;
               residueCount++;
               positionScore = inSubject.score(index + 1, residue);
            }

            positionScores[index++] = positionScore;
         }

         scoringMap.put(seq, positionScores);
      }

      StringBuilderPlus buffer = new StringBuilderPlus();

      for (int position = 0; position < inSubject.length(); position++)
      {
         buffer.append(String.format("%3d.  ", position + 1));

         for (BioSequence seq : inQueries)
         {
            char residue = seq.getSequence().charAt(position);
            Float score = scoringMap.get(seq)[position];
            buffer.append(String.format("%c %5.2f   ", residue, score));
         }

         buffer.appendln();
      }

      buffer.appendln();
      buffer.append("TOTAL: ");
      for (BioSequence seq : inQueries)
      {
         Float[] positionScores = scoringMap.get(seq);
         float total = 0.0f;
         for (int i = 0; i < positionScores.length; i++)
         {
            total += positionScores[i];
         }
         buffer.append(String.format("%.2f    ", total));
      }
      buffer.appendln();

      return buffer.toString();
   }

   //###########################################################################
   // PRIVATE METHODS
   //###########################################################################

   //---------------------------------------------------------------------------
   private PairwiseSettings generateDefaultSettings()
   {
      PairwiseSettings settings = new PairwiseSettings();

      return settings;
   }

   //---------------------------------------------------------------------------
   private void commonSetup(BioSequence inQuery)
   {
      mQuery           = inQuery;
      mQueryResidues   = inQuery.getSequence();
      mQueryLength     = mQueryResidues.length();

      mMinRawScore = getSettings().getMinRawScore();

      mQueryLocal   = getSettings().getQueryAlignmentType().equals(PairwiseAlignmentType.LOCAL);
      mSubjectLocal = getSettings().getSubjectAlignmentType().equals(PairwiseAlignmentType.LOCAL);

      mPenalizeQueryLeftTerminalGaps = (mSubjectLocal ? false : getSettings().getPenalizeTerminalGaps(PairwiseSeqType.QUERY, PairwiseSeqTerminus.LEFT));
      mPenalizeQueryRightTerminalGaps = (mSubjectLocal ? false : getSettings().getPenalizeTerminalGaps(PairwiseSeqType.QUERY, PairwiseSeqTerminus.RIGHT));
      mPenalizeSubjectLeftTerminalGaps = (mQueryLocal ? false : getSettings().getPenalizeTerminalGaps(PairwiseSeqType.SUBJECT, PairwiseSeqTerminus.LEFT));
      mPenalizeSubjectRightTerminalGaps = (mQueryLocal ? false : getSettings().getPenalizeTerminalGaps(PairwiseSeqType.SUBJECT, PairwiseSeqTerminus.RIGHT));
   }

   //---------------------------------------------------------------------------
   private void setup(BioSequence inQuery, BioSequence inSubject)
   {
      commonSetup(inQuery);

      mSubject = inSubject;
      mSubjectResidues = inSubject.getSequence();
      mSubjectLength = inSubject.length();


      mMinY = 1;
      mMaxY  = mSubjectLength;
      if (getSettings().getSubjectTemplateMode())
      {
         while (mMinY < mSubjectLength
               && mSubjectResidues.charAt(mMinY - 1) == '-')
         {
            mMinY++;
         }

         while (mMaxY > 1
               && mSubjectResidues.charAt(mMaxY - 1) == '-')
         {
            mMaxY--;
         }
      }

      // Do we need to create a new or bigger scoring matrix?
      if (null == mScoreMatrix
          || mQueryLength + 1 > mScoreMatrix.length
          || mSubjectLength + 1 > mScoreMatrix[0].length)
      {
         mScoreMatrix = new float[mQueryLength + 1][mSubjectLength + 1];
         mTracebackMatrix = new byte[mQueryLength + 1][mSubjectLength + 1];
      }

      mBestScore  = -1;
      mBestScoreX = -1;
      mBestScoreY = -1;

      mScoreMatrix[0][0]     = 0;
      mTracebackMatrix[0][0] = 0;

      GapPenalties queryGapPenalties   = getSettings().getGapPenalties(PairwiseSeqType.QUERY);
      GapPenalties subjectGapPenalties = getSettings().getGapPenalties(PairwiseSeqType.SUBJECT);

      SubstitutionMatrix substitutionMatrix = getSettings().getSubstitutionMatrix(inQuery.getType());
      // We will use pre-fetched substitution matrix indices to speed up the score matrix generation.
      mQuerySubstitutionIndices = substitutionMatrix.getResidueIndicesForSequence(getSettings().getScoreCaseInsensitive() ? mQuery.getSequence().toUpperCase() : mQuery.getSequence());
      mSubjectSubstitutionIndices = substitutionMatrix.getResidueIndicesForSequence(getSettings().getScoreCaseInsensitive() ? mSubject.getSequence().toUpperCase() : mSubject.getSequence());

      float gapGapScore = 0;
      if (substitutionMatrix.contains('-'))
      {
         gapGapScore = (getSettings().getScoreCaseInsensitive() ? substitutionMatrix.scoreCaseInsensitive('-', '-') : substitutionMatrix.score('-', '-'));
      }

      // First row
      boolean openGap = true;
      for (int x = 1; x <= mQueryLength; x++)
      {
         if (mPenalizeSubjectLeftTerminalGaps)
         {
            mScoreMatrix[x][0] = mScoreMatrix[x - 1][0]; // Start with the score to the left
            if (openGap)
            {
               if (mQueryResidues.charAt(x - 1) == mGapCharacter)
               {
                  mScoreMatrix[x][0] += gapGapScore;
               }
               else
               {
                  openGap = false;
                  mScoreMatrix[x][0] += subjectGapPenalties.getOpenPenalty();
               }
            }
            else
            {
               mScoreMatrix[x][0] += subjectGapPenalties.getExtensionPenalty();
            }
         }
         else // No subject left terminal gap penalty
         {
            mScoreMatrix[x][0] = 0;
         }

         mTracebackMatrix[x][0] = SUBJECT_GAP;
      }

      // First column
      openGap = true;
      for (int y = 1; y <= mSubjectLength; y++)
      {
         if (mPenalizeQueryLeftTerminalGaps)
         {
            mScoreMatrix[0][y] = mScoreMatrix[0][y - 1]; // Start with the score above

            if (openGap)
            {
               if (mSubjectResidues.charAt(y - 1) == mGapCharacter)
               {
                  mScoreMatrix[0][y] += gapGapScore;
               }
               else
               {
                  openGap = false;
                  mScoreMatrix[0][y] += queryGapPenalties.getOpenPenalty();
               }
            }
            else
            {
               mScoreMatrix[0][y] += queryGapPenalties.getExtensionPenalty();
            }
         }
         else if (mSubjectResidues.charAt(y - 1) == '-')
         {
            mScoreMatrix[0][y] = mScoreMatrix[0][y - 1] + gapGapScore;
         }
         else // No query left terminal gap penalty
         {
            mScoreMatrix[0][y] = mScoreMatrix[0][y - 1];
         }

         mTracebackMatrix[0][y] = QUERY_GAP;
      }
   }

   //---------------------------------------------------------------------------
   private void setup(BioSequence inQuery, PSSM inSubject)
   {
      commonSetup(inQuery);

      mSubjectPSSM = inSubject;
      mSubjectLength = inSubject.length();
      mSubjectResidues = StringUtil.polyChar(' ', mSubjectLength);   //TODO: What should be done here?


      mMinY = 1;
      mMaxY  = mSubjectPSSM.length();

      // Do we need to create a new or bigger scoring matrix?
      if (null == mScoreMatrix
          || mQueryLength + 1 > mScoreMatrix.length
          || mSubjectLength + 1 > mScoreMatrix[0].length)
      {
         mScoreMatrix = new float[mQueryLength + 1][mSubjectLength + 1];
         mTracebackMatrix = new byte[mQueryLength + 1][mSubjectLength + 1];
      }

      mBestScore  = -1;
      mBestScoreX = -1;
      mBestScoreY = -1;

      mScoreMatrix[0][0]     = 0;
      mTracebackMatrix[0][0] = 0;

      GapPenalties queryGapPenalties   = getSettings().getGapPenalties(PairwiseSeqType.QUERY);
      GapPenalties subjectGapPenalties = getSettings().getGapPenalties(PairwiseSeqType.SUBJECT);

      // First row
      boolean openGap = true;
      for (int x = 1; x <= mQueryLength; x++)
      {
         if (mPenalizeSubjectLeftTerminalGaps)
         {
            mScoreMatrix[x][0] = mScoreMatrix[x - 1][0]; // Start with the score to the left
            if (openGap)
            {
               if (mQueryResidues.charAt(x - 1) == mGapCharacter)
               {
                  mScoreMatrix[x][0] += inSubject.score(x, mGapCharacter);
               }
               else
               {
                  openGap = false;
                  mScoreMatrix[x][0] += subjectGapPenalties.getOpenPenalty();
               }
            }
            else
            {
               mScoreMatrix[x][0] += subjectGapPenalties.getExtensionPenalty();
            }
         }
         else // No subject left terminal gap penalty
         {
            mScoreMatrix[x][0] = 0;
         }

         mTracebackMatrix[x][0] = SUBJECT_GAP;
      }

      // First column
      for (int y = 1; y <= mSubjectLength; y++)
      {
         if (mPenalizeQueryLeftTerminalGaps)
         {
            mScoreMatrix[0][y] = mScoreMatrix[0][y - 1]; // Start with the score above

            mScoreMatrix[0][y] += queryGapPenalties.getExtensionPenalty();
         }
         else // No query left terminal gap penalty
         {
            mScoreMatrix[0][y] = 0;
         }

         mTracebackMatrix[0][y] = QUERY_GAP;
      }
   }

   //---------------------------------------------------------------------------
   private void fillInMatrix(BioSequence inQuery, BioSequence inSubject)
   {
      setup(inQuery, inSubject);

      GapPenalties queryGapPenalties = mSettings.getGapPenalties(PairwiseSeqType.QUERY);
      GapPenalties subjectGapPenalties = mSettings.getGapPenalties(PairwiseSeqType.SUBJECT);

      boolean queryLocal   = getSettings().getQueryAlignmentType().equals(PairwiseAlignmentType.LOCAL);
      boolean subjectLocal = getSettings().getSubjectAlignmentType().equals(PairwiseAlignmentType.LOCAL);
      boolean fullGlobal = (! queryLocal && ! subjectLocal);

      boolean subjectTemplateMode = getSettings().getSubjectTemplateMode();

      SubstitutionMatrix substitutionMatrix = getSettings().getSubstitutionMatrix(inQuery.getType());
      // Use the raw matrix for speed so we won't have to repeatedly look up the substitution matrix indices.
      float[][] rawSubstitutionMatrix = substitutionMatrix.getRawMatrix();

      float gapGapScore = 0;
      if (substitutionMatrix.contains('-'))
      {
         gapGapScore = (getSettings().getScoreCaseInsensitive() ? substitutionMatrix.scoreCaseInsensitive('-', '-') : substitutionMatrix.score('-', '-'));
      }

      int queryLength = inQuery.length();
      int subjectLength = inSubject.length();

      for (int y = 1; y <= subjectLength; y++)
      {
         for (int x = 1; x <= mQueryLength; x++)
         {
            // Calculate a diagonal score, a query gap score, and a subject gap score; then choose the highest score
            // as this cell's value

            float diagScore = mScoreMatrix[x - 1][y - 1] + rawSubstitutionMatrix[mQuerySubstitutionIndices[x - 1]][mSubjectSubstitutionIndices[y - 1]];
            byte tracebackDirection = DIAGONAL;
            float bestScore = diagScore;

            // Don't create right-terminal gaps for the query if the subject is in local mode
            if (fullGlobal
                || x < mQueryLength)
            {
               float queryGapScore = mScoreMatrix[x][y - 1];
               if (! mPenalizeQueryRightTerminalGaps
                   && x == queryLength)
               {
                  // Don't add any other penalty
               }
               else if (subjectTemplateMode
                        && mSubjectResidues.charAt(y - 1) == '-')
               {
                  // Don't penalize - subject has a gap at this position.
                  // However, only add the gap score if it is not a terminal gap.
                  if (y > mMinY
                      && y < mMaxY)
                  {
                     queryGapScore += gapGapScore;
                  }
               }
               else
               {
                  // If there was a gap in the query sequence at the preceding position, the gap opening penalty should apply and not the gap extension penalty.
                  // If we are in subject template mode and there was a subject gap at the preceding position, the gap opening penalty should apply and not the gap extension penalty.
                  boolean preceedingQueryGap = (mQueryResidues.charAt(x - 1) == '-');
                  queryGapScore += (preceedingQueryGap
                                    || (QUERY_GAP == mTracebackMatrix[x][y - 1]
                                        && (! subjectTemplateMode
                                            || y < 2
                                            || mSubjectResidues.charAt(y - 2) != '-')) ? queryGapPenalties.getExtensionPenalty() : queryGapPenalties.getOpenPenalty());
               }

               if (queryGapScore > bestScore)
               {
                  tracebackDirection = QUERY_GAP;
                  bestScore = queryGapScore;
               }
            }

            // Don't create right-terminal gaps for the subject if the query is in local mode
            if (fullGlobal
                || y < mSubjectLength)
            {
               float subjectGapScore = mScoreMatrix[x - 1][y];
               if (! mPenalizeSubjectRightTerminalGaps
                   && y == subjectLength)
               {
                  // Full global and no subject right terminal gap penalty.
                  // Don't add any other penalty
               }
               else
               {
                  subjectGapScore += (SUBJECT_GAP == mTracebackMatrix[x - 1][y] ? subjectGapPenalties.getExtensionPenalty() : subjectGapPenalties.getOpenPenalty());
               }

               if (subjectGapScore > bestScore)
               {
                  tracebackDirection = SUBJECT_GAP;
                  bestScore = subjectGapScore;
               }
            }

            mTracebackMatrix[x][y] = tracebackDirection;
            mScoreMatrix[x][y] = bestScore;

            // For performance, keep track of the best score.
            // If only one alignment is desired, this will keep us from having to examine the whole matrix.
            if (bestScore >= mBestScore)
            {
               mBestScore = bestScore;
               mBestScoreX = x;
               mBestScoreY = y;
            }
         }
      }

      if (getSettings().getDumpMatrices())
      {
         dumpMatrices();
      }
   }

   //---------------------------------------------------------------------------
   private void fillInMatrix(BioSequence inQuery, PSSM inSubject)
   {
      setup(inQuery, inSubject);

      GapPenalties queryGapPenalties = mSettings.getGapPenalties(PairwiseSeqType.QUERY);
      GapPenalties subjectGapPenalties = mSettings.getGapPenalties(PairwiseSeqType.SUBJECT);

      boolean queryLocal   = getSettings().getQueryAlignmentType().equals(PairwiseAlignmentType.LOCAL);
      boolean subjectLocal = getSettings().getSubjectAlignmentType().equals(PairwiseAlignmentType.LOCAL);
      boolean fullGlobal = (! queryLocal && ! subjectLocal);

      int queryLength = inQuery.length();
      int subjectLength = inSubject.length();

      float[] gapOpenScores = inSubject.getGapOpenScores();

      for (int y = 1; y <= subjectLength; y++)
      {
         for (int x = 1; x <= queryLength; x++)
         {
            char queryResidue = mQueryResidues.charAt(x - 1);

            // Calculate a diagonal score, a query gap score, and a subject gap score; then choose the highest score
            // as this cell's value

            float diagScore = mScoreMatrix[x - 1][y - 1] + inSubject.score(y, queryResidue);
            byte tracebackDirection = DIAGONAL;
            float bestScore = diagScore;

            // Don't create right-terminal gaps for the query if the subject is in local mode
            if (fullGlobal
                || x < queryLength)
            {
               float queryGapScore = mScoreMatrix[x][y - 1];
               if (! mPenalizeQueryRightTerminalGaps
                   && x == queryLength)
               {
                  // Don't add any other penalty
               }
               else
               {
                  // If there was a gap in the query sequence at the preceding position, the gap opening penalty should apply and not the gap extension penalty.
                  queryGapScore += (QUERY_GAP == (mTracebackMatrix[x][y - 1]) ? queryGapPenalties.getExtensionPenalty() * inSubject.getGapExtScore(y)
                                                                              : queryGapPenalties.getOpenPenalty() * gapOpenScores[y - 1] );
               }

               if (queryGapScore > bestScore)
               {
                  tracebackDirection = QUERY_GAP;
                  bestScore = queryGapScore;
               }
            }

            // Don't create right-terminal gaps for the subject if the query is in local mode
            if (fullGlobal
                || y < subjectLength)
            {
               float subjectGapScore = mScoreMatrix[x - 1][y];
               if (! mPenalizeSubjectRightTerminalGaps
                   && y == subjectLength)
               {
                  // Full global and no subject right terminal gap penalty.
                  // Don't add any other penalty
               }
               else
               {
                  subjectGapScore += (SUBJECT_GAP == (mTracebackMatrix)[x - 1][y] ? subjectGapPenalties.getExtensionPenalty() : subjectGapPenalties.getOpenPenalty());
               }

               if (subjectGapScore > bestScore)
               {
                  tracebackDirection = SUBJECT_GAP;
                  bestScore = subjectGapScore;
               }
            }

            mTracebackMatrix[x][y] = tracebackDirection;
            mScoreMatrix[x][y] = bestScore;

            // For performance, keep track of the best score.
            // If only one alignment is desired, this will keep us from having to examine the whole matrix.
            if (bestScore >= mBestScore)
            {
               mBestScore = bestScore;
               mBestScoreX = x;
               mBestScoreY = y;
            }
         }
      }

      if (getSettings().getDumpMatrices())
      {
         dumpMatrices();
      }
   }

   //---------------------------------------------------------------------------
   private void dumpMatrices()
   {
      StringBuilderPlus buffer = new StringBuilderPlus();
      buffer.appendln("SCORE MATRIX:");

      buffer.append("               ");
      for (int x = 0; x < mQueryLength; x++)
      {
         buffer.append(String.format("   %c     ", mQueryResidues.charAt(x)));
      }
      buffer.appendln("");

      for (int y = 0; y <= mSubjectLength; y++)
      {
         if (0 == y)
         {
            buffer.append("     ");
         }
         else
         {
            if (mSubject != null)
            {
               buffer.append(String.format("  %c  ", mSubjectResidues.charAt(y - 1)));
            }
            else
            {
               buffer.append(String.format("%3d. ", y));
            }
         }

         for (int x = 0; x <= mQueryLength; x++)
         {
            buffer.append(String.format("%7.1f  ", mScoreMatrix[x][y]));
         }
         buffer.appendln("");
      }

      buffer.appendln("\n\nTRACEBACK MATRIX:");
      buffer.append("               ");
      for (int x = 0; x < mQueryLength; x++)
      {
         buffer.append(String.format("   %c     ", mQueryResidues.charAt(x)));
      }
      buffer.appendln("");

      for (int y = 0; y <= mSubjectLength; y++)
      {
         if (0 == y)
         {
            buffer.append("    ");
         }
         else
         {
            if (mSubject != null)
            {
               buffer.append(String.format("  %c ", mSubjectResidues.charAt(y - 1)));
            }
            else
            {
               buffer.append(String.format("%3d. ", y));
            }
         }

         for (int x = 0; x <= mQueryLength; x++)
         {
            buffer.append(String.format("%7d  ", mTracebackMatrix[x][y]));
         }
         buffer.appendln("");
      }

      System.out.println(buffer);
   }

   //---------------------------------------------------------------------------
   // Tried another implementation keeping a queue of seeds but it didn't perform as well
   private boolean hasMoreSeeds()
   {
      mHasMoreSeeds = false;

      // Full global?
      if (! mQueryLocal
          && ! mSubjectLocal)
      {
         // Start from the bottom right cell in the matrix
         float score = mScoreMatrix[mQueryLength][mSubjectLength];
         if (score != Integer.MIN_VALUE)
         {
            mNextSeed.update(mQueryLength, mSubjectLength, score);
            mHasMoreSeeds = true;
         }
      }
      else if (mBestScore > 0)
      {
         mNextSeed.update(mBestScoreX, mBestScoreY, mBestScore);
         mBestScore  = -1;
         mBestScoreX = -1;
         mBestScoreY = -1;
         mHasMoreSeeds = true;
      }
      else
      {
         int minX = 0;
         int maxX = mQueryLength;

         float largestScore = 0;
         int seedX = 0;
         int seedY = 0;

         if (! mSubjectLocal // Global subject alignment
             && mPenalizeQueryRightTerminalGaps)
         {
            // Only search the bottom
            for (int x = minX; x <= maxX; x++)
            {
               float score = mScoreMatrix[x][mMaxY];
               if (score >= largestScore)
               {
                  seedX = x;
                  seedY = mMaxY;
                  largestScore = score;
               }
            }
         }
         else if (! mQueryLocal // Global query alignment
                  && mPenalizeSubjectRightTerminalGaps)
         {
            // Only search the side
            for (int y = mMinY - 1; y < mMaxY; y++)
            {
               float score = mScoreMatrix[maxX][y];
               if (score >= largestScore)
               {
                  seedX = maxX;
                  seedY = y;
                  largestScore = score;
               }
            }
         }
         else
         {
            for (int x = minX; x <= maxX; x++)
            {
               for (int y = mMinY - 1; y <= mMaxY; y++)
               {
                  float score = mScoreMatrix[x][y];
                  if (score >= largestScore)
                  {
                     seedX = x;
                     seedY = y;
                     largestScore = score;
                  }
               }
            }
         }

         mNextSeed.update(seedX, seedY, largestScore);
         mHasMoreSeeds = true;
      }

      if (mMinRawScore != null
          && mMinRawScore > mNextSeed.getScore())
      {
         mHasMoreSeeds = false;
      }

      return mHasMoreSeeds;
   }

   //---------------------------------------------------------------------------
   private AlignmentSeed nextSeed()
   {
      return mNextSeed;
   }

   //---------------------------------------------------------------------------
   private List generateAlignments()
   {
      List alignments = new ArrayList<>(mSettings.getMaxLocalAlignmentResults());

      boolean queryGlobal = ! mQueryLocal;
      boolean subjectGlobal = ! mSubjectLocal;
      boolean fullGlobal = queryGlobal && subjectGlobal;
      boolean fullLocal = ! queryGlobal && ! subjectGlobal;

      int maxAlignments = (fullGlobal ? 1 : getSettings().getMaxLocalAlignmentResults());

      while (hasMoreSeeds()
             && alignments.size() < maxAlignments)
      {
         AlignmentSeed seed = nextSeed();

         // Note that these aligned sequences will be built up in the reverse order and then reversed at the end for performance
         StringBuilder alignedQuery   = new StringBuilder();
         StringBuilder alignedSubject = new StringBuilder();

         int x = seed.getX();
         int y = seed.getY();
         float score = seed.getScore();

         byte tracebackDirection = mTracebackMatrix[x][y];

         // Query global / Subject local or Subject global / Query local alignment.
         // Don't align past the end of the global seq.
         while ((x > mQueryLength
                 && queryGlobal
                 && mSubjectLocal)
                || (y > mSubjectLength
                    && subjectGlobal
                    && mQueryLocal))
         {
            if (DIAGONAL == tracebackDirection)
            {
               x--;
               y--;
            }
            else if (QUERY_GAP == tracebackDirection)
            {
               y--;
            }
            else if (SUBJECT_GAP == tracebackDirection)
            {
               x--;
            }

            tracebackDirection = mTracebackMatrix[x][y];
         }

//         int xLeftLimit = (queryGlobal && subjectLocal ? 1 : 0);
//         int yLeftLimit = (subjectGlobal && queryLocal ? 1 : 0);
         int xLeftLimit = (fullGlobal ? 0 : 1);
         int yLeftLimit = (fullGlobal ? 0 : 1);

         while ((x > 0
                 || y > 0)
                && (fullLocal ? mScoreMatrix[x][y] > 0 : true))
         {
            float matrixScore = mScoreMatrix[x][y];

            // Did we overlap a previous alignment?
            if (Integer.MIN_VALUE == matrixScore)
            {
               break; // Skip
            }
            else if (x < xLeftLimit)
            {
               break;  // Query global / Subject local alignment. Don't align past the beginning of the Query
            }
            else if (y < yLeftLimit)
            {
               break;  // Subject global / Query local alignment. Don't align past the beginning of the Subject
            }

            // Deface this position so we won't get overlapping subsequences
            mScoreMatrix[x][y] = Integer.MIN_VALUE;

            if (DIAGONAL == tracebackDirection)
            {
               alignedQuery.append(mQueryResidues.charAt(x - 1));
               alignedSubject.append(mSubjectResidues.charAt(y - 1));
               x--;
               y--;
            }
            else if (QUERY_GAP == tracebackDirection)
            {
               alignedQuery.append(mGapCharacter);
               alignedSubject.append(mSubjectResidues.charAt(y - 1));
               y--;
            }
            else if (SUBJECT_GAP == tracebackDirection)
            {
               alignedQuery.append(mQueryResidues.charAt(x - 1));
               alignedSubject.append(mGapCharacter);
               x--;
            }

            tracebackDirection = mTracebackMatrix[x][y];
         }

         // Did we overlap a previous alignment?
         if (Integer.MIN_VALUE == mScoreMatrix[x][y])
         {
            continue; // Skip
         }
         else
         {
            // Deface this position so we won't get overlapping subsequences
            mScoreMatrix[x][y] = Integer.MIN_VALUE;
         }

         if (queryGlobal
             || (alignedQuery.length() >= mSettings.getMinLocalAlignmentLength()  // Length of aligned query meets minimum?
                 && StringUtil.replaceAll(alignedQuery, "-", "").length() >= mSettings.getMinQueryMatchLength()))
         {
            int initialQueryPosition   = x + 1;
            int initialSubjectPosition = y + 1;

            if (mQueryLocal && subjectGlobal)
            {
               if (y > 0)
               {
                  // Add a left terminal query gap
                  alignedQuery.append(StringUtil.polyChar('-', y));

                  alignedSubject = alignedSubject.reverse();
                  alignedSubject.insert(0, mSubjectResidues, 0, y);
                  initialSubjectPosition = 1;

                  if (mPenalizeQueryLeftTerminalGaps)
                  {
                     GapPenalties queryGapPenalties = mSettings.getGapPenalties(PairwiseSeqType.QUERY);

                     score += queryGapPenalties.getOpenPenalty()
                              + (queryGapPenalties.getExtensionPenalty() * (y - 1));
                  }
               }
               else
               {
                  alignedSubject = alignedSubject.reverse();
               }

               alignedQuery = alignedQuery.reverse();

               if (seed.getY() < mSubjectLength)
               {
                  // Add a right terminal query gap
                  int gapLength = mSubjectLength - seed.getY();

                  alignedQuery.append(StringUtil.polyChar('-', gapLength));
                  alignedSubject.append(mSubjectResidues, seed.getY(), mSubjectLength);

                  if (mPenalizeQueryRightTerminalGaps)
                  {
                     GapPenalties queryGapPenalties = mSettings.getGapPenalties(PairwiseSeqType.QUERY);

                     score += queryGapPenalties.getOpenPenalty()
                              + (queryGapPenalties.getExtensionPenalty() * (gapLength - 1));
                  }
               }
            }
            else if (mSubjectLocal && queryGlobal)
            {
               if (x > 0)
               {
                  // Add a left terminal subject gap
                  alignedQuery = alignedQuery.reverse();
                  alignedQuery.insert(0, mQueryResidues, 0, x);

                  alignedSubject.append(StringUtil.polyChar('-', x));
                  initialSubjectPosition = 1;

                  if (mPenalizeSubjectLeftTerminalGaps)
                  {
                     GapPenalties subjectGapPenalties = mSettings.getGapPenalties(PairwiseSeqType.SUBJECT);

                     score += subjectGapPenalties.getOpenPenalty()
                              + (subjectGapPenalties.getExtensionPenalty() * (x - 1));
                  }
               }
               else
               {
                  alignedQuery = alignedQuery.reverse();
               }

               alignedSubject = alignedSubject.reverse();

               if (seed.getX() < mQueryLength)
               {
                  // Add a right terminal subject gap
                  int gapLength = mQueryLength - seed.getX();

                  alignedQuery.append(mQueryResidues, seed.getX(), mQueryLength);
                  alignedSubject.append(StringUtil.polyChar('-', gapLength));

                  if (mPenalizeSubjectRightTerminalGaps)
                  {
                     GapPenalties subjectGapPenalties = mSettings.getGapPenalties(PairwiseSeqType.SUBJECT);

                     score += subjectGapPenalties.getOpenPenalty()
                              + (subjectGapPenalties.getExtensionPenalty() * (gapLength - 1));
                  }
               }
            }
            else
            {
               alignedQuery = alignedQuery.reverse();
               alignedSubject = alignedSubject.reverse();
            }

            if (score >= mMinRawScore)
            {
               AlignedQuery query = new AlignedQuery(mQuery, alignedQuery, initialQueryPosition);
               AlignedSubject subject;
               if (mSubject != null)
               {
                  subject = new AlignedSubject(mSubject, alignedSubject, initialSubjectPosition);
               }
               else
               {
                  subject = new AlignedSubject(mSubjectPSSM, alignedSubject, initialSubjectPosition);
               }

               PairwiseSeqAlignment alignment = new PairwiseSeqAlignment(query, subject).setScore(score);
               alignment.setSettings(getSettings().clone());

               // TODO: force full local alignment if specified

               alignments.add(alignment);
            }
         }

         if (getSettings().getDumpMatrices())
         {
            dumpMatrices();
         }
      }

      return alignments;
   }

   //---------------------------------------------------------------------------
   private void calculateStatisticalScores(List inAlignments)
   {
      String originalQuerySeq = mQuery.getSequence();

      // Calculate the background
      double prevMean = 0;
      SimpleSampleStats stats = new SimpleSampleStats();

      do
      {
         if (stats.getSampleSize() > 0)
         {
            prevMean = stats.getMean();
         }
         
         // Scramble the query
         String querySeq = mQuery.getSequence();
         mQuery.setSequence(StringUtil.scramble(querySeq));

         if (mSubject != null)
         {
            fillInMatrix(mQuery, mSubject);
         }
         else // Working w/ a PSSM
         {
            fillInMatrix(mQuery, mSubjectPSSM);
         }

         stats.add(hasMoreSeeds() ? nextSeed().getScore() : 0);

         // Repeat until the mean background value isn't changing significantly
      }
      while (stats.getSampleSize() < getSettings().getMinNumOfBackgroundCycles()
             || Math.abs(stats.getMean() - prevMean) > 0.000001 * stats.getMean());


      // Calculate the decay constant
      double lambda = Math.PI / (stats.getSampleStandardDeviation() * Math.sqrt(6));

      // Estimate the extreme value distribution's location constant
      double mu = stats.getMean() - 0.5772 / lambda;

      for (PairwiseSeqAlignment alignment : inAlignments)
      {
         double eValue = Math.exp(-lambda * (alignment.getScore() - mu));
         alignment.setEValue((float) eValue);

         alignment.setPValue((float) (1 - Math.exp(-eValue)));

         alignment.setZScore((float) ((alignment.getScore() - stats.getMean()) / stats.getSampleStandardDeviation()));
      }


      // Replace the original query seq
      if (originalQuerySeq != null)
      {
         mQuery.setSequence(originalQuerySeq);
      }
   }


   //###########################################################################
   // INNER CLASS
   //###########################################################################

   private class AlignmentSeed implements Comparable
   {
      private int   mX;
      private int   mY;
      private float mCumulativeScore;

      //###########################################################################
      // CONSTRUCTORS
      //###########################################################################

      //---------------------------------------------------------------------------
      public AlignmentSeed()
      {
      }

      //---------------------------------------------------------------------------
      public AlignmentSeed(int inX, int inY, float inScore)
      {
         mX = inX;
         mY = inY;
         mCumulativeScore = inScore;
      }

      //###########################################################################
      // PUBLIC METHODS
      //###########################################################################

      //---------------------------------------------------------------------------
      public void update(int inX, int inY, float inScore)
      {
         mX = inX;
         mY = inY;
         mCumulativeScore = inScore;
      }

      //---------------------------------------------------------------------------
      public int getX()
      {
         return mX;
      }

      //---------------------------------------------------------------------------
      public int getY()
      {
         return mY;
      }

      //---------------------------------------------------------------------------
      public float getScore()
      {
         return mCumulativeScore;
      }

      //---------------------------------------------------------------------------
      @Override
      public String toString()
      {
         return String.format("[%d,%d]: %.2f", mX, mY, mCumulativeScore);
      }

      //---------------------------------------------------------------------------
      public int compareTo(AlignmentSeed inObj2)
      {
         int result = 1;

         if (inObj2 != null)
         {
            // Sort by score high to low
            result = - CompareUtil.compare(getScore(), inObj2.getScore());

            if (0 == result)
            {
               result = CompareUtil.compare(getX(), inObj2.getX());

               if (0 == result)
               {
                  result = CompareUtil.compare(getY(), inObj2.getY());
               }
            }
         }

         return result;
      }

      //---------------------------------------------------------------------------
      @Override
      public boolean equals(Object inObj2)
      {
         boolean result = false;

         if (inObj2 != null
               && inObj2 instanceof AlignmentSeed)
         {
            AlignmentSeed seed2 = (AlignmentSeed) inObj2;
            result = (0 == compareTo(seed2));
         }

         return result;
      }

      //---------------------------------------------------------------------------
      @Override
      public int hashCode()
      {
         return (31 * mX) + (31 * mY) + (31 * Float.floatToIntBits(mCumulativeScore));
      }
   }

}




© 2015 - 2024 Weber Informatics LLC | Privacy Policy