All Downloads are FREE. Search and download functionalities are using the official Maven repository.

com.hfg.bio.seq.NucleicAcidTrimmer Maven / Gradle / Ivy

There is a newer version: 20240423
Show newest version
package com.hfg.bio.seq;


import java.util.Arrays;


//------------------------------------------------------------------------------
/**
 * Nucleotide quality trimmer based on Phred's modified-Mott trimming algorithm.
 *
 * @author J. Alex Taylor, hairyfatguy.com
 */
//------------------------------------------------------------------------------
// com.hfg XML/HTML Coding Library
//
// This library is free software; you can redistribute it and/or
// modify it under the terms of the GNU Lesser General Public
// License as published by the Free Software Foundation; either
// version 2.1 of the License, or (at your option) any later version.
//
// This library is distributed in the hope that it will be useful,
// but WITHOUT ANY WARRANTY; without even the implied warranty of
// MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
// Lesser General Public License for more details.
//
// You should have received a copy of the GNU Lesser General Public
// License along with this library; if not, write to the Free Software
// Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA  02111-1307  USA
//
// J. Alex Taylor, President, Founder, CEO, COO, CFO, OOPS hairyfatguy.com
// [email protected]
//------------------------------------------------------------------------------

public class NucleicAcidTrimmer
{
   public static final String TRIM_RANGE = "trimRange";

   private int   mMinLength = 20;
   private float mThreshold = 0.05f;

   private double[] mQualityToProbability;

   //###########################################################################
   // CONSTRUCTORS
   //###########################################################################

   //--------------------------------------------------------------------------
   public NucleicAcidTrimmer()
   {
   }

   //###########################################################################
   // PUBLIC METHODS
   //###########################################################################

   //--------------------------------------------------------------------------
   public int getMinLength()
   {
      return mMinLength;
   }

   //--------------------------------------------------------------------------
   public NucleicAcidTrimmer setMinLength(int inValue)
   {
      mMinLength = inValue;
      return this;
   }


   //--------------------------------------------------------------------------
   public float getProbabilityThreshold()
   {
      return mThreshold;
   }

   //--------------------------------------------------------------------------
   public NucleicAcidTrimmer setProbabilityThreshold(float inValue)
   {
      mThreshold = inValue;
      return this;
   }


   //--------------------------------------------------------------------------
   public SeqLocation calculateTrimRange(NucleicAcid inSeq)
   {
      SeqLocation trimRange = null;

      // Don't bother trimming if the starting length is less than the min length
      // or it doesn't have quality scores
      if (inSeq.length() >= mMinLength
            && inSeq.getSeqQualityScores() != null)
      {
         SeqQualityScores quality = inSeq.getSeqQualityScores();
         double score = 0;
         double maxScore = 0;
         int begin_1based = 0;
         int end_1based = 0;
         int tmp = 0;
         for (int i = 0; i < inSeq.length(); i++)
         {
            short qualityScore = quality.getQualityScores()[i];

            score += mThreshold - qualityToProbability(qualityScore);

            if (score > maxScore)
            {
               maxScore = score;
               begin_1based = tmp + 1;
               end_1based = i + 1;
            }
            else if (score < 0)
            {
               score = 0;
               tmp = i + 1;
            }
         }

         if (end_1based - begin_1based + 1 >= mMinLength)
         {
            trimRange = new SeqLocation(begin_1based, end_1based);
         }
      }

      return trimRange;
   }

   //--------------------------------------------------------------------------
   public NucleicAcid trim(NucleicAcid inSeq)
   {
      NucleicAcid trimmedSeq = null;

      SeqLocation trimRange = calculateTrimRange(inSeq);
      if (trimRange != null)
      {
         trimmedSeq = inSeq.clone()
               .setSequence(inSeq.getSubSequence(trimRange));

         // Trim the quality score to match the new trimmed seq
         trimmedSeq.setSeqQualityScores(new SeqQualityScores(Arrays.copyOfRange(trimmedSeq.getSeqQualityScores().getQualityScores(), trimRange.getStart() - 1,
               trimRange.getEnd()), trimmedSeq.getSeqQualityScores().getScheme()));


         trimmedSeq.setAttribute(TRIM_RANGE, trimRange);
      }

      return trimmedSeq;
   }

   //###########################################################################
   // PRIVATE METHODS
   //###########################################################################

   //--------------------------------------------------------------------------
   // Quality scores are assumed to be = -10log10(probability)
   private double qualityToProbability(short inQuality)
   {
      if (null == mQualityToProbability)
      {
         double[] values = new double[255];
         for (int i = 0; i < 255; i++)
         {
            values[i] = Math.pow(10.0, i / -10.0);
         }

         mQualityToProbability = values;
      }

      return mQualityToProbability[inQuality];
   }
}




© 2015 - 2024 Weber Informatics LLC | Privacy Policy