com.hfg.bio.seq.NucleicAcidTrimmer Maven / Gradle / Ivy
Go to download
Show more of this group Show more artifacts with this name
Show all versions of com_hfg Show documentation
Show all versions of com_hfg Show documentation
com.hfg xml, html, svg, and bioinformatics utility library
package com.hfg.bio.seq;
import java.util.Arrays;
//------------------------------------------------------------------------------
/**
* Nucleotide quality trimmer based on Phred's modified-Mott trimming algorithm.
*
* @author J. Alex Taylor, hairyfatguy.com
*/
//------------------------------------------------------------------------------
// com.hfg XML/HTML Coding Library
//
// This library is free software; you can redistribute it and/or
// modify it under the terms of the GNU Lesser General Public
// License as published by the Free Software Foundation; either
// version 2.1 of the License, or (at your option) any later version.
//
// This library is distributed in the hope that it will be useful,
// but WITHOUT ANY WARRANTY; without even the implied warranty of
// MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
// Lesser General Public License for more details.
//
// You should have received a copy of the GNU Lesser General Public
// License along with this library; if not, write to the Free Software
// Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA
//
// J. Alex Taylor, President, Founder, CEO, COO, CFO, OOPS hairyfatguy.com
// [email protected]
//------------------------------------------------------------------------------
public class NucleicAcidTrimmer
{
public static final String TRIM_RANGE = "trimRange";
private int mMinLength = 20;
private float mThreshold = 0.05f;
private double[] mQualityToProbability;
//###########################################################################
// CONSTRUCTORS
//###########################################################################
//--------------------------------------------------------------------------
public NucleicAcidTrimmer()
{
}
//###########################################################################
// PUBLIC METHODS
//###########################################################################
//--------------------------------------------------------------------------
public int getMinLength()
{
return mMinLength;
}
//--------------------------------------------------------------------------
public NucleicAcidTrimmer setMinLength(int inValue)
{
mMinLength = inValue;
return this;
}
//--------------------------------------------------------------------------
public float getProbabilityThreshold()
{
return mThreshold;
}
//--------------------------------------------------------------------------
public NucleicAcidTrimmer setProbabilityThreshold(float inValue)
{
mThreshold = inValue;
return this;
}
//--------------------------------------------------------------------------
public SeqLocation calculateTrimRange(NucleicAcid inSeq)
{
SeqLocation trimRange = null;
// Don't bother trimming if the starting length is less than the min length
// or it doesn't have quality scores
if (inSeq.length() >= mMinLength
&& inSeq.getSeqQualityScores() != null)
{
SeqQualityScores quality = inSeq.getSeqQualityScores();
double score = 0;
double maxScore = 0;
int begin_1based = 0;
int end_1based = 0;
int tmp = 0;
for (int i = 0; i < inSeq.length(); i++)
{
short qualityScore = quality.getQualityScores()[i];
score += mThreshold - qualityToProbability(qualityScore);
if (score > maxScore)
{
maxScore = score;
begin_1based = tmp + 1;
end_1based = i + 1;
}
else if (score < 0)
{
score = 0;
tmp = i + 1;
}
}
if (end_1based - begin_1based + 1 >= mMinLength)
{
trimRange = new SeqLocation(begin_1based, end_1based);
}
}
return trimRange;
}
//--------------------------------------------------------------------------
public NucleicAcid trim(NucleicAcid inSeq)
{
NucleicAcid trimmedSeq = null;
SeqLocation trimRange = calculateTrimRange(inSeq);
if (trimRange != null)
{
trimmedSeq = inSeq.clone()
.setSequence(inSeq.getSubSequence(trimRange));
// Trim the quality score to match the new trimmed seq
trimmedSeq.setSeqQualityScores(new SeqQualityScores(Arrays.copyOfRange(trimmedSeq.getSeqQualityScores().getQualityScores(), trimRange.getStart() - 1,
trimRange.getEnd()), trimmedSeq.getSeqQualityScores().getScheme()));
trimmedSeq.setAttribute(TRIM_RANGE, trimRange);
}
return trimmedSeq;
}
//###########################################################################
// PRIVATE METHODS
//###########################################################################
//--------------------------------------------------------------------------
// Quality scores are assumed to be = -10log10(probability)
private double qualityToProbability(short inQuality)
{
if (null == mQualityToProbability)
{
double[] values = new double[255];
for (int i = 0; i < 255; i++)
{
values[i] = Math.pow(10.0, i / -10.0);
}
mQualityToProbability = values;
}
return mQualityToProbability[inQuality];
}
}