com.hfg.bio.seq.format.FASTQ Maven / Gradle / Ivy
Go to download
Show more of this group Show more artifacts with this name
Show all versions of com_hfg Show documentation
Show all versions of com_hfg Show documentation
com.hfg xml, html, svg, and bioinformatics utility library
package com.hfg.bio.seq.format;
import java.io.BufferedReader;
import java.io.BufferedWriter;
import java.io.OutputStream;
import java.io.OutputStreamWriter;
import java.io.StringWriter;
import java.io.Writer;
import java.util.regex.Matcher;
import java.util.regex.Pattern;
import com.hfg.bio.seq.BioSequenceFactory;
import com.hfg.bio.seq.NucleicAcid;
import com.hfg.bio.seq.SeqQualityScoreScheme;
import com.hfg.bio.seq.SeqQualityScores;
import com.hfg.exception.InvalidValueException;
import com.hfg.util.StringUtil;
//------------------------------------------------------------------------------
/**
FASTQ sequence format encompassing sequences and their per-base sequencing quality scores.
From Wikipedia:
"A FASTQ file normally uses four lines per sequence.
Line 1 begins with a '@' character and is followed by a sequence identifier and an optional description (like a FASTA title line).
Line 2 is the raw sequence letters.
Line 3 begins with a '+' character and is optionally followed by the same sequence identifier (and any description) again.
Line 4 encodes the quality values for the sequence in Line 2, and must contain the same number of symbols as letters in the sequence."
@author J. Alex Taylor, hairyfatguy.com
*/
//------------------------------------------------------------------------------
// com.hfg Library
//
// This library is free software; you can redistribute it and/or
// modify it under the terms of the GNU Lesser General Public
// License as published by the Free Software Foundation; either
// version 2.1 of the License, or (at your option) any later version.
//
// This library is distributed in the hope that it will be useful,
// but WITHOUT ANY WARRANTY; without even the implied warranty of
// MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
// Lesser General Public License for more details.
//
// You should have received a copy of the GNU Lesser General Public
// License along with this library; if not, write to the Free Software
// Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA
//
// J. Alex Taylor, President, Founder, CEO, COO, CFO, OOPS hairyfatguy.com
// [email protected]
//------------------------------------------------------------------------------
public class FASTQ extends ReadableSeqFormatBase implements WritableSeqFormat
{
private SeqQualityScoreScheme mScheme = SeqQualityScoreScheme.sanger;
private static final Pattern sHeaderLinePattern = Pattern.compile("@(\\S+)(?:\\s+(.*?))?");
private static final Pattern sSequenceLinePattern = Pattern.compile("[A-Za-z\\*\\-]+");
private enum TargetLine
{
header,
sequence,
quality_header,
quality
}
//###########################################################################
// CONSTRUCTORS
//###########################################################################
//---------------------------------------------------------------------------
public FASTQ()
{
super(null);
}
//---------------------------------------------------------------------------
public FASTQ(BioSequenceFactory inSeqFactory)
{
super(inSeqFactory);
}
//###########################################################################
// PUBLIC METHODS
//###########################################################################
//---------------------------------------------------------------------------
public FASTQ setScheme(SeqQualityScoreScheme inValue)
{
if (null == inValue)
{
throw new InvalidValueException("The sequence scoring scheme cannot be set to null!");
}
mScheme = inValue;
return this;
}
//---------------------------------------------------------------------------
public SeqQualityScoreScheme getScheme()
{
return mScheme;
}
//---------------------------------------------------------------------------
public T readRecord(BufferedReader inReader)
throws SeqIOException
{
if (null == getBioSequenceFactory())
{
throw new SeqIOException("No BioSequence factory has been specified!");
}
int lineCount = 0;
T seq;
try
{
seq = getBioSequenceFactory().createSeqObj();
TargetLine targetLine = TargetLine.header;
String line;
while ((line = inReader.readLine()) != null)
{
lineCount++;
// Skip comment lines or blank lines. Note that quality lines sometimes start w/ '#'
// so we can't use that to indicate a comment line.
if (line.startsWith("//")
|| line.matches("\\s*"))
{
continue;
}
switch (targetLine)
{
case header:
parseHeaderLine(line, seq);
targetLine = TargetLine.sequence;
break;
case sequence:
parseSequenceLine(line, seq);
targetLine = TargetLine.quality_header;
break;
case quality_header:
parseQualityHeaderLine(line, seq);
targetLine = TargetLine.quality;
break;
case quality:
parseQualityLine(line, seq);
targetLine = null;
break;
}
if (null == targetLine)
{
break;
}
}
}
catch (SeqIOException e)
{
throw e;
}
catch (Exception e)
{
throw new SeqIOException(e);
}
return seq;
}
//---------------------------------------------------------------------------
public boolean isEndOfRecord(String inLine)
{
return inLine.startsWith("@");
}
//---------------------------------------------------------------------------
public boolean hasJanusDelimiter()
{
return true;
}
//---------------------------------------------------------------------------
public String write(T inSeq)
throws SeqIOException
{
StringWriter writer = new StringWriter();
write(inSeq, writer);
return writer.toString();
}
//---------------------------------------------------------------------------
public void write(T inSeq, OutputStream inStream)
throws SeqIOException
{
Writer writer = new OutputStreamWriter(inStream);
write(inSeq, writer);
try
{
writer.flush();
}
catch (Exception e)
{
throw new SeqIOException(e);
}
}
//---------------------------------------------------------------------------
public void write(T inSeq, Writer inWriter)
throws SeqIOException
{
BufferedWriter writer = null;
try
{
try
{
if (writer instanceof BufferedWriter)
{
writer = (BufferedWriter) inWriter;
} else
{
writer = new BufferedWriter(inWriter, 8196);
}
// Write the header line
writer.write("@");
writer.write(inSeq.getID());
if (StringUtil.isSet(inSeq.getDescription()))
{
writer.write(" " + inSeq.getDescription());
}
writer.write("\n");
// Write the sequence line
writer.write(inSeq.getSequence());
writer.write("\n");
// Write the quality header line
writer.write("+\n");
// Write the quality line
if (inSeq.getSeqQualityScores() != null)
{
String encodedQualityString = inSeq.getSeqQualityScores().getEncodedQualityString();
if (encodedQualityString != null)
{
writer.write(encodedQualityString);
}
}
writer.write("\n");
}
finally
{
if (writer != null)
{
writer.flush();
}
}
}
catch (SeqIOException e)
{
throw e;
}
catch (Exception e)
{
throw new SeqIOException(e);
}
}
//###########################################################################
// PROTECTED METHODS
//###########################################################################
//---------------------------------------------------------------------------
protected void parseHeaderLine(String inLine, T inSeq)
{
if (! inLine.startsWith("@"))
{
throw new SeqFormatException("Expected a FASTQ header line but found " + StringUtil.singleQuote(inLine) + "!");
}
Matcher m = sHeaderLinePattern.matcher(inLine);
if (m.matches())
{
inSeq.setID(m.group(1));
inSeq.setDescription(m.group(2));
}
else
{
throw new SeqFormatException("The header line" + StringUtil.singleQuote(inLine) + " is not in proper FASTQ format!");
}
}
//###########################################################################
// PRIVATE METHODS
//###########################################################################
//---------------------------------------------------------------------------
private void parseSequenceLine(String inLine, T inSeq)
{
String seqString = StringUtil.replaceWhitespace(inLine, "");
if (! sSequenceLinePattern.matcher(seqString).matches())
{
throw new SeqFormatException("Expected a FASTQ header line but found " + StringUtil.singleQuote(inLine) + "!");
}
inSeq.setSequence(seqString);
}
//---------------------------------------------------------------------------
private void parseQualityHeaderLine(String inLine, T inSeq)
{
if (! inLine.startsWith("+"))
{
throw new SeqFormatException("Expected a FASTQ quality header line but found " + StringUtil.singleQuote(inLine) + "!");
}
// For now, skip parsing any id or description from here
}
//---------------------------------------------------------------------------
private void parseQualityLine(String inLine, T inSeq)
{
if (inLine.length() != inSeq.length())
{
throw new SeqFormatException("The FASTQ quality string is not the same length as the sequence!");
}
SeqQualityScores seqQualityScores = new SeqQualityScores(inLine, getScheme());
inSeq.setSeqQualityScores(seqQualityScores);
}
}