All Downloads are FREE. Search and download functionalities are using the official Maven repository.

com.hfg.bio.seq.format.FASTQ Maven / Gradle / Ivy

There is a newer version: 20240423
Show newest version
package com.hfg.bio.seq.format;

import java.io.BufferedReader;
import java.io.BufferedWriter;
import java.io.OutputStream;
import java.io.OutputStreamWriter;
import java.io.StringWriter;
import java.io.Writer;
import java.util.regex.Matcher;
import java.util.regex.Pattern;

import com.hfg.bio.seq.BioSequenceFactory;
import com.hfg.bio.seq.NucleicAcid;
import com.hfg.bio.seq.SeqQualityScoreScheme;
import com.hfg.bio.seq.SeqQualityScores;
import com.hfg.exception.InvalidValueException;
import com.hfg.util.StringUtil;

//------------------------------------------------------------------------------
/**
 FASTQ sequence format encompassing sequences and their per-base sequencing quality scores.
 
From Wikipedia:
 "A FASTQ file normally uses four lines per sequence.

 Line 1 begins with a '@' character and is followed by a sequence identifier and an optional description (like a FASTA title line).
 Line 2 is the raw sequence letters.
 Line 3 begins with a '+' character and is optionally followed by the same sequence identifier (and any description) again.
 Line 4 encodes the quality values for the sequence in Line 2, and must contain the same number of symbols as letters in the sequence."
 
@author J. Alex Taylor, hairyfatguy.com
*/ //------------------------------------------------------------------------------ // com.hfg Library // // This library is free software; you can redistribute it and/or // modify it under the terms of the GNU Lesser General Public // License as published by the Free Software Foundation; either // version 2.1 of the License, or (at your option) any later version. // // This library is distributed in the hope that it will be useful, // but WITHOUT ANY WARRANTY; without even the implied warranty of // MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU // Lesser General Public License for more details. // // You should have received a copy of the GNU Lesser General Public // License along with this library; if not, write to the Free Software // Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA // // J. Alex Taylor, President, Founder, CEO, COO, CFO, OOPS hairyfatguy.com // [email protected] //------------------------------------------------------------------------------ public class FASTQ extends ReadableSeqFormatBase implements WritableSeqFormat { private SeqQualityScoreScheme mScheme = SeqQualityScoreScheme.sanger; private static final Pattern sHeaderLinePattern = Pattern.compile("@(\\S+)(?:\\s+(.*?))?"); private static final Pattern sSequenceLinePattern = Pattern.compile("[A-Za-z\\*\\-]+"); private enum TargetLine { header, sequence, quality_header, quality } //########################################################################### // CONSTRUCTORS //########################################################################### //--------------------------------------------------------------------------- public FASTQ() { super(null); } //--------------------------------------------------------------------------- public FASTQ(BioSequenceFactory inSeqFactory) { super(inSeqFactory); } //########################################################################### // PUBLIC METHODS //########################################################################### //--------------------------------------------------------------------------- public FASTQ setScheme(SeqQualityScoreScheme inValue) { if (null == inValue) { throw new InvalidValueException("The sequence scoring scheme cannot be set to null!"); } mScheme = inValue; return this; } //--------------------------------------------------------------------------- public SeqQualityScoreScheme getScheme() { return mScheme; } //--------------------------------------------------------------------------- public T readRecord(BufferedReader inReader) throws SeqIOException { if (null == getBioSequenceFactory()) { throw new SeqIOException("No BioSequence factory has been specified!"); } int lineCount = 0; T seq; try { seq = getBioSequenceFactory().createSeqObj(); TargetLine targetLine = TargetLine.header; String line; while ((line = inReader.readLine()) != null) { lineCount++; // Skip comment lines or blank lines. Note that quality lines sometimes start w/ '#' // so we can't use that to indicate a comment line. if (line.startsWith("//") || line.matches("\\s*")) { continue; } switch (targetLine) { case header: parseHeaderLine(line, seq); targetLine = TargetLine.sequence; break; case sequence: parseSequenceLine(line, seq); targetLine = TargetLine.quality_header; break; case quality_header: parseQualityHeaderLine(line, seq); targetLine = TargetLine.quality; break; case quality: parseQualityLine(line, seq); targetLine = null; break; } if (null == targetLine) { break; } } } catch (SeqIOException e) { throw e; } catch (Exception e) { throw new SeqIOException(e); } return seq; } //--------------------------------------------------------------------------- public boolean isEndOfRecord(String inLine) { return inLine.startsWith("@"); } //--------------------------------------------------------------------------- public boolean hasJanusDelimiter() { return true; } //--------------------------------------------------------------------------- public String write(T inSeq) throws SeqIOException { StringWriter writer = new StringWriter(); write(inSeq, writer); return writer.toString(); } //--------------------------------------------------------------------------- public void write(T inSeq, OutputStream inStream) throws SeqIOException { Writer writer = new OutputStreamWriter(inStream); write(inSeq, writer); try { writer.flush(); } catch (Exception e) { throw new SeqIOException(e); } } //--------------------------------------------------------------------------- public void write(T inSeq, Writer inWriter) throws SeqIOException { BufferedWriter writer = null; try { try { if (writer instanceof BufferedWriter) { writer = (BufferedWriter) inWriter; } else { writer = new BufferedWriter(inWriter, 8196); } // Write the header line writer.write("@"); writer.write(inSeq.getID()); if (StringUtil.isSet(inSeq.getDescription())) { writer.write(" " + inSeq.getDescription()); } writer.write("\n"); // Write the sequence line writer.write(inSeq.getSequence()); writer.write("\n"); // Write the quality header line writer.write("+\n"); // Write the quality line if (inSeq.getSeqQualityScores() != null) { String encodedQualityString = inSeq.getSeqQualityScores().getEncodedQualityString(); if (encodedQualityString != null) { writer.write(encodedQualityString); } } writer.write("\n"); } finally { if (writer != null) { writer.flush(); } } } catch (SeqIOException e) { throw e; } catch (Exception e) { throw new SeqIOException(e); } } //########################################################################### // PROTECTED METHODS //########################################################################### //--------------------------------------------------------------------------- protected void parseHeaderLine(String inLine, T inSeq) { if (! inLine.startsWith("@")) { throw new SeqFormatException("Expected a FASTQ header line but found " + StringUtil.singleQuote(inLine) + "!"); } Matcher m = sHeaderLinePattern.matcher(inLine); if (m.matches()) { inSeq.setID(m.group(1)); inSeq.setDescription(m.group(2)); } else { throw new SeqFormatException("The header line" + StringUtil.singleQuote(inLine) + " is not in proper FASTQ format!"); } } //########################################################################### // PRIVATE METHODS //########################################################################### //--------------------------------------------------------------------------- private void parseSequenceLine(String inLine, T inSeq) { String seqString = StringUtil.replaceWhitespace(inLine, ""); if (! sSequenceLinePattern.matcher(seqString).matches()) { throw new SeqFormatException("Expected a FASTQ header line but found " + StringUtil.singleQuote(inLine) + "!"); } inSeq.setSequence(seqString); } //--------------------------------------------------------------------------- private void parseQualityHeaderLine(String inLine, T inSeq) { if (! inLine.startsWith("+")) { throw new SeqFormatException("Expected a FASTQ quality header line but found " + StringUtil.singleQuote(inLine) + "!"); } // For now, skip parsing any id or description from here } //--------------------------------------------------------------------------- private void parseQualityLine(String inLine, T inSeq) { if (inLine.length() != inSeq.length()) { throw new SeqFormatException("The FASTQ quality string is not the same length as the sequence!"); } SeqQualityScores seqQualityScores = new SeqQualityScores(inLine, getScheme()); inSeq.setSeqQualityScores(seqQualityScores); } }




© 2015 - 2024 Weber Informatics LLC | Privacy Policy