com.hfg.bio.seq.format.IlluminaFASTQ Maven / Gradle / Ivy
Go to download
Show more of this group Show more artifacts with this name
Show all versions of com_hfg Show documentation
Show all versions of com_hfg Show documentation
com.hfg xml, html, svg, and bioinformatics utility library
package com.hfg.bio.seq.format;
import com.hfg.bio.seq.BioSequenceFactory;
import com.hfg.bio.seq.NucleicAcid;
import com.hfg.util.BooleanUtil;
import com.hfg.util.StringUtil;
//------------------------------------------------------------------------------
/**
FASTQ sequence format from Illumina. Fields are parsed from the header line into
attributes on the sequence object.
@author J. Alex Taylor, hairyfatguy.com
*/
//------------------------------------------------------------------------------
// com.hfg Library
//
// This library is free software; you can redistribute it and/or
// modify it under the terms of the GNU Lesser General Public
// License as published by the Free Software Foundation; either
// version 2.1 of the License, or (at your option) any later version.
//
// This library is distributed in the hope that it will be useful,
// but WITHOUT ANY WARRANTY; without even the implied warranty of
// MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
// Lesser General Public License for more details.
//
// You should have received a copy of the GNU Lesser General Public
// License along with this library; if not, write to the Free Software
// Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA
//
// J. Alex Taylor, President, Founder, CEO, COO, CFO, OOPS hairyfatguy.com
// [email protected]
//------------------------------------------------------------------------------
public class IlluminaFASTQ extends FASTQ
{
//###########################################################################
// CONSTRUCTORS
//###########################################################################
//---------------------------------------------------------------------------
public IlluminaFASTQ()
{
super(null);
}
//---------------------------------------------------------------------------
public IlluminaFASTQ(BioSequenceFactory inSeqFactory)
{
super(inSeqFactory);
}
//###########################################################################
// PROTECTED METHODS
//###########################################################################
//---------------------------------------------------------------------------
// See: https://support.illumina.com/help/BaseSpace_OLH_009008/Content/Source/Informatics/BS/FileFormat_FASTQ-files_swBS.htm
// Header line format:
// @::::::: :::
//
// Ex: @HWI-M01141:63:A4NDL:1:1101:16668:1377 1:N:0:TATAGCGAGACACCGT
// instrument = HWI-M01141
// run number = 63
// flowcell ID = A4NDL
// lane = 1
// tile = 1101
// x-pos = 16668
// y-pos = 1377
// UMI (optional) =
// read = 1
// is filtered = N
// control number = 0
// index = TATAGCGAGACACCGT
protected void parseHeaderLine(String inLine, T inSeq)
{
// Let the super class break the header line into id and description
super.parseHeaderLine(inLine, inSeq);
// Extract Illumina fields from the id
String[] fields = inSeq.getID().split(":");
if (fields.length < 7
|| fields.length > 8)
{
throw new SeqFormatException("Unexpected number of fields in the header id " + StringUtil.singleQuote(inSeq.getID()) + "!");
}
inSeq.setAttribute("instrument", fields[0]);
inSeq.setAttribute("run number", Integer.parseInt(fields[1]));
inSeq.setAttribute("flowcell ID", fields[2]);
inSeq.setAttribute("lane", Integer.parseInt(fields[3]));
inSeq.setAttribute("tile", Integer.parseInt(fields[4]));
inSeq.setAttribute("x-pos", Integer.parseInt(fields[5]));
inSeq.setAttribute("y-pos", Integer.parseInt(fields[6]));
if (8 == fields.length)
{
inSeq.setAttribute("UMI", fields[7]);
}
// Extract Illumina fields from the description
fields = inSeq.getDescription().split(":");
if (fields.length != 4)
{
throw new SeqFormatException("Unexpected number of fields in the header description " + StringUtil.singleQuote(inSeq.getDescription()) + "!");
}
inSeq.setAttribute("read", Integer.parseInt(fields[0]));
inSeq.setAttribute("is filtered", BooleanUtil.valueOf(fields[1]));
inSeq.setAttribute("control number", Integer.parseInt(fields[2]));
inSeq.setAttribute("index", fields[3]);
}
}