com.hfg.bio.seq.format.FASTA Maven / Gradle / Ivy
Go to download
Show more of this group Show more artifacts with this name
Show all versions of com_hfg Show documentation
Show all versions of com_hfg Show documentation
com.hfg xml, html, svg, and bioinformatics utility library
package com.hfg.bio.seq.format;
import java.io.*;
import java.util.Collection;
import java.util.logging.Logger;
import java.util.regex.Matcher;
import java.util.regex.Pattern;
import com.hfg.bio.seq.BioSequence;
import com.hfg.bio.seq.BioSequenceFactory;
import com.hfg.bio.seq.BioSequencePlus;
import com.hfg.util.StringUtil;
//------------------------------------------------------------------------------
/**
FASTA sequence format. Allowed sequence characters are upper-case letters,
lower-case letters, '*' for stop codons, and '-' for gaps. Numbers and spaces
will be silently stripped from the sequence and any other characters will cause
a SeqFormatException.
@author J. Alex Taylor, hairyfatguy.com
*/
//------------------------------------------------------------------------------
// com.hfg Library
//
// This library is free software; you can redistribute it and/or
// modify it under the terms of the GNU Lesser General Public
// License as published by the Free Software Foundation; either
// version 2.1 of the License, or (at your option) any later version.
//
// This library is distributed in the hope that it will be useful,
// but WITHOUT ANY WARRANTY; without even the implied warranty of
// MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
// Lesser General Public License for more details.
//
// You should have received a copy of the GNU Lesser General Public
// License along with this library; if not, write to the Free Software
// Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA
//
// J. Alex Taylor, President, Founder, CEO, COO, CFO, OOPS hairyfatguy.com
// [email protected]
//------------------------------------------------------------------------------
public class FASTA extends ReadableSeqFormatBase implements WritableSeqFormat
{
private Integer mLineLength = sDefaultLineLength;
private int mMaxExceptionsPerRecord = 0;
private static int sDefaultLineLength = 75;
private static Pattern sHeaderLinePattern = Pattern.compile(">(\\S+)(?:\\s+(.*?))?");
private final static Logger LOGGER = Logger.getLogger(FASTA.class.getName());
//###########################################################################
// CONSTRUCTORS
//###########################################################################
//---------------------------------------------------------------------------
public FASTA()
{
super(null);
}
//---------------------------------------------------------------------------
public FASTA(BioSequenceFactory inSeqFactory)
{
super(inSeqFactory);
}
//###########################################################################
// PUBLIC METHODS
//###########################################################################
//---------------------------------------------------------------------------
public static Logger getLogger()
{
return LOGGER;
}
//---------------------------------------------------------------------------
/**
Specify the maximum number of Exceptions to tolerate per record. Defaults to zero.
This mechanism will only work with sequences objects that implement the BioSequencePlus interface.
If a record produces less than the specified maximum number of Exceptions, the
Exceptions can be retrieved via the getParseExceptions() method on the
BioSequencePlus sequence object.
* @param inValue the maximum number of Exceptions to tolerate per record
* @return this format object to facilitate method chaining.
*/
public FASTA setMaxExceptionsPerRecord(int inValue)
{
mMaxExceptionsPerRecord = inValue;
return this;
}
//---------------------------------------------------------------------------
public FASTA setLineLength(Integer inValue)
{
mLineLength = inValue;
return this;
}
//---------------------------------------------------------------------------
public Integer getLineLength()
{
return mLineLength;
}
//---------------------------------------------------------------------------
public T readRecord(BufferedReader inReader)
throws SeqIOException
{
if (null == getBioSequenceFactory())
{
throw new SeqIOException("No BioSequence factory has been specified!");
}
int lineCount = 0;
T seq = null;
try
{
seq = getBioSequenceFactory().createSeqObj();
boolean headerLineFound = false;
String line;
while ((line = inReader.readLine()) != null)
{
lineCount++;
// Skip comment lines or blank lines
if (line.startsWith("#")
|| line.startsWith("//")
|| line.matches("\\s*"))
{
continue;
}
if (line.startsWith(">"))
{
headerLineFound = true;
line = line.trim();
if (seq.getID() != null)
{
throw new SeqFormatException("Line " + lineCount + ": Multiple header lines found in the sequence record!");
}
Matcher m = sHeaderLinePattern.matcher(line);
if (m.matches())
{
seq.setID(m.group(1));
seq.setDescription(m.group(2));
}
else
{
throw new SeqFormatException("Line " + lineCount + ": The header line" + StringUtil.singleQuote(line) + " is not in proper FASTA format!");
}
break;
}
else
{
throw new SeqFormatException("Invalid FASTA Format! Expected header line but found " + StringUtil.singleQuote(line) + "!");
}
}
if (! headerLineFound)
{
throw new SeqFormatException("No FASTA header line found!");
}
// The rest of the record should be sequence
// Cleanup the sequence to remove spaces and numbers
Reader filterReader = new FASTASeqFilterReader(seq, inReader);
seq.setSequence(filterReader);
filterReader.close();
}
catch (SeqFormatException e)
{
SeqIOException exception;
if (StringUtil.isSet(seq.getID()))
{
exception = new SeqIOException("Problem encountered while reading sequence "
+ StringUtil.singleQuote(seq.getID()) + "!", e);
}
else
{
exception = e;
}
if (mMaxExceptionsPerRecord > 0
&& seq instanceof BioSequencePlus
&& (! ((BioSequencePlus) seq).hadParseExceptions()
|| ((BioSequencePlus) seq).getParseExceptions().size() < mMaxExceptionsPerRecord))
{
((BioSequencePlus) seq).addParseException(exception);
getLogger().warning(exception.getMessage());
}
else
{
throw exception;
}
}
catch (SeqIOException e)
{
throw e;
}
catch (Exception e)
{
throw new SeqIOException(e);
}
return seq;
}
//---------------------------------------------------------------------------
public boolean isEndOfRecord(String inLine)
{
return inLine.startsWith(">");
}
//---------------------------------------------------------------------------
public boolean hasJanusDelimiter()
{
return true;
}
//---------------------------------------------------------------------------
public String write(Collection inSeqs)
throws SeqIOException
{
StringWriter writer = new StringWriter();
for (T seq : inSeqs)
{
write(seq, writer);
}
return writer.toString();
}
//---------------------------------------------------------------------------
public String write(T inSeq)
throws SeqIOException
{
StringWriter writer = new StringWriter();
write(inSeq, writer);
return writer.toString();
}
//---------------------------------------------------------------------------
public void write(T inSeq, OutputStream inStream)
throws SeqIOException
{
Writer writer = new OutputStreamWriter(inStream);
write(inSeq, writer);
try
{
writer.flush();
}
catch (Exception e)
{
throw new SeqIOException(e);
}
}
//---------------------------------------------------------------------------
public void write(T inSeq, Writer inWriter)
throws SeqIOException
{
Reader seqReader = null;
BufferedWriter writer = null;
try
{
try
{
if (inWriter instanceof BufferedWriter)
{
writer = (BufferedWriter) inWriter;
} else
{
writer = new BufferedWriter(inWriter, 8196);
}
// Write the header line
writer.write(">");
writer.write(inSeq.getID());
if (StringUtil.isSet(inSeq.getDescription()))
{
writer.write(" " + inSeq.getDescription());
}
writer.write("\n");
// Write the sequence lines
seqReader = inSeq.getSequenceReader();
// A null line length indicates that we should write the whole sequence on one line
int bufferSize = (mLineLength != null ? mLineLength : 2048);
char[] buffer = new char[bufferSize];
int numBytesRead;
while ((numBytesRead = seqReader.read(buffer)) != -1)
{
writer.write(buffer, 0, numBytesRead);
if (mLineLength != null)
{
writer.write("\n");
}
}
if (null == mLineLength)
{
writer.write("\n");
}
}
finally
{
if (seqReader != null)
{
seqReader.close();
}
if (writer != null)
{
writer.flush();
}
}
}
catch (SeqIOException e)
{
throw e;
}
catch (Exception e)
{
throw new SeqIOException(e);
}
}
//###########################################################################
// INNER CLASS
//###########################################################################
private class FASTASeqFilterReader extends FilterReader
{
private BioSequence mSeq;
private char[] mBuffer = new char[8196];
private int mBufferLimit;
private int mBufferIndex;
private boolean mEndOfStreamReached;
private int mPrevChar = -1;
private int mLineCount = 1;
private int mCharacterCount;
//---------------------------------------------------------------------------
FASTASeqFilterReader(BioSequence inSeq, Reader inReader)
{
super(inReader);
mSeq = inSeq;
}
//---------------------------------------------------------------------------
@Override
public int read()
throws IOException
{
int returnChar;
do
{
returnChar = innerRead();
mCharacterCount++;
if (Character.isWhitespace(returnChar)
|| Character.isDigit(returnChar))
{
if ('\n' == returnChar)
{
mLineCount++;
mCharacterCount = 1;
}
continue;
}
else if (! Character.isLetter(returnChar) // Allow letters
&& returnChar != '*' // Allow stop codons
&& returnChar != '-' // Allow gaps
&& returnChar != -1) // Allow EOF
{
String msg;
if ('>' == returnChar)
{
// This is severe enough that we don't want to continue processing
// the sequence as if it belongs to this record.
throw new SeqFormatException("The FASTA record start character " + StringUtil.singleQuote((char) returnChar) + " following this record must occur as the first character on the line!");
}
else
{
SeqFormatException e = new SeqFormatException("Illegal sequence character " + StringUtil.singleQuote((char) returnChar) + " encountered on sequence line " + mLineCount + " position " + mCharacterCount + "!");
if (mMaxExceptionsPerRecord > 0
&& mSeq instanceof BioSequencePlus
&& (!((BioSequencePlus) mSeq).hadParseExceptions()
|| ((BioSequencePlus) mSeq).getParseExceptions()
.size() < mMaxExceptionsPerRecord))
{
((BioSequencePlus) mSeq).addParseException(e);
getLogger().warning(e.getMessage());
}
else
{
throw e;
}
}
}
}
while (false);
return returnChar;
}
//---------------------------------------------------------------------------
public int read(char[] inBuffer, int inOffset, int inMaxReadLength)
throws IOException
{
int theChar;
int numCharsRead = 0;
do
{
theChar = read();
if (theChar > 0)
{
inBuffer[inOffset++] = (char) theChar;
numCharsRead++;
}
}
while (theChar >= 0
&& numCharsRead < inMaxReadLength);
return (theChar < 0 && 0 == numCharsRead ? -1 : numCharsRead);
}
//---------------------------------------------------------------------------
protected int innerRead()
throws IOException
{
if (mBufferIndex >= mBufferLimit)
{
fillBuffer();
}
return (mEndOfStreamReached ? -1 : mBuffer[mBufferIndex++]);
}
//---------------------------------------------------------------------------
private void fillBuffer()
throws IOException
{
mBufferLimit = super.in.read(mBuffer, 0, mBuffer.length);
if (-1 == mBufferLimit)
{
mEndOfStreamReached = true;
}
// Reset the index
mBufferIndex = 0;
}
}
}