All Downloads are FREE. Search and download functionalities are using the official Maven repository.

com.hfg.bio.seq.format.FASTA Maven / Gradle / Ivy

There is a newer version: 20240423
Show newest version
package com.hfg.bio.seq.format;

import java.io.*;
import java.util.Collection;
import java.util.logging.Logger;
import java.util.regex.Matcher;
import java.util.regex.Pattern;

import com.hfg.bio.seq.BioSequence;
import com.hfg.bio.seq.BioSequenceFactory;
import com.hfg.bio.seq.BioSequencePlus;
import com.hfg.util.StringUtil;

//------------------------------------------------------------------------------
/**
 FASTA sequence format. Allowed sequence characters are upper-case letters,
 lower-case letters, '*' for stop codons, and '-' for gaps. Numbers and spaces
 will be silently stripped from the sequence and any other characters will cause
 a SeqFormatException.
 
@author J. Alex Taylor, hairyfatguy.com
*/ //------------------------------------------------------------------------------ // com.hfg Library // // This library is free software; you can redistribute it and/or // modify it under the terms of the GNU Lesser General Public // License as published by the Free Software Foundation; either // version 2.1 of the License, or (at your option) any later version. // // This library is distributed in the hope that it will be useful, // but WITHOUT ANY WARRANTY; without even the implied warranty of // MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU // Lesser General Public License for more details. // // You should have received a copy of the GNU Lesser General Public // License along with this library; if not, write to the Free Software // Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA // // J. Alex Taylor, President, Founder, CEO, COO, CFO, OOPS hairyfatguy.com // [email protected] //------------------------------------------------------------------------------ public class FASTA extends ReadableSeqFormatBase implements WritableSeqFormat { private Integer mLineLength = sDefaultLineLength; private int mMaxExceptionsPerRecord = 0; private static int sDefaultLineLength = 75; private static Pattern sHeaderLinePattern = Pattern.compile(">(\\S+)(?:\\s+(.*?))?"); private final static Logger LOGGER = Logger.getLogger(FASTA.class.getName()); //########################################################################### // CONSTRUCTORS //########################################################################### //--------------------------------------------------------------------------- public FASTA() { super(null); } //--------------------------------------------------------------------------- public FASTA(BioSequenceFactory inSeqFactory) { super(inSeqFactory); } //########################################################################### // PUBLIC METHODS //########################################################################### //--------------------------------------------------------------------------- public static Logger getLogger() { return LOGGER; } //--------------------------------------------------------------------------- /** Specify the maximum number of Exceptions to tolerate per record. Defaults to zero. This mechanism will only work with sequences objects that implement the BioSequencePlus interface. If a record produces less than the specified maximum number of Exceptions, the Exceptions can be retrieved via the getParseExceptions() method on the BioSequencePlus sequence object. * @param inValue the maximum number of Exceptions to tolerate per record * @return this format object to facilitate method chaining. */ public FASTA setMaxExceptionsPerRecord(int inValue) { mMaxExceptionsPerRecord = inValue; return this; } //--------------------------------------------------------------------------- public FASTA setLineLength(Integer inValue) { mLineLength = inValue; return this; } //--------------------------------------------------------------------------- public Integer getLineLength() { return mLineLength; } //--------------------------------------------------------------------------- public T readRecord(BufferedReader inReader) throws SeqIOException { if (null == getBioSequenceFactory()) { throw new SeqIOException("No BioSequence factory has been specified!"); } int lineCount = 0; T seq = null; try { seq = getBioSequenceFactory().createSeqObj(); boolean headerLineFound = false; String line; while ((line = inReader.readLine()) != null) { lineCount++; // Skip comment lines or blank lines if (line.startsWith("#") || line.startsWith("//") || line.matches("\\s*")) { continue; } if (line.startsWith(">")) { headerLineFound = true; line = line.trim(); if (seq.getID() != null) { throw new SeqFormatException("Line " + lineCount + ": Multiple header lines found in the sequence record!"); } Matcher m = sHeaderLinePattern.matcher(line); if (m.matches()) { seq.setID(m.group(1)); seq.setDescription(m.group(2)); } else { throw new SeqFormatException("Line " + lineCount + ": The header line" + StringUtil.singleQuote(line) + " is not in proper FASTA format!"); } break; } else { throw new SeqFormatException("Invalid FASTA Format! Expected header line but found " + StringUtil.singleQuote(line) + "!"); } } if (! headerLineFound) { throw new SeqFormatException("No FASTA header line found!"); } // The rest of the record should be sequence // Cleanup the sequence to remove spaces and numbers Reader filterReader = new FASTASeqFilterReader(seq, inReader); seq.setSequence(filterReader); filterReader.close(); } catch (SeqFormatException e) { SeqIOException exception; if (StringUtil.isSet(seq.getID())) { exception = new SeqIOException("Problem encountered while reading sequence " + StringUtil.singleQuote(seq.getID()) + "!", e); } else { exception = e; } if (mMaxExceptionsPerRecord > 0 && seq instanceof BioSequencePlus && (! ((BioSequencePlus) seq).hadParseExceptions() || ((BioSequencePlus) seq).getParseExceptions().size() < mMaxExceptionsPerRecord)) { ((BioSequencePlus) seq).addParseException(exception); getLogger().warning(exception.getMessage()); } else { throw exception; } } catch (SeqIOException e) { throw e; } catch (Exception e) { throw new SeqIOException(e); } return seq; } //--------------------------------------------------------------------------- public boolean isEndOfRecord(String inLine) { return inLine.startsWith(">"); } //--------------------------------------------------------------------------- public boolean hasJanusDelimiter() { return true; } //--------------------------------------------------------------------------- public String write(Collection inSeqs) throws SeqIOException { StringWriter writer = new StringWriter(); for (T seq : inSeqs) { write(seq, writer); } return writer.toString(); } //--------------------------------------------------------------------------- public String write(T inSeq) throws SeqIOException { StringWriter writer = new StringWriter(); write(inSeq, writer); return writer.toString(); } //--------------------------------------------------------------------------- public void write(T inSeq, OutputStream inStream) throws SeqIOException { Writer writer = new OutputStreamWriter(inStream); write(inSeq, writer); try { writer.flush(); } catch (Exception e) { throw new SeqIOException(e); } } //--------------------------------------------------------------------------- public void write(T inSeq, Writer inWriter) throws SeqIOException { Reader seqReader = null; BufferedWriter writer = null; try { try { if (inWriter instanceof BufferedWriter) { writer = (BufferedWriter) inWriter; } else { writer = new BufferedWriter(inWriter, 8196); } // Write the header line writer.write(">"); writer.write(inSeq.getID()); if (StringUtil.isSet(inSeq.getDescription())) { writer.write(" " + inSeq.getDescription()); } writer.write("\n"); // Write the sequence lines seqReader = inSeq.getSequenceReader(); // A null line length indicates that we should write the whole sequence on one line int bufferSize = (mLineLength != null ? mLineLength : 2048); char[] buffer = new char[bufferSize]; int numBytesRead; while ((numBytesRead = seqReader.read(buffer)) != -1) { writer.write(buffer, 0, numBytesRead); if (mLineLength != null) { writer.write("\n"); } } if (null == mLineLength) { writer.write("\n"); } } finally { if (seqReader != null) { seqReader.close(); } if (writer != null) { writer.flush(); } } } catch (SeqIOException e) { throw e; } catch (Exception e) { throw new SeqIOException(e); } } //########################################################################### // INNER CLASS //########################################################################### private class FASTASeqFilterReader extends FilterReader { private BioSequence mSeq; private char[] mBuffer = new char[8196]; private int mBufferLimit; private int mBufferIndex; private boolean mEndOfStreamReached; private int mPrevChar = -1; private int mLineCount = 1; private int mCharacterCount; //--------------------------------------------------------------------------- FASTASeqFilterReader(BioSequence inSeq, Reader inReader) { super(inReader); mSeq = inSeq; } //--------------------------------------------------------------------------- @Override public int read() throws IOException { int returnChar; do { returnChar = innerRead(); mCharacterCount++; if (Character.isWhitespace(returnChar) || Character.isDigit(returnChar)) { if ('\n' == returnChar) { mLineCount++; mCharacterCount = 1; } continue; } else if (! Character.isLetter(returnChar) // Allow letters && returnChar != '*' // Allow stop codons && returnChar != '-' // Allow gaps && returnChar != -1) // Allow EOF { String msg; if ('>' == returnChar) { // This is severe enough that we don't want to continue processing // the sequence as if it belongs to this record. throw new SeqFormatException("The FASTA record start character " + StringUtil.singleQuote((char) returnChar) + " following this record must occur as the first character on the line!"); } else { SeqFormatException e = new SeqFormatException("Illegal sequence character " + StringUtil.singleQuote((char) returnChar) + " encountered on sequence line " + mLineCount + " position " + mCharacterCount + "!"); if (mMaxExceptionsPerRecord > 0 && mSeq instanceof BioSequencePlus && (!((BioSequencePlus) mSeq).hadParseExceptions() || ((BioSequencePlus) mSeq).getParseExceptions() .size() < mMaxExceptionsPerRecord)) { ((BioSequencePlus) mSeq).addParseException(e); getLogger().warning(e.getMessage()); } else { throw e; } } } } while (false); return returnChar; } //--------------------------------------------------------------------------- public int read(char[] inBuffer, int inOffset, int inMaxReadLength) throws IOException { int theChar; int numCharsRead = 0; do { theChar = read(); if (theChar > 0) { inBuffer[inOffset++] = (char) theChar; numCharsRead++; } } while (theChar >= 0 && numCharsRead < inMaxReadLength); return (theChar < 0 && 0 == numCharsRead ? -1 : numCharsRead); } //--------------------------------------------------------------------------- protected int innerRead() throws IOException { if (mBufferIndex >= mBufferLimit) { fillBuffer(); } return (mEndOfStreamReached ? -1 : mBuffer[mBufferIndex++]); } //--------------------------------------------------------------------------- private void fillBuffer() throws IOException { mBufferLimit = super.in.read(mBuffer, 0, mBuffer.length); if (-1 == mBufferLimit) { mEndOfStreamReached = true; } // Reset the index mBufferIndex = 0; } } }




© 2015 - 2024 Weber Informatics LLC | Privacy Policy