com.hfg.bio.seq.format.BufferedSeqReader Maven / Gradle / Ivy
Go to download
Show more of this group Show more artifacts with this name
Show all versions of com_hfg Show documentation
Show all versions of com_hfg Show documentation
com.hfg xml, html, svg, and bioinformatics utility library
package com.hfg.bio.seq.format;
import java.io.BufferedReader;
import java.io.ByteArrayInputStream;
import java.io.CharArrayReader;
import java.io.FilterInputStream;
import java.io.IOException;
import java.io.InputStream;
import java.io.InputStreamReader;
import java.util.ArrayList;
import java.util.Collection;
import java.util.List;
import com.hfg.bio.seq.BioSequence;
import com.hfg.exception.ProgrammingException;
import com.hfg.util.StringBuilderPlus;
import com.hfg.util.collection.CollectionUtil;
import com.hfg.util.io.GZIP;
//------------------------------------------------------------------------------
/**
Buffered sequence reader.
@author J. Alex Taylor, hairyfatguy.com
*/
//------------------------------------------------------------------------------
// com.hfg Library
//
// This library is free software; you can redistribute it and/or
// modify it under the terms of the GNU Lesser General Public
// License as published by the Free Software Foundation; either
// version 2.1 of the License, or (at your option) any later version.
//
// This library is distributed in the hope that it will be useful,
// but WITHOUT ANY WARRANTY; without even the implied warranty of
// MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
// Lesser General Public License for more details.
//
// You should have received a copy of the GNU Lesser General Public
// License along with this library; if not, write to the Free Software
// Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA
//
// J. Alex Taylor, President, Founder, CEO, COO, CFO, OOPS hairyfatguy.com
// [email protected]
//------------------------------------------------------------------------------
public class BufferedSeqReader
{
private ReadableSeqFormat mSeqFormatObj;
private BufferedReader mBufferedReader;
private boolean mEndOfContentReached;
private String mRecordStartLine;
private int mNumRecordsParsed;
// How long the record should be before compression is used.
private int mCompressionThreshold = sDefaultCompressionThreshold;
private StringBuilderPlus mUncompressedRecordChunk = new StringBuilderPlus(mCompressionThreshold + 500).setDelimiter("\n");
private List mCompressedRecordChunks = new ArrayList<>(50);
private int mCurrentRecordLength = 0;
// Default value for how long the record should be before compression is used.
private static final int sDefaultCompressionThreshold = 8 * 1024;
//###########################################################################
// CONSTRUCTORS
//###########################################################################
//---------------------------------------------------------------------------
public BufferedSeqReader(BufferedReader inReader, ReadableSeqFormat inSeqFormatObj)
{
mBufferedReader = inReader;
mSeqFormatObj = inSeqFormatObj;
}
//###########################################################################
// PUBLIC METHODS
//###########################################################################
//---------------------------------------------------------------------------
public static ReadableSeqFormat determineSeqFormat(BufferedReader inReader, Collection> inSeqFormatObjects)
throws IOException
{
if (! inReader.markSupported())
{
throw new ProgrammingException("The passed reader must support setting a mark!");
}
// Grab a chunk of content from the start of the stream that will hopefully be enough
// to let us determine the format.
// We need to be careful to put the reader back in its original state when we are done.
int readAheadLimit = 10 * 1024;
inReader.mark(readAheadLimit);
// Put the chunk into a buffer we can reuse
char[] buffer = new char[readAheadLimit];
inReader.read(buffer);
inReader.reset();
ReadableSeqFormat successfulFormat = null;
// LimitedBufferedReader reader = new LimitedBufferedReader(new NoCloseBufferedReader(inReader), readAheadLimit);
for (ReadableSeqFormat seqFormat : inSeqFormatObjects)
{
try
{
BufferedReader reader = new BufferedReader(new CharArrayReader(buffer));
// reader.mark(readAheadLimit);
BufferedSeqReader seqReader = new BufferedSeqReader<>(reader, seqFormat);
seqReader.readNextRecord((int) (0.9 * readAheadLimit));
T seq = seqReader.next();
successfulFormat = seqFormat;
break;
}
catch (SeqIOException e)
{
// Ignore. Try the next format
}
finally
{
// reader.reset();
}
}
inReader.reset();
return successfulFormat;
}
//---------------------------------------------------------------------------
public BufferedSeqReader setCompressionThreshold(int inNumBytes)
{
mCompressionThreshold = inNumBytes;
return this;
}
//---------------------------------------------------------------------------
public void close()
throws IOException
{
mBufferedReader.close();
}
//---------------------------------------------------------------------------
public ReadableSeqFormat getSeqFormat()
{
return mSeqFormatObj;
}
//---------------------------------------------------------------------------
public synchronized boolean hasNext()
{
boolean result = false;
if (! endOfContentReached())
{
if (0 == mCurrentRecordLength)
{
readNextRecord();
}
result = mCurrentRecordLength > 0;
}
return result;
}
//---------------------------------------------------------------------------
public synchronized T next()
{
T nextSeq = null;
if (0 == mCurrentRecordLength)
{
readNextRecord();
}
if (mCurrentRecordLength > 0)
{
nextSeq = mSeqFormatObj.readRecord(getBufferedRecordReader());
mCurrentRecordLength = 0;
}
return nextSeq;
}
//---------------------------------------------------------------------------
public List readAll()
{
List seqs = new ArrayList();
while (hasNext())
{
seqs.add(next());
}
return seqs;
}
//---------------------------------------------------------------------------
protected boolean endOfContentReached()
{
return mEndOfContentReached;
}
//---------------------------------------------------------------------------
private void readNextRecord()
{
readNextRecord(null);
}
//---------------------------------------------------------------------------
private synchronized void readNextRecord(Integer inReadLimit)
{
if (! endOfContentReached())
{
// Start w/ a fresh record
mUncompressedRecordChunk.setLength(0);
mCompressedRecordChunks.clear();
mCurrentRecordLength = 0;
if (mRecordStartLine != null)
{
appendLineToCurrentRecord(mRecordStartLine);
mRecordStartLine = null;
}
try
{
String line;
while ((line = mBufferedReader.readLine()) != null)
{
if (line.length() > 0)
{
if (mSeqFormatObj.isEndOfRecord(line))
{
if (mSeqFormatObj.hasJanusDelimiter())
{
// Line is the end of one record and the start of another
if (0 == mNumRecordsParsed
&& 0 == mCurrentRecordLength)
{
// Line is the start of the first record
appendLineToCurrentRecord(line);
}
else
{
// Save the line for the next record
mRecordStartLine = line;
break;
}
}
else
{
// Line is the end of the record
appendLineToCurrentRecord(line);
break;
}
}
else
{
// Line is not the end of the record
appendLineToCurrentRecord(line);
}
}
// We might just be testing formats and need to quit before we go too far
if (inReadLimit != null
&& mCurrentRecordLength > inReadLimit)
{
break;
}
}
if (null == line)
{
mEndOfContentReached = true;
}
}
catch (IOException e)
{
throw new SeqIOException(e);
}
}
if (mCurrentRecordLength > 0)
{
mNumRecordsParsed++;
}
}
//--------------------------------------------------------------------------
// Note: inLine will not have a return at the end
private void appendLineToCurrentRecord(String inLine)
throws SeqIOException
{
mCurrentRecordLength += inLine.length() + 1;
mUncompressedRecordChunk.appendln(inLine);
if (mUncompressedRecordChunk.length() > mCompressionThreshold)
{
mCompressedRecordChunks.add(GZIP.compress(mUncompressedRecordChunk.toString()));
mUncompressedRecordChunk.setLength(0);
}
}
//--------------------------------------------------------------------------
private BufferedReader getBufferedRecordReader()
{
InputStream seqStream = null;
if (CollectionUtil.hasValues(mCompressedRecordChunks))
{
// Compress any leftover lines and add them to the rest of the chunks
if (mUncompressedRecordChunk.length() > 0)
{
mCompressedRecordChunks.add(GZIP.compress(mUncompressedRecordChunk.toString()));
}
seqStream = new GZIPRecordStreamer();
}
else if (mUncompressedRecordChunk.length() > 0)
{
seqStream = new ByteArrayInputStream(mUncompressedRecordChunk.toString().getBytes());
}
return new BufferedReader(new InputStreamReader(seqStream));
}
//##########################################################################
// INNER CLASS
//##########################################################################
private class GZIPRecordStreamer extends FilterInputStream
{
private byte[] mBuffer;
private int mBufferLimit;
private int mCurrentChunkIndex;
private int mCharIndex;
private boolean mDone = false;
private boolean mEndOfStreamReached;
private int mBytesStreamed;
//-----------------------------------------------------------------------
public GZIPRecordStreamer()
{
super(null);
mCurrentChunkIndex = 0;
}
//---------------------------------------------------------------------------
@Override
public int read(byte[] inBuffer, int inOffset, int inMaxReadLength)
throws IOException
{
byte theChar;
int numCharsRead = 0;
do
{
if (mCharIndex >= mBufferLimit)
{
fillBuffer();
}
theChar = (mEndOfStreamReached ? -1 : mBuffer[mCharIndex++]);
if (theChar > 0)
{
inBuffer[inOffset++] = theChar;
numCharsRead++;
}
}
while (theChar >= 0
&& numCharsRead < inMaxReadLength);
mBytesStreamed += numCharsRead;
return (theChar < 0 && 0 == numCharsRead ? -1 : numCharsRead);
}
//---------------------------------------------------------------------------
private void fillBuffer()
throws IOException
{
if (mCurrentChunkIndex < mCompressedRecordChunks.size())
{
mBuffer = GZIP.uncompress(mCompressedRecordChunks.get(mCurrentChunkIndex++));
mBufferLimit = mBuffer.length;
// Reset the index
mCharIndex = 0;
}
else
{
mEndOfStreamReached = true;
}
}
//-----------------------------------------------------------------------
@Override
public int available()
throws IOException
{
return mCurrentRecordLength - mBytesStreamed;
}
//-----------------------------------------------------------------------
@Override
public void close()
throws IOException
{
// Do nothing
}
//-----------------------------------------------------------------------
@Override
public int read()
{
int nextChar = -1;
if (! mDone)
{
if (null == mBuffer)
{
mBuffer = GZIP.uncompress(mCompressedRecordChunks.get(mCurrentChunkIndex));
mBufferLimit = mBuffer.length;
// Reset the index
mCharIndex = 0;
}
nextChar = mBuffer[mCharIndex++];
if (mCharIndex >= mBufferLimit)
{
// This is the last char in this chunk.
mBuffer = null;
mCurrentChunkIndex++;
if (mCurrentChunkIndex < 0 || mCurrentChunkIndex == mCompressedRecordChunks.size())
{
// This was the last chunk.
mDone = true;
}
}
}
return nextChar;
}
}
}