All Downloads are FREE. Search and download functionalities are using the official Maven repository.

com.hfg.bio.seq.format.BufferedSeqReader Maven / Gradle / Ivy

There is a newer version: 20240423
Show newest version
package com.hfg.bio.seq.format;

import java.io.BufferedReader;
import java.io.ByteArrayInputStream;
import java.io.CharArrayReader;
import java.io.FilterInputStream;
import java.io.IOException;
import java.io.InputStream;
import java.io.InputStreamReader;
import java.util.ArrayList;
import java.util.Collection;
import java.util.List;

import com.hfg.bio.seq.BioSequence;
import com.hfg.exception.ProgrammingException;
import com.hfg.util.StringBuilderPlus;
import com.hfg.util.collection.CollectionUtil;
import com.hfg.util.io.GZIP;

//------------------------------------------------------------------------------
/**
 Buffered sequence reader.
 
@author J. Alex Taylor, hairyfatguy.com
*/ //------------------------------------------------------------------------------ // com.hfg Library // // This library is free software; you can redistribute it and/or // modify it under the terms of the GNU Lesser General Public // License as published by the Free Software Foundation; either // version 2.1 of the License, or (at your option) any later version. // // This library is distributed in the hope that it will be useful, // but WITHOUT ANY WARRANTY; without even the implied warranty of // MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU // Lesser General Public License for more details. // // You should have received a copy of the GNU Lesser General Public // License along with this library; if not, write to the Free Software // Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA // // J. Alex Taylor, President, Founder, CEO, COO, CFO, OOPS hairyfatguy.com // [email protected] //------------------------------------------------------------------------------ public class BufferedSeqReader { private ReadableSeqFormat mSeqFormatObj; private BufferedReader mBufferedReader; private boolean mEndOfContentReached; private String mRecordStartLine; private int mNumRecordsParsed; // How long the record should be before compression is used. private int mCompressionThreshold = sDefaultCompressionThreshold; private StringBuilderPlus mUncompressedRecordChunk = new StringBuilderPlus(mCompressionThreshold + 500).setDelimiter("\n"); private List mCompressedRecordChunks = new ArrayList<>(50); private int mCurrentRecordLength = 0; // Default value for how long the record should be before compression is used. private static final int sDefaultCompressionThreshold = 8 * 1024; //########################################################################### // CONSTRUCTORS //########################################################################### //--------------------------------------------------------------------------- public BufferedSeqReader(BufferedReader inReader, ReadableSeqFormat inSeqFormatObj) { mBufferedReader = inReader; mSeqFormatObj = inSeqFormatObj; } //########################################################################### // PUBLIC METHODS //########################################################################### //--------------------------------------------------------------------------- public static ReadableSeqFormat determineSeqFormat(BufferedReader inReader, Collection> inSeqFormatObjects) throws IOException { if (! inReader.markSupported()) { throw new ProgrammingException("The passed reader must support setting a mark!"); } // Grab a chunk of content from the start of the stream that will hopefully be enough // to let us determine the format. // We need to be careful to put the reader back in its original state when we are done. int readAheadLimit = 10 * 1024; inReader.mark(readAheadLimit); // Put the chunk into a buffer we can reuse char[] buffer = new char[readAheadLimit]; inReader.read(buffer); inReader.reset(); ReadableSeqFormat successfulFormat = null; // LimitedBufferedReader reader = new LimitedBufferedReader(new NoCloseBufferedReader(inReader), readAheadLimit); for (ReadableSeqFormat seqFormat : inSeqFormatObjects) { try { BufferedReader reader = new BufferedReader(new CharArrayReader(buffer)); // reader.mark(readAheadLimit); BufferedSeqReader seqReader = new BufferedSeqReader<>(reader, seqFormat); seqReader.readNextRecord((int) (0.9 * readAheadLimit)); T seq = seqReader.next(); successfulFormat = seqFormat; break; } catch (SeqIOException e) { // Ignore. Try the next format } finally { // reader.reset(); } } inReader.reset(); return successfulFormat; } //--------------------------------------------------------------------------- public BufferedSeqReader setCompressionThreshold(int inNumBytes) { mCompressionThreshold = inNumBytes; return this; } //--------------------------------------------------------------------------- public void close() throws IOException { mBufferedReader.close(); } //--------------------------------------------------------------------------- public ReadableSeqFormat getSeqFormat() { return mSeqFormatObj; } //--------------------------------------------------------------------------- public synchronized boolean hasNext() { boolean result = false; if (! endOfContentReached()) { if (0 == mCurrentRecordLength) { readNextRecord(); } result = mCurrentRecordLength > 0; } return result; } //--------------------------------------------------------------------------- public synchronized T next() { T nextSeq = null; if (0 == mCurrentRecordLength) { readNextRecord(); } if (mCurrentRecordLength > 0) { nextSeq = mSeqFormatObj.readRecord(getBufferedRecordReader()); mCurrentRecordLength = 0; } return nextSeq; } //--------------------------------------------------------------------------- public List readAll() { List seqs = new ArrayList(); while (hasNext()) { seqs.add(next()); } return seqs; } //--------------------------------------------------------------------------- protected boolean endOfContentReached() { return mEndOfContentReached; } //--------------------------------------------------------------------------- private void readNextRecord() { readNextRecord(null); } //--------------------------------------------------------------------------- private synchronized void readNextRecord(Integer inReadLimit) { if (! endOfContentReached()) { // Start w/ a fresh record mUncompressedRecordChunk.setLength(0); mCompressedRecordChunks.clear(); mCurrentRecordLength = 0; if (mRecordStartLine != null) { appendLineToCurrentRecord(mRecordStartLine); mRecordStartLine = null; } try { String line; while ((line = mBufferedReader.readLine()) != null) { if (line.length() > 0) { if (mSeqFormatObj.isEndOfRecord(line)) { if (mSeqFormatObj.hasJanusDelimiter()) { // Line is the end of one record and the start of another if (0 == mNumRecordsParsed && 0 == mCurrentRecordLength) { // Line is the start of the first record appendLineToCurrentRecord(line); } else { // Save the line for the next record mRecordStartLine = line; break; } } else { // Line is the end of the record appendLineToCurrentRecord(line); break; } } else { // Line is not the end of the record appendLineToCurrentRecord(line); } } // We might just be testing formats and need to quit before we go too far if (inReadLimit != null && mCurrentRecordLength > inReadLimit) { break; } } if (null == line) { mEndOfContentReached = true; } } catch (IOException e) { throw new SeqIOException(e); } } if (mCurrentRecordLength > 0) { mNumRecordsParsed++; } } //-------------------------------------------------------------------------- // Note: inLine will not have a return at the end private void appendLineToCurrentRecord(String inLine) throws SeqIOException { mCurrentRecordLength += inLine.length() + 1; mUncompressedRecordChunk.appendln(inLine); if (mUncompressedRecordChunk.length() > mCompressionThreshold) { mCompressedRecordChunks.add(GZIP.compress(mUncompressedRecordChunk.toString())); mUncompressedRecordChunk.setLength(0); } } //-------------------------------------------------------------------------- private BufferedReader getBufferedRecordReader() { InputStream seqStream = null; if (CollectionUtil.hasValues(mCompressedRecordChunks)) { // Compress any leftover lines and add them to the rest of the chunks if (mUncompressedRecordChunk.length() > 0) { mCompressedRecordChunks.add(GZIP.compress(mUncompressedRecordChunk.toString())); } seqStream = new GZIPRecordStreamer(); } else if (mUncompressedRecordChunk.length() > 0) { seqStream = new ByteArrayInputStream(mUncompressedRecordChunk.toString().getBytes()); } return new BufferedReader(new InputStreamReader(seqStream)); } //########################################################################## // INNER CLASS //########################################################################## private class GZIPRecordStreamer extends FilterInputStream { private byte[] mBuffer; private int mBufferLimit; private int mCurrentChunkIndex; private int mCharIndex; private boolean mDone = false; private boolean mEndOfStreamReached; private int mBytesStreamed; //----------------------------------------------------------------------- public GZIPRecordStreamer() { super(null); mCurrentChunkIndex = 0; } //--------------------------------------------------------------------------- @Override public int read(byte[] inBuffer, int inOffset, int inMaxReadLength) throws IOException { byte theChar; int numCharsRead = 0; do { if (mCharIndex >= mBufferLimit) { fillBuffer(); } theChar = (mEndOfStreamReached ? -1 : mBuffer[mCharIndex++]); if (theChar > 0) { inBuffer[inOffset++] = theChar; numCharsRead++; } } while (theChar >= 0 && numCharsRead < inMaxReadLength); mBytesStreamed += numCharsRead; return (theChar < 0 && 0 == numCharsRead ? -1 : numCharsRead); } //--------------------------------------------------------------------------- private void fillBuffer() throws IOException { if (mCurrentChunkIndex < mCompressedRecordChunks.size()) { mBuffer = GZIP.uncompress(mCompressedRecordChunks.get(mCurrentChunkIndex++)); mBufferLimit = mBuffer.length; // Reset the index mCharIndex = 0; } else { mEndOfStreamReached = true; } } //----------------------------------------------------------------------- @Override public int available() throws IOException { return mCurrentRecordLength - mBytesStreamed; } //----------------------------------------------------------------------- @Override public void close() throws IOException { // Do nothing } //----------------------------------------------------------------------- @Override public int read() { int nextChar = -1; if (! mDone) { if (null == mBuffer) { mBuffer = GZIP.uncompress(mCompressedRecordChunks.get(mCurrentChunkIndex)); mBufferLimit = mBuffer.length; // Reset the index mCharIndex = 0; } nextChar = mBuffer[mCharIndex++]; if (mCharIndex >= mBufferLimit) { // This is the last char in this chunk. mBuffer = null; mCurrentChunkIndex++; if (mCurrentChunkIndex < 0 || mCurrentChunkIndex == mCompressedRecordChunks.size()) { // This was the last chunk. mDone = true; } } } return nextChar; } } }




© 2015 - 2024 Weber Informatics LLC | Privacy Policy