com.hfg.bio.seq.format.BufferedSeqReader Maven / Gradle / Ivy

Show more of this group Show more artifacts with this name
Show all versions of com_hfg Show documentation
com.hfg xml, html, svg, and bioinformatics utility library
There is a newer version: 20240423
package com.hfg.bio.seq.format;

import java.io.BufferedReader;
import java.io.ByteArrayInputStream;
import java.io.CharArrayReader;
import java.io.FilterInputStream;
import java.io.IOException;
import java.io.InputStream;
import java.io.InputStreamReader;
import java.util.ArrayList;
import java.util.Collection;
import java.util.List;

import com.hfg.bio.seq.BioSequence;
import com.hfg.exception.ProgrammingException;
import com.hfg.util.StringBuilderPlus;
import com.hfg.util.collection.CollectionUtil;
import com.hfg.util.io.GZIP;

//------------------------------------------------------------------------------
/**
 Buffered sequence reader.
 
 @author J. Alex Taylor, hairyfatguy.com
 
 */
//------------------------------------------------------------------------------
// com.hfg Library
//
// This library is free software; you can redistribute it and/or
// modify it under the terms of the GNU Lesser General Public
// License as published by the Free Software Foundation; either
// version 2.1 of the License, or (at your option) any later version.
//
// This library is distributed in the hope that it will be useful,
// but WITHOUT ANY WARRANTY; without even the implied warranty of
// MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
// Lesser General Public License for more details.
//
// You should have received a copy of the GNU Lesser General Public
// License along with this library; if not, write to the Free Software
// Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA  02111-1307  USA
//
// J. Alex Taylor, President, Founder, CEO, COO, CFO, OOPS hairyfatguy.com
// [email protected]
//------------------------------------------------------------------------------

public class BufferedSeqReader
{
   private ReadableSeqFormat mSeqFormatObj;
   private BufferedReader       mBufferedReader;
   private boolean              mEndOfContentReached;
   private String               mRecordStartLine;
   private int                  mNumRecordsParsed;
   // How long the record should be before compression is used.
   private int                  mCompressionThreshold = sDefaultCompressionThreshold;

   private StringBuilderPlus mUncompressedRecordChunk = new StringBuilderPlus(mCompressionThreshold + 500).setDelimiter("\n");
   private List      mCompressedRecordChunks  = new ArrayList<>(50);
   private int               mCurrentRecordLength     = 0;

   // Default value for how long the record should be before compression is used.
   private static final int   sDefaultCompressionThreshold = 8 * 1024;

   //###########################################################################
   // CONSTRUCTORS
   //###########################################################################

   //---------------------------------------------------------------------------
   public BufferedSeqReader(BufferedReader inReader, ReadableSeqFormat inSeqFormatObj)
   {
      mBufferedReader = inReader;
      mSeqFormatObj   = inSeqFormatObj;
   }

   //###########################################################################
   // PUBLIC METHODS
   //###########################################################################

   //---------------------------------------------------------------------------
   public static  ReadableSeqFormat determineSeqFormat(BufferedReader inReader, Collection> inSeqFormatObjects)
         throws IOException
   {
      if (! inReader.markSupported())
      {
         throw new ProgrammingException("The passed reader must support setting a mark!");
      }

      // Grab a chunk of content from the start of the stream that will hopefully be enough
      // to let us determine the format.
      // We need to be careful to put the reader back in its original state when we are done.
      int readAheadLimit = 10 * 1024;
      inReader.mark(readAheadLimit);
      // Put the chunk into a buffer we can reuse
      char[] buffer = new char[readAheadLimit];
      inReader.read(buffer);
      inReader.reset();

      ReadableSeqFormat successfulFormat = null;

 //     LimitedBufferedReader reader = new LimitedBufferedReader(new NoCloseBufferedReader(inReader), readAheadLimit);
      for (ReadableSeqFormat seqFormat : inSeqFormatObjects)
      {
         try
         {
            BufferedReader reader = new BufferedReader(new CharArrayReader(buffer));
//            reader.mark(readAheadLimit);
            BufferedSeqReader seqReader = new BufferedSeqReader<>(reader, seqFormat);
            seqReader.readNextRecord((int) (0.9 * readAheadLimit));
            T seq = seqReader.next();
            successfulFormat = seqFormat;
            break;
         }
         catch (SeqIOException e)
         {
            // Ignore. Try the next format
         }
         finally
         {
//            reader.reset();
         }
      }

      inReader.reset();

      return successfulFormat;
   }

   //---------------------------------------------------------------------------
   public BufferedSeqReader setCompressionThreshold(int inNumBytes)
   {
      mCompressionThreshold = inNumBytes;
      return this;
   }

   //---------------------------------------------------------------------------
   public void close()
         throws IOException
   {
      mBufferedReader.close();
   }

   //---------------------------------------------------------------------------
   public ReadableSeqFormat getSeqFormat()
   {
      return mSeqFormatObj;
   }

   //---------------------------------------------------------------------------
   public synchronized boolean hasNext()
   {
      boolean result = false;
      if (! endOfContentReached())
      {
         if (0 == mCurrentRecordLength)
         {
            readNextRecord();
         }

         result = mCurrentRecordLength > 0;
      }

      return result;
   }

   //---------------------------------------------------------------------------
   public synchronized T next()
   {
      T nextSeq = null;
      if (0 == mCurrentRecordLength)
      {
         readNextRecord();
      }

      if (mCurrentRecordLength > 0)
      {
         nextSeq = mSeqFormatObj.readRecord(getBufferedRecordReader());
         mCurrentRecordLength = 0;
      }

      return nextSeq;
   }

   //---------------------------------------------------------------------------
   public List readAll()
   {
      List seqs = new ArrayList();
      while (hasNext())
      {
         seqs.add(next());
      }

      return seqs;
   }

   //---------------------------------------------------------------------------
   protected boolean endOfContentReached()
   {
      return mEndOfContentReached;
   }

   //---------------------------------------------------------------------------
   private void readNextRecord()
   {
      readNextRecord(null);
   }

   //---------------------------------------------------------------------------
   private synchronized void readNextRecord(Integer inReadLimit)
   {
      if (! endOfContentReached())
      {
         // Start w/ a fresh record
         mUncompressedRecordChunk.setLength(0);
         mCompressedRecordChunks.clear();
         mCurrentRecordLength = 0;

         if (mRecordStartLine != null)
         {
            appendLineToCurrentRecord(mRecordStartLine);
            mRecordStartLine = null;
         }

         try
         {
            String line;
            while ((line = mBufferedReader.readLine()) != null)
            {
               if (line.length() > 0)
               {
                  if (mSeqFormatObj.isEndOfRecord(line))
                  {
                     if (mSeqFormatObj.hasJanusDelimiter())
                     {
                        // Line is the end of one record and the start of another
                        if (0 == mNumRecordsParsed
                            && 0 == mCurrentRecordLength)
                        {
                           // Line is the start of the first record
                           appendLineToCurrentRecord(line);
                        }
                        else
                        {
                           // Save the line for the next record
                           mRecordStartLine = line;
                           break;
                        }
                     }
                     else
                     {
                        // Line is the end of the record
                        appendLineToCurrentRecord(line);
                        break;
                     }
                  }
                  else
                  {
                     // Line is not the end of the record
                     appendLineToCurrentRecord(line);
                  }
               }

               // We might just be testing formats and need to quit before we go too far
               if (inReadLimit != null
                   && mCurrentRecordLength > inReadLimit)
               {
                  break;
               }
            }

            if (null == line)
            {
               mEndOfContentReached = true;
            }
         }
         catch (IOException e)
         {
            throw new SeqIOException(e);
         }
      }

      if (mCurrentRecordLength > 0)
      {
         mNumRecordsParsed++;
      }
   }

   //--------------------------------------------------------------------------
   // Note: inLine will not have a return at the end
   private void appendLineToCurrentRecord(String inLine)
         throws SeqIOException
   {
      mCurrentRecordLength += inLine.length() + 1;

      mUncompressedRecordChunk.appendln(inLine);
      if (mUncompressedRecordChunk.length() > mCompressionThreshold)
      {
         mCompressedRecordChunks.add(GZIP.compress(mUncompressedRecordChunk.toString()));
         mUncompressedRecordChunk.setLength(0);
      }
   }

   //--------------------------------------------------------------------------
   private BufferedReader getBufferedRecordReader()
   {
      InputStream seqStream = null;

      if (CollectionUtil.hasValues(mCompressedRecordChunks))
      {
         // Compress any leftover lines and add them to the rest of the chunks
         if (mUncompressedRecordChunk.length() > 0)
         {
            mCompressedRecordChunks.add(GZIP.compress(mUncompressedRecordChunk.toString()));
         }

         seqStream = new GZIPRecordStreamer();
      }
      else if (mUncompressedRecordChunk.length() > 0)
      {
         seqStream = new ByteArrayInputStream(mUncompressedRecordChunk.toString().getBytes());
      }

      return new BufferedReader(new InputStreamReader(seqStream));
   }

   //##########################################################################
   // INNER CLASS
   //##########################################################################

   private class GZIPRecordStreamer extends FilterInputStream
   {
      private byte[]    mBuffer;
      private int       mBufferLimit;
      private int       mCurrentChunkIndex;
      private int       mCharIndex;
      private boolean   mDone = false;
      private boolean   mEndOfStreamReached;
      private int       mBytesStreamed;

      //-----------------------------------------------------------------------
      public GZIPRecordStreamer()
      {
         super(null);
         mCurrentChunkIndex = 0;
      }

      //---------------------------------------------------------------------------
      @Override
      public int read(byte[] inBuffer, int inOffset, int inMaxReadLength)
            throws IOException
      {
         byte theChar;
         int numCharsRead = 0;
         do
         {
            if (mCharIndex >= mBufferLimit)
            {
               fillBuffer();
            }

            theChar = (mEndOfStreamReached ? -1 : mBuffer[mCharIndex++]);

            if (theChar > 0)
            {
               inBuffer[inOffset++] = theChar;
               numCharsRead++;
            }
         }
         while (theChar >= 0
                && numCharsRead < inMaxReadLength);

         mBytesStreamed += numCharsRead;

         return (theChar < 0 && 0 == numCharsRead ? -1 : numCharsRead);
      }

      //---------------------------------------------------------------------------
      private void fillBuffer()
            throws IOException
      {
         if (mCurrentChunkIndex < mCompressedRecordChunks.size())
         {
            mBuffer = GZIP.uncompress(mCompressedRecordChunks.get(mCurrentChunkIndex++));
            mBufferLimit = mBuffer.length;

            // Reset the index
            mCharIndex = 0;
         }
         else
         {
            mEndOfStreamReached = true;
         }
      }

      //-----------------------------------------------------------------------
      @Override
      public int available()
            throws IOException
      {
         return mCurrentRecordLength - mBytesStreamed;
      }

      //-----------------------------------------------------------------------
      @Override
      public void close()
            throws IOException
      {
         // Do nothing
      }

      //-----------------------------------------------------------------------
      @Override
      public int read()
      {
         int nextChar = -1;
         if (! mDone)
         {
            if (null == mBuffer)
            {
               mBuffer = GZIP.uncompress(mCompressedRecordChunks.get(mCurrentChunkIndex));
               mBufferLimit = mBuffer.length;

               // Reset the index
               mCharIndex = 0;
            }

            nextChar = mBuffer[mCharIndex++];

            if (mCharIndex >= mBufferLimit)
            {
               // This is the last char in this chunk.
               mBuffer = null;
               mCurrentChunkIndex++;
               if (mCurrentChunkIndex < 0 || mCurrentChunkIndex == mCompressedRecordChunks.size())
               {
                  // This was the last chunk.
                  mDone = true;
               }
            }
         }

         return nextChar;
      }
   }

}