All Downloads are FREE. Search and download functionalities are using the official Maven repository.

com.hfg.bio.seq.alignment.KMerBitSet Maven / Gradle / Ivy

There is a newer version: 20240423
Show newest version
package com.hfg.bio.seq.alignment;

import java.util.BitSet;

import com.hfg.bio.seq.BioSequence;
import com.hfg.bio.seq.BioSequenceType;

//------------------------------------------------------------------------------
/**
 * BitSet container for holding detected k-mer instances from a sequence.
 *
 * @author J. Alex Taylor, hairyfatguy.com
 */
//------------------------------------------------------------------------------
// com.hfg XML/HTML Coding Library
//
// This library is free software; you can redistribute it and/or
// modify it under the terms of the GNU Lesser General Public
// License as published by the Free Software Foundation; either
// version 2.1 of the License, or (at your option) any later version.
//
// This library is distributed in the hope that it will be useful,
// but WITHOUT ANY WARRANTY; without even the implied warranty of
// MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
// Lesser General Public License for more details.
//
// You should have received a copy of the GNU Lesser General Public
// License along with this library; if not, write to the Free Software
// Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA  02111-1307  USA
//
// J. Alex Taylor, President, Founder, CEO, COO, CFO, OOPS hairyfatguy.com
// [email protected]
//------------------------------------------------------------------------------

public class KMerBitSet extends BitSet
{
   private BioSequenceType mSeqType;
   private int mKMerSize;
   private int[] mResidueIndexLookup;
   private int mUnknownResidueIndex;

   private static String sProteinAlphabet = "ACDEFGHIKLMNPQRSTVWXY";
   private static String sNucleicAcidAlphabet = "ACGNT";

   //--------------------------------------------------------------------------
   public KMerBitSet(BioSequenceType inSeqType, int inKMerSize)
   {
      super((int) Math.pow(inSeqType.equals(BioSequenceType.PROTEIN) ? 21 : 5, inKMerSize));
      mSeqType = inSeqType;
      mKMerSize = inKMerSize;

      String alphabet = getAlphabet();
      mResidueIndexLookup = new int[(int) alphabet.charAt(alphabet.length() - 1) + 1];
      for (int i = 0; i < mResidueIndexLookup.length; i++)
      {
         mResidueIndexLookup[i] = -1;
      }

      for (int i = 0; i < alphabet.length(); i++)
      {
         char theChar = alphabet.charAt(i);
         mResidueIndexLookup[(int) theChar] = i;
      }

      mUnknownResidueIndex = mResidueIndexLookup[(int) getUnknownResidueChar()];
   }

   //--------------------------------------------------------------------------
   public BioSequenceType getBioSequenceType()
   {
      return mSeqType;
   }

   //--------------------------------------------------------------------------
   public void fill(BioSequence inSequence)
   {
      byte[] seqIndices = getResidueIndicesForSequence(inSequence.getSequence());

      int alphabetSize = getAlphabet().length();

      for (int i = 0; i <= seqIndices.length - mKMerSize; i++)
      {
         int charPosition = 0;
         int bitIndex = 0;
         int pow = mKMerSize - 1;
         while (charPosition < mKMerSize)
         {
            bitIndex += seqIndices[i + charPosition++] * Math.pow(alphabetSize, pow--);
         }
         set(bitIndex);
      }
   }

   //---------------------------------------------------------------------------
   public int getCommonKMerCount(KMerBitSet inComparisonKMerBitSet)
   {
      int count = 0;
      for (int i = 0; i < size(); i++)
      {
         if (get(i) && inComparisonKMerBitSet.get(i))
         {
            count++;
         }
      }

      return count;
   }

   //---------------------------------------------------------------------------
   private String getAlphabet()
   {
      return (getBioSequenceType().equals(BioSequenceType.PROTEIN) ? sProteinAlphabet : sNucleicAcidAlphabet);
   }

   //---------------------------------------------------------------------------
   private char getUnknownResidueChar()
   {
      return (getBioSequenceType().equals(BioSequenceType.PROTEIN) ? 'X' : 'N');
   }

   //---------------------------------------------------------------------------
   private byte[] getResidueIndicesForSequence(String inSequence)
   {
      int length = inSequence.length();
      byte[] indices = new byte[length];

      String ucSequence = inSequence.toUpperCase();
      for (int i = 0; i < length; i++)
      {
         char theChar = ucSequence.charAt(i);

         int residueIndex = mResidueIndexLookup[(int) theChar];
         if (-1 == residueIndex)
         {
            residueIndex = mUnknownResidueIndex;
         }

         indices[i] = (byte) residueIndex;
      }

      return indices;
   }

}




© 2015 - 2024 Weber Informatics LLC | Privacy Policy