com.hfg.bio.seq.alignment.KMerBitSet Maven / Gradle / Ivy
Go to download
Show more of this group Show more artifacts with this name
Show all versions of com_hfg Show documentation
Show all versions of com_hfg Show documentation
com.hfg xml, html, svg, and bioinformatics utility library
package com.hfg.bio.seq.alignment;
import java.util.BitSet;
import com.hfg.bio.seq.BioSequence;
import com.hfg.bio.seq.BioSequenceType;
//------------------------------------------------------------------------------
/**
* BitSet container for holding detected k-mer instances from a sequence.
*
* @author J. Alex Taylor, hairyfatguy.com
*/
//------------------------------------------------------------------------------
// com.hfg XML/HTML Coding Library
//
// This library is free software; you can redistribute it and/or
// modify it under the terms of the GNU Lesser General Public
// License as published by the Free Software Foundation; either
// version 2.1 of the License, or (at your option) any later version.
//
// This library is distributed in the hope that it will be useful,
// but WITHOUT ANY WARRANTY; without even the implied warranty of
// MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
// Lesser General Public License for more details.
//
// You should have received a copy of the GNU Lesser General Public
// License along with this library; if not, write to the Free Software
// Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA
//
// J. Alex Taylor, President, Founder, CEO, COO, CFO, OOPS hairyfatguy.com
// [email protected]
//------------------------------------------------------------------------------
public class KMerBitSet extends BitSet
{
private BioSequenceType mSeqType;
private int mKMerSize;
private int[] mResidueIndexLookup;
private int mUnknownResidueIndex;
private static String sProteinAlphabet = "ACDEFGHIKLMNPQRSTVWXY";
private static String sNucleicAcidAlphabet = "ACGNT";
//--------------------------------------------------------------------------
public KMerBitSet(BioSequenceType inSeqType, int inKMerSize)
{
super((int) Math.pow(inSeqType.equals(BioSequenceType.PROTEIN) ? 21 : 5, inKMerSize));
mSeqType = inSeqType;
mKMerSize = inKMerSize;
String alphabet = getAlphabet();
mResidueIndexLookup = new int[(int) alphabet.charAt(alphabet.length() - 1) + 1];
for (int i = 0; i < mResidueIndexLookup.length; i++)
{
mResidueIndexLookup[i] = -1;
}
for (int i = 0; i < alphabet.length(); i++)
{
char theChar = alphabet.charAt(i);
mResidueIndexLookup[(int) theChar] = i;
}
mUnknownResidueIndex = mResidueIndexLookup[(int) getUnknownResidueChar()];
}
//--------------------------------------------------------------------------
public BioSequenceType getBioSequenceType()
{
return mSeqType;
}
//--------------------------------------------------------------------------
public void fill(BioSequence inSequence)
{
byte[] seqIndices = getResidueIndicesForSequence(inSequence.getSequence());
int alphabetSize = getAlphabet().length();
for (int i = 0; i <= seqIndices.length - mKMerSize; i++)
{
int charPosition = 0;
int bitIndex = 0;
int pow = mKMerSize - 1;
while (charPosition < mKMerSize)
{
bitIndex += seqIndices[i + charPosition++] * Math.pow(alphabetSize, pow--);
}
set(bitIndex);
}
}
//---------------------------------------------------------------------------
public int getCommonKMerCount(KMerBitSet inComparisonKMerBitSet)
{
int count = 0;
for (int i = 0; i < size(); i++)
{
if (get(i) && inComparisonKMerBitSet.get(i))
{
count++;
}
}
return count;
}
//---------------------------------------------------------------------------
private String getAlphabet()
{
return (getBioSequenceType().equals(BioSequenceType.PROTEIN) ? sProteinAlphabet : sNucleicAcidAlphabet);
}
//---------------------------------------------------------------------------
private char getUnknownResidueChar()
{
return (getBioSequenceType().equals(BioSequenceType.PROTEIN) ? 'X' : 'N');
}
//---------------------------------------------------------------------------
private byte[] getResidueIndicesForSequence(String inSequence)
{
int length = inSequence.length();
byte[] indices = new byte[length];
String ucSequence = inSequence.toUpperCase();
for (int i = 0; i < length; i++)
{
char theChar = ucSequence.charAt(i);
int residueIndex = mResidueIndexLookup[(int) theChar];
if (-1 == residueIndex)
{
residueIndex = mUnknownResidueIndex;
}
indices[i] = (byte) residueIndex;
}
return indices;
}
}