com.hfg.bio.seq.alignment.matrix.PSSM Maven / Gradle / Ivy
Go to download
Show more of this group Show more artifacts with this name
Show all versions of com_hfg Show documentation
Show all versions of com_hfg Show documentation
com.hfg xml, html, svg, and bioinformatics utility library
package com.hfg.bio.seq.alignment.matrix;
import java.util.ArrayList;
import java.util.Collection;
import java.util.Collections;
import java.util.HashMap;
import java.util.HashSet;
import java.util.List;
import java.util.Map;
import java.util.Set;
import com.hfg.bio.AminoAcid;
import com.hfg.bio.AminoAcidFreqTable;
import com.hfg.bio.seq.BioSequence;
import com.hfg.bio.seq.BioSequenceType;
import com.hfg.bio.seq.PositionalFrequencyMatrix;
import com.hfg.bio.seq.Protein;
import com.hfg.bio.seq.alignment.MultipleSequenceAlignment;
import com.hfg.math.MathUtil;
import com.hfg.util.StringBuilderPlus;
import com.hfg.util.StringUtil;
import com.hfg.util.collection.OrderedSet;
import com.hfg.xml.XMLTag;
//------------------------------------------------------------------------------
/**
Container for a Position-Specific Scoring Matrix (PSSM). Also known as a
Position-Weighted Matrix.
@author J. Alex Taylor, hairyfatguy.com
*/
//------------------------------------------------------------------------------
// com.hfg Library
//
// This library is free software; you can redistribute it and/or
// modify it under the terms of the GNU Lesser General Public
// License as published by the Free Software Foundation; either
// version 2.1 of the License, or (at your option) any later version.
//
// This library is distributed in the hope that it will be useful,
// but WITHOUT ANY WARRANTY; without even the implied warranty of
// MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
// Lesser General Public License for more details.
//
// You should have received a copy of the GNU Lesser General Public
// License along with this library; if not, write to the Free Software
// Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA
//
// J. Alex Taylor, President, Founder, CEO, COO, CFO, OOPS hairyfatguy.com
// [email protected]
//------------------------------------------------------------------------------
public class PSSM
{
public enum Flag
{
CASE_SENSITIVE,
LOG2_WEIGHTED
}
private String mName;
private BioSequenceType mBioSequenceType;
private List mPositions;
private int mMinPosition;
private int mMaxPosition;
private List mResidues;
private String mResiduesString;
private Float[][] mMatrixArray;
private List mPositionalGapOpenScore;
private List mPositionalGapExtScore;
private float mDefaultScore = 0.0f;
private char mUnknownResidue;
private Set mFlags = new HashSet<>(4);
// Cached values
private Integer mLength;
private boolean mCaseSensitive;
private static final String XML_PSSM = "PSSM";
private static final String XML_NAME_ATT = "name";
private static final String XML_RESIDUES = "Residues";
private static final String XML_MATRIX_ROW = "MatrixRow";
private static final String XML_POSITION_ATT = "position";
//###########################################################################
// CONSTRUCTORS
//###########################################################################
//---------------------------------------------------------------------------
/**
Constructs a PSSM from a multiple sequence alignment.
@param inMSA source multiple sequence alignment
@param inFlags flag(s) to apply to the PSSM
*/
public PSSM(MultipleSequenceAlignment inMSA, Flag... inFlags)
{
this(inMSA, null, inFlags);
}
//---------------------------------------------------------------------------
/**
Constructs a PSSM from a multiple sequence alignment.
@param inMSA source multiple sequence alignment
@param inBackgroundAAFreq optional background amino acid frequency table. If null,
an equal residue probability is used.
@param inFlags flag(s) to apply to the PSSM
*/
public PSSM(MultipleSequenceAlignment inMSA, AminoAcidFreqTable inBackgroundAAFreq, Flag... inFlags)
{
// Flags must be set first
setFlags(inFlags);
mBioSequenceType = inMSA.getBioSequenceType();
if (BioSequenceType.NUCLEIC_ACID.equals(mBioSequenceType))
{
mUnknownResidue = 'N';
}
else
{
mUnknownResidue = 'X';
}
mResidues = new OrderedSet<>(30);
PositionalFrequencyMatrix freqMatrix;
if (mCaseSensitive)
{
freqMatrix = inMSA.getPositionFreqMatrix(new PositionalFrequencyMatrix.Flag[] {PositionalFrequencyMatrix.Flag.CASE_SENSITIVE});
}
else
{
freqMatrix = inMSA.getPositionFreqMatrix();
}
if (freqMatrix != null)
{
float totalPsuedocounts = (float) Math.sqrt(inMSA.size());
for (Character residue : freqMatrix.getResidueKeys())
{
mResidues.add(residue);
}
Map backgroundResidueFreqMap = new HashMap<>(30);
if (inBackgroundAAFreq != null)
{
for (AminoAcid aa : inBackgroundAAFreq.keySet())
{
mResidues.add(aa.getOneLetterCode());
if (aa.getOneLetterCode() != mUnknownResidue)
{
backgroundResidueFreqMap.put(aa.getOneLetterCode(), inBackgroundAAFreq.get(aa));
}
}
}
else
{
for (Character residue : freqMatrix.getResidueKeys())
{
if (residue != mUnknownResidue)
{
backgroundResidueFreqMap.put(residue, 1f);
}
}
}
mResiduesString = StringUtil.join(mResidues, "");
boolean log2Weighted = mFlags.contains(Flag.LOG2_WEIGHTED);
// Determine the number of rows and columns
mPositions = new ArrayList<>(freqMatrix.getPositionKeys());
Collections.sort(mPositions);
mMinPosition = mPositions.get(0);
mMaxPosition = mPositions.get(mPositions.size() - 1);
int numRows = mMaxPosition - mMinPosition + 1;
int numCols = mResidues.size();
// Create the 2D array
mMatrixArray = new Float[numRows][numCols];
mPositionalGapOpenScore = new ArrayList<>();
mPositionalGapExtScore = new ArrayList<>();
for (Integer position : freqMatrix.getPositionKeys())
{
for (Character residue : mResidues)
{
if (residue != mUnknownResidue)
{
Float bgFreq = backgroundResidueFreqMap.get(residue);
// Avoid divide by zero
if (null == bgFreq
|| 0.0f == bgFreq)
{
bgFreq = 0.0001f;
}
// float weightedScore = ((float) freqMatrix.getCount(residue, position) + (totalPsuedocounts * bgFreq)) / (freqMatrix.getPositionTotal(position) + totalPsuedocounts);
float weightedScore = ((float) freqMatrix.getCount(residue, position) + (totalPsuedocounts * bgFreq)) / (inMSA.size() + totalPsuedocounts);
if (log2Weighted)
{
weightedScore = (float) MathUtil.log2(weightedScore);
}
int residueIdx = mResiduesString.indexOf(residue);
mMatrixArray[position - mMinPosition][residueIdx] = weightedScore * 10;
}
}
float weightedScore = 1 - ((float) freqMatrix.getGapOpenCount(position) + (totalPsuedocounts * 0.0001f)) / (inMSA.size() + totalPsuedocounts);
if (log2Weighted)
{
weightedScore = (float) MathUtil.log2(weightedScore);
}
mPositionalGapOpenScore.add(weightedScore);
weightedScore = 1 - ((float) freqMatrix.getGapExtCount(position) + (totalPsuedocounts * 0.0001f)) / (inMSA.size() + totalPsuedocounts);
if (log2Weighted)
{
weightedScore = (float) MathUtil.log2(weightedScore);
}
mPositionalGapExtScore.add(weightedScore);
}
}
}
//---------------------------------------------------------------------------
public PSSM(XMLTag inXMLTag)
{
inXMLTag.verifyTagName(XML_PSSM);
setName(inXMLTag.getAttributeValue(XML_NAME_ATT));
XMLTag residuesTag = inXMLTag.getRequiredSubtagByName(XML_RESIDUES);
mResidues = new ArrayList<>(30);
for (char residue : residuesTag.getContent().toCharArray())
{
mResidues.add(residue);
}
mResiduesString = StringUtil.join(mResidues, "");
List matrixRowTags = inXMLTag.getSubtagsByName(XML_MATRIX_ROW);
List mPositions = new ArrayList<>(matrixRowTags.size());
for (XMLTag rowTag : matrixRowTags)
{
mPositions.add(Integer.parseInt(rowTag.getAttributeValue(XML_POSITION_ATT)));
}
Collections.sort(mPositions);
mMinPosition = mPositions.get(0);
mMaxPosition = mPositions.get(mPositions.size() - 1);
// Determine the number of rows and columns
int numRows = mMaxPosition - mMinPosition + 1;
int numCols = mResidues.size();
// Create the 2D array
mMatrixArray = new Float[numRows][numCols];
for (XMLTag matrixRowTag : matrixRowTags)
{
int position = Integer.parseInt(matrixRowTag.getAttributeValue(XML_POSITION_ATT));
String[] values = matrixRowTag.getContent().split("\\s+");
for (int j = 0; j < mResidues.size(); j++)
{
mMatrixArray[position - mMinPosition][j] = Float.parseFloat(values[j]);
}
}
}
//###########################################################################
// PUBLIC METHODS
//###########################################################################
//---------------------------------------------------------------------------
public float score(int inPosition, char inResidue)
{
char residue = inResidue;
if (! mCaseSensitive)
{
residue = Character.toUpperCase(residue);
}
float score = mDefaultScore;
if (residue != mUnknownResidue
&& inPosition >= mMinPosition
&& inPosition <= mMaxPosition)
{
// score = mMatrix.get(inPosition, residue);
int residueIdx = mResiduesString.indexOf(residue);
if (residueIdx >= 0)
{
score = mMatrixArray[inPosition - mMinPosition][residueIdx];
}
}
return score;
}
//---------------------------------------------------------------------------
public float getGapOpenScore(int inPosition)
{
return mPositionalGapOpenScore.get(inPosition - 1);
}
//---------------------------------------------------------------------------
// Used for eaking out a little more performance vs. repeated calls to getGapOpenScore().
public float[] getGapOpenScores()
{
float[] values = new float[mPositionalGapOpenScore.size()];
int i = 0;
for (Float value : mPositionalGapOpenScore)
{
values[i++] = (value != null ? value : Float.NaN);
}
return values;
}
//---------------------------------------------------------------------------
public float getGapExtScore(int inPosition)
{
return mPositionalGapExtScore.get(inPosition - 1);
}
//---------------------------------------------------------------------------
public String showScoring(Collection inSeqs)
{
StringBuilderPlus buffer = new StringBuilderPlus();
for (Integer position : mPositions)
{
buffer.append(String.format("%3d. ", position));
for (BioSequence seq : inSeqs)
{
char residue = seq.getSequence().charAt(position - 1);
buffer.append(String.format("%c %.1f ", residue, score(position, residue)));
}
buffer.appendln();
}
return buffer.toString();
}
//---------------------------------------------------------------------------
public String getName()
{
return mName;
}
//---------------------------------------------------------------------------
public PSSM setName(String inValue)
{
mName = inValue;
return this;
}
//---------------------------------------------------------------------------
public int length()
{
if (null == mLength)
{
int length = 0;
if (mMatrixArray != null)
{
// Since not all positions might be present in the matrix, return the max position value
length = mMaxPosition;
}
mLength = length;
}
return mLength;
}
//---------------------------------------------------------------------------
public XMLTag toXMLTag()
{
XMLTag rootTag = new XMLTag(XML_PSSM);
if (StringUtil.isSet(getName()))
{
rootTag.setAttribute(XML_NAME_ATT, getName());
}
rootTag.addSubtag(new XMLTag(XML_RESIDUES).setContent(StringUtil.join(mResidues, "")));
for (Integer position : mPositions)
{
XMLTag matrixRowTag = new XMLTag(XML_MATRIX_ROW);
matrixRowTag.setAttribute(XML_POSITION_ATT, position);
StringBuilderPlus buffer = new StringBuilderPlus().setDelimiter(" ");
for (Character residue : mResidues)
{
int residueIdx = mResiduesString.indexOf(residue);
Float value = mMatrixArray[position - mMinPosition][residueIdx];
buffer.delimitedAppend(String.format("%.2f", value));
}
matrixRowTag.setContent(buffer);
rootTag.addSubtag(matrixRowTag);
}
return rootTag;
}
//###########################################################################
// PRIVATE METHODS
//###########################################################################
//--------------------------------------------------------------------------
private void setFlags(Flag[] inFlags)
{
mFlags.clear();
mCaseSensitive = false;
if (inFlags != null)
{
for (Flag flag : inFlags)
{
mFlags.add(flag);
}
if (mFlags.contains(Flag.CASE_SENSITIVE))
{
mCaseSensitive = true;
}
}
}
}