com.hfg.bio.seq.alignment.matrix.PSSM Maven / Gradle / Ivy
Go to download
Show more of this group Show more artifacts with this name
Show all versions of com_hfg Show documentation
Show all versions of com_hfg Show documentation
com.hfg xml, html, svg, and bioinformatics utility library
package com.hfg.bio.seq.alignment.matrix;
import java.util.ArrayList;
import java.util.Collection;
import java.util.Collections;
import java.util.HashMap;
import java.util.HashSet;
import java.util.List;
import java.util.Map;
import java.util.Set;
import com.hfg.bio.AminoAcid;
import com.hfg.bio.AminoAcidFreqTable;
import com.hfg.bio.seq.BioSequence;
import com.hfg.bio.seq.BioSequenceType;
import com.hfg.bio.seq.PositionalFrequencyMatrix;
import com.hfg.bio.seq.Protein;
import com.hfg.bio.seq.alignment.MultipleSequenceAlignment;
import com.hfg.util.StringBuilderPlus;
import com.hfg.util.StringUtil;
import com.hfg.util.collection.SparseMatrix;
import com.hfg.xml.XMLTag;
//------------------------------------------------------------------------------
/**
Container for a Position-Specific Scoring Matrix (PSSM). Also known as a
Position-Weighted Matrix.
@author J. Alex Taylor, hairyfatguy.com
*/
//------------------------------------------------------------------------------
// com.hfg Library
//
// This library is free software; you can redistribute it and/or
// modify it under the terms of the GNU Lesser General Public
// License as published by the Free Software Foundation; either
// version 2.1 of the License, or (at your option) any later version.
//
// This library is distributed in the hope that it will be useful,
// but WITHOUT ANY WARRANTY; without even the implied warranty of
// MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
// Lesser General Public License for more details.
//
// You should have received a copy of the GNU Lesser General Public
// License along with this library; if not, write to the Free Software
// Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA
//
// J. Alex Taylor, President, Founder, CEO, COO, CFO, OOPS hairyfatguy.com
// [email protected]
//------------------------------------------------------------------------------
public class PSSM
{
public enum Flag
{
CASE_SENSITIVE
}
private String mName;
private BioSequenceType mBioSequenceType;
private SparseMatrix mMatrix;
private List mPositionalGapOpenScore;
private List mPositionalGapExtScore;
private float mDefaultScore = 0.0f;
private char mUnknownResidue;
private Set mFlags = new HashSet<>(4);
// Cached values
private Integer mLength;
private static final String XML_PSSM = "PSSM";
private static final String XML_NAME_ATT = "name";
private static final String XML_RESIDUES = "Residues";
private static final String XML_MATRIX_ROW = "MatrixRow";
private static final String XML_POSITION_ATT = "position";
//**************************************************************************
// CONSTRUCTORS
//**************************************************************************
//---------------------------------------------------------------------------
/**
Constructs a PSSM from a multiple sequence alignment.
@param inMSA source multiple sequence alignment
@param inFlags flag(s) to apply to the PSSM
*/
public PSSM(MultipleSequenceAlignment inMSA, Flag... inFlags)
{
this(inMSA, null, inFlags);
}
//---------------------------------------------------------------------------
/**
Constructs a PSSM from a multiple sequence alignment.
@param inMSA source multiple sequence alignment
@param inBackgroundAAFreq optional background amino acid frequency table. If null,
an equal residue probability is used.
@param inFlags flag(s) to apply to the PSSM
*/
public PSSM(MultipleSequenceAlignment inMSA, AminoAcidFreqTable inBackgroundAAFreq, Flag... inFlags)
{
// Flags must be set first
setFlags(inFlags);
mBioSequenceType = inMSA.getBioSequenceType();
if (BioSequenceType.NUCLEIC_ACID.equals(mBioSequenceType))
{
mUnknownResidue = 'N';
}
else
{
mUnknownResidue = 'X';
}
PositionalFrequencyMatrix freqMatrix;
if (mFlags.contains(Flag.CASE_SENSITIVE))
{
freqMatrix = inMSA.getPositionFreqMatrix(new PositionalFrequencyMatrix.Flag[] {PositionalFrequencyMatrix.Flag.CASE_SENSITIVE});
}
else
{
freqMatrix = inMSA.getPositionFreqMatrix();
}
if (freqMatrix != null)
{
float totalPsuedocounts = (float) Math.sqrt(inMSA.size());
Map backgroundResidueFreqMap = new HashMap<>(30);
if (inBackgroundAAFreq != null)
{
for (AminoAcid aa : inBackgroundAAFreq.keySet())
{
if (aa.getOneLetterCode() != mUnknownResidue)
{
backgroundResidueFreqMap.put(aa.getOneLetterCode(), inBackgroundAAFreq.get(aa));
}
}
}
else
{
for (Character residue : freqMatrix.getResidueKeys())
{
if (residue != mUnknownResidue)
{
backgroundResidueFreqMap.put(residue, 1f);
}
}
}
mMatrix = new SparseMatrix<>();
mPositionalGapOpenScore = new ArrayList<>();
mPositionalGapExtScore = new ArrayList<>();
List residues = new ArrayList<>(freqMatrix.getResidueKeys());
Collections.sort(residues);
for (Integer position : freqMatrix.getPositionKeys())
{
for (Character residue : residues)
{
if (residue != mUnknownResidue)
{
Float bgFreq = backgroundResidueFreqMap.get(residue);
// Avoid divide by zero
if (null == bgFreq
|| 0.0f == bgFreq)
{
bgFreq = 0.0001f;
}
// float weightedScore = ((float) freqMatrix.getCount(residue, position) + (totalPsuedocounts * bgFreq)) / (freqMatrix.getPositionTotal(position) + totalPsuedocounts);
float weightedScore = ((float) freqMatrix.getCount(residue, position) + (totalPsuedocounts * bgFreq)) / (inMSA.size() + totalPsuedocounts);
// mMatrix.put(position, residue, (float) - Math.log(weightedScore));
mMatrix.put(position, residue, weightedScore * 10);
}
}
float weightedScore = 1 - ((float) freqMatrix.getGapOpenCount(position) + (totalPsuedocounts * 0.0001f)) / (inMSA.size() + totalPsuedocounts);
mPositionalGapOpenScore.add(weightedScore);
weightedScore = 1 - ((float) freqMatrix.getGapExtCount(position) + (totalPsuedocounts * 0.0001f)) / (inMSA.size() + totalPsuedocounts);
mPositionalGapExtScore.add(weightedScore);
}
}
}
//---------------------------------------------------------------------------
public PSSM(XMLTag inXMLTag)
{
inXMLTag.verifyTagName(XML_PSSM);
setName(inXMLTag.getAttributeValue(XML_NAME_ATT));
XMLTag residuesTag = inXMLTag.getRequiredSubtagByName(XML_RESIDUES);
List residues = new ArrayList<>(30);
for (char residue : residuesTag.getContent().toCharArray())
{
residues.add(residue);
}
mMatrix = new SparseMatrix<>();
List matrixRowTags = inXMLTag.getSubtagsByName(XML_MATRIX_ROW);
for (XMLTag matrixRowTag : matrixRowTags)
{
int position = Integer.parseInt(matrixRowTag.getAttributeValue(XML_POSITION_ATT));
String[] values = matrixRowTag.getContent().split("\\s+");
int i = 0;
for (Character residue : residues)
{
mMatrix.put(position, residue, Float.parseFloat(values[i++]));
}
}
}
//**************************************************************************
// PUBLIC METHODS
//**************************************************************************
//---------------------------------------------------------------------------
public float score(int inPosition, char inResidue)
{
char residue = inResidue;
if (! mFlags.contains(Flag.CASE_SENSITIVE))
{
residue = Character.toUpperCase(residue);
}
Float score = null;
if (residue != mUnknownResidue)
{
score = mMatrix.get(inPosition, residue);
}
return (score != null ? score : mDefaultScore);
}
//---------------------------------------------------------------------------
public float getGapOpenScore(int inPosition)
{
return mPositionalGapOpenScore.get(inPosition - 1);
}
//---------------------------------------------------------------------------
public float getGapExtScore(int inPosition)
{
return mPositionalGapExtScore.get(inPosition - 1);
}
//---------------------------------------------------------------------------
public String showScoring(Collection inSeqs)
{
StringBuilderPlus buffer = new StringBuilderPlus();
for (Integer position : mMatrix.rowKeySet())
{
buffer.append(String.format("%3d. ", position));
for (BioSequence seq : inSeqs)
{
char residue = seq.getSequence().charAt(position - 1);
buffer.append(String.format("%c %.1f ", residue, score(position, residue)));
}
buffer.appendln();
}
return buffer.toString();
}
//---------------------------------------------------------------------------
public String getName()
{
return mName;
}
//---------------------------------------------------------------------------
public PSSM setName(String inValue)
{
mName = inValue;
return this;
}
//---------------------------------------------------------------------------
public int length()
{
if (null == mLength)
{
int length = 0;
if (mMatrix != null)
{
// Since not all positions might be present in the matrix, return the max position value
List positions = new ArrayList<>(mMatrix.rowKeySet());
Collections.sort(positions);
length = positions.get(positions.size() - 1);
}
mLength = length;
}
return mLength;
}
//---------------------------------------------------------------------------
public XMLTag toXMLTag()
{
XMLTag rootTag = new XMLTag(XML_PSSM);
if (StringUtil.isSet(getName()))
{
rootTag.setAttribute(XML_NAME_ATT, getName());
}
Set residues = mMatrix.colKeySet();
rootTag.addSubtag(new XMLTag(XML_RESIDUES).setContent(StringUtil.join(residues, "")));
for (Integer position : mMatrix.rowKeySet())
{
XMLTag matrixRowTag = new XMLTag(XML_MATRIX_ROW);
matrixRowTag.setAttribute(XML_POSITION_ATT, position);
StringBuilderPlus buffer = new StringBuilderPlus().setDelimiter(" ");
Map rowMap = mMatrix.getRow(position);
for (Character residue : residues)
{
Float value = rowMap.get(residue);
buffer.delimitedAppend(String.format("%.2f", value));
}
matrixRowTag.setContent(buffer);
rootTag.addSubtag(matrixRowTag);
}
return rootTag;
}
//--------------------------------------------------------------------------
private void setFlags(Flag[] inFlags)
{
mFlags.clear();
if (inFlags != null)
{
for (Flag flag : inFlags)
{
mFlags.add(flag);
}
}
}
}