All Downloads are FREE. Search and download functionalities are using the official Maven repository.

com.hfg.bio.seq.alignment.matrix.PSSM Maven / Gradle / Ivy

There is a newer version: 20240423
Show newest version
package com.hfg.bio.seq.alignment.matrix;


import java.util.ArrayList;
import java.util.Collection;
import java.util.Collections;
import java.util.HashMap;
import java.util.HashSet;
import java.util.List;
import java.util.Map;
import java.util.Set;

import com.hfg.bio.AminoAcid;
import com.hfg.bio.AminoAcidFreqTable;
import com.hfg.bio.seq.BioSequence;
import com.hfg.bio.seq.BioSequenceType;
import com.hfg.bio.seq.PositionalFrequencyMatrix;
import com.hfg.bio.seq.Protein;
import com.hfg.bio.seq.alignment.MultipleSequenceAlignment;
import com.hfg.math.MathUtil;
import com.hfg.util.StringBuilderPlus;
import com.hfg.util.StringUtil;
import com.hfg.util.collection.OrderedSet;
import com.hfg.xml.XMLTag;

//------------------------------------------------------------------------------
/**
 Container for a Position-Specific Scoring Matrix (PSSM). Also known as a
 Position-Weighted Matrix.
 
See http://en.wikipedia.org/wiki/Position_weight_matrix
@author J. Alex Taylor, hairyfatguy.com */ //------------------------------------------------------------------------------ // com.hfg Library // // This library is free software; you can redistribute it and/or // modify it under the terms of the GNU Lesser General Public // License as published by the Free Software Foundation; either // version 2.1 of the License, or (at your option) any later version. // // This library is distributed in the hope that it will be useful, // but WITHOUT ANY WARRANTY; without even the implied warranty of // MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU // Lesser General Public License for more details. // // You should have received a copy of the GNU Lesser General Public // License along with this library; if not, write to the Free Software // Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA // // J. Alex Taylor, President, Founder, CEO, COO, CFO, OOPS hairyfatguy.com // [email protected] //------------------------------------------------------------------------------ public class PSSM { public enum Flag { CASE_SENSITIVE, LOG2_WEIGHTED } private String mName; private BioSequenceType mBioSequenceType; private List mPositions; private int mMinPosition; private int mMaxPosition; private List mResidues; private String mResiduesString; private Float[][] mMatrixArray; private List mPositionalGapOpenScore; private List mPositionalGapExtScore; private float mDefaultScore = 0.0f; private char mUnknownResidue; private Set mFlags = new HashSet<>(4); // Cached values private Integer mLength; private boolean mCaseSensitive; private static final String XML_PSSM = "PSSM"; private static final String XML_NAME_ATT = "name"; private static final String XML_RESIDUES = "Residues"; private static final String XML_MATRIX_ROW = "MatrixRow"; private static final String XML_POSITION_ATT = "position"; //########################################################################### // CONSTRUCTORS //########################################################################### //--------------------------------------------------------------------------- /** Constructs a PSSM from a multiple sequence alignment. @param inMSA source multiple sequence alignment @param inFlags flag(s) to apply to the PSSM */ public PSSM(MultipleSequenceAlignment inMSA, Flag... inFlags) { this(inMSA, null, inFlags); } //--------------------------------------------------------------------------- /** Constructs a PSSM from a multiple sequence alignment. @param inMSA source multiple sequence alignment @param inBackgroundAAFreq optional background amino acid frequency table. If null, an equal residue probability is used. @param inFlags flag(s) to apply to the PSSM */ public PSSM(MultipleSequenceAlignment inMSA, AminoAcidFreqTable inBackgroundAAFreq, Flag... inFlags) { // Flags must be set first setFlags(inFlags); mBioSequenceType = inMSA.getBioSequenceType(); if (BioSequenceType.NUCLEIC_ACID.equals(mBioSequenceType)) { mUnknownResidue = 'N'; } else { mUnknownResidue = 'X'; } mResidues = new OrderedSet<>(30); PositionalFrequencyMatrix freqMatrix; if (mCaseSensitive) { freqMatrix = inMSA.getPositionFreqMatrix(new PositionalFrequencyMatrix.Flag[] {PositionalFrequencyMatrix.Flag.CASE_SENSITIVE}); } else { freqMatrix = inMSA.getPositionFreqMatrix(); } if (freqMatrix != null) { float totalPsuedocounts = (float) Math.sqrt(inMSA.size()); for (Character residue : freqMatrix.getResidueKeys()) { mResidues.add(residue); } Map backgroundResidueFreqMap = new HashMap<>(30); if (inBackgroundAAFreq != null) { for (AminoAcid aa : inBackgroundAAFreq.keySet()) { mResidues.add(aa.getOneLetterCode()); if (aa.getOneLetterCode() != mUnknownResidue) { backgroundResidueFreqMap.put(aa.getOneLetterCode(), inBackgroundAAFreq.get(aa)); } } } else { for (Character residue : freqMatrix.getResidueKeys()) { if (residue != mUnknownResidue) { backgroundResidueFreqMap.put(residue, 1f); } } } mResiduesString = StringUtil.join(mResidues, ""); boolean log2Weighted = mFlags.contains(Flag.LOG2_WEIGHTED); // Determine the number of rows and columns mPositions = new ArrayList<>(freqMatrix.getPositionKeys()); Collections.sort(mPositions); mMinPosition = mPositions.get(0); mMaxPosition = mPositions.get(mPositions.size() - 1); int numRows = mMaxPosition - mMinPosition + 1; int numCols = mResidues.size(); // Create the 2D array mMatrixArray = new Float[numRows][numCols]; mPositionalGapOpenScore = new ArrayList<>(); mPositionalGapExtScore = new ArrayList<>(); for (Integer position : freqMatrix.getPositionKeys()) { for (Character residue : mResidues) { if (residue != mUnknownResidue) { Float bgFreq = backgroundResidueFreqMap.get(residue); // Avoid divide by zero if (null == bgFreq || 0.0f == bgFreq) { bgFreq = 0.0001f; } // float weightedScore = ((float) freqMatrix.getCount(residue, position) + (totalPsuedocounts * bgFreq)) / (freqMatrix.getPositionTotal(position) + totalPsuedocounts); float weightedScore = ((float) freqMatrix.getCount(residue, position) + (totalPsuedocounts * bgFreq)) / (inMSA.size() + totalPsuedocounts); if (log2Weighted) { weightedScore = (float) MathUtil.log2(weightedScore); } int residueIdx = mResiduesString.indexOf(residue); mMatrixArray[position - mMinPosition][residueIdx] = weightedScore * 10; } } float weightedScore = 1 - ((float) freqMatrix.getGapOpenCount(position) + (totalPsuedocounts * 0.0001f)) / (inMSA.size() + totalPsuedocounts); if (log2Weighted) { weightedScore = (float) MathUtil.log2(weightedScore); } mPositionalGapOpenScore.add(weightedScore); weightedScore = 1 - ((float) freqMatrix.getGapExtCount(position) + (totalPsuedocounts * 0.0001f)) / (inMSA.size() + totalPsuedocounts); if (log2Weighted) { weightedScore = (float) MathUtil.log2(weightedScore); } mPositionalGapExtScore.add(weightedScore); } } } //--------------------------------------------------------------------------- public PSSM(XMLTag inXMLTag) { inXMLTag.verifyTagName(XML_PSSM); setName(inXMLTag.getAttributeValue(XML_NAME_ATT)); XMLTag residuesTag = inXMLTag.getRequiredSubtagByName(XML_RESIDUES); mResidues = new ArrayList<>(30); for (char residue : residuesTag.getContent().toCharArray()) { mResidues.add(residue); } mResiduesString = StringUtil.join(mResidues, ""); List matrixRowTags = inXMLTag.getSubtagsByName(XML_MATRIX_ROW); List mPositions = new ArrayList<>(matrixRowTags.size()); for (XMLTag rowTag : matrixRowTags) { mPositions.add(Integer.parseInt(rowTag.getAttributeValue(XML_POSITION_ATT))); } Collections.sort(mPositions); mMinPosition = mPositions.get(0); mMaxPosition = mPositions.get(mPositions.size() - 1); // Determine the number of rows and columns int numRows = mMaxPosition - mMinPosition + 1; int numCols = mResidues.size(); // Create the 2D array mMatrixArray = new Float[numRows][numCols]; for (XMLTag matrixRowTag : matrixRowTags) { int position = Integer.parseInt(matrixRowTag.getAttributeValue(XML_POSITION_ATT)); String[] values = matrixRowTag.getContent().split("\\s+"); for (int j = 0; j < mResidues.size(); j++) { mMatrixArray[position - mMinPosition][j] = Float.parseFloat(values[j]); } } } //########################################################################### // PUBLIC METHODS //########################################################################### //--------------------------------------------------------------------------- public float score(int inPosition, char inResidue) { char residue = inResidue; if (! mCaseSensitive) { residue = Character.toUpperCase(residue); } float score = mDefaultScore; if (residue != mUnknownResidue && inPosition >= mMinPosition && inPosition <= mMaxPosition) { // score = mMatrix.get(inPosition, residue); int residueIdx = mResiduesString.indexOf(residue); if (residueIdx >= 0) { score = mMatrixArray[inPosition - mMinPosition][residueIdx]; } } return score; } //--------------------------------------------------------------------------- public float getGapOpenScore(int inPosition) { return mPositionalGapOpenScore.get(inPosition - 1); } //--------------------------------------------------------------------------- // Used for eaking out a little more performance vs. repeated calls to getGapOpenScore(). public float[] getGapOpenScores() { float[] values = new float[mPositionalGapOpenScore.size()]; int i = 0; for (Float value : mPositionalGapOpenScore) { values[i++] = (value != null ? value : Float.NaN); } return values; } //--------------------------------------------------------------------------- public float getGapExtScore(int inPosition) { return mPositionalGapExtScore.get(inPosition - 1); } //--------------------------------------------------------------------------- public String showScoring(Collection inSeqs) { StringBuilderPlus buffer = new StringBuilderPlus(); for (Integer position : mPositions) { buffer.append(String.format("%3d. ", position)); for (BioSequence seq : inSeqs) { char residue = seq.getSequence().charAt(position - 1); buffer.append(String.format("%c %.1f ", residue, score(position, residue))); } buffer.appendln(); } return buffer.toString(); } //--------------------------------------------------------------------------- public String getName() { return mName; } //--------------------------------------------------------------------------- public PSSM setName(String inValue) { mName = inValue; return this; } //--------------------------------------------------------------------------- public int length() { if (null == mLength) { int length = 0; if (mMatrixArray != null) { // Since not all positions might be present in the matrix, return the max position value length = mMaxPosition; } mLength = length; } return mLength; } //--------------------------------------------------------------------------- public XMLTag toXMLTag() { XMLTag rootTag = new XMLTag(XML_PSSM); if (StringUtil.isSet(getName())) { rootTag.setAttribute(XML_NAME_ATT, getName()); } rootTag.addSubtag(new XMLTag(XML_RESIDUES).setContent(StringUtil.join(mResidues, ""))); for (Integer position : mPositions) { XMLTag matrixRowTag = new XMLTag(XML_MATRIX_ROW); matrixRowTag.setAttribute(XML_POSITION_ATT, position); StringBuilderPlus buffer = new StringBuilderPlus().setDelimiter(" "); for (Character residue : mResidues) { int residueIdx = mResiduesString.indexOf(residue); Float value = mMatrixArray[position - mMinPosition][residueIdx]; buffer.delimitedAppend(String.format("%.2f", value)); } matrixRowTag.setContent(buffer); rootTag.addSubtag(matrixRowTag); } return rootTag; } //########################################################################### // PRIVATE METHODS //########################################################################### //-------------------------------------------------------------------------- private void setFlags(Flag[] inFlags) { mFlags.clear(); mCaseSensitive = false; if (inFlags != null) { for (Flag flag : inFlags) { mFlags.add(flag); } if (mFlags.contains(Flag.CASE_SENSITIVE)) { mCaseSensitive = true; } } } }




© 2015 - 2024 Weber Informatics LLC | Privacy Policy