All Downloads are FREE. Search and download functionalities are using the official Maven repository.

com.hfg.bio.seq.alignment.matrix.PSSM Maven / Gradle / Ivy

There is a newer version: 20240423
Show newest version
package com.hfg.bio.seq.alignment.matrix;


import java.util.ArrayList;
import java.util.Collection;
import java.util.Collections;
import java.util.HashMap;
import java.util.HashSet;
import java.util.List;
import java.util.Map;
import java.util.Set;

import com.hfg.bio.AminoAcid;
import com.hfg.bio.AminoAcidFreqTable;
import com.hfg.bio.seq.BioSequence;
import com.hfg.bio.seq.BioSequenceType;
import com.hfg.bio.seq.PositionalFrequencyMatrix;
import com.hfg.bio.seq.Protein;
import com.hfg.bio.seq.alignment.MultipleSequenceAlignment;
import com.hfg.util.StringBuilderPlus;
import com.hfg.util.StringUtil;
import com.hfg.util.collection.SparseMatrix;
import com.hfg.xml.XMLTag;

//------------------------------------------------------------------------------
/**
 Container for a Position-Specific Scoring Matrix (PSSM). Also known as a
 Position-Weighted Matrix.
 
See http://en.wikipedia.org/wiki/Position_weight_matrix
@author J. Alex Taylor, hairyfatguy.com */ //------------------------------------------------------------------------------ // com.hfg Library // // This library is free software; you can redistribute it and/or // modify it under the terms of the GNU Lesser General Public // License as published by the Free Software Foundation; either // version 2.1 of the License, or (at your option) any later version. // // This library is distributed in the hope that it will be useful, // but WITHOUT ANY WARRANTY; without even the implied warranty of // MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU // Lesser General Public License for more details. // // You should have received a copy of the GNU Lesser General Public // License along with this library; if not, write to the Free Software // Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA // // J. Alex Taylor, President, Founder, CEO, COO, CFO, OOPS hairyfatguy.com // [email protected] //------------------------------------------------------------------------------ public class PSSM { public enum Flag { CASE_SENSITIVE } private String mName; private BioSequenceType mBioSequenceType; private SparseMatrix mMatrix; private List mPositionalGapOpenScore; private List mPositionalGapExtScore; private float mDefaultScore = 0.0f; private char mUnknownResidue; private Set mFlags = new HashSet<>(4); // Cached values private Integer mLength; private static final String XML_PSSM = "PSSM"; private static final String XML_NAME_ATT = "name"; private static final String XML_RESIDUES = "Residues"; private static final String XML_MATRIX_ROW = "MatrixRow"; private static final String XML_POSITION_ATT = "position"; //************************************************************************** // CONSTRUCTORS //************************************************************************** //--------------------------------------------------------------------------- /** Constructs a PSSM from a multiple sequence alignment. @param inMSA source multiple sequence alignment @param inFlags flag(s) to apply to the PSSM */ public PSSM(MultipleSequenceAlignment inMSA, Flag... inFlags) { this(inMSA, null, inFlags); } //--------------------------------------------------------------------------- /** Constructs a PSSM from a multiple sequence alignment. @param inMSA source multiple sequence alignment @param inBackgroundAAFreq optional background amino acid frequency table. If null, an equal residue probability is used. @param inFlags flag(s) to apply to the PSSM */ public PSSM(MultipleSequenceAlignment inMSA, AminoAcidFreqTable inBackgroundAAFreq, Flag... inFlags) { // Flags must be set first setFlags(inFlags); mBioSequenceType = inMSA.getBioSequenceType(); if (BioSequenceType.NUCLEIC_ACID.equals(mBioSequenceType)) { mUnknownResidue = 'N'; } else { mUnknownResidue = 'X'; } PositionalFrequencyMatrix freqMatrix; if (mFlags.contains(Flag.CASE_SENSITIVE)) { freqMatrix = inMSA.getPositionFreqMatrix(new PositionalFrequencyMatrix.Flag[] {PositionalFrequencyMatrix.Flag.CASE_SENSITIVE}); } else { freqMatrix = inMSA.getPositionFreqMatrix(); } if (freqMatrix != null) { float totalPsuedocounts = (float) Math.sqrt(inMSA.size()); Map backgroundResidueFreqMap = new HashMap<>(30); if (inBackgroundAAFreq != null) { for (AminoAcid aa : inBackgroundAAFreq.keySet()) { if (aa.getOneLetterCode() != mUnknownResidue) { backgroundResidueFreqMap.put(aa.getOneLetterCode(), inBackgroundAAFreq.get(aa)); } } } else { for (Character residue : freqMatrix.getResidueKeys()) { if (residue != mUnknownResidue) { backgroundResidueFreqMap.put(residue, 1f); } } } mMatrix = new SparseMatrix<>(); mPositionalGapOpenScore = new ArrayList<>(); mPositionalGapExtScore = new ArrayList<>(); List residues = new ArrayList<>(freqMatrix.getResidueKeys()); Collections.sort(residues); for (Integer position : freqMatrix.getPositionKeys()) { for (Character residue : residues) { if (residue != mUnknownResidue) { Float bgFreq = backgroundResidueFreqMap.get(residue); // Avoid divide by zero if (null == bgFreq || 0.0f == bgFreq) { bgFreq = 0.0001f; } // float weightedScore = ((float) freqMatrix.getCount(residue, position) + (totalPsuedocounts * bgFreq)) / (freqMatrix.getPositionTotal(position) + totalPsuedocounts); float weightedScore = ((float) freqMatrix.getCount(residue, position) + (totalPsuedocounts * bgFreq)) / (inMSA.size() + totalPsuedocounts); // mMatrix.put(position, residue, (float) - Math.log(weightedScore)); mMatrix.put(position, residue, weightedScore * 10); } } float weightedScore = 1 - ((float) freqMatrix.getGapOpenCount(position) + (totalPsuedocounts * 0.0001f)) / (inMSA.size() + totalPsuedocounts); mPositionalGapOpenScore.add(weightedScore); weightedScore = 1 - ((float) freqMatrix.getGapExtCount(position) + (totalPsuedocounts * 0.0001f)) / (inMSA.size() + totalPsuedocounts); mPositionalGapExtScore.add(weightedScore); } } } //--------------------------------------------------------------------------- public PSSM(XMLTag inXMLTag) { inXMLTag.verifyTagName(XML_PSSM); setName(inXMLTag.getAttributeValue(XML_NAME_ATT)); XMLTag residuesTag = inXMLTag.getRequiredSubtagByName(XML_RESIDUES); List residues = new ArrayList<>(30); for (char residue : residuesTag.getContent().toCharArray()) { residues.add(residue); } mMatrix = new SparseMatrix<>(); List matrixRowTags = inXMLTag.getSubtagsByName(XML_MATRIX_ROW); for (XMLTag matrixRowTag : matrixRowTags) { int position = Integer.parseInt(matrixRowTag.getAttributeValue(XML_POSITION_ATT)); String[] values = matrixRowTag.getContent().split("\\s+"); int i = 0; for (Character residue : residues) { mMatrix.put(position, residue, Float.parseFloat(values[i++])); } } } //************************************************************************** // PUBLIC METHODS //************************************************************************** //--------------------------------------------------------------------------- public float score(int inPosition, char inResidue) { char residue = inResidue; if (! mFlags.contains(Flag.CASE_SENSITIVE)) { residue = Character.toUpperCase(residue); } Float score = null; if (residue != mUnknownResidue) { score = mMatrix.get(inPosition, residue); } return (score != null ? score : mDefaultScore); } //--------------------------------------------------------------------------- public float getGapOpenScore(int inPosition) { return mPositionalGapOpenScore.get(inPosition - 1); } //--------------------------------------------------------------------------- public float getGapExtScore(int inPosition) { return mPositionalGapExtScore.get(inPosition - 1); } //--------------------------------------------------------------------------- public String showScoring(Collection inSeqs) { StringBuilderPlus buffer = new StringBuilderPlus(); for (Integer position : mMatrix.rowKeySet()) { buffer.append(String.format("%3d. ", position)); for (BioSequence seq : inSeqs) { char residue = seq.getSequence().charAt(position - 1); buffer.append(String.format("%c %.1f ", residue, score(position, residue))); } buffer.appendln(); } return buffer.toString(); } //--------------------------------------------------------------------------- public String getName() { return mName; } //--------------------------------------------------------------------------- public PSSM setName(String inValue) { mName = inValue; return this; } //--------------------------------------------------------------------------- public int length() { if (null == mLength) { int length = 0; if (mMatrix != null) { // Since not all positions might be present in the matrix, return the max position value List positions = new ArrayList<>(mMatrix.rowKeySet()); Collections.sort(positions); length = positions.get(positions.size() - 1); } mLength = length; } return mLength; } //--------------------------------------------------------------------------- public XMLTag toXMLTag() { XMLTag rootTag = new XMLTag(XML_PSSM); if (StringUtil.isSet(getName())) { rootTag.setAttribute(XML_NAME_ATT, getName()); } Set residues = mMatrix.colKeySet(); rootTag.addSubtag(new XMLTag(XML_RESIDUES).setContent(StringUtil.join(residues, ""))); for (Integer position : mMatrix.rowKeySet()) { XMLTag matrixRowTag = new XMLTag(XML_MATRIX_ROW); matrixRowTag.setAttribute(XML_POSITION_ATT, position); StringBuilderPlus buffer = new StringBuilderPlus().setDelimiter(" "); Map rowMap = mMatrix.getRow(position); for (Character residue : residues) { Float value = rowMap.get(residue); buffer.delimitedAppend(String.format("%.2f", value)); } matrixRowTag.setContent(buffer); rootTag.addSubtag(matrixRowTag); } return rootTag; } //-------------------------------------------------------------------------- private void setFlags(Flag[] inFlags) { mFlags.clear(); if (inFlags != null) { for (Flag flag : inFlags) { mFlags.add(flag); } } } }




© 2015 - 2024 Weber Informatics LLC | Privacy Policy