com.hfg.bio.seq.alignment.MultipleSequenceAlignment Maven / Gradle / Ivy
Go to download
Show more of this group Show more artifacts with this name
Show all versions of com_hfg Show documentation
Show all versions of com_hfg Show documentation
com.hfg xml, html, svg, and bioinformatics utility library
package com.hfg.bio.seq.alignment;
import java.util.*;
import com.hfg.bio.phylogeny.DistanceMatrix;
import com.hfg.bio.phylogeny.DistanceMatrixModel;
import com.hfg.bio.phylogeny.UncorrectedModel;
import com.hfg.bio.seq.PositionalFrequencyMatrix;
import com.hfg.bio.seq.BioSequence;
import com.hfg.bio.seq.BioSequenceType;
import com.hfg.exception.ProgrammingException;
import com.hfg.network.Edge;
import com.hfg.util.StringUtil;
import com.hfg.util.collection.CollectionUtil;
import com.hfg.util.collection.DataTable;
import com.hfg.util.collection.SparseMatrix;
//------------------------------------------------------------------------------
/**
Container for aligned sequences.
@author J. Alex Taylor, hairyfatguy.com
*/
//------------------------------------------------------------------------------
// com.hfg Library
//
// This library is free software; you can redistribute it and/or
// modify it under the terms of the GNU Lesser General Public
// License as published by the Free Software Foundation; either
// version 2.1 of the License, or (at your option) any later version.
//
// This library is distributed in the hope that it will be useful,
// but WITHOUT ANY WARRANTY; without even the implied warranty of
// MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
// Lesser General Public License for more details.
//
// You should have received a copy of the GNU Lesser General Public
// License along with this library; if not, write to the Free Software
// Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA
//
// J. Alex Taylor, President, Founder, CEO, COO, CFO, OOPS hairyfatguy.com
// [email protected]
//------------------------------------------------------------------------------
public class MultipleSequenceAlignment implements Cloneable
{
//**************************************************************************
// PRIVATE FIELDS
//**************************************************************************
private String mId;
private String mTitle;
private List mAlignedSeqs = new ArrayList<>();
private int mAlignmentLength;
private BioSequenceType mBioSequenceType;
private PositionalFrequencyMatrix mPositionFreqMatrix;
private DataTable mDataTable;
private Map mAttributes;
//**************************************************************************
// CONSTRUCTORS
//**************************************************************************
//---------------------------------------------------------------------------
public MultipleSequenceAlignment()
{
}
//---------------------------------------------------------------------------
public MultipleSequenceAlignment(Collection inAlignedSeqs)
{
if (inAlignedSeqs != null)
{
for (T seq : inAlignedSeqs)
{
addSequence(seq);
}
}
}
//**************************************************************************
// PUBLIC METHODS
//**************************************************************************
//---------------------------------------------------------------------------
public MultipleSequenceAlignment setId(String inValue)
{
mId = inValue;
return this;
}
//---------------------------------------------------------------------------
public String getId()
{
return mId;
}
//---------------------------------------------------------------------------
public MultipleSequenceAlignment setTitle(String inValue)
{
mTitle = inValue;
return this;
}
//---------------------------------------------------------------------------
public String getTitle()
{
return mTitle;
}
//---------------------------------------------------------------------------
public MultipleSequenceAlignment setDataTable(DataTable inValue)
{
mDataTable = inValue;
return this;
}
//---------------------------------------------------------------------------
public DataTable getDataTable()
{
return mDataTable;
}
//---------------------------------------------------------------------------
@Override
public MultipleSequenceAlignment clone()
{
MultipleSequenceAlignment cloneObj;
try
{
cloneObj = (MultipleSequenceAlignment) super.clone();
}
catch (CloneNotSupportedException e)
{
throw new ProgrammingException(e);
}
if (mAlignedSeqs != null)
{
cloneObj.mAlignedSeqs = new ArrayList<>(mAlignedSeqs.size());
for (T seq : mAlignedSeqs)
{
cloneObj.mAlignedSeqs.add((T) seq.clone());
}
}
if (mPositionFreqMatrix != null)
{
cloneObj.mPositionFreqMatrix = mPositionFreqMatrix.clone();
}
return cloneObj;
}
//---------------------------------------------------------------------------
public void clearCachedData()
{
mPositionFreqMatrix = null;
}
//---------------------------------------------------------------------------
public MultipleSequenceAlignment subset(Collection inSeqIds)
{
MultipleSequenceAlignment subset = clone();
subset.mAlignedSeqs.clear();
for (String seqId : inSeqIds)
{
subset.mAlignedSeqs.add(getSequence(seqId));
}
// Force recalculation of the position freq. data
subset.mPositionFreqMatrix = null;
return subset;
}
//---------------------------------------------------------------------------
public void addSequence(T inSeq)
{
if (inSeq != null)
{
if (inSeq.length() != mAlignmentLength
&& CollectionUtil.hasValues(mAlignedSeqs))
{
throw new RuntimeException(inSeq.getID() + "'s length of "
+ inSeq.length() + " is different from the alignment length ("
+ mAlignmentLength + ")! They must be the same.");
}
mAlignedSeqs.add(inSeq);
if (0 == mAlignmentLength) mAlignmentLength = inSeq.length();
}
if (mPositionFreqMatrix != null)
{
mPositionFreqMatrix.addSequence(inSeq);
}
}
//---------------------------------------------------------------------------
public List getSequences()
{
return mAlignedSeqs;
}
//---------------------------------------------------------------------------
public T getSequence(String inSeqId)
{
T requestedSeq = null;
for (T seq : getSequences())
{
if (seq.getID().equals(inSeqId))
{
requestedSeq = seq;
break;
}
}
return requestedSeq;
}
//---------------------------------------------------------------------------
public void removeSequence(T inSeq)
{
getSequences().remove(inSeq);
clearCachedData();
}
//---------------------------------------------------------------------------
public void addInsert(int inIndex)
{
for (T seq : getSequences())
{
StringBuilder buffer = new StringBuilder(seq.getSequence());
buffer.insert(inIndex, "-");
seq.setSequence(buffer);
}
mAlignmentLength++;
mPositionFreqMatrix = null;
}
//---------------------------------------------------------------------------
public int size()
{
return (getSequences() != null ? getSequences().size() : 0);
}
//---------------------------------------------------------------------------
public BioSequenceType getBioSequenceType()
{
if (null == mBioSequenceType
&& CollectionUtil.hasValues(mAlignedSeqs))
{
mBioSequenceType = mAlignedSeqs.get(0).getType();
}
return mBioSequenceType;
}
//---------------------------------------------------------------------------
public int length()
{
return mAlignmentLength;
}
//---------------------------------------------------------------------------
public PositionalFrequencyMatrix getPositionFreqMatrix()
{
return getPositionFreqMatrix(null);
}
//---------------------------------------------------------------------------
public PositionalFrequencyMatrix getPositionFreqMatrix(PositionalFrequencyMatrix.Flag[] inFlags)
{
if (mPositionFreqMatrix != null)
{
Set currentFlags = mPositionFreqMatrix.getFlags();
boolean flagsDiffer = true;
if (null == inFlags)
{
if (! CollectionUtil.hasValues(currentFlags))
{
flagsDiffer = false;
}
}
else if (CollectionUtil.hasValues(currentFlags)
&& inFlags.length == currentFlags.size())
{
for (PositionalFrequencyMatrix.Flag flag : inFlags)
{
flagsDiffer = false;
if (! currentFlags.contains(flag))
{
flagsDiffer = true;
break;
}
}
}
if (flagsDiffer)
{
mPositionFreqMatrix = new PositionalFrequencyMatrix(this, inFlags);
}
}
else
{
if (CollectionUtil.hasValues(getSequences()))
{
mPositionFreqMatrix = new PositionalFrequencyMatrix(this, inFlags);
}
}
return mPositionFreqMatrix;
}
/*
//---------------------------------------------------------------------------
public SparseMatrix getPositionProbabilityMatrix()
{
SparseMatrix probabilityMatrix = null;
int size = size();
PositionalFrequencyMatrix freqMatrix = getPositionFreqMatrix();
if (freqMatrix != null)
{
probabilityMatrix = new SparseMatrix();
for (Character residue : freqMatrix.getResidueKeys())
{
for (Integer position : freqMatrix.getPositionKeys())
{
probabilityMatrix.put(residue, position, freqMatrix.getCount(residue, position) / (float) size);
}
}
}
return probabilityMatrix;
}
*/
//---------------------------------------------------------------------------
/**
Returns a percent identity matrix adjusted for any terminal gaps.
@return the generated percent identity matrix
*/
public SparseMatrix getPctIdentityMatrix()
{
int matrixWidth = mAlignedSeqs != null ? mAlignedSeqs.size() + 10 : 10;
SparseMatrix matrix = new SparseMatrix<>(matrixWidth, matrixWidth);
if (mAlignedSeqs != null)
{
// Calculate the pct. id between ea. pair of sequences.
// (The A-B pct. id is the not necessarily the same as the B-A pct. id.)
for (int i = 0; i < mAlignedSeqs.size(); i++)
{
BioSequence seq1 = mAlignedSeqs.get(i);
matrix.put(seq1.getID(), seq1.getID(), 100f); // Identity diagonal
for (int j = i + 1; j < mAlignedSeqs.size(); j++)
{
BioSequence seq2 = mAlignedSeqs.get(j);
AlignedQuery query = new AlignedQuery(seq1, seq1.getSequence(), 1);
AlignedSubject subject = new AlignedSubject(seq2, seq2.getSequence(), 1);
PairwiseSeqAlignment pairwiseSeqAlignment = new PairwiseSeqAlignment(query, subject);
matrix.put(seq1.getID(), seq2.getID(), pairwiseSeqAlignment.getAdjustedPctIdentity());
// Invert
query = new AlignedQuery(seq2, seq2.getSequence(), 1);
subject = new AlignedSubject(seq1, seq1.getSequence(), 1);
pairwiseSeqAlignment = new PairwiseSeqAlignment(query, subject);
matrix.put(seq2.getID(), seq1.getID(), pairwiseSeqAlignment.getAdjustedPctIdentity());
}
}
}
return matrix;
}
//---------------------------------------------------------------------------
/**
Returns a distance matrix using the specified model. For a simple distance matrix
based on mismatches and without any evolutionary compensation, use the UncorrectedModel.
@param inAlgorithm the distance matrix mode to use when calculating the distance matrix
@return the generated DistanceMatrix
*/
public DistanceMatrix getDistanceMatrix(DistanceMatrixModel inAlgorithm)
{
if (null == inAlgorithm)
{
throw new RuntimeException("A DistanceMatrixAlgorithm must be specified!");
}
DistanceMatrix matrix = new DistanceMatrix(mAlignedSeqs != null ? mAlignedSeqs.size() + 10 : 10);
if (mAlignedSeqs != null)
{
// Calculate the distance between ea. pair of sequences.
// (The A-B distance is the same as the B-A distance.)
for (int i = 0; i < mAlignedSeqs.size() - 1; i++)
{
BioSequence seq1 = mAlignedSeqs.get(i);
for (int j = i + 1; j < mAlignedSeqs.size(); j++)
{
BioSequence seq2 = mAlignedSeqs.get(j);
matrix.setDistance(seq1.getID(), seq2.getID(), inAlgorithm.calculateDistance(seq1, seq2));
}
}
}
return matrix;
}
//---------------------------------------------------------------------------
public void orderByDistanceTo(String inSeqID)
{
if (null == getSequence(inSeqID))
{
throw new RuntimeException("The MSA does not contain a sequence with id " + StringUtil.singleQuote(inSeqID) + "!");
}
DistanceMatrix matrix = getDistanceMatrix(new UncorrectedModel());
List> sortedEdges = matrix.getSortedEdges(inSeqID);
List resortedAlignedSeqs = new ArrayList(getSequences().size());
for (Edge edge : sortedEdges)
{
resortedAlignedSeqs.add(getSequence(edge.getTo()));
}
mAlignedSeqs = resortedAlignedSeqs;
}
//---------------------------------------------------------------------------
public List getPositionResidues(int inPosition)
{
List positionResidues = new ArrayList<>(size());
for (T seq : getSequences())
{
positionResidues.add(seq.residueAt(inPosition));
}
return positionResidues;
}
//---------------------------------------------------------------------------
public Set getPositionResidueSet(int inPosition)
{
Set positionResidues = new HashSet<>(20);
for (T seq : getSequences())
{
positionResidues.add(seq.residueAt(inPosition));
}
return positionResidues;
}
//--------------------------------------------------------------------------
public void setAttribute(String inName, Object inValue)
{
if (null == mAttributes)
{
mAttributes = new HashMap<>();
}
mAttributes.put(inName, inValue);
}
//--------------------------------------------------------------------------
public boolean hasAttribute(String inName)
{
return mAttributes != null && mAttributes.containsKey(inName);
}
//--------------------------------------------------------------------------
public Object getAttribute(String inName)
{
Object attr = null;
if (mAttributes != null)
{
attr = mAttributes.get(inName);
}
return attr;
}
//--------------------------------------------------------------------------
public Collection getAttributeNames()
{
Collection attrNames = null;
if (mAttributes != null)
{
attrNames = mAttributes.keySet();
}
return attrNames;
}
//--------------------------------------------------------------------------
public void clearAttributes()
{
if (mAttributes != null)
{
mAttributes.clear();
}
}
//--------------------------------------------------------------------------
public Object removeAttribute(String inName)
{
Object attr = null;
if (mAttributes != null)
{
attr = mAttributes.remove(inName);
}
return attr;
}
//**************************************************************************
// PROTECTED METHODS
//**************************************************************************
//---------------------------------------------------------------------------
protected void setLength(int inValue)
{
mAlignmentLength = inValue;
}
}