com.hfg.bio.phylogeny.DistanceMatrix Maven / Gradle / Ivy
Go to download
Show more of this group Show more artifacts with this name
Show all versions of com_hfg Show documentation
Show all versions of com_hfg Show documentation
com.hfg xml, html, svg, and bioinformatics utility library
package com.hfg.bio.phylogeny;
import java.io.*;
import java.util.*;
import java.util.regex.Pattern;
import java.util.regex.Matcher;
import com.hfg.util.StringUtil;
import com.hfg.network.Edge;
import com.hfg.util.collection.CollectionUtil;
import com.hfg.util.collection.MatrixCell;
import com.hfg.util.collection.SymmetricNumberMatrix;
//------------------------------------------------------------------------------
/**
Simple distance matrix container. Can read Phylip-format matrices or general
matrices where the identifiers do not contain whitespace.
Most commonly constructed via a multiple sequence alignment:
DistanceMatrix matrix = msa.getDistanceMatrix(new UncorrectedModel());
Phylip-formatted matrix example:
String testMatrix =
" 14\n" +
"Mouse \n" +
"Bovine 1.7043\n" +
"Lemur 2.0235 1.1901\n" +
"Tarsier 2.1378 1.3287 1.2905\n" +
"Squir Monk 1.5232 1.2423 1.3199 1.7878\n" +
"Jpn Macaq 1.8261 1.2508 1.3887 1.3137 1.0642\n" +
"Rhesus Mac 1.9182 1.2536 1.4658 1.3788 1.1124 0.1022\n" +
"Crab-E.Mac 2.0039 1.3066 1.4826 1.3826 0.9832 0.2061 0.2681\n";
DistanceMatrix matrix = new DistanceMatrix(testMatrix);
General matrix example:
String testMatrix =
"# Subunit distance matrix\n" +
"Alpha 0.000 1.000 2.000 3.000 3.000\n" +
"Beta 1.000 0.000 2.000 3.000 3.000\n" +
"Gamma 2.000 2.000 0.000 3.000 3.000\n" +
"Delta 3.000 3.000 0.000 0.000 1.000\n" +
"Epsilon 3.000 3.000 3.000 1.000 0.000\n\n";
DistanceMatrix matrix = new DistanceMatrix(testMatrix);
See the Phylip
Distance matrix programs page
for the description of the Phylip distance matrix format.
@author J. Alex Taylor, hairyfatguy.com
*/
//------------------------------------------------------------------------------
// com.hfg Library
//
// This library is free software; you can redistribute it and/or
// modify it under the terms of the GNU Lesser General Public
// License as published by the Free Software Foundation; either
// version 2.1 of the License, or (at your option) any later version.
//
// This library is distributed in the hope that it will be useful,
// but WITHOUT ANY WARRANTY; without even the implied warranty of
// MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
// Lesser General Public License for more details.
//
// You should have received a copy of the GNU Lesser General Public
// License along with this library; if not, write to the Free Software
// Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA
//
// J. Alex Taylor, President, Founder, CEO, COO, CFO, OOPS hairyfatguy.com
// [email protected]
//------------------------------------------------------------------------------
public class DistanceMatrix implements Cloneable
{
private SymmetricNumberMatrix mMatrix;
private boolean mIsConsumable = true;
private boolean mIsConsumed;
private static final Pattern sPhylipPattern = Pattern.compile("(.{1,10})(?:\\s*(.+))?");
private static final Pattern sPhylipFirstLinePattern = Pattern.compile("^\\s*\\d+\\s*[\\n\\r]");
//##########################################################################
// CONSTRUCTORS
//##########################################################################
//--------------------------------------------------------------------------
public DistanceMatrix()
{
this(100);
}
//--------------------------------------------------------------------------
public DistanceMatrix(int inInitialSize)
{
mMatrix = new SymmetricNumberMatrix<>(inInitialSize);
}
//--------------------------------------------------------------------------
public DistanceMatrix(String inMatrix)
{
this();
if (StringUtil.isSet(inMatrix))
{
try
{
BufferedReader reader = null;
try
{
reader = new BufferedReader(new StringReader(inMatrix));
parseMatrix(reader);
}
finally
{
if (reader != null)
{
reader.close();
}
}
}
catch (IOException e)
{
throw new RuntimeException("Problem parsing distance matrix!", e);
}
}
}
//--------------------------------------------------------------------------
public DistanceMatrix(File inMatrix)
throws IOException
{
this();
if (null == inMatrix)
{
throw new IOException("No file was specified!");
}
else if (!inMatrix.exists())
{
throw new IOException("The file " + StringUtil.singleQuote(inMatrix.getPath()) + " does not exist!");
}
BufferedReader reader = null;
try
{
reader = new BufferedReader(new FileReader(inMatrix));
parseMatrix(reader);
}
finally
{
if (reader != null) reader.close();
}
}
//##########################################################################
// PUBLIC METHODS
//##########################################################################
//--------------------------------------------------------------------------
public DistanceMatrix setIsConsumable(boolean inValue)
{
mIsConsumable = inValue;
return this;
}
//--------------------------------------------------------------------------
public boolean isConsumable()
{
return mIsConsumable;
}
//--------------------------------------------------------------------------
public DistanceMatrix setIsConsumed()
{
mIsConsumed = true;
return this;
}
//--------------------------------------------------------------------------
public boolean isConsumed()
{
return mIsConsumed;
}
//--------------------------------------------------------------------------
public void addKey(String inKey)
{
mMatrix.addKey(inKey);
}
//--------------------------------------------------------------------------
public void removeKey(String inKey)
{
mMatrix.removeKey(inKey);
}
//--------------------------------------------------------------------------
public void removeKeys(Set inKeys)
{
mMatrix.removeKeys(inKeys);
}
//--------------------------------------------------------------------------
/**
Changes the key name inOldKey to inNewKey. Can be useful in re-expanding names
after replacing them in order to comply with the 10-character Phylip format limitation.
@param inOldKey the old matrix key
@param inNewKey the new matrix key
*/
public void changeKey(String inOldKey, String inNewKey)
{
mMatrix.changeKey(inOldKey, inNewKey);
}
//--------------------------------------------------------------------------
public void setDistance(String inKey1, String inKey2, Float inDistance)
{
if (! inKey1.equals(inKey2))
{
mMatrix.put(inKey1, inKey2, inDistance);
}
}
//--------------------------------------------------------------------------
public Float getDistance(String inKey1, String inKey2)
{
Float distance;
if (inKey1.equals(inKey2))
{
distance = 0.0f;
}
else
{
distance = mMatrix.get(inKey1, inKey2);
}
return distance;
}
//--------------------------------------------------------------------------
public int size()
{
return mMatrix.size();
}
//--------------------------------------------------------------------------
public int numKeys()
{
return mMatrix.numKeys();
}
//--------------------------------------------------------------------------
public Collection keySet()
{
return Collections.unmodifiableCollection(mMatrix.keySet());
}
//--------------------------------------------------------------------------
public float getNetDivergence(String inKey)
{
return mMatrix.getSumForKey(inKey);
}
//--------------------------------------------------------------------------
public String getNearestNeighbor(String inKey)
{
Float minValue = Float.MAX_VALUE;
String minKey = null;
for (String key2 : mMatrix.keySet())
{
if (key2.equals(inKey))
{
continue;
}
Float value = mMatrix.get(inKey, key2);
if (value != null
&& value < minValue)
{
minValue = value;
minKey = key2;
}
}
return minKey;
}
//--------------------------------------------------------------------------
/**
Returns the Edge with the shortest distance. If multiple edges are found with
the same distance, no guarantee is made as to which one will be returned.
@return the Edge with the shortest distance
*/
public Edge getShortestEdge()
{
Edge shortestEdge = null;
MatrixCell smallestValueCell = mMatrix.getNonIdentityCellWithSmallestValue();
if (smallestValueCell != null)
{
shortestEdge = new Edge<>(smallestValueCell.getColKey(), smallestValueCell.getRowKey(), smallestValueCell.getValue());
}
return shortestEdge;
}
//--------------------------------------------------------------------------
/**
Returns the Edges for the specified sequence sorted shortest to longest.
@param inSeqId the id (key) of the sequence for which Edges should be retrieved
@return the Edges for the specified sequence sorted shortest to longest
*/
public List> getSortedEdges(String inSeqId)
{
Collection keySet = keySet();
Map distanceMap = new HashMap<>(keySet.size());
for (String key : keySet)
{
distanceMap.put(key, getDistance(inSeqId, key));
}
Map sortedRowMap = CollectionUtil.sortMapByValue(distanceMap);
List> edges = new ArrayList<>(sortedRowMap.size());
for (String key : sortedRowMap.keySet())
{
edges.add(new Edge<>(inSeqId, key, sortedRowMap.get(key)));
}
return edges;
}
//--------------------------------------------------------------------------
@Override
public DistanceMatrix clone()
{
DistanceMatrix clone;
try
{
clone = (DistanceMatrix) super.clone();
}
catch (CloneNotSupportedException e)
{
throw new RuntimeException(e);
}
clone.mMatrix = mMatrix.clone();
return clone;
}
//**************************************************************************
// PRIVATE METHODS
//**************************************************************************
//--------------------------------------------------------------------------
private void parseMatrix(BufferedReader inMatrixReader)
throws IOException
{
char[] preview = new char[256];
inMatrixReader.mark(preview.length);
inMatrixReader.read(preview, 0, preview.length);
inMatrixReader.reset();
// In Phylip format, the first line should be the number of entries.
if (sPhylipFirstLinePattern.matcher(new String(preview)).find())
{
parsePhylipFormatMatrix(inMatrixReader);
}
else
{
parseGeneralFormatMatrix(inMatrixReader);
}
}
//--------------------------------------------------------------------------
private void parsePhylipFormatMatrix(BufferedReader inMatrixReader)
throws IOException
{
List keys = new ArrayList<>();
String newKey = null;
int distancesParsed = 0;
int numEntries = 0;
int lineCount = 1;
boolean needMoreValues = false;
String line;
while ((line = inMatrixReader.readLine()) != null)
{
// The first line should be the number of entries.
if (numEntries == 0
&& line.matches("\\s*\\d+\\s*"))
{
numEntries = Integer.parseInt(line.trim());
}
else if (needMoreValues)
{
String[] distances = line.trim().split("\\s+");
for (int i = 0; i < distances.length && distancesParsed + i < keys.size(); i++)
{
setDistance(newKey, keys.get(distancesParsed + i), Float.parseFloat(distances[i]));
}
if (distancesParsed + distances.length < keys.size() - 1)
{
distancesParsed += distances.length;
needMoreValues = true;
}
else
{
needMoreValues = false;
}
}
else if (line.matches("\\s*((?:\\s*[\\-\\d\\.]+)*)"))
{
// Continued full-table values we don't need.
}
else
{
Matcher m = sPhylipPattern.matcher(line);
if (! m.matches())
{
throw new RuntimeException("Unexpected format of matrix file line " + lineCount + ": '" + line + "'");
}
newKey = m.group(1).trim();
keys.add(newKey);
mMatrix.put(newKey, newKey, null);
if (m.group(2) != null)
{
String[] distances = m.group(2).split("\\s+");
int i;
for (i = 0; i < distances.length && i < keys.size(); i++)
{
setDistance(newKey, keys.get(i), Float.parseFloat(distances[i]));
}
if (i < keys.size() - 1)
{
// Line must be wrapped
distancesParsed = distances.length;
needMoreValues = true;
}
}
}
lineCount++;
}
}
//--------------------------------------------------------------------------
private void parseGeneralFormatMatrix(BufferedReader inMatrixReader)
throws IOException
{
List keys = new ArrayList<>();
String newKey = null;
int distancesParsed = 0;
int lineCount = 1;
boolean needMoreValues = false;
String line;
while ((line = inMatrixReader.readLine()) != null)
{
if (! StringUtil.isSet(line)
|| line.startsWith("//")
|| line.startsWith("#"))
{
continue; // Skip blank lines or comment lines.
}
if (needMoreValues)
{
String[] distances = line.trim().split("\\s+");
for (int i = 0; i < distances.length && distancesParsed + i < keys.size(); i++)
{
setDistance(newKey, keys.get(distancesParsed + i), Float.parseFloat(distances[i]));
}
if (distancesParsed + distances.length < keys.size() - 1)
{
distancesParsed += distances.length;
needMoreValues = true;
}
else
{
needMoreValues = false;
}
}
else if (line.matches("^\\s+.*"))
{
// Continued full-table values we don't need.
}
else
{
String[] pieces = line.split("\\s+");
newKey = pieces[0];
keys.add(newKey);
mMatrix.put(newKey, newKey, 0.0f);
int i;
for (i = 1; i < pieces.length && i < keys.size(); i++)
{
setDistance(newKey, keys.get(i - 1), Float.parseFloat(pieces[i]));
}
if (i < keys.size())
{
// Line must be wrapped
distancesParsed = pieces.length - 1;
needMoreValues = true;
}
}
lineCount++;
}
}
//--------------------------------------------------------------------------
private int getMaxKeyLength()
{
int maxLength = 0;
for (String key : keySet())
{
if (key.length() > maxLength)
{
maxLength = key.length();
}
}
return maxLength;
}
}