All Downloads are FREE. Search and download functionalities are using the official Maven repository.

com.hfg.bio.phylogeny.DistanceMatrix Maven / Gradle / Ivy

There is a newer version: 20240423
Show newest version
package com.hfg.bio.phylogeny;

import java.io.*;
import java.util.*;
import java.util.regex.Pattern;
import java.util.regex.Matcher;

import com.hfg.util.StringUtil;
import com.hfg.network.Edge;
import com.hfg.util.collection.CollectionUtil;
import com.hfg.util.collection.MatrixCell;
import com.hfg.util.collection.SymmetricNumberMatrix;

//------------------------------------------------------------------------------
/**
 Simple distance matrix container. Can read Phylip-format matrices or general
 matrices where the identifiers do not contain whitespace.

 
Most commonly constructed via a multiple sequence alignment:
    DistanceMatrix matrix = msa.getDistanceMatrix(new UncorrectedModel());
   
Phylip-formatted matrix example:
    String testMatrix =
       "   14\n" +
       "Mouse     \n" +
       "Bovine      1.7043\n" +
       "Lemur       2.0235  1.1901\n" +
       "Tarsier     2.1378  1.3287  1.2905\n" +
       "Squir Monk  1.5232  1.2423  1.3199  1.7878\n" +
       "Jpn Macaq   1.8261  1.2508  1.3887  1.3137  1.0642\n" +
       "Rhesus Mac  1.9182  1.2536  1.4658  1.3788  1.1124  0.1022\n" +
       "Crab-E.Mac  2.0039  1.3066  1.4826  1.3826  0.9832  0.2061  0.2681\n";

    DistanceMatrix matrix = new DistanceMatrix(testMatrix);
   
General matrix example:
    String testMatrix =
             "# Subunit distance matrix\n" +
             "Alpha      0.000 1.000 2.000 3.000 3.000\n" +
             "Beta       1.000 0.000 2.000 3.000 3.000\n" +
             "Gamma      2.000 2.000 0.000 3.000 3.000\n" +
             "Delta      3.000 3.000 0.000 0.000 1.000\n" +
             "Epsilon    3.000 3.000 3.000 1.000 0.000\n\n";

    DistanceMatrix matrix = new DistanceMatrix(testMatrix);
   
See the Phylip Distance matrix programs page for the description of the Phylip distance matrix format.
@author J. Alex Taylor, hairyfatguy.com
*/ //------------------------------------------------------------------------------ // com.hfg Library // // This library is free software; you can redistribute it and/or // modify it under the terms of the GNU Lesser General Public // License as published by the Free Software Foundation; either // version 2.1 of the License, or (at your option) any later version. // // This library is distributed in the hope that it will be useful, // but WITHOUT ANY WARRANTY; without even the implied warranty of // MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU // Lesser General Public License for more details. // // You should have received a copy of the GNU Lesser General Public // License along with this library; if not, write to the Free Software // Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA // // J. Alex Taylor, President, Founder, CEO, COO, CFO, OOPS hairyfatguy.com // [email protected] //------------------------------------------------------------------------------ public class DistanceMatrix implements Cloneable { private SymmetricNumberMatrix mMatrix; private boolean mIsConsumable = true; private boolean mIsConsumed; private static final Pattern sPhylipPattern = Pattern.compile("(.{1,10})(?:\\s*(.+))?"); private static final Pattern sPhylipFirstLinePattern = Pattern.compile("^\\s*\\d+\\s*[\\n\\r]"); //########################################################################## // CONSTRUCTORS //########################################################################## //-------------------------------------------------------------------------- public DistanceMatrix() { this(100); } //-------------------------------------------------------------------------- public DistanceMatrix(int inInitialSize) { mMatrix = new SymmetricNumberMatrix<>(inInitialSize); } //-------------------------------------------------------------------------- public DistanceMatrix(String inMatrix) { this(); if (StringUtil.isSet(inMatrix)) { try { BufferedReader reader = null; try { reader = new BufferedReader(new StringReader(inMatrix)); parseMatrix(reader); } finally { if (reader != null) { reader.close(); } } } catch (IOException e) { throw new RuntimeException("Problem parsing distance matrix!", e); } } } //-------------------------------------------------------------------------- public DistanceMatrix(File inMatrix) throws IOException { this(); if (null == inMatrix) { throw new IOException("No file was specified!"); } else if (!inMatrix.exists()) { throw new IOException("The file " + StringUtil.singleQuote(inMatrix.getPath()) + " does not exist!"); } BufferedReader reader = null; try { reader = new BufferedReader(new FileReader(inMatrix)); parseMatrix(reader); } finally { if (reader != null) reader.close(); } } //########################################################################## // PUBLIC METHODS //########################################################################## //-------------------------------------------------------------------------- public DistanceMatrix setIsConsumable(boolean inValue) { mIsConsumable = inValue; return this; } //-------------------------------------------------------------------------- public boolean isConsumable() { return mIsConsumable; } //-------------------------------------------------------------------------- public DistanceMatrix setIsConsumed() { mIsConsumed = true; return this; } //-------------------------------------------------------------------------- public boolean isConsumed() { return mIsConsumed; } //-------------------------------------------------------------------------- public void addKey(String inKey) { mMatrix.addKey(inKey); } //-------------------------------------------------------------------------- public void removeKey(String inKey) { mMatrix.removeKey(inKey); } //-------------------------------------------------------------------------- public void removeKeys(Set inKeys) { mMatrix.removeKeys(inKeys); } //-------------------------------------------------------------------------- /** Changes the key name inOldKey to inNewKey. Can be useful in re-expanding names after replacing them in order to comply with the 10-character Phylip format limitation. @param inOldKey the old matrix key @param inNewKey the new matrix key */ public void changeKey(String inOldKey, String inNewKey) { mMatrix.changeKey(inOldKey, inNewKey); } //-------------------------------------------------------------------------- public void setDistance(String inKey1, String inKey2, Float inDistance) { if (! inKey1.equals(inKey2)) { mMatrix.put(inKey1, inKey2, inDistance); } } //-------------------------------------------------------------------------- public Float getDistance(String inKey1, String inKey2) { Float distance; if (inKey1.equals(inKey2)) { distance = 0.0f; } else { distance = mMatrix.get(inKey1, inKey2); } return distance; } //-------------------------------------------------------------------------- public int size() { return mMatrix.size(); } //-------------------------------------------------------------------------- public int numKeys() { return mMatrix.numKeys(); } //-------------------------------------------------------------------------- public Collection keySet() { return Collections.unmodifiableCollection(mMatrix.keySet()); } //-------------------------------------------------------------------------- public float getNetDivergence(String inKey) { return mMatrix.getSumForKey(inKey); } //-------------------------------------------------------------------------- public String getNearestNeighbor(String inKey) { Float minValue = Float.MAX_VALUE; String minKey = null; for (String key2 : mMatrix.keySet()) { if (key2.equals(inKey)) { continue; } Float value = mMatrix.get(inKey, key2); if (value != null && value < minValue) { minValue = value; minKey = key2; } } return minKey; } //-------------------------------------------------------------------------- /** Returns the Edge with the shortest distance. If multiple edges are found with the same distance, no guarantee is made as to which one will be returned. @return the Edge with the shortest distance */ public Edge getShortestEdge() { Edge shortestEdge = null; MatrixCell smallestValueCell = mMatrix.getNonIdentityCellWithSmallestValue(); if (smallestValueCell != null) { shortestEdge = new Edge<>(smallestValueCell.getColKey(), smallestValueCell.getRowKey(), smallestValueCell.getValue()); } return shortestEdge; } //-------------------------------------------------------------------------- /** Returns the Edges for the specified sequence sorted shortest to longest. @param inSeqId the id (key) of the sequence for which Edges should be retrieved @return the Edges for the specified sequence sorted shortest to longest */ public List> getSortedEdges(String inSeqId) { Collection keySet = keySet(); Map distanceMap = new HashMap<>(keySet.size()); for (String key : keySet) { distanceMap.put(key, getDistance(inSeqId, key)); } Map sortedRowMap = CollectionUtil.sortMapByValue(distanceMap); List> edges = new ArrayList<>(sortedRowMap.size()); for (String key : sortedRowMap.keySet()) { edges.add(new Edge<>(inSeqId, key, sortedRowMap.get(key))); } return edges; } //-------------------------------------------------------------------------- @Override public DistanceMatrix clone() { DistanceMatrix clone; try { clone = (DistanceMatrix) super.clone(); } catch (CloneNotSupportedException e) { throw new RuntimeException(e); } clone.mMatrix = mMatrix.clone(); return clone; } //************************************************************************** // PRIVATE METHODS //************************************************************************** //-------------------------------------------------------------------------- private void parseMatrix(BufferedReader inMatrixReader) throws IOException { char[] preview = new char[256]; inMatrixReader.mark(preview.length); inMatrixReader.read(preview, 0, preview.length); inMatrixReader.reset(); // In Phylip format, the first line should be the number of entries. if (sPhylipFirstLinePattern.matcher(new String(preview)).find()) { parsePhylipFormatMatrix(inMatrixReader); } else { parseGeneralFormatMatrix(inMatrixReader); } } //-------------------------------------------------------------------------- private void parsePhylipFormatMatrix(BufferedReader inMatrixReader) throws IOException { List keys = new ArrayList<>(); String newKey = null; int distancesParsed = 0; int numEntries = 0; int lineCount = 1; boolean needMoreValues = false; String line; while ((line = inMatrixReader.readLine()) != null) { // The first line should be the number of entries. if (numEntries == 0 && line.matches("\\s*\\d+\\s*")) { numEntries = Integer.parseInt(line.trim()); } else if (needMoreValues) { String[] distances = line.trim().split("\\s+"); for (int i = 0; i < distances.length && distancesParsed + i < keys.size(); i++) { setDistance(newKey, keys.get(distancesParsed + i), Float.parseFloat(distances[i])); } if (distancesParsed + distances.length < keys.size() - 1) { distancesParsed += distances.length; needMoreValues = true; } else { needMoreValues = false; } } else if (line.matches("\\s*((?:\\s*[\\-\\d\\.]+)*)")) { // Continued full-table values we don't need. } else { Matcher m = sPhylipPattern.matcher(line); if (! m.matches()) { throw new RuntimeException("Unexpected format of matrix file line " + lineCount + ": '" + line + "'"); } newKey = m.group(1).trim(); keys.add(newKey); mMatrix.put(newKey, newKey, null); if (m.group(2) != null) { String[] distances = m.group(2).split("\\s+"); int i; for (i = 0; i < distances.length && i < keys.size(); i++) { setDistance(newKey, keys.get(i), Float.parseFloat(distances[i])); } if (i < keys.size() - 1) { // Line must be wrapped distancesParsed = distances.length; needMoreValues = true; } } } lineCount++; } } //-------------------------------------------------------------------------- private void parseGeneralFormatMatrix(BufferedReader inMatrixReader) throws IOException { List keys = new ArrayList<>(); String newKey = null; int distancesParsed = 0; int lineCount = 1; boolean needMoreValues = false; String line; while ((line = inMatrixReader.readLine()) != null) { if (! StringUtil.isSet(line) || line.startsWith("//") || line.startsWith("#")) { continue; // Skip blank lines or comment lines. } if (needMoreValues) { String[] distances = line.trim().split("\\s+"); for (int i = 0; i < distances.length && distancesParsed + i < keys.size(); i++) { setDistance(newKey, keys.get(distancesParsed + i), Float.parseFloat(distances[i])); } if (distancesParsed + distances.length < keys.size() - 1) { distancesParsed += distances.length; needMoreValues = true; } else { needMoreValues = false; } } else if (line.matches("^\\s+.*")) { // Continued full-table values we don't need. } else { String[] pieces = line.split("\\s+"); newKey = pieces[0]; keys.add(newKey); mMatrix.put(newKey, newKey, 0.0f); int i; for (i = 1; i < pieces.length && i < keys.size(); i++) { setDistance(newKey, keys.get(i - 1), Float.parseFloat(pieces[i])); } if (i < keys.size()) { // Line must be wrapped distancesParsed = pieces.length - 1; needMoreValues = true; } } lineCount++; } } //-------------------------------------------------------------------------- private int getMaxKeyLength() { int maxLength = 0; for (String key : keySet()) { if (key.length() > maxLength) { maxLength = key.length(); } } return maxLength; } }




© 2015 - 2024 Weber Informatics LLC | Privacy Policy