org.forester.evoinference.parsimony.FitchParsimony Maven / Gradle / Ivy
Go to download
Show more of this group Show more artifacts with this name
Show all versions of forester Show documentation
Show all versions of forester Show documentation
Applications and software libraries for evolutionary biology and comparative genomics research
The newest version!
// $Id:
//
// FORESTER -- software libraries and applications
// for evolutionary biology research and applications.
//
// Copyright (C) 2008-2009 Christian M. Zmasek
// Copyright (C) 2008-2009 Burnham Institute for Medical Research
// All rights reserved
//
// This library is free software; you can redistribute it and/or
// modify it under the terms of the GNU Lesser General Public
// License as published by the Free Software Foundation; either
// version 2.1 of the License, or (at your option) any later version.
//
// This library is distributed in the hope that it will be useful,
// but WITHOUT ANY WARRANTY; without even the implied warranty of
// MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
// Lesser General Public License for more details.
//
// You should have received a copy of the GNU Lesser General Public
// License along with this library; if not, write to the Free Software
// Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301, USA
//
// Contact: phylosoft @ gmail . com
// WWW: https://sites.google.com/site/cmzmasek/home/software/forester
package org.forester.evoinference.parsimony;
import java.text.DecimalFormat;
import java.util.ArrayList;
import java.util.HashMap;
import java.util.Iterator;
import java.util.List;
import java.util.Map;
import java.util.Random;
import java.util.SortedSet;
import java.util.TreeSet;
import org.forester.evoinference.matrix.character.BasicCharacterStateMatrix;
import org.forester.evoinference.matrix.character.CharacterStateMatrix;
import org.forester.evoinference.matrix.character.CharacterStateMatrix.BinaryStates;
import org.forester.evoinference.matrix.character.CharacterStateMatrix.GainLossStates;
import org.forester.phylogeny.Phylogeny;
import org.forester.phylogeny.PhylogenyNode;
import org.forester.phylogeny.iterators.PhylogenyNodeIterator;
import org.forester.util.FailedConditionCheckException;
import org.forester.util.ForesterUtil;
public class FitchParsimony {
final static private BinaryStates ABSENT = BinaryStates.ABSENT;
final static private GainLossStates GAIN = GainLossStates.GAIN;
final static private GainLossStates LOSS = GainLossStates.LOSS;
final static private BinaryStates PRESENT = BinaryStates.PRESENT;
private static final long RANDOM_NUMBER_SEED_DEFAULT = 21;
private static final boolean RANDOMIZE_DEFAULT = false;
private static final boolean RETURN_GAIN_LOSS_MATRIX_DEFAULT = false;
private static final boolean RETURN_INTERNAL_STATES_DEFAULT = false;
final static private GainLossStates UNCHANGED_ABSENT = GainLossStates.UNCHANGED_ABSENT;
final static private GainLossStates UNCHANGED_PRESENT = GainLossStates.UNCHANGED_PRESENT;
private static final boolean USE_LAST_DEFAULT = false;
private int _cost;
private CharacterStateMatrix _gain_loss_matrix;
private CharacterStateMatrix _internal_states_matrix_after_traceback;
private CharacterStateMatrix> _internal_states_matrix_prior_to_traceback;
private Random _random_generator;
private long _random_number_seed;
private boolean _randomize;
private boolean _return_gain_loss = false;
private boolean _return_internal_states = false;
private int _total_gains;
private int _total_losses;
private int _total_unchanged;
private boolean _use_last;
private boolean _verbose = false;
public FitchParsimony() {
init();
}
public void execute( final Phylogeny p, final CharacterStateMatrix external_node_states_matrix ) {
execute( p, external_node_states_matrix, false );
}
public void execute( final Phylogeny p,
final CharacterStateMatrix external_node_states_matrix,
final boolean verbose ) {
if ( !p.isRooted() ) {
throw new IllegalArgumentException( "attempt to execute Fitch parsimony on unroored phylogeny" );
}
if ( external_node_states_matrix.isEmpty() ) {
throw new IllegalArgumentException( "character matrix is empty" );
}
if ( external_node_states_matrix.getNumberOfIdentifiers() != p.getNumberOfExternalNodes() ) {
throw new IllegalArgumentException( "number of external nodes in phylogeny ["
+ p.getNumberOfExternalNodes() + "] and number of indentifiers ["
+ external_node_states_matrix.getNumberOfIdentifiers() + "] in matrix are not equal" );
}
setVerbose( verbose );
reset();
if ( isReturnInternalStates() ) {
initializeInternalStates( p, external_node_states_matrix );
}
if ( isReturnGainLossMatrix() ) {
initializeGainLossMatrix( p, external_node_states_matrix );
}
final DecimalFormat pf = new java.text.DecimalFormat( "000000" );
if ( isVerbose() ) {
System.out.println( "Number of characters: " + external_node_states_matrix.getNumberOfCharacters() );
}
for( int character_index = 0; character_index < external_node_states_matrix.getNumberOfCharacters(); ++character_index ) {
if ( isVerbose() ) {
ForesterUtil.updateProgress( character_index, pf );
}
executeForOneCharacter( p,
getStatesForCharacter( p, external_node_states_matrix, character_index ),
getStatesForCharacterForTraceback( p, external_node_states_matrix, character_index ),
character_index );
}
if ( isVerbose() ) {
System.out.println();
}
if ( external_node_states_matrix.getState( 0, 0 ) instanceof BinaryStates ) {
if ( ( external_node_states_matrix.getNumberOfCharacters() * p.getNumberOfBranches() ) != ( getTotalGains()
+ getTotalLosses() + getTotalUnchanged() ) ) {
throw new FailedConditionCheckException( "this should not have happened: something is deeply wrong with Fitch parsimony implementation" );
}
}
}
public int getCost() {
return _cost;
}
public CharacterStateMatrix getGainLossMatrix() {
if ( !isReturnGainLossMatrix() ) {
throw new RuntimeException( "creation of gain-loss matrix has not been enabled" );
}
return _gain_loss_matrix;
}
public CharacterStateMatrix getInternalStatesMatrix() {
if ( !isReturnInternalStates() ) {
throw new RuntimeException( "creation of internal state matrix has not been enabled" );
}
return _internal_states_matrix_after_traceback;
}
/**
* Returns a view of the internal states prior to trace-back.
*
* @return
*/
public CharacterStateMatrix> getInternalStatesMatrixPriorToTraceback() {
if ( !isReturnInternalStates() ) {
throw new RuntimeException( "creation of internal state matrix has not been enabled" );
}
return _internal_states_matrix_prior_to_traceback;
}
public int getTotalGains() {
return _total_gains;
}
public int getTotalLosses() {
return _total_losses;
}
public int getTotalUnchanged() {
return _total_unchanged;
}
public boolean isVerbose() {
return _verbose;
}
public void setRandomize( final boolean randomize ) {
if ( randomize && isUseLast() ) {
throw new IllegalArgumentException( "attempt to allways use last state (ordered) if more than one choices and randomization at the same time" );
}
_randomize = randomize;
}
public void setRandomNumberSeed( final long random_number_seed ) {
if ( !isRandomize() ) {
throw new IllegalArgumentException( "attempt to set random number generator seed without randomization enabled" );
}
_random_number_seed = random_number_seed;
}
public void setReturnGainLossMatrix( final boolean return_gain_loss ) {
_return_gain_loss = return_gain_loss;
}
public void setReturnInternalStates( final boolean return_internal_states ) {
_return_internal_states = return_internal_states;
}
/**
* This sets whether to use the first or last state in the sorted
* states at the undecided internal nodes.
* For randomized choices set randomize to true (and this to false).
*
* Note. It might be advisable to set this to false
* for BinaryStates if absence at the root is preferred
* (given the enum BinaryStates sorts in the following order:
* ABSENT, UNKNOWN, PRESENT).
*
*
* @param use_last
*/
public void setUseLast( final boolean use_last ) {
if ( use_last && isRandomize() ) {
throw new IllegalArgumentException( "attempt to allways use last state (ordered) if more than one choices and randomization at the same time" );
}
_use_last = use_last;
}
public void setVerbose( final boolean verbose ) {
_verbose = verbose;
}
private int determineIndex( final SortedSet current_node_states, int i ) {
if ( isRandomize() ) {
i = getRandomGenerator().nextInt( current_node_states.size() );
}
else if ( isUseLast() ) {
i = current_node_states.size() - 1;
}
return i;
}
private void executeForOneCharacter( final Phylogeny p,
final Map> states,
final Map traceback_states,
final int character_state_column ) {
postOrderTraversal( p, states );
preOrderTraversal( p, states, traceback_states, character_state_column );
}
private SortedSet getIntersectionOfStatesOfChildNodes( final Map> states,
final PhylogenyNode node ) throws AssertionError {
final SortedSet states_in_child_nodes = new TreeSet();
for( int i = 0; i < node.getNumberOfDescendants(); ++i ) {
final PhylogenyNode node_child = node.getChildNode( i );
if ( !states.containsKey( node_child ) ) {
throw new AssertionError( "this should not have happened: node [" + node_child.getName()
+ "] not found in node state map" );
}
if ( i == 0 ) {
states_in_child_nodes.addAll( states.get( node_child ) );
}
else {
states_in_child_nodes.retainAll( states.get( node_child ) );
}
}
return states_in_child_nodes;
}
private Random getRandomGenerator() {
return _random_generator;
}
private long getRandomNumberSeed() {
return _random_number_seed;
}
private STATE_TYPE getStateAt( final int i, final SortedSet states ) {
final Iterator it = states.iterator();
for( int j = 0; j < i; ++j ) {
it.next();
}
return it.next();
}
private Map> getStatesForCharacter( final Phylogeny p,
final CharacterStateMatrix matrix,
final int character_index ) {
final Map> states = new HashMap>( matrix
.getNumberOfIdentifiers() );
for( int indentifier_index = 0; indentifier_index < matrix.getNumberOfIdentifiers(); ++indentifier_index ) {
final STATE_TYPE state = matrix.getState( indentifier_index, character_index );
if ( state == null ) {
throw new IllegalArgumentException( "value at [" + indentifier_index + ", " + character_index
+ "] is null" );
}
final SortedSet l = new TreeSet();
l.add( state );
states.put( p.getNode( matrix.getIdentifier( indentifier_index ) ), l );
}
return states;
}
private Map getStatesForCharacterForTraceback( final Phylogeny p,
final CharacterStateMatrix matrix,
final int character_index ) {
final Map states = new HashMap( matrix.getNumberOfIdentifiers() );
for( int indentifier_index = 0; indentifier_index < matrix.getNumberOfIdentifiers(); ++indentifier_index ) {
final STATE_TYPE state = matrix.getState( indentifier_index, character_index );
if ( state == null ) {
throw new IllegalArgumentException( "value at [" + indentifier_index + ", " + character_index
+ "] is null" );
}
states.put( p.getNode( matrix.getIdentifier( indentifier_index ) ), state );
}
return states;
}
private SortedSet getUnionOfStatesOfChildNodes( final Map> states,
final PhylogenyNode node ) throws AssertionError {
final SortedSet states_in_child_nodes = new TreeSet();
for( int i = 0; i < node.getNumberOfDescendants(); ++i ) {
final PhylogenyNode node_child = node.getChildNode( i );
if ( !states.containsKey( node_child ) ) {
throw new AssertionError( "this should not have happened: node [" + node_child.getName()
+ "] not found in node state map" );
}
states_in_child_nodes.addAll( states.get( node_child ) );
}
return states_in_child_nodes;
}
private void increaseCost() {
++_cost;
}
private void init() {
setReturnInternalStates( RETURN_INTERNAL_STATES_DEFAULT );
setReturnGainLossMatrix( RETURN_GAIN_LOSS_MATRIX_DEFAULT );
setRandomize( RANDOMIZE_DEFAULT );
setUseLast( USE_LAST_DEFAULT );
_random_number_seed = RANDOM_NUMBER_SEED_DEFAULT;
reset();
}
private void initializeGainLossMatrix( final Phylogeny p,
final CharacterStateMatrix external_node_states_matrix ) {
final List nodes = new ArrayList();
for( final PhylogenyNodeIterator postorder = p.iteratorPostorder(); postorder.hasNext(); ) {
nodes.add( postorder.next() );
}
setGainLossMatrix( new BasicCharacterStateMatrix( nodes.size(),
external_node_states_matrix
.getNumberOfCharacters() ) );
int identifier_index = 0;
for( final PhylogenyNode node : nodes ) {
getGainLossMatrix().setIdentifier( identifier_index++,
ForesterUtil.isEmpty( node.getName() ) ? node.getId() + "" : node
.getName() );
}
for( int character_index = 0; character_index < external_node_states_matrix.getNumberOfCharacters(); ++character_index ) {
getGainLossMatrix().setCharacter( character_index,
external_node_states_matrix.getCharacter( character_index ) );
}
}
private void initializeInternalStates( final Phylogeny p,
final CharacterStateMatrix external_node_states_matrix ) {
final List internal_nodes = new ArrayList();
for( final PhylogenyNodeIterator postorder = p.iteratorPostorder(); postorder.hasNext(); ) {
final PhylogenyNode node = postorder.next();
if ( node.isInternal() ) {
internal_nodes.add( node );
}
}
setInternalStatesMatrixPriorToTraceback( new BasicCharacterStateMatrix>( internal_nodes.size(),
external_node_states_matrix
.getNumberOfCharacters() ) );
setInternalStatesMatrixTraceback( new BasicCharacterStateMatrix( internal_nodes.size(),
external_node_states_matrix
.getNumberOfCharacters() ) );
int identifier_index = 0;
for( final PhylogenyNode node : internal_nodes ) {
getInternalStatesMatrix().setIdentifier( identifier_index,
ForesterUtil.isEmpty( node.getName() ) ? node.getId() + "" : node
.getName() );
getInternalStatesMatrixPriorToTraceback().setIdentifier( identifier_index,
ForesterUtil.isEmpty( node.getName() ) ? node
.getId() + "" : node.getName() );
++identifier_index;
}
for( int character_index = 0; character_index < external_node_states_matrix.getNumberOfCharacters(); ++character_index ) {
getInternalStatesMatrix().setCharacter( character_index,
external_node_states_matrix.getCharacter( character_index ) );
getInternalStatesMatrixPriorToTraceback().setCharacter( character_index,
external_node_states_matrix
.getCharacter( character_index ) );
}
}
private boolean isRandomize() {
return _randomize;
}
private boolean isReturnGainLossMatrix() {
return _return_gain_loss;
}
private boolean isReturnInternalStates() {
return _return_internal_states;
}
private boolean isUseLast() {
return _use_last;
}
private void postOrderTraversal( final Phylogeny p, final Map> states ) {
for( final PhylogenyNodeIterator postorder = p.iteratorPostorder(); postorder.hasNext(); ) {
final PhylogenyNode node = postorder.next();
if ( !node.isExternal() ) {
SortedSet states_in_children = getIntersectionOfStatesOfChildNodes( states, node );
if ( states_in_children.isEmpty() ) {
states_in_children = getUnionOfStatesOfChildNodes( states, node );
}
states.put( node, states_in_children );
}
}
}
private void preOrderTraversal( final Phylogeny p,
final Map> states,
final Map traceback_states,
final int character_state_column ) {
for( final PhylogenyNodeIterator preorder = p.iteratorPreorder(); preorder.hasNext(); ) {
final PhylogenyNode current_node = preorder.next();
final SortedSet current_node_states = states.get( current_node );
STATE_TYPE parent_state = null;
if ( current_node.isRoot() ) {
int i = 0;
i = determineIndex( current_node_states, i );
traceback_states.put( current_node, getStateAt( i, current_node_states ) );
}
else {
parent_state = traceback_states.get( current_node.getParent() );
if ( current_node_states.contains( parent_state ) ) {
traceback_states.put( current_node, parent_state );
}
else {
increaseCost();
int i = 0;
i = determineIndex( current_node_states, i );
traceback_states.put( current_node, getStateAt( i, current_node_states ) );
}
}
if ( isReturnInternalStates() ) {
if ( !current_node.isExternal() ) {
setInternalNodeStatePriorToTraceback( states, character_state_column, current_node );
setInternalNodeState( traceback_states, character_state_column, current_node );
}
}
if ( isReturnGainLossMatrix() && !current_node.isRoot() ) {
if ( !( parent_state instanceof BinaryStates ) ) {
throw new RuntimeException( "attempt to create gain loss matrix for not binary states" );
}
final BinaryStates parent_binary_state = ( BinaryStates ) parent_state;
final BinaryStates current_binary_state = ( BinaryStates ) traceback_states.get( current_node );
if ( ( parent_binary_state == PRESENT ) && ( current_binary_state == ABSENT ) ) {
++_total_losses;
setGainLossState( character_state_column, current_node, LOSS );
}
else if ( ( ( parent_binary_state == ABSENT ) || ( parent_binary_state == null ) )
&& ( current_binary_state == PRESENT ) ) {
++_total_gains;
setGainLossState( character_state_column, current_node, GAIN );
}
else {
++_total_unchanged;
if ( current_binary_state == PRESENT ) {
setGainLossState( character_state_column, current_node, UNCHANGED_PRESENT );
}
else if ( current_binary_state == ABSENT ) {
setGainLossState( character_state_column, current_node, UNCHANGED_ABSENT );
}
}
}
else if ( isReturnGainLossMatrix() && current_node.isRoot() ) {
final BinaryStates current_binary_state = ( BinaryStates ) traceback_states.get( current_node );
++_total_unchanged; //new
if ( current_binary_state == PRESENT ) {//new
setGainLossState( character_state_column, current_node, UNCHANGED_PRESENT );//new
}//new
else if ( current_binary_state == ABSENT ) {//new
setGainLossState( character_state_column, current_node, UNCHANGED_ABSENT );//new
}//new
// setGainLossState( character_state_column, current_node, UNKNOWN_GAIN_LOSS );
}
}
}
private void reset() {
setCost( 0 );
setTotalLosses( 0 );
setTotalGains( 0 );
setTotalUnchanged( 0 );
setRandomGenerator( new Random( getRandomNumberSeed() ) );
}
private void setCost( final int cost ) {
_cost = cost;
}
private void setGainLossMatrix( final CharacterStateMatrix gain_loss_matrix ) {
_gain_loss_matrix = gain_loss_matrix;
}
private void setGainLossState( final int character_state_column,
final PhylogenyNode node,
final GainLossStates state ) {
getGainLossMatrix().setState( ForesterUtil.isEmpty( node.getName() ) ? node.getId() + "" : node.getName(),
character_state_column,
state );
}
private void setInternalNodeState( final Map states,
final int character_state_column,
final PhylogenyNode node ) {
getInternalStatesMatrix()
.setState( ForesterUtil.isEmpty( node.getName() ) ? node.getId() + "" : node.getName(),
character_state_column,
states.get( node ) );
}
private void setInternalNodeStatePriorToTraceback( final Map> states,
final int character_state_column,
final PhylogenyNode node ) {
getInternalStatesMatrixPriorToTraceback()
.setState( ForesterUtil.isEmpty( node.getName() ) ? String.valueOf( node.getId() ) : node.getName(),
character_state_column,
toListSorted( states.get( node ) ) );
}
private void setInternalStatesMatrixPriorToTraceback( final CharacterStateMatrix> internal_states_matrix_prior_to_traceback ) {
_internal_states_matrix_prior_to_traceback = internal_states_matrix_prior_to_traceback;
}
private void setInternalStatesMatrixTraceback( final CharacterStateMatrix internal_states_matrix_after_traceback ) {
_internal_states_matrix_after_traceback = internal_states_matrix_after_traceback;
}
private void setRandomGenerator( final Random random_generator ) {
_random_generator = random_generator;
}
private void setTotalGains( final int total_gains ) {
_total_gains = total_gains;
}
private void setTotalLosses( final int total_losses ) {
_total_losses = total_losses;
}
private void setTotalUnchanged( final int total_unchanged ) {
_total_unchanged = total_unchanged;
}
private List toListSorted( final SortedSet states ) {
final List l = new ArrayList( states.size() );
for( final STATE_TYPE state : states ) {
l.add( state );
}
return l;
}
}