All Downloads are FREE. Search and download functionalities are using the official Maven repository.

org.forester.evoinference.parsimony.FitchParsimony Maven / Gradle / Ivy

Go to download

Applications and software libraries for evolutionary biology and comparative genomics research

The newest version!
// $Id:
//
// FORESTER -- software libraries and applications
// for evolutionary biology research and applications.
//
// Copyright (C) 2008-2009 Christian M. Zmasek
// Copyright (C) 2008-2009 Burnham Institute for Medical Research
// All rights reserved
//
// This library is free software; you can redistribute it and/or
// modify it under the terms of the GNU Lesser General Public
// License as published by the Free Software Foundation; either
// version 2.1 of the License, or (at your option) any later version.
//
// This library is distributed in the hope that it will be useful,
// but WITHOUT ANY WARRANTY; without even the implied warranty of
// MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
// Lesser General Public License for more details.
//
// You should have received a copy of the GNU Lesser General Public
// License along with this library; if not, write to the Free Software
// Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301, USA
//
// Contact: phylosoft @ gmail . com
// WWW: https://sites.google.com/site/cmzmasek/home/software/forester

package org.forester.evoinference.parsimony;

import java.text.DecimalFormat;
import java.util.ArrayList;
import java.util.HashMap;
import java.util.Iterator;
import java.util.List;
import java.util.Map;
import java.util.Random;
import java.util.SortedSet;
import java.util.TreeSet;

import org.forester.evoinference.matrix.character.BasicCharacterStateMatrix;
import org.forester.evoinference.matrix.character.CharacterStateMatrix;
import org.forester.evoinference.matrix.character.CharacterStateMatrix.BinaryStates;
import org.forester.evoinference.matrix.character.CharacterStateMatrix.GainLossStates;
import org.forester.phylogeny.Phylogeny;
import org.forester.phylogeny.PhylogenyNode;
import org.forester.phylogeny.iterators.PhylogenyNodeIterator;
import org.forester.util.FailedConditionCheckException;
import org.forester.util.ForesterUtil;

public class FitchParsimony {

    final static private BinaryStates              ABSENT                          = BinaryStates.ABSENT;
    final static private GainLossStates            GAIN                            = GainLossStates.GAIN;
    final static private GainLossStates            LOSS                            = GainLossStates.LOSS;
    final static private BinaryStates              PRESENT                         = BinaryStates.PRESENT;
    private static final long                      RANDOM_NUMBER_SEED_DEFAULT      = 21;
    private static final boolean                   RANDOMIZE_DEFAULT               = false;
    private static final boolean                   RETURN_GAIN_LOSS_MATRIX_DEFAULT = false;
    private static final boolean                   RETURN_INTERNAL_STATES_DEFAULT  = false;
    final static private GainLossStates            UNCHANGED_ABSENT                = GainLossStates.UNCHANGED_ABSENT;
    final static private GainLossStates            UNCHANGED_PRESENT               = GainLossStates.UNCHANGED_PRESENT;
    private static final boolean                   USE_LAST_DEFAULT                = false;
    private int                                    _cost;
    private CharacterStateMatrix   _gain_loss_matrix;
    private CharacterStateMatrix       _internal_states_matrix_after_traceback;
    private CharacterStateMatrix> _internal_states_matrix_prior_to_traceback;
    private Random                                 _random_generator;
    private long                                   _random_number_seed;
    private boolean                                _randomize;
    private boolean                                _return_gain_loss               = false;
    private boolean                                _return_internal_states         = false;
    private int                                    _total_gains;
    private int                                    _total_losses;
    private int                                    _total_unchanged;
    private boolean                                _use_last;
    private boolean                                _verbose                        = false;

    public FitchParsimony() {
        init();
    }

    public void execute( final Phylogeny p, final CharacterStateMatrix external_node_states_matrix ) {
        execute( p, external_node_states_matrix, false );
    }

    public void execute( final Phylogeny p,
                         final CharacterStateMatrix external_node_states_matrix,
                         final boolean verbose ) {
        if ( !p.isRooted() ) {
            throw new IllegalArgumentException( "attempt to execute Fitch parsimony on unroored phylogeny" );
        }
        if ( external_node_states_matrix.isEmpty() ) {
            throw new IllegalArgumentException( "character matrix is empty" );
        }
        if ( external_node_states_matrix.getNumberOfIdentifiers() != p.getNumberOfExternalNodes() ) {
            throw new IllegalArgumentException( "number of external nodes in phylogeny ["
                    + p.getNumberOfExternalNodes() + "] and number of indentifiers ["
                    + external_node_states_matrix.getNumberOfIdentifiers() + "] in matrix are not equal" );
        }
        setVerbose( verbose );
        reset();
        if ( isReturnInternalStates() ) {
            initializeInternalStates( p, external_node_states_matrix );
        }
        if ( isReturnGainLossMatrix() ) {
            initializeGainLossMatrix( p, external_node_states_matrix );
        }
        final DecimalFormat pf = new java.text.DecimalFormat( "000000" );
        if ( isVerbose() ) {
            System.out.println( "Number of characters: " + external_node_states_matrix.getNumberOfCharacters() );
        }
        for( int character_index = 0; character_index < external_node_states_matrix.getNumberOfCharacters(); ++character_index ) {
            if ( isVerbose() ) {
                ForesterUtil.updateProgress( character_index, pf );
            }
            executeForOneCharacter( p,
                                    getStatesForCharacter( p, external_node_states_matrix, character_index ),
                                    getStatesForCharacterForTraceback( p, external_node_states_matrix, character_index ),
                                    character_index );
        }
        if ( isVerbose() ) {
            System.out.println();
        }
        if ( external_node_states_matrix.getState( 0, 0 ) instanceof BinaryStates ) {
            if ( ( external_node_states_matrix.getNumberOfCharacters() * p.getNumberOfBranches() ) != ( getTotalGains()
                    + getTotalLosses() + getTotalUnchanged() ) ) {
                throw new FailedConditionCheckException( "this should not have happened: something is deeply wrong with Fitch parsimony implementation" );
            }
        }
    }

    public int getCost() {
        return _cost;
    }

    public CharacterStateMatrix getGainLossMatrix() {
        if ( !isReturnGainLossMatrix() ) {
            throw new RuntimeException( "creation of gain-loss matrix has not been enabled" );
        }
        return _gain_loss_matrix;
    }

    public CharacterStateMatrix getInternalStatesMatrix() {
        if ( !isReturnInternalStates() ) {
            throw new RuntimeException( "creation of internal state matrix has not been enabled" );
        }
        return _internal_states_matrix_after_traceback;
    }

    /**
     * Returns a view of the internal states prior to trace-back.
     *
     * @return
     */
    public CharacterStateMatrix> getInternalStatesMatrixPriorToTraceback() {
        if ( !isReturnInternalStates() ) {
            throw new RuntimeException( "creation of internal state matrix has not been enabled" );
        }
        return _internal_states_matrix_prior_to_traceback;
    }

    public int getTotalGains() {
        return _total_gains;
    }

    public int getTotalLosses() {
        return _total_losses;
    }

    public int getTotalUnchanged() {
        return _total_unchanged;
    }

    public boolean isVerbose() {
        return _verbose;
    }

    public void setRandomize( final boolean randomize ) {
        if ( randomize && isUseLast() ) {
            throw new IllegalArgumentException( "attempt to allways use last state (ordered) if more than one choices and randomization at the same time" );
        }
        _randomize = randomize;
    }

    public void setRandomNumberSeed( final long random_number_seed ) {
        if ( !isRandomize() ) {
            throw new IllegalArgumentException( "attempt to set random number generator seed without randomization enabled" );
        }
        _random_number_seed = random_number_seed;
    }

    public void setReturnGainLossMatrix( final boolean return_gain_loss ) {
        _return_gain_loss = return_gain_loss;
    }

    public void setReturnInternalStates( final boolean return_internal_states ) {
        _return_internal_states = return_internal_states;
    }

    /**
     * This sets whether to use the first or last state in the sorted
     * states at the undecided internal nodes.
     * For randomized choices set randomize to true (and this to false).
     *
     * Note. It might be advisable to set this to false
     * for BinaryStates if absence at the root is preferred
     * (given the enum BinaryStates sorts in the following order:
     * ABSENT, UNKNOWN, PRESENT).
     *
     *
     * @param use_last
     */
    public void setUseLast( final boolean use_last ) {
        if ( use_last && isRandomize() ) {
            throw new IllegalArgumentException( "attempt to allways use last state (ordered) if more than one choices and randomization at the same time" );
        }
        _use_last = use_last;
    }

    public void setVerbose( final boolean verbose ) {
        _verbose = verbose;
    }

    private int determineIndex( final SortedSet current_node_states, int i ) {
        if ( isRandomize() ) {
            i = getRandomGenerator().nextInt( current_node_states.size() );
        }
        else if ( isUseLast() ) {
            i = current_node_states.size() - 1;
        }
        return i;
    }

    private void executeForOneCharacter( final Phylogeny p,
                                         final Map> states,
                                         final Map traceback_states,
                                         final int character_state_column ) {
        postOrderTraversal( p, states );
        preOrderTraversal( p, states, traceback_states, character_state_column );
    }

    private SortedSet getIntersectionOfStatesOfChildNodes( final Map> states,
                                                                       final PhylogenyNode node ) throws AssertionError {
        final SortedSet states_in_child_nodes = new TreeSet();
        for( int i = 0; i < node.getNumberOfDescendants(); ++i ) {
            final PhylogenyNode node_child = node.getChildNode( i );
            if ( !states.containsKey( node_child ) ) {
                throw new AssertionError( "this should not have happened: node [" + node_child.getName()
                                          + "] not found in node state map" );
            }
            if ( i == 0 ) {
                states_in_child_nodes.addAll( states.get( node_child ) );
            }
            else {
                states_in_child_nodes.retainAll( states.get( node_child ) );
            }
        }
        return states_in_child_nodes;
    }

    private Random getRandomGenerator() {
        return _random_generator;
    }

    private long getRandomNumberSeed() {
        return _random_number_seed;
    }

    private STATE_TYPE getStateAt( final int i, final SortedSet states ) {
        final Iterator it = states.iterator();
        for( int j = 0; j < i; ++j ) {
            it.next();
        }
        return it.next();
    }

    private Map> getStatesForCharacter( final Phylogeny p,
                                                                             final CharacterStateMatrix matrix,
                                                                             final int character_index ) {
        final Map> states = new HashMap>( matrix
                .getNumberOfIdentifiers() );
        for( int indentifier_index = 0; indentifier_index < matrix.getNumberOfIdentifiers(); ++indentifier_index ) {
            final STATE_TYPE state = matrix.getState( indentifier_index, character_index );
            if ( state == null ) {
                throw new IllegalArgumentException( "value at [" + indentifier_index + ", " + character_index
                                                    + "] is null" );
            }
            final SortedSet l = new TreeSet();
            l.add( state );
            states.put( p.getNode( matrix.getIdentifier( indentifier_index ) ), l );
        }
        return states;
    }

    private Map getStatesForCharacterForTraceback( final Phylogeny p,
                                                                              final CharacterStateMatrix matrix,
                                                                              final int character_index ) {
        final Map states = new HashMap( matrix.getNumberOfIdentifiers() );
        for( int indentifier_index = 0; indentifier_index < matrix.getNumberOfIdentifiers(); ++indentifier_index ) {
            final STATE_TYPE state = matrix.getState( indentifier_index, character_index );
            if ( state == null ) {
                throw new IllegalArgumentException( "value at [" + indentifier_index + ", " + character_index
                                                    + "] is null" );
            }
            states.put( p.getNode( matrix.getIdentifier( indentifier_index ) ), state );
        }
        return states;
    }

    private SortedSet getUnionOfStatesOfChildNodes( final Map> states,
                                                                final PhylogenyNode node ) throws AssertionError {
        final SortedSet states_in_child_nodes = new TreeSet();
        for( int i = 0; i < node.getNumberOfDescendants(); ++i ) {
            final PhylogenyNode node_child = node.getChildNode( i );
            if ( !states.containsKey( node_child ) ) {
                throw new AssertionError( "this should not have happened: node [" + node_child.getName()
                                          + "] not found in node state map" );
            }
            states_in_child_nodes.addAll( states.get( node_child ) );
        }
        return states_in_child_nodes;
    }

    private void increaseCost() {
        ++_cost;
    }

    private void init() {
        setReturnInternalStates( RETURN_INTERNAL_STATES_DEFAULT );
        setReturnGainLossMatrix( RETURN_GAIN_LOSS_MATRIX_DEFAULT );
        setRandomize( RANDOMIZE_DEFAULT );
        setUseLast( USE_LAST_DEFAULT );
        _random_number_seed = RANDOM_NUMBER_SEED_DEFAULT;
        reset();
    }

    private void initializeGainLossMatrix( final Phylogeny p,
                                           final CharacterStateMatrix external_node_states_matrix ) {
        final List nodes = new ArrayList();
        for( final PhylogenyNodeIterator postorder = p.iteratorPostorder(); postorder.hasNext(); ) {
            nodes.add( postorder.next() );
        }
        setGainLossMatrix( new BasicCharacterStateMatrix( nodes.size(),
                external_node_states_matrix
                .getNumberOfCharacters() ) );
        int identifier_index = 0;
        for( final PhylogenyNode node : nodes ) {
            getGainLossMatrix().setIdentifier( identifier_index++,
                                               ForesterUtil.isEmpty( node.getName() ) ? node.getId() + "" : node
                                                       .getName() );
        }
        for( int character_index = 0; character_index < external_node_states_matrix.getNumberOfCharacters(); ++character_index ) {
            getGainLossMatrix().setCharacter( character_index,
                                              external_node_states_matrix.getCharacter( character_index ) );
        }
    }

    private void initializeInternalStates( final Phylogeny p,
                                           final CharacterStateMatrix external_node_states_matrix ) {
        final List internal_nodes = new ArrayList();
        for( final PhylogenyNodeIterator postorder = p.iteratorPostorder(); postorder.hasNext(); ) {
            final PhylogenyNode node = postorder.next();
            if ( node.isInternal() ) {
                internal_nodes.add( node );
            }
        }
        setInternalStatesMatrixPriorToTraceback( new BasicCharacterStateMatrix>( internal_nodes.size(),
                external_node_states_matrix
                .getNumberOfCharacters() ) );
        setInternalStatesMatrixTraceback( new BasicCharacterStateMatrix( internal_nodes.size(),
                external_node_states_matrix
                .getNumberOfCharacters() ) );
        int identifier_index = 0;
        for( final PhylogenyNode node : internal_nodes ) {
            getInternalStatesMatrix().setIdentifier( identifier_index,
                                                     ForesterUtil.isEmpty( node.getName() ) ? node.getId() + "" : node
                                                             .getName() );
            getInternalStatesMatrixPriorToTraceback().setIdentifier( identifier_index,
                                                                     ForesterUtil.isEmpty( node.getName() ) ? node
                                                                             .getId() + "" : node.getName() );
            ++identifier_index;
        }
        for( int character_index = 0; character_index < external_node_states_matrix.getNumberOfCharacters(); ++character_index ) {
            getInternalStatesMatrix().setCharacter( character_index,
                                                    external_node_states_matrix.getCharacter( character_index ) );
            getInternalStatesMatrixPriorToTraceback().setCharacter( character_index,
                                                                    external_node_states_matrix
                                                                    .getCharacter( character_index ) );
        }
    }

    private boolean isRandomize() {
        return _randomize;
    }

    private boolean isReturnGainLossMatrix() {
        return _return_gain_loss;
    }

    private boolean isReturnInternalStates() {
        return _return_internal_states;
    }

    private boolean isUseLast() {
        return _use_last;
    }

    private void postOrderTraversal( final Phylogeny p, final Map> states ) {
        for( final PhylogenyNodeIterator postorder = p.iteratorPostorder(); postorder.hasNext(); ) {
            final PhylogenyNode node = postorder.next();
            if ( !node.isExternal() ) {
                SortedSet states_in_children = getIntersectionOfStatesOfChildNodes( states, node );
                if ( states_in_children.isEmpty() ) {
                    states_in_children = getUnionOfStatesOfChildNodes( states, node );
                }
                states.put( node, states_in_children );
            }
        }
    }

    private void preOrderTraversal( final Phylogeny p,
                                    final Map> states,
                                    final Map traceback_states,
                                    final int character_state_column ) {
        for( final PhylogenyNodeIterator preorder = p.iteratorPreorder(); preorder.hasNext(); ) {
            final PhylogenyNode current_node = preorder.next();
            final SortedSet current_node_states = states.get( current_node );
            STATE_TYPE parent_state = null;
            if ( current_node.isRoot() ) {
                int i = 0;
                i = determineIndex( current_node_states, i );
                traceback_states.put( current_node, getStateAt( i, current_node_states ) );
            }
            else {
                parent_state = traceback_states.get( current_node.getParent() );
                if ( current_node_states.contains( parent_state ) ) {
                    traceback_states.put( current_node, parent_state );
                }
                else {
                    increaseCost();
                    int i = 0;
                    i = determineIndex( current_node_states, i );
                    traceback_states.put( current_node, getStateAt( i, current_node_states ) );
                }
            }
            if ( isReturnInternalStates() ) {
                if ( !current_node.isExternal() ) {
                    setInternalNodeStatePriorToTraceback( states, character_state_column, current_node );
                    setInternalNodeState( traceback_states, character_state_column, current_node );
                }
            }
            if ( isReturnGainLossMatrix() && !current_node.isRoot() ) {
                if ( !( parent_state instanceof BinaryStates ) ) {
                    throw new RuntimeException( "attempt to create gain loss matrix for not binary states" );
                }
                final BinaryStates parent_binary_state = ( BinaryStates ) parent_state;
                final BinaryStates current_binary_state = ( BinaryStates ) traceback_states.get( current_node );
                if ( ( parent_binary_state == PRESENT ) && ( current_binary_state == ABSENT ) ) {
                    ++_total_losses;
                    setGainLossState( character_state_column, current_node, LOSS );
                }
                else if ( ( ( parent_binary_state == ABSENT ) || ( parent_binary_state == null ) )
                        && ( current_binary_state == PRESENT ) ) {
                    ++_total_gains;
                    setGainLossState( character_state_column, current_node, GAIN );
                }
                else {
                    ++_total_unchanged;
                    if ( current_binary_state == PRESENT ) {
                        setGainLossState( character_state_column, current_node, UNCHANGED_PRESENT );
                    }
                    else if ( current_binary_state == ABSENT ) {
                        setGainLossState( character_state_column, current_node, UNCHANGED_ABSENT );
                    }
                }
            }
            else if ( isReturnGainLossMatrix() && current_node.isRoot() ) {
                final BinaryStates current_binary_state = ( BinaryStates ) traceback_states.get( current_node );
                ++_total_unchanged; //new
                if ( current_binary_state == PRESENT ) {//new
                    setGainLossState( character_state_column, current_node, UNCHANGED_PRESENT );//new
                }//new
                else if ( current_binary_state == ABSENT ) {//new
                    setGainLossState( character_state_column, current_node, UNCHANGED_ABSENT );//new
                }//new
                // setGainLossState( character_state_column, current_node, UNKNOWN_GAIN_LOSS );
            }
        }
    }

    private void reset() {
        setCost( 0 );
        setTotalLosses( 0 );
        setTotalGains( 0 );
        setTotalUnchanged( 0 );
        setRandomGenerator( new Random( getRandomNumberSeed() ) );
    }

    private void setCost( final int cost ) {
        _cost = cost;
    }

    private void setGainLossMatrix( final CharacterStateMatrix gain_loss_matrix ) {
        _gain_loss_matrix = gain_loss_matrix;
    }

    private void setGainLossState( final int character_state_column,
                                   final PhylogenyNode node,
                                   final GainLossStates state ) {
        getGainLossMatrix().setState( ForesterUtil.isEmpty( node.getName() ) ? node.getId() + "" : node.getName(),
                character_state_column,
                state );
    }

    private void setInternalNodeState( final Map states,
                                       final int character_state_column,
                                       final PhylogenyNode node ) {
        getInternalStatesMatrix()
        .setState( ForesterUtil.isEmpty( node.getName() ) ? node.getId() + "" : node.getName(),
                character_state_column,
                states.get( node ) );
    }

    private void setInternalNodeStatePriorToTraceback( final Map> states,
                                                       final int character_state_column,
                                                       final PhylogenyNode node ) {
        getInternalStatesMatrixPriorToTraceback()
        .setState( ForesterUtil.isEmpty( node.getName() ) ? String.valueOf( node.getId() ) : node.getName(),
                character_state_column,
                toListSorted( states.get( node ) ) );
    }

    private void setInternalStatesMatrixPriorToTraceback( final CharacterStateMatrix> internal_states_matrix_prior_to_traceback ) {
        _internal_states_matrix_prior_to_traceback = internal_states_matrix_prior_to_traceback;
    }

    private void setInternalStatesMatrixTraceback( final CharacterStateMatrix internal_states_matrix_after_traceback ) {
        _internal_states_matrix_after_traceback = internal_states_matrix_after_traceback;
    }

    private void setRandomGenerator( final Random random_generator ) {
        _random_generator = random_generator;
    }

    private void setTotalGains( final int total_gains ) {
        _total_gains = total_gains;
    }

    private void setTotalLosses( final int total_losses ) {
        _total_losses = total_losses;
    }

    private void setTotalUnchanged( final int total_unchanged ) {
        _total_unchanged = total_unchanged;
    }

    private List toListSorted( final SortedSet states ) {
        final List l = new ArrayList( states.size() );
        for( final STATE_TYPE state : states ) {
            l.add( state );
        }
        return l;
    }
}




© 2015 - 2024 Weber Informatics LLC | Privacy Policy