All Downloads are FREE. Search and download functionalities are using the official Maven repository.

net.maizegenetics.analysis.data.PrincipalComponentsPlugin Maven / Gradle / Ivy

package net.maizegenetics.analysis.data;

import java.awt.Frame;
import java.net.URL;
import java.util.ArrayList;
import java.util.Arrays;
import java.util.List;
import java.util.Optional;
import java.util.stream.Stream;

import javax.swing.ImageIcon;

import org.apache.log4j.Logger;

import com.google.common.collect.Range;

import net.maizegenetics.analysis.numericaltransform.ImputationPlugin;
import net.maizegenetics.analysis.numericaltransform.NumericalGenotypePlugin;
import net.maizegenetics.dna.snp.GenotypeTable;
import net.maizegenetics.dna.snp.score.ReferenceProbability;
import net.maizegenetics.matrixalgebra.Matrix.DoubleMatrix;
import net.maizegenetics.matrixalgebra.Matrix.DoubleMatrixFactory;
import net.maizegenetics.phenotype.NumericAttribute;
import net.maizegenetics.phenotype.Phenotype;
import net.maizegenetics.phenotype.PhenotypeBuilder;
import net.maizegenetics.phenotype.TaxaAttribute;
import net.maizegenetics.phenotype.Phenotype.ATTRIBUTE_TYPE;
import net.maizegenetics.phenotype.PhenotypeAttribute;
import net.maizegenetics.plugindef.AbstractPlugin;
import net.maizegenetics.plugindef.DataSet;
import net.maizegenetics.plugindef.Datum;
import net.maizegenetics.plugindef.GeneratePluginCode;
import net.maizegenetics.plugindef.PluginParameter;
import net.maizegenetics.stats.PCA.PrinComp;
import net.maizegenetics.stats.PCA.PrinComp.PC_TYPE;
import net.maizegenetics.util.OpenBitSet;
import net.maizegenetics.util.SimpleTableReport;

public class PrincipalComponentsPlugin extends AbstractPlugin {
    private static final Logger myLogger = Logger.getLogger(PrincipalComponentsPlugin.class);
    public static enum PCA_LIMIT {number_of_components, min_eigenvalue, total_variance};
    
    private PluginParameter useCovariance = new PluginParameter.Builder<>("covariance", true, Boolean.class)
    		.description("If the box is checked, then the analysis will do an eigenvalue decomposition of the covariance matrix. "
    				+ "If the box is unchecked, it will use a correlation matrix. Using the covariance matrix "
    				+ "is recommended for genotypes while the correlation matrix is often used for phenotypes.")
    		.guiName("covariance (alternative = correlation)")
    		.build();
    private PluginParameter limitBy = new PluginParameter.Builder<>("limitBy", PCA_LIMIT.number_of_components, PCA_LIMIT.class)
    		.description("This parameter determines the type of value that will be used to limit the number of principal components (axes) returned. "
    				+ "The possible choices are number_of_components, min_eigenvalue, and total_variance.")
    		.guiName("limit number of components by")
    		.build();
    private PluginParameter numberOfComponents = new PluginParameter.Builder<>("ncomponents", 5, Integer.class)
    		.description("The analysis will return this many principal components up to the number of taxa.")
    		.guiName("number of components")
    		.dependentOnParameter(limitBy, PCA_LIMIT.number_of_components)
    		.build();
    private PluginParameter minEigenval = new PluginParameter.Builder<>("minEigenval", 0.0, Double.class)
    		.description("All principal components with an eigenvalue greater than or equal to this value will be returned.")
    		.guiName("minimum eigenvalue")
    		.dependentOnParameter(limitBy, PCA_LIMIT.min_eigenvalue)
    		.build();
    private PluginParameter totalVar = new PluginParameter.Builder<>("totalVar", 0.5, Double.class)
    		.description("The first principal components that together explain this proportion of the total variance will be returned.")
    		.range(Range.closed(0.0, 1.0))
    		.guiName("total variance")
    		.dependentOnParameter(limitBy, PCA_LIMIT.total_variance)
    		.build();
    private PluginParameter reportEigenvalues = new PluginParameter.Builder<>("reportEigenvalues", true, Boolean.class)
    		.description("Returns a list of eigenvalues sorted high to low.")
    		.guiName("Return Eigenvalues")
    		.build();
    private PluginParameter reportEigenvectors = new PluginParameter.Builder<>("reportEigenvectors", true, Boolean.class)
    		.description("Returns the eigenvectors calculated from a Singular Value Decomposition of the data. The resulting table can be quite large if the number of variants and taxa are big.")
    		.guiName("Return Eigenvectors")
    		.build();
    
	public PrincipalComponentsPlugin(Frame parentFrame, boolean isInteractive) {
		super(parentFrame, isInteractive);
	}
	
	public DataSet processData(DataSet input){
		List myResults = new ArrayList<>();
		List myData = input.getDataOfType(new Class[]{Phenotype.class, GenotypeTable.class});
		for (Datum aDatum : myData) {
			if (aDatum.getData() instanceof Phenotype) {
				//check for missing values, throw an IllegalArgumentException if there is any missing data
				Phenotype myPhenotype = (Phenotype) aDatum.getData();
				if (areAnyPhenotypesMissing(myPhenotype.dataAttributeStream())) {
					StringBuilder msgBuilder = new StringBuilder();
					msgBuilder.append("There are missing values in ")
						.append(aDatum.getName())
						.append(". PCA will not be run.");
					throw new IllegalArgumentException(msgBuilder.toString());
				}

				//create the matrix, rows are observations, columns are data attributes
				List dataAttributes = myPhenotype.attributeListOfType(ATTRIBUTE_TYPE.data);
				int nAttributes = dataAttributes.size();
				int nobs = myPhenotype.numberOfObservations();
				DoubleMatrix dataMatrix = DoubleMatrixFactory.DEFAULT.make(nobs, nAttributes);
				int colCount = 0;
				for (PhenotypeAttribute attr: dataAttributes) {
					float[] colData = ((NumericAttribute) attr).floatValues();
					for (int i = 0; i < nobs; i++) dataMatrix.set(i, colCount, colData[i]);
					colCount++;
				}

				//run PCA
				PC_TYPE pctype;
				if (useCovariance.value()) pctype = PC_TYPE.cov;
				else pctype = PC_TYPE.corr;
				PrinComp pca = new PrinComp(dataMatrix, pctype);
				
				//get results
				myResults.addAll(addResultsToDatumList(pca, myPhenotype.taxaAttribute(), dataAttributes, aDatum.getName()));
				
			} else {
				GenotypeTable myGenotype = (GenotypeTable) aDatum.getData();
				
				//is there a reference probability? If not, create one and impute missing values
				if (!myGenotype.hasReferenceProbablity()) {
					myGenotype = NumericalGenotypePlugin.setAlternateMinorAllelesToMinor(myGenotype);
					DataSet myDataset = new DataSet(new Datum("name", myGenotype, "comment"), this);
					ImputationPlugin myImputer = new ImputationPlugin(null, false);
					myImputer.by_mean(true);
					DataSet imputedDataset = myImputer.performFunction(myDataset);
					myGenotype = (GenotypeTable) imputedDataset.getData(0).getData();
				} else if (areAnyGenotypesMissingInReferenceProbability(myGenotype)) {
					StringBuilder msgBuilder = new StringBuilder();
					msgBuilder.append("There are missing values in ")
						.append(aDatum.getName())
						.append(". PCA will not be run.");
					throw new IllegalArgumentException(msgBuilder.toString());
				}
				
				//create the matrix, rows are taxa, columns are sites
				int ntaxa = myGenotype.numberOfTaxa();
				int nsites = myGenotype.numberOfSites();
				DoubleMatrix dataMatrix = DoubleMatrixFactory.DEFAULT.make(ntaxa, nsites);
				for (int t = 0; t < ntaxa; t++) {
					for (int s = 0; s < nsites; s++) {
						dataMatrix.set(t, s, myGenotype.referenceProbability(t, s));
					}
				}

				//run PCA
				PC_TYPE pctype;
				if (useCovariance.value()) pctype = PC_TYPE.cov;
				else pctype = PC_TYPE.corr;
				PrinComp pca = new PrinComp(dataMatrix, pctype);

				//get results
				myResults.addAll(addResultsToDatumList(pca, myGenotype, aDatum.getName()));
				
			}
		}

		return new DataSet(myResults, this);
	}
	
	private boolean areAnyPhenotypesMissing(Stream attributes) {
		Optional na = attributes.filter(a -> a.missing().cardinality() > 0).findAny();
		return na.isPresent();
	}
	
	private boolean areAnyGenotypesMissingInReferenceProbability(GenotypeTable myGenotype) {
		int ntaxa = myGenotype.numberOfTaxa();
		int nsites = myGenotype.numberOfSites();
		ReferenceProbability refprob = myGenotype.referenceProbability();
		for (int s = 0; s < nsites; s++) {
			for (int t = 0; t < ntaxa; t++) {
				if (Float.isNaN(refprob.value(t, s))) return true;
			}
		}
		return false;
	}
	
	private List addResultsToDatumList(PrinComp pca, TaxaAttribute myTaxa, List data, String datasetName) {
		List results = new ArrayList<>();
		
		//determine how many pc's to return
		int numberOfPCs;
		double[] eigenvalues = pca.getEigenValues();
		int nvalues = eigenvalues.length;
		double[] cumulativeEigenvalues = Arrays.copyOf(eigenvalues, nvalues);
		for (int i = 1; i < nvalues; i++) {
			cumulativeEigenvalues[i] += cumulativeEigenvalues[i - 1];
		}
		
		if (limitBy.value() == PCA_LIMIT.number_of_components) {
			numberOfPCs = Math.min(numberOfComponents.value(), nvalues);
		} else if (limitBy.value() == PCA_LIMIT.total_variance) {
			double limit = totalVar.value() * cumulativeEigenvalues[nvalues - 1];
			int ndx = Arrays.binarySearch(cumulativeEigenvalues, limit);
			if (ndx < -1) numberOfPCs = - ndx;
			else numberOfPCs = ndx + 1;
			numberOfPCs = Math.min(numberOfPCs, nvalues);
		} else {    //min_eigenvalue
			int ndx = Arrays.binarySearch(eigenvalues, minEigenval.value());
			if (ndx < -1) numberOfPCs = - ndx;
			else numberOfPCs = ndx + 1;
			numberOfPCs = Math.min(numberOfPCs, nvalues);
		}
		
		//create a Phenotype with the requested number of PCs
		DoubleMatrix pcs = pca.getPrincipalComponents();
		List attributes = new ArrayList<>();
		List types = new ArrayList<>();
		attributes.add(myTaxa);
		types.add(ATTRIBUTE_TYPE.taxa);
		int ntaxa = myTaxa.size();
		for (int i = 0; i < numberOfPCs; i++) {
			String pcname = "PC" + (i + 1);
			float[] pcvalue = new float[ntaxa];
			for (int t = 0; t < ntaxa; t++) {
				pcvalue[t] = (float) pcs.get(t, i);
			}
			NumericAttribute na = new NumericAttribute(pcname, pcvalue, new OpenBitSet(ntaxa));
			attributes.add(na);
			types.add(ATTRIBUTE_TYPE.covariate);
		}
		Phenotype pcPhenotype = new PhenotypeBuilder().fromAttributeList(attributes, types).build().get(0);
		StringBuilder nameBuilder = new StringBuilder();
		nameBuilder.append("PC_").append(datasetName);
		StringBuilder commentBuilder = new StringBuilder("\nPrincipalComponents stored as covariates.\n");
		commentBuilder.append("calculated from ").append(datasetName);
		
		results.add(new Datum(nameBuilder.toString(), pcPhenotype, commentBuilder.toString()));
		
		//create a tableReport with eigenvalues, if requested
		if (reportEigenvalues.value()) {
			String name = "Proportion of Variance Explained";
			String[] columnNames = new String[]{"PC","eigenvalue","proportion of total","cumulative proportion"};
			int nEigenvalues = eigenvalues.length;
			Object[][] tableData = new Object[nEigenvalues][4];
			double sumvalues = cumulativeEigenvalues[nEigenvalues - 1];
			for (int i = 0; i < nEigenvalues; i++) {
				tableData[i][0] = String.format("%d",i+1);
				tableData[i][1] = new Double(eigenvalues[i]);
				tableData[i][2] = new Double(eigenvalues[i]/sumvalues);
				tableData[i][3] = new Double(cumulativeEigenvalues[i]/sumvalues);
			}
			
			nameBuilder = new StringBuilder();
			nameBuilder.append("Eigenvalues_").append(datasetName);
			commentBuilder = new StringBuilder("\nEigenvalues and proportion of variance explained by PCs.\n");
			commentBuilder.append("calculated from ").append(datasetName);
			SimpleTableReport str = new SimpleTableReport(name, columnNames, tableData);
			results.add(new Datum(nameBuilder.toString(), str, commentBuilder.toString()));
		}
		
		//create a tableReport with eigenvectors, if requested
		DoubleMatrix eigenvectors = pca.getEigenVectors();
		if (reportEigenvectors.value()) {
			String name = "Eigenvectors";
			int ncol = numberOfPCs + 1;
			int nrows = data.size();
			String[] columnNames = new String[ncol];
			columnNames[0] = "Trait";
			for (int c = 1; c < ncol; c++) columnNames[c] = String.format("Eigenvector%d",c);
			Object[][] tableData = new Object[nrows][ncol];
			for (int r = 0; r < nrows; r++) {
				tableData[r][0] = data.get(r).name();
				for (int c = 1; c < ncol; c++) tableData[r][c] = new Double(eigenvectors.get(r, c - 1));
			}
			
			nameBuilder = new StringBuilder();
			nameBuilder.append("Eigenvectors_").append(datasetName);
			commentBuilder = new StringBuilder("\nEigenvectors for requested PCs.\n");
			commentBuilder.append("calculated from ").append(datasetName);
			SimpleTableReport str = new SimpleTableReport(name, columnNames, tableData);
			results.add(new Datum(nameBuilder.toString(), str, commentBuilder.toString()));
			
		}
		
		return results;
	}
	
	private List addResultsToDatumList(PrinComp pca, GenotypeTable myGenotype, String datasetName) {
		List results = new ArrayList<>();
		
		//determine how many pc's to return
		int numberOfPCs;
		double[] eigenvalues = pca.getEigenValues();
		int nvalues = eigenvalues.length;
		double[] cumulativeEigenvalues = Arrays.copyOf(eigenvalues, nvalues);
		for (int i = 1; i < nvalues; i++) {
			cumulativeEigenvalues[i] += cumulativeEigenvalues[i - 1];
		}
		
		if (limitBy.value() == PCA_LIMIT.number_of_components) {
			numberOfPCs = Math.min(numberOfComponents.value(), nvalues);
		} else if (limitBy.value() == PCA_LIMIT.total_variance) {
			double limit = totalVar.value() * cumulativeEigenvalues[nvalues - 1];
			int ndx = Arrays.binarySearch(cumulativeEigenvalues, limit);
			if (ndx < -1) numberOfPCs = - ndx;
			else numberOfPCs = ndx + 1;
			numberOfPCs = Math.min(numberOfPCs, nvalues);
		} else {    //min_eigenvalue
			int ndx = Arrays.binarySearch(eigenvalues, minEigenval.value());
			if (ndx < -1) numberOfPCs = - ndx;
			else numberOfPCs = ndx + 1;
			numberOfPCs = Math.min(numberOfPCs, nvalues);
		}
		
		//create a Phenotype with the requested number of PCs
		DoubleMatrix pcs = pca.getPrincipalComponents();
		List attributes = new ArrayList<>();
		List types = new ArrayList<>();
		attributes.add(new TaxaAttribute(myGenotype.taxa()));
		types.add(ATTRIBUTE_TYPE.taxa);
		int ntaxa = myGenotype.numberOfTaxa();
		for (int i = 0; i < numberOfPCs; i++) {
			String pcname = "PC" + (i + 1);
			float[] pcvalue = new float[ntaxa];
			for (int t = 0; t < ntaxa; t++) {
				pcvalue[t] = (float) pcs.get(t, i);
			}
			NumericAttribute na = new NumericAttribute(pcname, pcvalue, new OpenBitSet(ntaxa));
			attributes.add(na);
			types.add(ATTRIBUTE_TYPE.covariate);
		}
		Phenotype pcPhenotype = new PhenotypeBuilder().fromAttributeList(attributes, types).build().get(0);
		StringBuilder nameBuilder = new StringBuilder();
		nameBuilder.append("PC_").append(datasetName);
		StringBuilder commentBuilder = new StringBuilder("\nPrincipalComponents stored as covariates.\n");
		commentBuilder.append("calculated from ").append(datasetName);
		
		results.add(new Datum(nameBuilder.toString(), pcPhenotype, commentBuilder.toString()));
		
		//create a tableReport with eigenvalues, if requested
		if (reportEigenvalues.value()) {
			String name = "Proportion of Variance Explained";
			String[] columnNames = new String[]{"PC","eigenvalue","proportion of total","cumulative proportion"};
			int nEigenvalues = eigenvalues.length;
			Object[][] tableData = new Object[nEigenvalues][4];
			double sumvalues = cumulativeEigenvalues[nEigenvalues - 1];
			for (int i = 0; i < nEigenvalues; i++) {
				tableData[i][0] = String.format("%d",i);
				tableData[i][1] = new Double(eigenvalues[i]);
				tableData[i][2] = new Double(eigenvalues[i]/sumvalues);
				tableData[i][3] = new Double(cumulativeEigenvalues[i]/sumvalues);
			}
			
			nameBuilder = new StringBuilder();
			nameBuilder.append("Eigenvalues_").append(datasetName);
			commentBuilder = new StringBuilder("\nEigenvalues and proportion of variance explained by PCs.\n");
			commentBuilder.append("calculated from ").append(datasetName);
			SimpleTableReport str = new SimpleTableReport(name, columnNames, tableData);
			results.add(new Datum(nameBuilder.toString(), str, commentBuilder.toString()));
		}
		
		//create a tableReport with eigenvectors, if requested
		DoubleMatrix eigenvectors = pca.getEigenVectors();
		if (reportEigenvectors.value()) {
			String name = "Eigenvectors";
			int ncol = numberOfPCs + 1;
			int nrows = myGenotype.numberOfSites();
			String[] columnNames = new String[ncol];
			columnNames[0] = "Trait";
			for (int c = 1; c < ncol; c++) columnNames[c] = String.format("Eigenvector%d",c);
			Object[][] tableData = new Object[nrows][ncol];
			for (int r = 0; r < nrows; r++) {
				tableData[r][0] = myGenotype.positions().siteName(r);
				for (int c = 1; c < ncol; c++) tableData[r][c] = new Double(eigenvectors.get(r, c - 1));
			}
			
			nameBuilder = new StringBuilder();
			nameBuilder.append("Eigenvectors_").append(datasetName);
			commentBuilder = new StringBuilder("\nEigenvectors for requested PCs.\n");
			commentBuilder.append("calculated from ").append(datasetName);
			SimpleTableReport str = new SimpleTableReport(name, columnNames, tableData);
			results.add(new Datum(nameBuilder.toString(), str, commentBuilder.toString()));
			
		}
		
		return results;
	}

	@Override
	public String pluginDescription() {
		return "This plugin performs principal components analysis and returns the requested number of PC axes (components), and, optionally, the eigenvalues and eigenvectors. "
				+ "It can take as input either phenotype data or ReferenceProbability from a GenotypeTable.";
	}

	@Override
	public ImageIcon getIcon() {
        URL imageURL = FileLoadPlugin.class.getResource("/net/maizegenetics/analysis/images/pca.gif");
        if (imageURL == null) {
            return null;
        } else {
            return new ImageIcon(imageURL);
        }
	}

	@Override
	public String getButtonName() {
		return "PCA";
	}

	@Override
	public String getToolTipText() {
		return "Performs principal components analysis";
	}

    // The following getters and setters were auto-generated.
    // Please use this method to re-generate.
    //
    // public static void main(String[] args) {
    //     GeneratePluginCode.generate(PrincipalComponentsPlugin.class);
    // }

    /**
     * Convenience method to run plugin with one return object.
     */
    public Phenotype runPlugin(DataSet input) {
        return (Phenotype) performFunction(input).getData(0).getData();
    }

    /**
     * If the box is checked, then the analysis will do an
     * eigenvalue decomposition of the covariance matrix.
     * If the box is unchecked, it will use a correlation
     * matrix. Using the covariance matrix is recommended
     * for genotypes while the correlation matrix is often
     * used for phenotypes.
     *
     * @return covariance (alternative = correlation)
     */
    public Boolean covariance() {
        return useCovariance.value();
    }

    /**
     * Set covariance (alternative = correlation). If the
     * box is checked, then the analysis will do an eigenvalue
     * decomposition of the covariance matrix. If the box
     * is unchecked, it will use a correlation matrix. Using
     * the covariance matrix is recommended for genotypes
     * while the correlation matrix is often used for phenotypes.
     *
     * @param value covariance (alternative = correlation)
     *
     * @return this plugin
     */
    public PrincipalComponentsPlugin covariance(Boolean value) {
        useCovariance = new PluginParameter<>(useCovariance, value);
        return this;
    }

    /**
     * This parameter determines the type of value that will
     * be used to limit the number of principal components
     * (axes) returned. The possible choices are number_of_components,
     * min_eigenvalue, and total_variance.
     *
     * @return limit number of components by
     */
    public PCA_LIMIT limitNumberOfComponentsBy() {
        return limitBy.value();
    }

    /**
     * Set limit number of components by. This parameter determines
     * the type of value that will be used to limit the number
     * of principal components (axes) returned. The possible
     * choices are number_of_components, min_eigenvalue, and
     * total_variance.
     *
     * @param value limit number of components by
     *
     * @return this plugin
     */
    public PrincipalComponentsPlugin limitNumberOfComponentsBy(PCA_LIMIT value) {
        limitBy = new PluginParameter<>(limitBy, value);
        return this;
    }

    /**
     * The analysis will return this many principal components
     * up to the number of taxa.
     *
     * @return number of components
     */
    public Integer numberOfComponents() {
        return numberOfComponents.value();
    }

    /**
     * Set number of components. The analysis will return
     * this many principal components up to the number of
     * taxa.
     *
     * @param value number of components
     *
     * @return this plugin
     */
    public PrincipalComponentsPlugin numberOfComponents(Integer value) {
        numberOfComponents = new PluginParameter<>(numberOfComponents, value);
        return this;
    }

    /**
     * All principal components with an eigenvalue greater
     * than or equal to this value will be returned.
     *
     * @return minimum eigenvalue
     */
    public Double minimumEigenvalue() {
        return minEigenval.value();
    }

    /**
     * Set minimum eigenvalue. All principal components with
     * an eigenvalue greater than or equal to this value will
     * be returned.
     *
     * @param value minimum eigenvalue
     *
     * @return this plugin
     */
    public PrincipalComponentsPlugin minimumEigenvalue(Double value) {
        minEigenval = new PluginParameter<>(minEigenval, value);
        return this;
    }

    /**
     * The first principal components that together explain
     * this proportion of the total variance will be returned.
     *
     * @return total variance
     */
    public Double totalVariance() {
        return totalVar.value();
    }

    /**
     * Set total variance. The first principal components
     * that together explain this proportion of the total
     * variance will be returned.
     *
     * @param value total variance
     *
     * @return this plugin
     */
    public PrincipalComponentsPlugin totalVariance(Double value) {
        totalVar = new PluginParameter<>(totalVar, value);
        return this;
    }

    /**
     * Returns a list of eigenvalues sorted high to low.
     *
     * @return Return Eigenvalues
     */
    public Boolean returnEigenvalues() {
        return reportEigenvalues.value();
    }

    /**
     * Set Return Eigenvalues. Returns a list of eigenvalues
     * sorted high to low.
     *
     * @param value Return Eigenvalues
     *
     * @return this plugin
     */
    public PrincipalComponentsPlugin returnEigenvalues(Boolean value) {
        reportEigenvalues = new PluginParameter<>(reportEigenvalues, value);
        return this;
    }

    /**
     * Returns the eigenvectors calculated from a Singular
     * Value Decomposition of the data. The resulting table
     * can be quite large if the number of variants and taxa
     * are big.
     *
     * @return Return Eigenvectors
     */
    public Boolean returnEigenvectors() {
        return reportEigenvectors.value();
    }

    /**
     * Set Return Eigenvectors. Returns the eigenvectors calculated
     * from a Singular Value Decomposition of the data. The
     * resulting table can be quite large if the number of
     * variants and taxa are big.
     *
     * @param value Return Eigenvectors
     *
     * @return this plugin
     */
    public PrincipalComponentsPlugin returnEigenvectors(Boolean value) {
        reportEigenvectors = new PluginParameter<>(reportEigenvectors, value);
        return this;
    }


}




© 2015 - 2025 Weber Informatics LLC | Privacy Policy