net.maizegenetics.analysis.data.PrincipalComponentsPlugin Maven / Gradle / Ivy
package net.maizegenetics.analysis.data;
import java.awt.Frame;
import java.net.URL;
import java.util.ArrayList;
import java.util.Arrays;
import java.util.List;
import java.util.Optional;
import java.util.stream.Stream;
import javax.swing.ImageIcon;
import org.apache.log4j.Logger;
import com.google.common.collect.Range;
import net.maizegenetics.analysis.numericaltransform.ImputationPlugin;
import net.maizegenetics.analysis.numericaltransform.NumericalGenotypePlugin;
import net.maizegenetics.dna.snp.GenotypeTable;
import net.maizegenetics.dna.snp.score.ReferenceProbability;
import net.maizegenetics.matrixalgebra.Matrix.DoubleMatrix;
import net.maizegenetics.matrixalgebra.Matrix.DoubleMatrixFactory;
import net.maizegenetics.phenotype.NumericAttribute;
import net.maizegenetics.phenotype.Phenotype;
import net.maizegenetics.phenotype.PhenotypeBuilder;
import net.maizegenetics.phenotype.TaxaAttribute;
import net.maizegenetics.phenotype.Phenotype.ATTRIBUTE_TYPE;
import net.maizegenetics.phenotype.PhenotypeAttribute;
import net.maizegenetics.plugindef.AbstractPlugin;
import net.maizegenetics.plugindef.DataSet;
import net.maizegenetics.plugindef.Datum;
import net.maizegenetics.plugindef.GeneratePluginCode;
import net.maizegenetics.plugindef.PluginParameter;
import net.maizegenetics.stats.PCA.PrinComp;
import net.maizegenetics.stats.PCA.PrinComp.PC_TYPE;
import net.maizegenetics.util.OpenBitSet;
import net.maizegenetics.util.SimpleTableReport;
public class PrincipalComponentsPlugin extends AbstractPlugin {
private static final Logger myLogger = Logger.getLogger(PrincipalComponentsPlugin.class);
public static enum PCA_LIMIT {number_of_components, min_eigenvalue, total_variance};
private PluginParameter useCovariance = new PluginParameter.Builder<>("covariance", true, Boolean.class)
.description("If the box is checked, then the analysis will do an eigenvalue decomposition of the covariance matrix. "
+ "If the box is unchecked, it will use a correlation matrix. Using the covariance matrix "
+ "is recommended for genotypes while the correlation matrix is often used for phenotypes.")
.guiName("covariance (alternative = correlation)")
.build();
private PluginParameter limitBy = new PluginParameter.Builder<>("limitBy", PCA_LIMIT.number_of_components, PCA_LIMIT.class)
.description("This parameter determines the type of value that will be used to limit the number of principal components (axes) returned. "
+ "The possible choices are number_of_components, min_eigenvalue, and total_variance.")
.guiName("limit number of components by")
.build();
private PluginParameter numberOfComponents = new PluginParameter.Builder<>("ncomponents", 5, Integer.class)
.description("The analysis will return this many principal components up to the number of taxa.")
.guiName("number of components")
.dependentOnParameter(limitBy, PCA_LIMIT.number_of_components)
.build();
private PluginParameter minEigenval = new PluginParameter.Builder<>("minEigenval", 0.0, Double.class)
.description("All principal components with an eigenvalue greater than or equal to this value will be returned.")
.guiName("minimum eigenvalue")
.dependentOnParameter(limitBy, PCA_LIMIT.min_eigenvalue)
.build();
private PluginParameter totalVar = new PluginParameter.Builder<>("totalVar", 0.5, Double.class)
.description("The first principal components that together explain this proportion of the total variance will be returned.")
.range(Range.closed(0.0, 1.0))
.guiName("total variance")
.dependentOnParameter(limitBy, PCA_LIMIT.total_variance)
.build();
private PluginParameter reportEigenvalues = new PluginParameter.Builder<>("reportEigenvalues", true, Boolean.class)
.description("Returns a list of eigenvalues sorted high to low.")
.guiName("Return Eigenvalues")
.build();
private PluginParameter reportEigenvectors = new PluginParameter.Builder<>("reportEigenvectors", true, Boolean.class)
.description("Returns the eigenvectors calculated from a Singular Value Decomposition of the data. The resulting table can be quite large if the number of variants and taxa are big.")
.guiName("Return Eigenvectors")
.build();
public PrincipalComponentsPlugin(Frame parentFrame, boolean isInteractive) {
super(parentFrame, isInteractive);
}
public DataSet processData(DataSet input){
List myResults = new ArrayList<>();
List myData = input.getDataOfType(new Class[]{Phenotype.class, GenotypeTable.class});
for (Datum aDatum : myData) {
if (aDatum.getData() instanceof Phenotype) {
//check for missing values, throw an IllegalArgumentException if there is any missing data
Phenotype myPhenotype = (Phenotype) aDatum.getData();
if (areAnyPhenotypesMissing(myPhenotype.dataAttributeStream())) {
StringBuilder msgBuilder = new StringBuilder();
msgBuilder.append("There are missing values in ")
.append(aDatum.getName())
.append(". PCA will not be run.");
throw new IllegalArgumentException(msgBuilder.toString());
}
//create the matrix, rows are observations, columns are data attributes
List dataAttributes = myPhenotype.attributeListOfType(ATTRIBUTE_TYPE.data);
int nAttributes = dataAttributes.size();
int nobs = myPhenotype.numberOfObservations();
DoubleMatrix dataMatrix = DoubleMatrixFactory.DEFAULT.make(nobs, nAttributes);
int colCount = 0;
for (PhenotypeAttribute attr: dataAttributes) {
float[] colData = ((NumericAttribute) attr).floatValues();
for (int i = 0; i < nobs; i++) dataMatrix.set(i, colCount, colData[i]);
colCount++;
}
//run PCA
PC_TYPE pctype;
if (useCovariance.value()) pctype = PC_TYPE.cov;
else pctype = PC_TYPE.corr;
PrinComp pca = new PrinComp(dataMatrix, pctype);
//get results
myResults.addAll(addResultsToDatumList(pca, myPhenotype.taxaAttribute(), dataAttributes, aDatum.getName()));
} else {
GenotypeTable myGenotype = (GenotypeTable) aDatum.getData();
//is there a reference probability? If not, create one and impute missing values
if (!myGenotype.hasReferenceProbablity()) {
myGenotype = NumericalGenotypePlugin.setAlternateMinorAllelesToMinor(myGenotype);
DataSet myDataset = new DataSet(new Datum("name", myGenotype, "comment"), this);
ImputationPlugin myImputer = new ImputationPlugin(null, false);
myImputer.by_mean(true);
DataSet imputedDataset = myImputer.performFunction(myDataset);
myGenotype = (GenotypeTable) imputedDataset.getData(0).getData();
} else if (areAnyGenotypesMissingInReferenceProbability(myGenotype)) {
StringBuilder msgBuilder = new StringBuilder();
msgBuilder.append("There are missing values in ")
.append(aDatum.getName())
.append(". PCA will not be run.");
throw new IllegalArgumentException(msgBuilder.toString());
}
//create the matrix, rows are taxa, columns are sites
int ntaxa = myGenotype.numberOfTaxa();
int nsites = myGenotype.numberOfSites();
DoubleMatrix dataMatrix = DoubleMatrixFactory.DEFAULT.make(ntaxa, nsites);
for (int t = 0; t < ntaxa; t++) {
for (int s = 0; s < nsites; s++) {
dataMatrix.set(t, s, myGenotype.referenceProbability(t, s));
}
}
//run PCA
PC_TYPE pctype;
if (useCovariance.value()) pctype = PC_TYPE.cov;
else pctype = PC_TYPE.corr;
PrinComp pca = new PrinComp(dataMatrix, pctype);
//get results
myResults.addAll(addResultsToDatumList(pca, myGenotype, aDatum.getName()));
}
}
return new DataSet(myResults, this);
}
private boolean areAnyPhenotypesMissing(Stream attributes) {
Optional na = attributes.filter(a -> a.missing().cardinality() > 0).findAny();
return na.isPresent();
}
private boolean areAnyGenotypesMissingInReferenceProbability(GenotypeTable myGenotype) {
int ntaxa = myGenotype.numberOfTaxa();
int nsites = myGenotype.numberOfSites();
ReferenceProbability refprob = myGenotype.referenceProbability();
for (int s = 0; s < nsites; s++) {
for (int t = 0; t < ntaxa; t++) {
if (Float.isNaN(refprob.value(t, s))) return true;
}
}
return false;
}
private List addResultsToDatumList(PrinComp pca, TaxaAttribute myTaxa, List data, String datasetName) {
List results = new ArrayList<>();
//determine how many pc's to return
int numberOfPCs;
double[] eigenvalues = pca.getEigenValues();
int nvalues = eigenvalues.length;
double[] cumulativeEigenvalues = Arrays.copyOf(eigenvalues, nvalues);
for (int i = 1; i < nvalues; i++) {
cumulativeEigenvalues[i] += cumulativeEigenvalues[i - 1];
}
if (limitBy.value() == PCA_LIMIT.number_of_components) {
numberOfPCs = Math.min(numberOfComponents.value(), nvalues);
} else if (limitBy.value() == PCA_LIMIT.total_variance) {
double limit = totalVar.value() * cumulativeEigenvalues[nvalues - 1];
int ndx = Arrays.binarySearch(cumulativeEigenvalues, limit);
if (ndx < -1) numberOfPCs = - ndx;
else numberOfPCs = ndx + 1;
numberOfPCs = Math.min(numberOfPCs, nvalues);
} else { //min_eigenvalue
int ndx = Arrays.binarySearch(eigenvalues, minEigenval.value());
if (ndx < -1) numberOfPCs = - ndx;
else numberOfPCs = ndx + 1;
numberOfPCs = Math.min(numberOfPCs, nvalues);
}
//create a Phenotype with the requested number of PCs
DoubleMatrix pcs = pca.getPrincipalComponents();
List attributes = new ArrayList<>();
List types = new ArrayList<>();
attributes.add(myTaxa);
types.add(ATTRIBUTE_TYPE.taxa);
int ntaxa = myTaxa.size();
for (int i = 0; i < numberOfPCs; i++) {
String pcname = "PC" + (i + 1);
float[] pcvalue = new float[ntaxa];
for (int t = 0; t < ntaxa; t++) {
pcvalue[t] = (float) pcs.get(t, i);
}
NumericAttribute na = new NumericAttribute(pcname, pcvalue, new OpenBitSet(ntaxa));
attributes.add(na);
types.add(ATTRIBUTE_TYPE.covariate);
}
Phenotype pcPhenotype = new PhenotypeBuilder().fromAttributeList(attributes, types).build().get(0);
StringBuilder nameBuilder = new StringBuilder();
nameBuilder.append("PC_").append(datasetName);
StringBuilder commentBuilder = new StringBuilder("\nPrincipalComponents stored as covariates.\n");
commentBuilder.append("calculated from ").append(datasetName);
results.add(new Datum(nameBuilder.toString(), pcPhenotype, commentBuilder.toString()));
//create a tableReport with eigenvalues, if requested
if (reportEigenvalues.value()) {
String name = "Proportion of Variance Explained";
String[] columnNames = new String[]{"PC","eigenvalue","proportion of total","cumulative proportion"};
int nEigenvalues = eigenvalues.length;
Object[][] tableData = new Object[nEigenvalues][4];
double sumvalues = cumulativeEigenvalues[nEigenvalues - 1];
for (int i = 0; i < nEigenvalues; i++) {
tableData[i][0] = String.format("%d",i+1);
tableData[i][1] = new Double(eigenvalues[i]);
tableData[i][2] = new Double(eigenvalues[i]/sumvalues);
tableData[i][3] = new Double(cumulativeEigenvalues[i]/sumvalues);
}
nameBuilder = new StringBuilder();
nameBuilder.append("Eigenvalues_").append(datasetName);
commentBuilder = new StringBuilder("\nEigenvalues and proportion of variance explained by PCs.\n");
commentBuilder.append("calculated from ").append(datasetName);
SimpleTableReport str = new SimpleTableReport(name, columnNames, tableData);
results.add(new Datum(nameBuilder.toString(), str, commentBuilder.toString()));
}
//create a tableReport with eigenvectors, if requested
DoubleMatrix eigenvectors = pca.getEigenVectors();
if (reportEigenvectors.value()) {
String name = "Eigenvectors";
int ncol = numberOfPCs + 1;
int nrows = data.size();
String[] columnNames = new String[ncol];
columnNames[0] = "Trait";
for (int c = 1; c < ncol; c++) columnNames[c] = String.format("Eigenvector%d",c);
Object[][] tableData = new Object[nrows][ncol];
for (int r = 0; r < nrows; r++) {
tableData[r][0] = data.get(r).name();
for (int c = 1; c < ncol; c++) tableData[r][c] = new Double(eigenvectors.get(r, c - 1));
}
nameBuilder = new StringBuilder();
nameBuilder.append("Eigenvectors_").append(datasetName);
commentBuilder = new StringBuilder("\nEigenvectors for requested PCs.\n");
commentBuilder.append("calculated from ").append(datasetName);
SimpleTableReport str = new SimpleTableReport(name, columnNames, tableData);
results.add(new Datum(nameBuilder.toString(), str, commentBuilder.toString()));
}
return results;
}
private List addResultsToDatumList(PrinComp pca, GenotypeTable myGenotype, String datasetName) {
List results = new ArrayList<>();
//determine how many pc's to return
int numberOfPCs;
double[] eigenvalues = pca.getEigenValues();
int nvalues = eigenvalues.length;
double[] cumulativeEigenvalues = Arrays.copyOf(eigenvalues, nvalues);
for (int i = 1; i < nvalues; i++) {
cumulativeEigenvalues[i] += cumulativeEigenvalues[i - 1];
}
if (limitBy.value() == PCA_LIMIT.number_of_components) {
numberOfPCs = Math.min(numberOfComponents.value(), nvalues);
} else if (limitBy.value() == PCA_LIMIT.total_variance) {
double limit = totalVar.value() * cumulativeEigenvalues[nvalues - 1];
int ndx = Arrays.binarySearch(cumulativeEigenvalues, limit);
if (ndx < -1) numberOfPCs = - ndx;
else numberOfPCs = ndx + 1;
numberOfPCs = Math.min(numberOfPCs, nvalues);
} else { //min_eigenvalue
int ndx = Arrays.binarySearch(eigenvalues, minEigenval.value());
if (ndx < -1) numberOfPCs = - ndx;
else numberOfPCs = ndx + 1;
numberOfPCs = Math.min(numberOfPCs, nvalues);
}
//create a Phenotype with the requested number of PCs
DoubleMatrix pcs = pca.getPrincipalComponents();
List attributes = new ArrayList<>();
List types = new ArrayList<>();
attributes.add(new TaxaAttribute(myGenotype.taxa()));
types.add(ATTRIBUTE_TYPE.taxa);
int ntaxa = myGenotype.numberOfTaxa();
for (int i = 0; i < numberOfPCs; i++) {
String pcname = "PC" + (i + 1);
float[] pcvalue = new float[ntaxa];
for (int t = 0; t < ntaxa; t++) {
pcvalue[t] = (float) pcs.get(t, i);
}
NumericAttribute na = new NumericAttribute(pcname, pcvalue, new OpenBitSet(ntaxa));
attributes.add(na);
types.add(ATTRIBUTE_TYPE.covariate);
}
Phenotype pcPhenotype = new PhenotypeBuilder().fromAttributeList(attributes, types).build().get(0);
StringBuilder nameBuilder = new StringBuilder();
nameBuilder.append("PC_").append(datasetName);
StringBuilder commentBuilder = new StringBuilder("\nPrincipalComponents stored as covariates.\n");
commentBuilder.append("calculated from ").append(datasetName);
results.add(new Datum(nameBuilder.toString(), pcPhenotype, commentBuilder.toString()));
//create a tableReport with eigenvalues, if requested
if (reportEigenvalues.value()) {
String name = "Proportion of Variance Explained";
String[] columnNames = new String[]{"PC","eigenvalue","proportion of total","cumulative proportion"};
int nEigenvalues = eigenvalues.length;
Object[][] tableData = new Object[nEigenvalues][4];
double sumvalues = cumulativeEigenvalues[nEigenvalues - 1];
for (int i = 0; i < nEigenvalues; i++) {
tableData[i][0] = String.format("%d",i);
tableData[i][1] = new Double(eigenvalues[i]);
tableData[i][2] = new Double(eigenvalues[i]/sumvalues);
tableData[i][3] = new Double(cumulativeEigenvalues[i]/sumvalues);
}
nameBuilder = new StringBuilder();
nameBuilder.append("Eigenvalues_").append(datasetName);
commentBuilder = new StringBuilder("\nEigenvalues and proportion of variance explained by PCs.\n");
commentBuilder.append("calculated from ").append(datasetName);
SimpleTableReport str = new SimpleTableReport(name, columnNames, tableData);
results.add(new Datum(nameBuilder.toString(), str, commentBuilder.toString()));
}
//create a tableReport with eigenvectors, if requested
DoubleMatrix eigenvectors = pca.getEigenVectors();
if (reportEigenvectors.value()) {
String name = "Eigenvectors";
int ncol = numberOfPCs + 1;
int nrows = myGenotype.numberOfSites();
String[] columnNames = new String[ncol];
columnNames[0] = "Trait";
for (int c = 1; c < ncol; c++) columnNames[c] = String.format("Eigenvector%d",c);
Object[][] tableData = new Object[nrows][ncol];
for (int r = 0; r < nrows; r++) {
tableData[r][0] = myGenotype.positions().siteName(r);
for (int c = 1; c < ncol; c++) tableData[r][c] = new Double(eigenvectors.get(r, c - 1));
}
nameBuilder = new StringBuilder();
nameBuilder.append("Eigenvectors_").append(datasetName);
commentBuilder = new StringBuilder("\nEigenvectors for requested PCs.\n");
commentBuilder.append("calculated from ").append(datasetName);
SimpleTableReport str = new SimpleTableReport(name, columnNames, tableData);
results.add(new Datum(nameBuilder.toString(), str, commentBuilder.toString()));
}
return results;
}
@Override
public String pluginDescription() {
return "This plugin performs principal components analysis and returns the requested number of PC axes (components), and, optionally, the eigenvalues and eigenvectors. "
+ "It can take as input either phenotype data or ReferenceProbability from a GenotypeTable.";
}
@Override
public ImageIcon getIcon() {
URL imageURL = FileLoadPlugin.class.getResource("/net/maizegenetics/analysis/images/pca.gif");
if (imageURL == null) {
return null;
} else {
return new ImageIcon(imageURL);
}
}
@Override
public String getButtonName() {
return "PCA";
}
@Override
public String getToolTipText() {
return "Performs principal components analysis";
}
// The following getters and setters were auto-generated.
// Please use this method to re-generate.
//
// public static void main(String[] args) {
// GeneratePluginCode.generate(PrincipalComponentsPlugin.class);
// }
/**
* Convenience method to run plugin with one return object.
*/
public Phenotype runPlugin(DataSet input) {
return (Phenotype) performFunction(input).getData(0).getData();
}
/**
* If the box is checked, then the analysis will do an
* eigenvalue decomposition of the covariance matrix.
* If the box is unchecked, it will use a correlation
* matrix. Using the covariance matrix is recommended
* for genotypes while the correlation matrix is often
* used for phenotypes.
*
* @return covariance (alternative = correlation)
*/
public Boolean covariance() {
return useCovariance.value();
}
/**
* Set covariance (alternative = correlation). If the
* box is checked, then the analysis will do an eigenvalue
* decomposition of the covariance matrix. If the box
* is unchecked, it will use a correlation matrix. Using
* the covariance matrix is recommended for genotypes
* while the correlation matrix is often used for phenotypes.
*
* @param value covariance (alternative = correlation)
*
* @return this plugin
*/
public PrincipalComponentsPlugin covariance(Boolean value) {
useCovariance = new PluginParameter<>(useCovariance, value);
return this;
}
/**
* This parameter determines the type of value that will
* be used to limit the number of principal components
* (axes) returned. The possible choices are number_of_components,
* min_eigenvalue, and total_variance.
*
* @return limit number of components by
*/
public PCA_LIMIT limitNumberOfComponentsBy() {
return limitBy.value();
}
/**
* Set limit number of components by. This parameter determines
* the type of value that will be used to limit the number
* of principal components (axes) returned. The possible
* choices are number_of_components, min_eigenvalue, and
* total_variance.
*
* @param value limit number of components by
*
* @return this plugin
*/
public PrincipalComponentsPlugin limitNumberOfComponentsBy(PCA_LIMIT value) {
limitBy = new PluginParameter<>(limitBy, value);
return this;
}
/**
* The analysis will return this many principal components
* up to the number of taxa.
*
* @return number of components
*/
public Integer numberOfComponents() {
return numberOfComponents.value();
}
/**
* Set number of components. The analysis will return
* this many principal components up to the number of
* taxa.
*
* @param value number of components
*
* @return this plugin
*/
public PrincipalComponentsPlugin numberOfComponents(Integer value) {
numberOfComponents = new PluginParameter<>(numberOfComponents, value);
return this;
}
/**
* All principal components with an eigenvalue greater
* than or equal to this value will be returned.
*
* @return minimum eigenvalue
*/
public Double minimumEigenvalue() {
return minEigenval.value();
}
/**
* Set minimum eigenvalue. All principal components with
* an eigenvalue greater than or equal to this value will
* be returned.
*
* @param value minimum eigenvalue
*
* @return this plugin
*/
public PrincipalComponentsPlugin minimumEigenvalue(Double value) {
minEigenval = new PluginParameter<>(minEigenval, value);
return this;
}
/**
* The first principal components that together explain
* this proportion of the total variance will be returned.
*
* @return total variance
*/
public Double totalVariance() {
return totalVar.value();
}
/**
* Set total variance. The first principal components
* that together explain this proportion of the total
* variance will be returned.
*
* @param value total variance
*
* @return this plugin
*/
public PrincipalComponentsPlugin totalVariance(Double value) {
totalVar = new PluginParameter<>(totalVar, value);
return this;
}
/**
* Returns a list of eigenvalues sorted high to low.
*
* @return Return Eigenvalues
*/
public Boolean returnEigenvalues() {
return reportEigenvalues.value();
}
/**
* Set Return Eigenvalues. Returns a list of eigenvalues
* sorted high to low.
*
* @param value Return Eigenvalues
*
* @return this plugin
*/
public PrincipalComponentsPlugin returnEigenvalues(Boolean value) {
reportEigenvalues = new PluginParameter<>(reportEigenvalues, value);
return this;
}
/**
* Returns the eigenvectors calculated from a Singular
* Value Decomposition of the data. The resulting table
* can be quite large if the number of variants and taxa
* are big.
*
* @return Return Eigenvectors
*/
public Boolean returnEigenvectors() {
return reportEigenvectors.value();
}
/**
* Set Return Eigenvectors. Returns the eigenvectors calculated
* from a Singular Value Decomposition of the data. The
* resulting table can be quite large if the number of
* variants and taxa are big.
*
* @param value Return Eigenvectors
*
* @return this plugin
*/
public PrincipalComponentsPlugin returnEigenvectors(Boolean value) {
reportEigenvectors = new PluginParameter<>(reportEigenvectors, value);
return this;
}
}
© 2015 - 2025 Weber Informatics LLC | Privacy Policy