All Downloads are FREE. Search and download functionalities are using the official Maven repository.

smile.projection.PCA Maven / Gradle / Ivy

The newest version!
/*******************************************************************************
 * Copyright (c) 2010 Haifeng Li
 *   
 * Licensed under the Apache License, Version 2.0 (the "License");
 * you may not use this file except in compliance with the License.
 * You may obtain a copy of the License at
 *  
 *     http://www.apache.org/licenses/LICENSE-2.0
 *
 * Unless required by applicable law or agreed to in writing, software
 * distributed under the License is distributed on an "AS IS" BASIS,
 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 * See the License for the specific language governing permissions and
 * limitations under the License.
 *******************************************************************************/
package smile.projection;

import smile.math.Math;
import smile.math.matrix.EigenValueDecomposition;
import smile.math.matrix.SingularValueDecomposition;

/**
 * Principal component analysis. PCA is an orthogonal
 * linear transformation that transforms a number of possibly correlated
 * variables into a smaller number of uncorrelated variables called principal
 * components. The first principal component accounts for as much of the
 * variability in the data as possible, and each succeeding component accounts
 * for as much of the remaining variability as possible. PCA is theoretically
 * the optimum transform for given data in least square terms.
 * PCA can be thought of as revealing the internal structure of the data in
 * a way which best explains the variance in the data. If a multivariate
 * dataset is visualized as a set of coordinates in a high-dimensional data
 * space, PCA supplies the user with a lower-dimensional picture when viewed
 * from its (in some sense) most informative viewpoint.
 * 

* PCA is mostly used as a tool in exploratory data analysis and for making * predictive models. PCA involves the calculation of the eigenvalue * decomposition of a data covariance matrix or singular value decomposition * of a data matrix, usually after mean centering the data for each attribute. * The results of a PCA are usually discussed in terms of component scores and * loadings. *

* As a linear technique, PCA is built for several purposes: first, it enables us to * decorrelate the original variables; second, to carry out data compression, * where we pay decreasing attention to the numerical accuracy by which we * encode the sequence of principal components; third, to reconstruct the * original input data using a reduced number of variables according to a * least-squares criterion; and fourth, to identify potential clusters in the data. *

* In certain applications, PCA can be misleading. PCA is heavily influenced * when there are outliers in the data. In other situations, the linearity * of PCA may be an obstacle to successful data reduction and compression. * * @see KPCA * @see PPCA * @see GHA * * @author Haifeng Li */ public class PCA implements Projection { /** * The dimension of feature space. */ private int p; /** * The dimension of input space. */ private int n; /** * The sample mean. */ private double[] mu; /** * The projected sample mean. */ private double[] pmu; /** * The matrix of variable loadings, whose columns contain the eigenvectors. */ private double[][] eigvectors; /** * Eigenvalues of principal components. */ private double[] eigvalues; /** * The proportion of variance contained in each principal component. */ private double[] proportion; /** * The cumulative proportion of variance contained in principal components. */ private double[] cumulativeProportion; /** * Projection matrix. */ private double[][] projection; /** * Constructor. Learn principal component analysis with covariance matrix. */ public PCA(double[][] data) { this(data, false); } /** * Constructor. Learn principal component analysis. * @param data training data of which each row is a sample. If the sample size * is larger than the data dimension and cor = false, SVD is employed for * efficiency. Otherwise, eigen decomposition on covariance or correlation * matrix is performed. * @param cor true if use correlation matrix instead of covariance matrix if ture. */ public PCA(double[][] data, boolean cor) { int m = data.length; n = data[0].length; mu = Math.colMean(data); double[][] x = Math.clone(data); for (int i = 0; i < m; i++) { for (int j = 0; j < n; j++) { x[i][j] = data[i][j] - mu[j]; } } if (m > n && !cor) { SingularValueDecomposition svd = SingularValueDecomposition.decompose(x); eigvalues = svd.getSingularValues(); for (int i = 0; i < eigvalues.length; i++) { eigvalues[i] *= eigvalues[i]; } eigvectors = svd.getV(); } else { double[][] cov = new double[n][n]; for (int k = 0; k < m; k++) { for (int i = 0; i < n; i++) { for (int j = 0; j <= i; j++) { cov[i][j] += x[k][i] * x[k][j]; } } } for (int i = 0; i < n; i++) { for (int j = 0; j <= i; j++) { cov[i][j] /= m; // divide m instead of m-1 for S-PLUS compatibilit cov[j][i] = cov[i][j]; } } double[] sd = null; if (cor) { sd = new double[n]; for (int i = 0; i < n; i++) { sd[i] = Math.sqrt(cov[i][i]); } for (int i = 0; i < n; i++) { for (int j = 0; j <= i; j++) { cov[i][j] /= sd[i] * sd[j]; cov[j][i] = cov[i][j]; } } } EigenValueDecomposition eigen = EigenValueDecomposition.decompose(cov, true); double[][] loadings = eigen.getEigenVectors(); if (cor) { for (int i = 0; i < n; i++) { for (int j = 0; j < n; j++) { loadings[i][j] /= sd[i]; } } } eigvalues = eigen.getEigenValues(); eigvectors = loadings; } proportion = eigvalues.clone(); Math.unitize1(proportion); cumulativeProportion = new double[eigvalues.length]; cumulativeProportion[0] = proportion[0]; for (int i = 1; i < eigvalues.length; i++) { cumulativeProportion[i] = cumulativeProportion[i - 1] + proportion[i]; } setProjection(0.95); } /** * Returns the center of data. */ public double[] getCenter() { return mu; } /** * Returns the variable loading matrix, ordered from largest to smallest * by corresponding eigenvalues. The matrix columns contain the eigenvectors. */ public double[][] getLoadings() { return eigvectors; } /** * Returns the principal component variances, ordered from largest to smallest, * which are the eigenvalues of the covariance or correlation matrix of learning data. */ public double[] getVariance() { return eigvalues; } /** * Returns the proportion of variance contained in each principal component, * ordered from largest to smallest. */ public double[] getVarianceProportion() { return proportion; } /** * Returns the cumulative proportion of variance contained in principal components, * ordered from largest to smallest. */ public double[] getCumulativeVarianceProportion() { return cumulativeProportion; } /** * Returns the projection matrix W. The dimension reduced data can be obtained * by y = W' * x. */ public double[][] getProjection() { return projection; } /** * Set the projection matrix with given number of principal components. * @param p choose top p principal components used for projection. */ public PCA setProjection(int p) { if (p < 1 || p > n) { throw new IllegalArgumentException("Invalid dimension of feature space: " + p); } this.p = p; projection = new double[p][n]; for (int i = 0; i < n; i++) { for (int j = 0; j < p; j++) { projection[j][i] = eigvectors[i][j]; } } pmu = new double[p]; Math.ax(projection, mu, pmu); return this; } /** * Set the projection matrix with top principal components that contain * (more than) the given percentage of variance. * @param p the required percentage of variance. */ public PCA setProjection(double p) { if (p <= 0 || p > 1) { throw new IllegalArgumentException("Invalid percentage of variance: " + p); } for (int k = 0; k < n; k++) { if (cumulativeProportion[k] >= p) { setProjection(k + 1); break; } } return this; } @Override public double[] project(double[] x) { if (x.length != n) { throw new IllegalArgumentException(String.format("Invalid input vector size: %d, expected: %d", x.length, n)); } double[] y = new double[p]; Math.ax(projection, x, y); Math.minus(y, pmu); return y; } @Override public double[][] project(double[][] x) { if (x[0].length != mu.length) { throw new IllegalArgumentException(String.format("Invalid input vector size: %d, expected: %d", x[0].length, n)); } double[][] y = new double[x.length][p]; for (int i = 0; i < x.length; i++) { Math.ax(projection, x[i], y[i]); Math.minus(y[i], pmu); } return y; } }





© 2015 - 2025 Weber Informatics LLC | Privacy Policy