All Downloads are FREE. Search and download functionalities are using the official Maven repository.

smile.feature.imputation.SVDImputer Maven / Gradle / Ivy

The newest version!
/*
 * Copyright (c) 2010-2021 Haifeng Li. All rights reserved.
 *
 * Smile is free software: you can redistribute it and/or modify
 * it under the terms of the GNU General Public License as published by
 * the Free Software Foundation, either version 3 of the License, or
 * (at your option) any later version.
 *
 * Smile is distributed in the hope that it will be useful,
 * but WITHOUT ANY WARRANTY; without even the implied warranty of
 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
 * GNU General Public License for more details.
 *
 * You should have received a copy of the GNU General Public License
 * along with Smile.  If not, see .
 */

package smile.feature.imputation;

import smile.math.MathEx;
import smile.math.matrix.Matrix;

/**
 * Missing value imputation with singular value decomposition. Given SVD
 * A = U Σ VT, we use the most significant eigenvectors of
 * VT to linearly estimate missing values. Although it has been
 * shown that several significant eigenvectors are sufficient to describe
 * the data with small errors, the exact fraction of eigenvectors best for
 * estimation needs to be determined empirically. Once k most significant
 * eigenvectors from VT are selected, we estimate a missing value j
 * in row i by first regressing this row against the k eigenvectors and then use
 * the coefficients of the regression to reconstruct j from a linear combination
 * of the k eigenvectors. The j th value of row i and the j th values of the k
 * eigenvectors are not used in determining these regression coefficients.
 * It should be noted that SVD can only be performed on complete matrices;
 * therefore we originally fill all missing values by other methods in
 * matrix A, obtaining A'. We then utilize an expectation maximization method to
 * arrive at the final estimate, as follows. Each missing value in A is estimated
 * using the above algorithm, and then the procedure is repeated on the newly
 * obtained matrix, until the total change in the matrix falls below the
 * empirically determined threshold (say 0.01).
 *
 * @author Haifeng Li
 */
public interface SVDImputer {
    /**
     * Impute missing values in the dataset.
     * @param data a data set with missing values (represented as Double.NaN).
     * @param k the number of eigenvectors used for imputation.
     * @param maxIter the maximum number of iterations.
     * @return the imputed data.
     * @throws IllegalArgumentException when the whole row or column is missing.
     */
    static double[][] impute(double[][] data, int k, int maxIter) {
        if (k < 1 || k > Math.min(data.length, data[0].length)) {
            throw new IllegalArgumentException("Invalid number of eigenvectors for imputation: " + k);
        }

        if (maxIter < 1) {
            throw new IllegalArgumentException("Invalid maximum number of iterations: " + maxIter);
        }

        int d = data[0].length;
        double[][] full = SimpleImputer.impute(data);

        for (int iter = 0; iter < maxIter; iter++) {
            Matrix.SVD svd = Matrix.of(full).svd(true, true);

            for (int i = 0; i < data.length; i++) {
                int missing = 0;
                for (int j = 0; j < d; j++) {
                    if (Double.isNaN(data[i][j])) {
                        missing++;
                    } else {
                        full[i][j] = data[i][j];
                    }
                }

                if (missing == 0) {
                    continue;
                }

                Matrix A = new Matrix(d - missing, k);
                double[] b = new double[d - missing];

                for (int j = 0, m = 0; j < d; j++) {
                    if (!Double.isNaN(data[i][j])) {
                        for (int l = 0; l < k; l++) {
                            A.set(m, l, svd.V.get(j, l));
                        }
                        b[m++] = data[i][j];
                    }
                }

                Matrix.QR qr = A.qr(true);
                double[] s = qr.solve(b);

                for (int j = 0; j < d; j++) {
                    if (Double.isNaN(data[i][j])) {
                        full[i][j] = 0;
                        for (int l = 0; l < k; l++) {
                            full[i][j] += s[l] * svd.V.get(j, l);
                        }
                    }
                }
            }
        }

        return full;
    }
}




© 2015 - 2024 Weber Informatics LLC | Privacy Policy