hex.glm.RegressionInfluenceDiagnosticsTasks Maven / Gradle / Ivy

Go to download
Show more of this group Show more artifacts with this name
Show all versions of h2o-algos Show documentation
H2O Algorithms
There is a newer version: 3.46.0.6
package hex.glm;

import hex.DataInfo;
import water.Job;
import water.MRTask;
import water.fvec.Chunk;
import water.fvec.NewChunk;
import water.util.ArrayUtils;

import java.util.Arrays;

import static hex.glm.GLMUtils.removeRedCols;
import static hex.util.LinearAlgebraUtils.matrixMultiply;
import static water.util.ArrayUtils.*;

/***
 * Classes defined here implemented the various pieces of regression influence diagnostics described in this doc:
 * https://h2oai.atlassian.net/browse/PUBDEV-8638.  Hence, whenever I refer to the document, I mean the one in the
 * http link.
 */
public class RegressionInfluenceDiagnosticsTasks {
  public static class RegressionInfluenceDiagBinomial extends MRTask {
    final double[] _beta;
    final double[][] _gramInv;  // could be with standardized or non-standardized predictors, not scaled with obj_reg
    final Job _j;
    final int _betaSize;
    final int _reducedBetaSize;
    final GLMModel.GLMParameters _parms;
    final DataInfo _dinfo;
    final double[] _stdErr;
    final boolean _foundRedCols;
    final double[] _oneOverStdErr;

    public RegressionInfluenceDiagBinomial(Job j, double[] beta, double[][] gramInv, GLMModel.GLMParameters parms, 
                                           DataInfo dinfo, double[] stdErr) {
      _j = j;
      _beta = beta; // denormalized beta
      _betaSize = beta.length;
      _reducedBetaSize = gramInv.length;
      _foundRedCols = !(_betaSize == _reducedBetaSize);
      _gramInv = gramInv; // not scaled by parms._obj_reg
      _parms = parms;
      _dinfo = dinfo;
      _stdErr = stdErr;
      _oneOverStdErr = Arrays.stream(_stdErr).map(x -> 1.0/x).toArray();
      
    }

    @Override
    public void map(Chunk[] chks, NewChunk[] nc) {
      if (isCancelled() || _j != null && _j.stop_requested()) return;
      double[] dfbetas = new double[_betaSize];
      double[] dfbetasReduced = new double[_reducedBetaSize];
      double[] row2Array = new double[_betaSize];
      double[] row2ArrayReduced = new double[_reducedBetaSize];
      double[] xTimesGramInv = new double[_reducedBetaSize];
      DataInfo.Row r = _dinfo.newDenseRow();
      for (int rid = 0; rid < chks[0]._len; ++rid) {
        _dinfo.extractDenseRow(chks, rid, r);
        genDfBetasRow(r, nc, row2Array, row2ArrayReduced, dfbetas, dfbetasReduced, xTimesGramInv);
      }

      if (_j != null)
       _j.update(1);
    }

    private void genDfBetasRow(DataInfo.Row r, NewChunk[] nc, double[] row2Array, double[] row2ArrayRed, 
                               double[] dfbetas, double[] dfbetasRed, double[] xTimesGramInv) {
      if (r.response_bad) {
        Arrays.fill(dfbetas, Double.NaN);
      } else if (r.weight == 0) {
        Arrays.fill(dfbetas, 0.0);
      } else {
        r.expandCatsPredsOnly(row2Array);  // change Row to array
        if (_foundRedCols) {
          removeRedCols(row2Array, row2ArrayRed, _stdErr);
          genDfBeta(r, row2ArrayRed, xTimesGramInv, dfbetasRed, nc);
        } else {
          genDfBeta(r, row2Array, xTimesGramInv, dfbetas, nc);
        }
      }
    }
    
    private void genDfBeta(DataInfo.Row r, double[] row2Array, double[] xTimesGramInv, double[] dfbetas, NewChunk[] nc) {
      double mu = _parms.linkInv(r.innerProduct(_beta)+r.offset); // generate p hat
      // generate residual
      double residual = r.response(0)-mu;
      double oneOverMLL = gen1OverMLL(row2Array, xTimesGramInv, mu, r.weight);  // 1.0/(oneOverObjReg-hjj)
      genDfBetas(oneOverMLL, residual, row2Array, dfbetas, r.weight);

      for (int c = 0; c < _reducedBetaSize; c++) // copy dfbetas over to new chunks
        nc[c].addNum(dfbetas[c]);
    }

    /***
     * implement operations on and in between equation 5, 6 of the document
     */
    public void genDfBetas(double oneOverMLL, double residual, double[] row2Array, double[] dfbetas, double weight) {
      double resOverMLL = oneOverMLL*residual*weight;
      int count=0;
      for (int index=0; index<_betaSize; index++) {
        if (!Double.isNaN(_stdErr[index])) {
          dfbetas[count] = resOverMLL * _oneOverStdErr[index] * ArrayUtils.innerProduct(row2Array, _gramInv[count]);
          count++;
        }
      }
    }

    /***
     * Generate 1.0/(1.0-hjj) for each data row j.  Implement equation 8 of the document for binomial family.
     */
    public double gen1OverMLL(double[] row2Array, double[] xTimesGramInv, double mu, double weight) {
      for (int index = 0; index< _reducedBetaSize; index++) {  // form X*invGram
        xTimesGramInv[index] = ArrayUtils.innerProduct(row2Array, _gramInv[index]);
      }
      double hjj = weight*mu*(1-mu)*ArrayUtils.innerProduct(xTimesGramInv, row2Array);
      return 1.0/(1.0-hjj);
    }
  }

  /***
   * generate DFBETAS as in equation 4 of the document.
   */
  public static class RegressionInfluenceDiagGaussian extends MRTask {
    final double[] _oneOverSqrtXTXDiag;
    final double[] _betas;  // Exclude redundant columns if present
    final int _betaSize;
    final Job _j;
    
    public RegressionInfluenceDiagGaussian(double[][] xTx, double[] betas, Job j) {
      _betas = betas;
      _betaSize = betas.length;
      _j = j;
      _oneOverSqrtXTXDiag = new double[_betaSize];
      for (int index = 0; index< _betaSize; index++)
        _oneOverSqrtXTXDiag[index] = 1.0/Math.sqrt(xTx[index][index]);
    }
    
    @Override
    public void map(Chunk[] chks, NewChunk[] ncs) {
      if (isCancelled() || (_j != null && _j.stop_requested()))
        return;
      double[] betaDiff = new double[_betaSize];
      int numCols = chks.length;
      double[] row2Array = new double[numCols]; // contains new beta and var estimate of ith row
      int len = chks[0]._len;
      for (int index=0; index {
    final double[][] _cholInv;  // XTX inverse: store cholInv without redundant predictors, not scaled by parms._obj_reg
    final double[] _xTransY;    // store XTY of full dataset
    final double[] _xTransYReduced; // same as xTransY, but changed when there is redundant columns
    final int _betaSize;
    final int _reducedBetaSize;
    final int _newChunkWidth;
    final Job _j;
    final DataInfo _dinfo;
    final double[][] _xTx;  // not scaled by parms._obj_reg
    final double _weightedNobs;
    final double _sumRespSq;
    final boolean _foundRedCols;
    final double[] _stdErr; // used to tell which predict is redundant
    
    public ComputeNewBetaVarEstimatedGaussian(double[][] cholInv, double[] xTY, Job j, DataInfo dinfo, double[][] gram,
                                              double nobs, double sumRespSq, double[] stdErr) {
      _cholInv = cholInv;
      _xTransYReduced = xTY; // if redundant columns present, is reduced
      _betaSize = stdErr.length;
      _reducedBetaSize = cholInv.length;
      _foundRedCols = !(_betaSize == _reducedBetaSize);
      _newChunkWidth = _betaSize+1; // last one is for estimated variance
      _j = j;
      _dinfo = dinfo;
      _xTx = gram;
      _weightedNobs = nobs-_reducedBetaSize; // intercept already included in gram/chol
      _sumRespSq = sumRespSq; // YTY
      _stdErr = stdErr;
      _xTransY = new double[_betaSize];
      if (_foundRedCols) { // shrink xTransY
       int count=0;
       for (int index=0; index<_betaSize; index++)
         if (!Double.isNaN(stdErr[index]))
           _xTransY[index] = _xTransYReduced[count++];
      } else {
        System.arraycopy(_xTransYReduced, 0, _xTransY, 0, _reducedBetaSize);
      }
    }

    @Override
    public void map(Chunk[] chks, NewChunk[] nc) {
      if (isCancelled() || (_j != null && _j.stop_requested()))
        return; // timeout
      double[] newBeta = new double[_betaSize];
      double[] newBetaRed = new double[_reducedBetaSize];
      double[] row2Array = new double[_betaSize];
      double[] row2ArrayRed = new double[_reducedBetaSize];
      double[][] tmpDoubleArray = new double[_reducedBetaSize][_reducedBetaSize];
      double[] tmpArray = new double[_betaSize];
      double[] tmpArrayRed = new double[_reducedBetaSize];
      final int chkLen = chks[0]._len;
      DataInfo.Row r = _dinfo.newDenseRow();
      for (int rowIndex=0; rowIndex