All Downloads are FREE. Search and download functionalities are using the official Maven repository.

hex.modelselection.ModelSelectionUtils Maven / Gradle / Ivy

There is a newer version: 3.46.0.6
Show newest version
package hex.modelselection;

import hex.DataInfo;
import hex.Model;
import hex.glm.GLM;
import hex.glm.GLMModel;
import hex.glm.GLMTask;
import hex.gram.Gram;
import jsr166y.ForkJoinTask;
import jsr166y.RecursiveAction;
import water.DKV;
import water.Key;
import water.MemoryManager;
import water.fvec.Frame;
import water.fvec.Vec;
import water.util.ArrayUtils;

import java.lang.reflect.Field;
import java.util.*;
import java.util.stream.Collectors;
import java.util.stream.IntStream;

import static hex.glm.GLMModel.GLMParameters.Family.gaussian;

public class ModelSelectionUtils {
    public static Frame[] generateTrainingFrames(ModelSelectionModel.ModelSelectionParameters parms, int predNum, String[] predNames,
                                                 int numModels, String foldColumn) {
        int maxPredNum = predNames.length;
        Frame[] trainFrames = new Frame[numModels];
        int[] predIndices = IntStream.range(0, predNum).toArray();   // contains indices to predictor names
        int zeroBound = maxPredNum-predNum;
        int[] bounds = IntStream.range(zeroBound, maxPredNum).toArray();   // highest combo value
        for (int frameCount = 0; frameCount < numModels; frameCount++) {    // generate one combo
            trainFrames[frameCount] = generateOneFrame(predIndices, parms, predNames, foldColumn);
            DKV.put(trainFrames[frameCount]);
            updatePredIndices(predIndices, bounds);
        }
        return trainFrames;
    }

    /***
     * Given predictor indices stored in currentPredIndices, we need to find the next combination of predictor indices
     * to use to generate the next combination.  For example, if we have 4 predictors and we are looking to take two 
     * predictors, predictor indices can change in the following sequence [0,1]->[0,2]->[0,3]->[1,2]->[1,2]->[2,3]. 
     *
     * @param currentPredIndices
     * @param indicesBounds
     */
    public static void updatePredIndices(int[] currentPredIndices, int[] indicesBounds) {
        int lastPredInd = currentPredIndices.length-1;
        for (int index = lastPredInd; index >= 0; index--) {
            if (currentPredIndices[index] < indicesBounds[index]) { // increase LSB first
                currentPredIndices[index]++;
                updateLaterIndices(currentPredIndices, index, lastPredInd);
                break;
            } 
        }
    }

    /***
     * Give 5 predictors and say we want the combo of 3 predictors, this function will properly reset the prediction
     * combination indices say from [0, 1, 4] -> [0, 2, 3] or [0, 3, 4] -> [1, 2, 3].  Given an index that was just
     * updated, it will update the indices that come later in the list correctly.
     * 
     * @param currentPredIndices
     * @param indexUpdated
     * @param lastPredInd
     */
    public static void updateLaterIndices(int[] currentPredIndices, int indexUpdated, int lastPredInd) {
        for (int index = indexUpdated; index < lastPredInd; index++) {
            currentPredIndices[index+1] = currentPredIndices[index]+1;
        }
    }
    
    /***
     *     Given a predictor indices set, this function will generate a training frame containing the predictors with
     *     indices in predIndices.
     *     
     * @param predIndices
     * @param parms
     * @param predNames
     * @return
     */
    public static Frame generateOneFrame(int[] predIndices, Model.Parameters parms, String[] predNames,
                                         String foldColumn) {
        final Frame predVecs = new Frame(Key.make());
        final Frame train = parms.train();
        boolean usePredIndices = predIndices != null;
        int numPreds = usePredIndices? predIndices.length : predNames.length;
        for (int index = 0; index < numPreds; index++) {
            int predVecNum = usePredIndices ? predIndices[index] : index;
            predVecs.add(predNames[predVecNum], train.vec(predNames[predVecNum]));
        }
        if (parms._weights_column != null)
            predVecs.add(parms._weights_column, train.vec(parms._weights_column));
        if (parms._offset_column != null)
            predVecs.add(parms._offset_column, train.vec(parms._offset_column));
        if (foldColumn != null)
            predVecs.add(foldColumn, train.vec(foldColumn));
        predVecs.add(parms._response_column, train.vec(parms._response_column));
        return predVecs;
    }
    
    public static void setBitSet(BitSet predBitSet, int[] currIndices) {
        for (int predIndex : currIndices)
            predBitSet.set(predIndex);
    }

    /***
     * Class to store the CPM produced, predNames and pred2CPMMapping after the removal of redundant predictors
     * if present
     */
    static class CPMnPredNames {
        double[][] _cpm;
        String[] _predNames;
        String[] _coefNames;
        int[][] _pred2CPMMapping;
        
        public CPMnPredNames(double[][] cpm, String[] predNames, String[] coeffNames, int[][] pred2CPMM) {
            _cpm = cpm; // cpm with duplicated predictors removed
            _predNames = predNames; // predictor names with duplicated predictors removed
            _pred2CPMMapping = pred2CPMM;   // mapping of predictors to cpm indices
            _coefNames = coeffNames;
        }
    }

    public static CPMnPredNames genCPMPredNamesIndex(Key jobKey, DataInfo dinfo, String[] predictorNames, 
                                                     ModelSelectionModel.ModelSelectionParameters parms) {
        // check if there are redundant predictors
        ArrayList ignoredCols = new ArrayList<>();
        GLMTask.GLMIterationTask gtask = genGramCheckDup(jobKey, dinfo, ignoredCols, parms);
        double[] xTransposey;
        Gram gram = gtask.getGram();
        List ignoredFullPredCols = new ArrayList<>();
        String[] coefNames = dinfo.coefNames();
        // drop unwanted predictors
        if (ignoredCols.size() > 0) {
            List ignoredPredNames = new ArrayList<>();
            List ignoredCoefNames = new ArrayList<>();
            ignoredFullPredCols = findFullDupPred(dinfo, ignoredCols, ignoredPredNames, ignoredCoefNames, predictorNames);
            coefNames = Arrays.stream(coefNames).filter(x -> !ignoredCoefNames.contains(x)).toArray(String[]::new);
            predictorNames = Arrays.stream(predictorNames).filter(x -> !ignoredPredNames.contains(x)).toArray(String[]::new);
            // drop cols from gram and XTY
            xTransposey = dropIgnoredCols(gtask, ignoredFullPredCols);
        } else {
            xTransposey = gtask.getXY();
        }
        return new CPMnPredNames(formCPM(gram, xTransposey, gtask.getYY()), predictorNames, coefNames, 
                mapPredIndex2CPMIndices(dinfo, predictorNames.length, ignoredFullPredCols));
    }

    /**
     *  This method attempts to map all predictors into the corresponding cpm indices that refer to that predictor.
     *  This is complicated by two things:
     *  a. the presence of duplicated predictors that are removed;
     *  b. the presence of enum predictors that will map one predictor to multiple consecutive cpm indices.
     *  Note that ignoredPredInd is at the level of coefficient indexing and not predictor indexing
     */
    public static int[][] mapPredIndex2CPMIndices(DataInfo dinfo, int numPreds, List ignoredPredInd) {
        int[][] pred2CPMMapping = new int[numPreds][];
        int offset = 0;
        int countPreds = 0;

        for (int index=0; index < dinfo._cats; index++) {  // take care of categorical columns
            int catStartLevel = dinfo._catOffsets[index];
            if (!ignoredPredInd.contains(catStartLevel)) {  // enum pred not ignored
                int numLevels = dinfo._catOffsets[index + 1] - dinfo._catOffsets[index];    // number of catLevels
                pred2CPMMapping[countPreds++] = IntStream.iterate(offset, n -> n + 1).limit(numLevels).toArray();
                offset += numLevels;
            }
            if (countPreds >= numPreds)
                break;
        }
        int totPreds = dinfo._catOffsets[dinfo._cats]+dinfo._nums;
        for (int index=dinfo._catOffsets[dinfo._cats]; index < totPreds; index++) {
            if (countPreds >= numPreds)
                break;
            if (!ignoredPredInd.contains(index))
                pred2CPMMapping[countPreds++] = new int[]{offset++};
        }
        return pred2CPMMapping;
    }
    
    
    public static double[][] formCPM(Gram gram, double[] xTransposey, double yy) {
        int coeffSize = xTransposey.length;
        int cPMsize = coeffSize+1;
        double[][] crossProductMatrix = MemoryManager.malloc8d(cPMsize, cPMsize);
        gram.getXXCPM(crossProductMatrix, false, false);
        // copy xZTransposex, xTransposey, yy to crossProductMatrix
        for (int rowIndex=0; rowIndex ignoredCols) {
        Gram gram = gtask.getGram();
        int[] droppedCols = ignoredCols.stream().mapToInt(x->x).toArray();
        gram.dropCols(droppedCols);
        return ArrayUtils.removeIds(gtask.getXY(), droppedCols);
    }
    
    /***
     * The duplicated columns generated by qr-cholesky is at the level of coefficients.  This means for enum predictors
     * there could be multiple coefficients, one for each level of the enum level.  For enum predictors, I will 
     * remove the predictor only if all its columns are duplicated.  This make it a little more complicated.  The
     * returned ignored list is at the level of coefficients.
     */
    public static List findFullDupPred(DataInfo dinfo, List ignoredCols, List ignoredPredNames, 
                                                List ignoredCoefNames, String[] prednames) {
        List ignoredColsCopy = new ArrayList<>(ignoredCols);
        List fullIgnoredCols = new ArrayList<>();
        int[] catOffsets = dinfo._catOffsets;
        String[] allCoefNames = dinfo.coefNames();
        
        if (dinfo._cats > 0) {   // there are enum columns in dataset
            int catOffsetsLen = catOffsets.length;
            for (int index = 1; index < catOffsetsLen; index++) {
                final int counter = index;
                List discarded = ignoredColsCopy.stream().filter(x -> x < catOffsets[counter]).collect(Collectors.toList());
                if ((discarded != null) && (discarded.size() == (catOffsets[index]-catOffsets[index-1]))) {  // full enum predictors found in ignored columns
                    fullIgnoredCols.addAll(discarded);
                    ignoredPredNames.add(prednames[index-1]);
                    ignoredCoefNames.addAll(discarded.stream().map(x -> allCoefNames[x]).collect(Collectors.toList()));
;                }
                if (discarded != null && discarded.size() > 0) 
                    ignoredColsCopy.removeAll(discarded);
            }
        }
        if (ignoredColsCopy != null && ignoredColsCopy.size()>0) {
            int offsetNum = dinfo._numOffsets[0]-dinfo._cats;
            ignoredPredNames.addAll(ignoredColsCopy.stream().map(x -> prednames[x-offsetNum]).collect(Collectors.toList()));
            ignoredCoefNames.addAll(ignoredColsCopy.stream().map(x -> allCoefNames[x]).collect(Collectors.toList()));
            fullIgnoredCols.addAll(ignoredColsCopy);    // add all remaining numerical ignored predictors columns
        }
        return fullIgnoredCols;
    }
    
    public static GLMTask.GLMIterationTask genGramCheckDup(Key jobKey, DataInfo dinfo, ArrayList ignoredCols, 
                                                    ModelSelectionModel.ModelSelectionParameters parms) {
        double[] beta = new double[dinfo.coefNames().length];
        beta = Arrays.stream(beta).map(x -> 1.0).toArray(); // set coefficient to all 1
        GLMTask.GLMIterationTask gtask = new GLMTask.GLMIterationTask(jobKey, dinfo, new GLMModel.GLMWeightsFun(gaussian,
                GLMModel.GLMParameters.Link.identity, 1, 0.1, 0.1), beta).doAll(dinfo._adaptedFrame);
        Gram gram = gtask.getGram();
        Gram.Cholesky chol = gram.qrCholesky(ignoredCols, parms._standardize);
        if (!chol.isSPD()) throw new Gram.NonSPDMatrixException();
        return gtask;
    }
    

    public static double calR2Scale(Frame train, String resp) {
        Vec respV = train.vec(resp);
        double sigma = respV.sigma();
        double var = sigma*sigma;
        long nobs = train.numRows()-respV.naCnt()-1;
        return nobs*var;
    }
    
    static class CoeffNormalization {
        double[] _sigmaOrOneOSigma; // only for the numerical predictors
        double[] _meanOverSigma;
        boolean _standardize;
        
        public CoeffNormalization(double[] oOSigma, double[] mOSigma, boolean standardize) {
            _sigmaOrOneOSigma = oOSigma;
           _meanOverSigma = mOSigma;
           _standardize = standardize;
        }
    }

    static CoeffNormalization generateScale(DataInfo dinfo, boolean standardize) {
        int numCols = dinfo._nums;
        double[] sigmaOrOneOverSigma = new double[numCols];
        double[] mOverSigma = new double[numCols];
        for (int index = 0; index < numCols; index++) {
            if (standardize) {
                sigmaOrOneOverSigma[index] = dinfo._normMul[index]; // 1/sigma
                mOverSigma[index] = dinfo._numMeans[index] * dinfo._normMul[index];
            } else {
                sigmaOrOneOverSigma[index] = dinfo._normSigmaStandardizationOff[index]; // sigma
                mOverSigma[index] = dinfo._numMeans[index] / dinfo._normSigmaStandardizationOff[index];
            }
        }
        return new CoeffNormalization(sigmaOrOneOverSigma, mOverSigma, standardize);
    }
    
    /**double
     * @param predictorNames
     * @param foldColumn
     * @param currSubsetIndices
     * @param validSubsets Lists containing only valid predictor indices to choose from
     * @return
     */
    public static Frame[] generateMaxRTrainingFrames(ModelSelectionModel.ModelSelectionParameters parms,
                                                     String[] predictorNames, String foldColumn,
                                                     List currSubsetIndices, int newPredPos,
                                                     List validSubsets, Set usedCombo) {
        List trainFramesList = new ArrayList<>();
        List changedSubset = new ArrayList<>(currSubsetIndices);
        changedSubset.add(newPredPos, -1);  // value irrelevant
        int[] predIndices = changedSubset.stream().mapToInt(Integer::intValue).toArray();
        int predNum = predictorNames.length;
        BitSet tempIndices =  new BitSet(predNum);
        int predSizes = changedSubset.size();
        boolean emptyUsedCombo = (usedCombo != null) && (usedCombo.size() == 0);
        for (int predIndex : validSubsets) {  // consider valid predictor indices only
            predIndices[newPredPos] = predIndex;
            if (emptyUsedCombo && predSizes > 1) {   // add all indices set into usedCombo
                tempIndices.clear();
                setBitSet(tempIndices, predIndices);
                usedCombo.add((BitSet) tempIndices.clone());
                Frame trainFrame = generateOneFrame(predIndices, parms, predictorNames, foldColumn);
                DKV.put(trainFrame);
                trainFramesList.add(trainFrame);
                
            } else if (usedCombo != null && predSizes > 1) {   // only need to check for forward and replacement step for maxR
                tempIndices.clear();
                setBitSet(tempIndices, predIndices);
                if (usedCombo.add((BitSet) tempIndices.clone())) {  // returns true if not in keyset
                    Frame trainFrame = generateOneFrame(predIndices, parms, predictorNames, foldColumn);
                    DKV.put(trainFrame);
                    trainFramesList.add(trainFrame);
                }
            } else {     // just build without checking duplicates for other modes
                Frame trainFrame = generateOneFrame(predIndices, parms, predictorNames, foldColumn);
                DKV.put(trainFrame);
                trainFramesList.add(trainFrame);
            }
        }
        return trainFramesList.stream().toArray(Frame[]::new);
    }


    /***
     * Given the original predictor subset, this function will go into a for loop and choose one predictor out of the
     * remaining predictor set validSubsets and put it into the array allPreds.  Then, it will spin off a process to 
     * calculation the error variance with the addition of the new predictor.  All error variances will be stored in
     * an array and returned for further processing.  This is very similar to generateAllErrVar but they are not the
     * same.  For starters, the replaced predictors are included in the predictor subset allPreds. For details, refer to 
     * https://h2oai.atlassian.net/browse/PUBDEV-8954, section VI.
     */
    public static double[] generateAllErrVarR(final double[][] allCPM, final Frame allCPMFrame, double[][] prevCPM, int predPos,
                                              List currSubsetIndices, List validSubsets,
                                              Set usedCombo, BitSet tempIndices,
                                              final int[][] pred2CPMIndices, final boolean hasIntercept,
                                              int[] removedPredSweepInd, SweepVector[][] removedPredSV) {
        int[] allPreds = new int[currSubsetIndices.size() + 1];   // store the bigger predictor subset
        int lastPredInd = allPreds.length - 1;
        System.arraycopy(currSubsetIndices.stream().mapToInt(Integer::intValue).toArray(), 0, allPreds, 0,
                allPreds.length - 1);
        int maxModelCount = validSubsets.size();
        RecursiveAction[] resA = new RecursiveAction[maxModelCount];
        final double[] subsetMSE = new double[maxModelCount];
        Arrays.fill(subsetMSE, Double.MAX_VALUE);
        int modelCount = 0;
        int[] oneLessSub = new int[lastPredInd];
        List oneLessSubset = new ArrayList<>(currSubsetIndices);
        oneLessSubset.remove(predPos);
        System.arraycopy(oneLessSubset.stream().mapToInt(Integer::intValue).toArray(), 0, oneLessSub, 0,
                oneLessSub.length - 1);
        int oneLessSubInd = lastPredInd-1;
        for (int predIndex : validSubsets) {  // consider valid predictor indices only
            allPreds[lastPredInd] = predIndex;
            oneLessSub[oneLessSubInd] = predIndex;
            tempIndices.clear();
            setBitSet(tempIndices, oneLessSub);
            if (usedCombo.add((BitSet) tempIndices.clone())) {
                final int resCount = modelCount++;
                genMSE4MorePredsR(pred2CPMIndices, allCPM, allCPMFrame, prevCPM, allPreds, subsetMSE, resA, resCount,
                        hasIntercept, removedPredSV, removedPredSweepInd);
            }
        }
        ForkJoinTask.invokeAll(Arrays.stream(resA).filter(Objects::nonNull).toArray(RecursiveAction[]::new));
        return subsetMSE;
    }

    /***
     * Generate the error variance for one predictor subset setting in allPreds.  It will do the following:
     * 1. add rows/columns corresponding to new predictor and store the partial CPM in subsetCPM;
     * 2. sweep the new rows/columns from 1 with the sweep vectors generated for the removed predictor and store
     * everything in subsetCPM;
     * 3. sweep the subsetCPM with rows/columns associated with the new predictor;
     * 4. record the new error variance.
     * 
     * For details, refer to https://h2oai.atlassian.net/browse/PUBDEV-8954, section VI.
     */
    public static void genMSE4MorePredsR(final int[][] pred2CPMIndices, final double[][] allCPM, 
                                         final Frame allCPMFrame, double[][] prevCPM, final int[] allPreds, 
                                         final double[] subsetMSE, RecursiveAction[] resA, final int resCount, 
                                         final boolean hasIntercept, SweepVector[][] removePredSV, 
                                         int[] removedPredSweepInd) {
        final int[] subsetIndices = allPreds.clone();
        resA[resCount] = new RecursiveAction() {
            @Override
            protected void compute() {
                double[][] subsetCPM = addNewPred2CPM(allCPM, allCPMFrame, prevCPM, subsetIndices, pred2CPMIndices,
                        hasIntercept);  // new pred added but swept with removed predictor
                // swept just new pred with sweep vector of removed pred to undo its effect
                int newPredInd = subsetIndices[subsetIndices.length - 1];
                int newPredCPMLength = pred2CPMIndices[newPredInd].length;
                int lastSweepIndex = prevCPM.length-1;
                if (newPredCPMLength == 1) {
                    applySweepVectors2NewPred(removePredSV, subsetCPM, newPredCPMLength, removedPredSweepInd);
                } else {
                    SweepVector[][] newSV = mapBasicVector2Multiple(removePredSV, newPredCPMLength);
                    applySweepVectors2NewPred(newSV, subsetCPM, newPredCPMLength, removedPredSweepInd);
                }
                // sweep subsetCPM with newly added predictor
                int[] newPredSweepInd = IntStream.range(0,newPredCPMLength).map(x -> x+lastSweepIndex).toArray();
                sweepCPM(subsetCPM, newPredSweepInd, false);
                // only apply the sweepIndices to only the last element of the CPM
                int lastInd = subsetCPM.length-1;
                subsetMSE[resCount] = subsetCPM[lastInd][lastInd];
            }
        };
    }

    /***
     * Given the original predictor subset, this function will go into a for loop and choose one predictor out of the
     * remaining predictor set validSubsets and put it into the array allPreds.  Then, it will spin off a process to 
     * calculation the error variance with the addition of the new predictor.  All error variances will be stored in
     * an array and returned for further processing.  For details, refer to 
     * https://h2oai.atlassian.net/browse/PUBDEV-8954, section V.
     */
    public static double[] generateAllErrVar(final double[][] allCPM, Frame allCPMFrame, int prevCPMSize, 
                                             List currSubsetIndices, List validSubsets, 
                                             Set usedCombo, BitSet tempIndices, final int[][] pred2CPMIndices, 
                                             final boolean hasIntercept) {
        int[] allPreds = new int[currSubsetIndices.size() + 1];   // store the bigger predictor subset
        int lastPredInd = allPreds.length - 1;

        if (currSubsetIndices.size() > 0)   // copy over last best predictor subset with smaller subset size
            System.arraycopy(currSubsetIndices.stream().mapToInt(Integer::intValue).toArray(), 0, allPreds,
                    0, allPreds.length - 1);
        int predSizes = allPreds.length;
        int maxModelCount = validSubsets.size();

        RecursiveAction[] resA = new RecursiveAction[maxModelCount];
        final double[] subsetMSE = Arrays.stream(new double[maxModelCount]).map(x -> Double.MAX_VALUE).toArray();
        int modelCount = 0;
        for (int predIndex : validSubsets) {  // consider valid predictor indices only
            allPreds[lastPredInd] = predIndex;
            if (predSizes > 1) {
                tempIndices.clear();
                setBitSet(tempIndices, allPreds);
                if (usedCombo.add((BitSet) tempIndices.clone())) {
                    final int resCount = modelCount++;
                    genMSE4MorePreds(pred2CPMIndices, allCPM, allCPMFrame, allPreds, prevCPMSize, subsetMSE, resA, 
                            resCount, hasIntercept);
                }
            } else {    // start from first predictor
                final int resCount = modelCount++;
                genMSE1stPred(pred2CPMIndices, allCPM, allCPMFrame, allPreds, subsetMSE, resA, resCount,
                        hasIntercept);
            }
        }
        ForkJoinTask.invokeAll(Arrays.stream(resA).filter(Objects::nonNull).toArray(RecursiveAction[]::new));
        return subsetMSE;
    }

    /***
     * This method will calculate the error variance value for all predictors in the allPreds.  For details,
     * refer to https://h2oai.atlassian.net/browse/PUBDEV-8954, section V.
     */
    public static void genMSE4MorePreds(final int[][] pred2CPMIndices, final double[][] allCPM, final Frame allCPMFrame, 
                                        final int[] allPreds, int lastSweepIndex, final double[] subsetMSE, 
                                        RecursiveAction[] resA, final int resCount, final boolean hasIntercept) {
        final int[] subsetIndices = allPreds.clone();
        resA[resCount] = new RecursiveAction() {
            @Override
            protected void compute() {
                boolean multinodeMode = allCPM == null && allCPMFrame != null;
                double[][] subsetCPM = multinodeMode ? 
                        extractPredSubsetsCPMFrame(allCPMFrame, subsetIndices, pred2CPMIndices, hasIntercept) : 
                        extractPredSubsetsCPM(allCPM, subsetIndices, pred2CPMIndices, hasIntercept);
                int lastPredInd = subsetIndices[subsetIndices.length - 1];
                int newPredCPMLength = pred2CPMIndices[lastPredInd].length;
                int[] sweepIndices = IntStream.range(0,newPredCPMLength).map(x -> x+lastSweepIndex).toArray();
                sweepCPM(subsetCPM, sweepIndices, false);
                // only apply the sweepIndices to only the last element of the CPM
                int lastInd = subsetCPM.length-1;
                subsetMSE[resCount] = subsetCPM[lastInd][lastInd];
            }
        };
    }

    /***
     * This function performs sweeping on the last row and column only to update the variance error to reduce
     * computation time.
     */
    public static double sweepMSE(double[][] subsetCPM, List sweepIndices) {
        int sweepLen = sweepIndices.size();
        int cpmLen = subsetCPM.length;
        int lastInd = cpmLen-1;
        if (sweepLen == 1) {    // quick stop for one sweep only
            int sweepInd = sweepIndices.get(0);
            return subsetCPM[lastInd][lastInd]
                    -subsetCPM[lastInd][sweepInd]*subsetCPM[sweepInd][lastInd]/subsetCPM[sweepInd][sweepInd];
        } 
        Set[] sweepElements = new Set[sweepLen];
        List tempElements = new ArrayList<>();
        tempElements.add(new SweepElement(lastInd, lastInd, new ArrayList<>(sweepIndices)));
        while (tempElements.size() > 0) {
            SweepElement oneEle = tempElements.remove(0);
            if (oneEle._sweepIndices.size() == 1) {
                if (sweepElements[0] == null)
                    sweepElements[0] = new HashSet<>();
                sweepElements[0].add(oneEle);
            } else { // sweepIndices size > 1
                int arrIndex = oneEle._sweepIndices.size() - 1;
                if (sweepElements[arrIndex] == null)
                    sweepElements[arrIndex] = new HashSet<>();
                sweepElements[arrIndex].add(oneEle);
                process(oneEle, tempElements);
            }
        }
        sweepCPMElements(sweepElements, subsetCPM);
        return subsetCPM[lastInd][lastInd];
    }

    public static void sweepCPMElements(Set[] sweepElements, double[][] subsetCPM) {
        int numSweeps = sweepElements.length;
        int row, col, oneIndex;
        for (int index = 0; index < numSweeps; index++) {
            Set oneSweepAction = sweepElements[index];
            for (SweepElement oneElement : oneSweepAction) {
                oneIndex = oneElement._sweepIndices.get(oneElement._sweepIndices.size() - 1);
                row = oneElement._row;
                col = oneElement._col;
                subsetCPM[row][col] = subsetCPM[row][col] -
                        subsetCPM[row][oneIndex] * subsetCPM[oneIndex][col] / subsetCPM[oneIndex][oneIndex];
            }
        }
    }

    /***
     * This method will generate all the elements that are needed to perform sweeping on the currEle.  The
     * formula for sweeping is:
     *  subsetCPM[row][col] = subsetCPM[row][col]-subsetCPM[row][sweepInd]*subsetCPM[sweepInd][row]/subsetCPM[sweepInd][sweepInd]
     *  
     *  We are not performing the actual sweeping here but rather to remember the elements of subsetCPM that we need
     *  to perform sweeping on.  From the formula, each currEle will generate three more new SweepElements
     */
    public static void process(SweepElement currEle, List tempList) {
        List newSweepIndices = new ArrayList<>(currEle._sweepIndices);
        int sweepIndex = newSweepIndices.remove(newSweepIndices.size()-1);  // remove last sweepIndices
        tempList.add(new SweepElement(currEle._row, currEle._col, newSweepIndices));
        tempList.add(new SweepElement(currEle._row, sweepIndex, newSweepIndices));
        tempList.add(new SweepElement(sweepIndex, currEle._col, newSweepIndices));
        tempList.add(new SweepElement(sweepIndex, sweepIndex, newSweepIndices));
    }
    
    static class SweepElement {
        final int _row;
        final int _col;
        final List _sweepIndices;
        
        public SweepElement(int row, int col, List sweepInd) {
            _row = row;
            _col = col;
            _sweepIndices = sweepInd;
        }
        
        @Override
        public boolean equals(Object o) {
            if (o instanceof SweepElement) {
                if (_row == ((SweepElement) o)._row && _col == ((SweepElement) o)._col) {
                    if (_sweepIndices.equals(((SweepElement) o)._sweepIndices))
                        return true;
                }
            }
            return false;
        }

        @Override
        public int hashCode() {
            return _row+(_col+1)*10+_sweepIndices.hashCode();
        }
    }
    
    static class CPMElement {
        final int _row;
        final int _col;
        
        public CPMElement(int row, int col) {
            _row = row;
            _col = col;
        }

        @Override
        public boolean equals(Object o) {
            if (o instanceof CPMElement) {
                return (_row == ((CPMElement) o)._row && _col == ((CPMElement) o)._col);
            }
            return false;
        }

        @Override
        public int hashCode() {
            Integer rowCol = _row+(_col+1)*10;
            return rowCol.hashCode();
        }
        
    }

    /***
     * This method will calculate the variance variance when only one predictor is considered in allPreds.  For details,
     * refer to https://h2oai.atlassian.net/browse/PUBDEV-8954, section V.
     */
    public static void genMSE1stPred(final int[][] pred2CPMIndices, final double[][] allCPM, final Frame allCPMFrame, 
                                     final int[] allPreds, final double[] subsetMSE, RecursiveAction[] resA, 
                                     final int resCount, final boolean hasIntercept) {
        final int[] subsetIndices = allPreds.clone();
        resA[resCount] = new RecursiveAction() {
            @Override
            protected void compute() {
                // generate CPM corresponding to the subset indices in subsetIndices
                boolean multinodeMode = allCPM == null && allCPMFrame != null;
                double[][] subsetCPM = multinodeMode ? 
                        extractPredSubsetsCPMFrame(allCPMFrame, subsetIndices, pred2CPMIndices, hasIntercept) : 
                        extractPredSubsetsCPM(allCPM, subsetIndices, pred2CPMIndices, hasIntercept);
                int lastSubsetIndex = subsetCPM.length-1;
                // perform sweeping action and record the sweeping vector and save the changed cpm
                subsetMSE[resCount] = sweepMSE(subsetCPM, IntStream.range(1, lastSubsetIndex).boxed().collect(Collectors.toList()));
            }
        };
    }

    /***
     * When multiple rows/columns are added to the CPM due to the new predictor being categorical, we need to map the 
     * old sweep vector arrays to new bigger sweep vector arrays.  See section V.II.V of doc.
     */
    public static SweepVector[][] mapBasicVector2Multiple(SweepVector[][] sweepVec, int newPredCPMLen) {
        int numSweep = sweepVec.length;
        int oldColLen = sweepVec[0].length/2;
        int newColLen = oldColLen+newPredCPMLen-1;  // sweepVector from old CPM was calculated when one new row/col is added
        int lastNewColInd = newColLen-1;
        int lastOldColInd = oldColLen-1;
        SweepVector[][] newSweepVec = new SweepVector[numSweep][newColLen*2];
        for (int sInd = 0; sInd < numSweep; sInd++) {
            double oneOverPivot = sweepVec[sInd][lastOldColInd-1]._value;
            int rowColInd = sweepVec[sInd][0]._column;
            for (int vInd = 0; vInd < lastNewColInd; vInd++) {
                if (vInd < lastOldColInd) { // index within old sweep vector range
                    newSweepVec[sInd][vInd] = new SweepVector(vInd, rowColInd, sweepVec[sInd][vInd]._value);
                    newSweepVec[sInd][vInd+newColLen] = new SweepVector(rowColInd, vInd,
                            sweepVec[sInd][vInd+oldColLen]._value);
                } else if (vInd == lastOldColInd) {  // last sweep index
                    newSweepVec[sInd][lastNewColInd] = new SweepVector(lastNewColInd, rowColInd,
                            sweepVec[sInd][lastOldColInd]._value);
                    newSweepVec[sInd][lastNewColInd + newColLen] = new SweepVector(rowColInd, lastNewColInd,
                            sweepVec[sInd][lastOldColInd + oldColLen]._value);
                    newSweepVec[sInd][vInd] = new SweepVector(vInd, rowColInd, oneOverPivot);
                    newSweepVec[sInd][vInd+newColLen] = new SweepVector(rowColInd, vInd, oneOverPivot);
                } else {    // new sweep vector index exceed old sweep vector index
                    newSweepVec[sInd][vInd] = new SweepVector(vInd, rowColInd, oneOverPivot);
                    newSweepVec[sInd][vInd+newColLen] = new SweepVector(rowColInd, vInd, oneOverPivot);
                }
            }
        }
        return newSweepVec;
    }

    /***
     * This method will sweep the rows/columns added to the CPM due to the addition of the new predictor using sweep
     * vector arrays.  See Step 3 of section V.II.IV of doc.  The sweep vectors should contain sweeping for predictor
     * 0, for predictor 2, .... predictor s of the predictor subset.
     */
    public static void applySweepVectors2NewPred(SweepVector[][] sweepVec, double[][] subsetCPM, int numNewRows, 
                                                 int[] sweepMat) {
        int numSweep = sweepVec.length; // number of sweeps that we need to do
        if (sweepMat == null) {
            for (int sweepInd=0; sweepInd < numSweep; sweepInd++) {
                oneSweepWSweepVector(sweepVec[sweepInd], subsetCPM, sweepInd, numNewRows);
            }
        } else {
            int sweepInd;
            for (int index = 0; index < numSweep; index++) {
                sweepInd = sweepMat[index];
                oneSweepWSweepVector(sweepVec[index], subsetCPM, sweepInd, numNewRows);
            }
        }
    }

    /***
     * This method perform just one sweep of the sweeping action described in Step 3 of section V.II.IV of doc.
     * Note that for a sweep vector array of 2*(N+1), the first N+1 elements describe the changes to the new column.
     * The last N+1 elements describing changes to the new row.  The sweep vector arrays will contain changes to 
     * the same element multiple times.  I only change each new element once by using elementAccessCount to keep track 
     * of which elements have been changed already.  See Step 3 of section V.II.IV of doc for details.
     * 
     * In addition, I did not use two arrays to implement the changes.  Hence, the order of change is important here.
     * I used temporary variable to store element changes that are used by other element updates.  I copied the 
     * temporary elements back to the CPM at the end.
     */
    public static void oneSweepWSweepVector(SweepVector[] sweepVec, double[][] subsetCPM, int sweepIndex,
                                            int colRowsAdded) {
        int sweepVecLen = sweepVec.length / 2;
        int newLastCPMInd = sweepVecLen - 1;
        int oldSweepVec = sweepVecLen - colRowsAdded;
        int oldLastCPMInd = oldSweepVec - 1;    // sweeping index before adding new rows/columns
        double[] colSweeps = new double[colRowsAdded];
        double[] rowSweeps = new double[colRowsAdded];
        Set trackSweep = new HashSet<>();
        CPMElement oneEle;
        for (int rcInd = 0; rcInd < colRowsAdded; rcInd++) {   // for each newly added row/column
            int rowColInd = sweepVec[0]._column + rcInd;
            for (int svInd = 0; svInd < sweepVecLen; svInd++) { // working on each additional row/col
                int svIndOffset = svInd + sweepVecLen;
                if (sweepVec[svInd]._row == sweepIndex) {  // take care of both row and column elements at sweepIndex
                    rowSweeps[rcInd] = sweepVec[svInd]._value * subsetCPM[sweepIndex][rowColInd];
                    colSweeps[rcInd] = sweepVec[svIndOffset]._value * subsetCPM[rowColInd][sweepIndex];
                } else if (sweepVec[svInd]._row == newLastCPMInd) {
                    oneEle = new CPMElement(newLastCPMInd, rowColInd);
                    if (!trackSweep.contains(oneEle)) {
                        trackSweep.add(oneEle);
                        subsetCPM[newLastCPMInd][rowColInd] = subsetCPM[newLastCPMInd][rowColInd] -
                                sweepVec[svInd]._value * subsetCPM[sweepIndex][rowColInd];
                    }
                    oneEle = new CPMElement(rowColInd, newLastCPMInd);
                    if (!trackSweep.contains(oneEle)) {
                        trackSweep.add(oneEle);
                        subsetCPM[rowColInd][newLastCPMInd] = subsetCPM[rowColInd][newLastCPMInd] -
                                sweepVec[svIndOffset]._value * subsetCPM[rowColInd][sweepIndex];
                    }
                } else if (sweepVec[svInd]._row == rowColInd) {
                    oneEle = new CPMElement(rowColInd, rowColInd);
                    if (!trackSweep.contains(oneEle)) {
                        subsetCPM[rowColInd][rowColInd] = subsetCPM[rowColInd][rowColInd] -
                                subsetCPM[rowColInd][sweepIndex] * subsetCPM[sweepIndex][rowColInd] * sweepVec[svInd]._value;
                        trackSweep.add(oneEle);
                    }
                } else if (sweepVec[svInd]._row < oldLastCPMInd) {
                    oneEle = new CPMElement(sweepVec[svInd]._row, rowColInd);
                    if (!trackSweep.contains(oneEle)) {
                        subsetCPM[sweepVec[svInd]._row][rowColInd] = subsetCPM[sweepVec[svInd]._row][rowColInd] -
                                subsetCPM[sweepIndex][rowColInd] * sweepVec[svInd]._value;
                        trackSweep.add(oneEle);
                    }
                    oneEle = new CPMElement(rowColInd, sweepVec[svIndOffset]._column);
                    if (!trackSweep.contains(oneEle)) {
                        trackSweep.add(oneEle);
                        subsetCPM[rowColInd][sweepVec[svIndOffset]._column] =
                                subsetCPM[rowColInd][sweepVec[svIndOffset]._column] - subsetCPM[rowColInd][sweepIndex] *
                                        sweepVec[svIndOffset]._value;
                    }
                } else { // considering rows/columns >= oldSweepVec
                    oneEle = new CPMElement(sweepVec[svInd]._row, rowColInd);
                    if (!trackSweep.contains(oneEle)) {
                        trackSweep.add(oneEle);
                        subsetCPM[sweepVec[svInd]._row][rowColInd] = subsetCPM[sweepVec[svInd]._row][rowColInd] -
                                subsetCPM[sweepVec[svInd]._row][sweepIndex] * subsetCPM[sweepIndex][rowColInd] * sweepVec[svInd]._value;
                    }
                    oneEle = new CPMElement(rowColInd, sweepVec[svIndOffset]._column);
                    if (!trackSweep.contains(oneEle)) {
                        trackSweep.add(oneEle);
                        subsetCPM[rowColInd][sweepVec[svIndOffset]._column] = subsetCPM[rowColInd][sweepVec[svIndOffset]._column]
                                - subsetCPM[rowColInd][sweepIndex] * subsetCPM[sweepIndex][sweepVec[svIndOffset]._column] * sweepVec[svIndOffset]._value;
                    }
                }
            }
        }
        // take care of updating elements that are not updated
        for (int rcInd = 0; rcInd < colRowsAdded; rcInd++) {
            int rowColInd = sweepVec[0]._column + rcInd;
            subsetCPM[sweepIndex][rowColInd] = rowSweeps[rcInd];
            subsetCPM[rowColInd][sweepIndex] = colSweeps[rcInd];
        }
    }

    /**
     * Given current CPM which has been swept already, we need to add the lastest predictor to the current CPM that have
     * not been swept. The new elements belonging to the newest predictor is extracted from the original allCPM.
     * 
     * Basically, I just extract the CPM from the original non-swept CPM that contains rows/columns of CPM 
     * corresponding to the predictors in subsetPredIndex.  Next, I copied over the swept rows/columns of CPM 
     * corresponding to the previous predictor subset while leaving the rows/columns corresponding to the new rows/
     * columns due to the new predictor unchanged.
     * 
     * See Step 2 in section V.II.IV of doc.
     */
    public static double[][] addNewPred2CPM(double[][] allCPM, Frame allCPMFrame, double[][] currentCPM, 
                                            int[] subsetPredIndex, int[][] pred2CPMIndices, boolean hasIntercept) {
        boolean multinodeMode = allCPM == null && allCPMFrame != null;
        double[][] newCPM = multinodeMode ? 
                extractPredSubsetsCPMFrame(allCPMFrame, subsetPredIndex, pred2CPMIndices, hasIntercept) : 
                extractPredSubsetsCPM(allCPM, subsetPredIndex, pred2CPMIndices, hasIntercept);
        
        int oldCPMDim = currentCPM.length-1;    // XTX dimension
        int newCPMDim = newCPM.length;
        int lastnewCPMInd = newCPMDim-1;
        for (int index=0; index currSubsetIndices, int predPos, int predRemoved, 
                                     int[][] predInd2CPMIndices, boolean hasIntercept) {
        int predRemovedLen = predInd2CPMIndices[predRemoved].length;
        int totalSize = IntStream.range(0, predPos).map(x->predInd2CPMIndices[currSubsetIndices.get(x)].length).sum()
                + (hasIntercept ? 1 : 0);
        return IntStream.range(0, predRemovedLen).map(x -> x+totalSize).toArray();
    }
    
    public static List extractCPMIndexFromPred(int cpmLastIndex, int[][] pred2CPMIndices, int[] newPredList, 
                                                        boolean hasIntercept) {
        List CPMIndices = extractCPMIndexFromPredOnly(pred2CPMIndices, newPredList);
        if (hasIntercept)
            CPMIndices.add(0, cpmLastIndex-1);;
        CPMIndices.add(cpmLastIndex);
        return CPMIndices;
    }


    /***
     * Given the predictor in subset newPredList, this function will find the rows/columns in the cpm matrix that
     * are contributed by the predictors in subset newPredList.
     */
    public static List extractCPMIndexFromPredOnly(int[][] pred2CPMIndices, int[] newPredList) {
        List CPMIndices = new ArrayList<>();
        for (int predInd : newPredList) {
            CPMIndices.addAll(Arrays.stream(pred2CPMIndices[predInd]).boxed().collect(Collectors.toList()));
        }
        return CPMIndices;
    }

    /***
     * This method perform the sweeping action described in section II of doc.  In addition, if genSweepVector is set
     * to true, it will also generate the corresponding sweep vector arrays described in section V.II of doc.
     */
    public static SweepVector[][] sweepCPM(double[][] subsetCPM, int[] sweepIndices, boolean genSweepVector) {
        int currSubsetCPMSize = subsetCPM.length;
        int numSweep = sweepIndices.length;
        SweepVector[][] sweepVecs = new SweepVector[numSweep][2*(currSubsetCPMSize+1)];
        for (int index=0; index < numSweep; index++) 
            performOneSweep(subsetCPM, sweepVecs[index], sweepIndices[index], genSweepVector);
        return sweepVecs;
    }
    
    public static void sweepCPMParallel(Frame cpm, int[] sweepIndices, int[] trackPivotSweeps) {
        int numSweep = sweepIndices.length;
        for (int index=0; index < numSweep; index++) {
            new ModelSelectionTasks.SweepFrameParallel(trackPivotSweeps, sweepIndices[index], cpm).doAll(cpm);
            DKV.put(cpm);
            trackPivotSweeps[sweepIndices[index]] *= -1;
        }
    }

    /**
     * store information on sweeping actions that are to be performed to new rows/columns added to CPM due to the 
     * addition of new predcitors.
     */
    public static class SweepVector {
        int _row;
        int _column;
        double _value;
        public SweepVector(int rIndex, int cIndex, double val) {
            _row = rIndex;
            _column = cIndex;
            _value = val;
        }
    }

    /***
     * Perform one sweep according to section II of doc and generate sweep vector according to section V.II of doc.
     */
    public static void performOneSweep(double[][] subsetCPM, SweepVector[] sweepVec, int sweepIndex,
                                                boolean genSweepVector) {
        int subsetCPMLen = subsetCPM.length;
        int lastSubsetInd = subsetCPMLen-1;
        if (subsetCPM[sweepIndex][sweepIndex]==0) { // pivot is zero, set error variance to max value
            subsetCPM[lastSubsetInd][lastSubsetInd] = Double.MAX_VALUE;
            return;
        } else {    // subsetCPM is healthy
            double oneOverPivot = 1.0/subsetCPM[sweepIndex][sweepIndex];
            // generate sweep vector as in section V.II of doc
            if (genSweepVector) {
                int sweepVecLen = sweepVec.length / 2;
                for (int index = 0; index < sweepVecLen; index++) {
                    if (index == sweepIndex) {
                        sweepVec[index] = new SweepVector(index, lastSubsetInd, oneOverPivot);
                        sweepVec[index + sweepVecLen] = new SweepVector(lastSubsetInd, index, -oneOverPivot);
                    } else if (index == subsetCPMLen) {
                        sweepVec[index] = new SweepVector(index, lastSubsetInd, subsetCPM[lastSubsetInd][sweepIndex] * oneOverPivot);
                        sweepVec[index + sweepVecLen] = new SweepVector(lastSubsetInd, index, subsetCPM[sweepIndex][lastSubsetInd] * oneOverPivot);
                    } else if (index==lastSubsetInd) {
                            sweepVec[index] = new SweepVector(index, lastSubsetInd, oneOverPivot);
                            sweepVec[index+sweepVecLen] = new SweepVector(lastSubsetInd, index, oneOverPivot);
                    } else {
                        sweepVec[index] = new SweepVector(index, lastSubsetInd, subsetCPM[index][sweepIndex] * oneOverPivot);
                        sweepVec[index + sweepVecLen] = new SweepVector(lastSubsetInd, index, subsetCPM[sweepIndex][index] * oneOverPivot);
                    }
                }
            }
            // perform sweeping action as in section II of doc.
            for (int rInd = 0; rInd < subsetCPMLen; rInd++) {
                for (int cInd = rInd; cInd < subsetCPMLen; cInd++) {
                    if (rInd != sweepIndex && cInd != sweepIndex) {
                        subsetCPM[rInd][cInd] = subsetCPM[rInd][cInd]-
                                subsetCPM[rInd][sweepIndex]*subsetCPM[sweepIndex][cInd]*oneOverPivot;
                        if (cInd != rInd)
                            subsetCPM[cInd][rInd] = subsetCPM[cInd][rInd]-
                                    subsetCPM[cInd][sweepIndex]*subsetCPM[sweepIndex][rInd]*oneOverPivot;
                    }
                }
            }
            for (int index=0; index < subsetCPMLen; index++) {
                subsetCPM[index][sweepIndex] = -subsetCPM[index][sweepIndex]*oneOverPivot;
                if (sweepIndex != index)
                    subsetCPM[sweepIndex][index] = subsetCPM[sweepIndex][index]*oneOverPivot;
            }
            subsetCPM[sweepIndex][sweepIndex] = oneOverPivot;
        }
    }
    
    public static String[][] shrinkStringArray(String[][] array, int numModels) {
        int offset = array.length - numModels;
        String[][] newArray =new String[numModels][];
        for (int index=0; index < numModels; index++)
            newArray[index] = array[offset+index].clone();
        return newArray;
    }
    
    public static double[][] shrinkDoubleArray(double[][] array, int numModels) {
        int offset = array.length-numModels;
        double[][] newArray =new double[numModels][];
        for (int index=0; index < numModels; index++)
            newArray[index] = array[offset+index].clone();
        return newArray;
    }

    public static Key[] shrinkKeyArray(Key[] array, int numModels) {
        int arrLen = array.length;
        Key[] newArray = new Key[numModels];
        System.arraycopy(array, (arrLen-numModels), newArray, 0, numModels);
        return newArray;
    }
    
    public static String joinDouble(double[] val) {
        int arrLen = val.length; // skip the intercept terms
        String[] strVal = new String[arrLen];
        for (int index=0; index < arrLen; index++)
            strVal[index] = Double.toString(val[index]);
        return String.join(", ", strVal);
    }
    
    public static GLMModel.GLMParameters[] generateGLMParameters(Frame[] trainingFrames,
                                                                 ModelSelectionModel.ModelSelectionParameters parms, 
                                                                 int nfolds, String foldColumn,
                                                                 Model.Parameters.FoldAssignmentScheme foldAssignment) {
        final int numModels = trainingFrames.length;
        GLMModel.GLMParameters[] params = new GLMModel.GLMParameters[numModels];
        final Field[] field1 = ModelSelectionModel.ModelSelectionParameters.class.getDeclaredFields();
        final Field[] field2 = Model.Parameters.class.getDeclaredFields();
        for (int index = 0; index < numModels; index++) {
            params[index] = new GLMModel.GLMParameters();
            setParamField(parms, params[index], false, field1, Collections.emptyList());
            setParamField(parms, params[index], true, field2, Collections.emptyList());
            params[index]._train = trainingFrames[index]._key;
            params[index]._nfolds = nfolds;
            params[index]._fold_column = foldColumn;
            params[index]._fold_assignment = foldAssignment;
        }
        return params;
    }
    
    public static void setParamField(Model.Parameters params, GLMModel.GLMParameters glmParam, boolean superClassParams,
                                     Field[] paramFields, List excludeList) {
        // assign relevant GAMParameter fields to GLMParameter fields
        Field glmField;
        boolean emptyExcludeList = excludeList.size() == 0;
        for (Field oneField : paramFields) {
            try {
                if (emptyExcludeList || !excludeList.contains(oneField.getName())) {
                    if (superClassParams)
                        glmField = glmParam.getClass().getSuperclass().getDeclaredField(oneField.getName());
                    else
                        glmField = glmParam.getClass().getDeclaredField(oneField.getName());
                    glmField.set(glmParam, oneField.get(params));
                }
            } catch (IllegalAccessException|NoSuchFieldException e) { // suppress error printing, only cares about fields that are accessible
                ;
            }
        }    
    }
    
    public static GLM[] buildGLMBuilders(GLMModel.GLMParameters[] trainingParams) {
        int numModels = trainingParams.length;
        GLM[] builders = new GLM[numModels];
        for (int index=0; index 0) {
                int r2Index = Arrays.asList(oneModel._output._cross_validation_metrics_summary.getRowHeaders()).indexOf("r2");
                Float tempR2 = (Float) oneModel._output._cross_validation_metrics_summary.get(r2Index, 0);
                currR2 = tempR2.doubleValue();
            }
            if (currR2 > bestR2Val) {
                bestR2Val = currR2;
                if (bestModel != null)
                    bestModel.delete();
                bestModel = oneModel;
            } else {
                oneModel.delete();
            }
        }
        return bestModel;
    }

    public static String[] extractPredictorNames(Model.Parameters parms, DataInfo dinfo,
                                                 String foldColumn) {
        List frameNames = Arrays.stream(dinfo._adaptedFrame.names()).collect(Collectors.toList());
        String[] nonResponseCols = parms.getNonPredictors();
        for (String col : nonResponseCols)
            frameNames.remove(col);
        if (foldColumn != null && frameNames.contains(foldColumn))
            frameNames.remove(foldColumn);
        return frameNames.stream().toArray(String[]::new);
    }
    
    public static int findMinZValue(GLMModel model, List numPredNames, List catPredNames, 
                                    List predNames) {
        List zValList = Arrays.stream(model._output.zValues()).boxed().map(Math::abs).collect(Collectors.toList());
        List coeffNames = Arrays.stream(model._output.coefficientNames()).collect(Collectors.toList());
        if (coeffNames.contains("Intercept")) { // remove intercept terms
            int interceptIndex = coeffNames.indexOf("Intercept");
            zValList.remove(interceptIndex);
            coeffNames.remove(interceptIndex);
        }
        // grab min z-values for numerical and categorical columns
        PredNameMinZVal numericalPred = findNumMinZVal(numPredNames, zValList, coeffNames);
        PredNameMinZVal categoricalPred = findCatMinZVal(model, zValList);
        
        // choose the min z-value from numerical and categorical predictors and return its index in predNames
        if (categoricalPred._minZVal >= 0 && categoricalPred._minZVal < numericalPred._minZVal) { // categorical pred has minimum z-value
            return predNames.indexOf(categoricalPred._predName);
        } else {    // numerical pred has minimum z-value
            return predNames.indexOf(numericalPred._predName);
        }
    }
    
    public static PredNameMinZVal findNumMinZVal(List numPredNames, List zValList, List coeffNames) {
        double minNumVal = -1;
        String numPredMinZ = null;
        if (numPredNames != null && numPredNames.size() > 0) {
            List numZValues = new ArrayList<>();
            for (String predName : numPredNames) {
                int eleInd = coeffNames.indexOf(predName);
                double oneZValue = zValList.get(eleInd);
                if (Double.isNaN(oneZValue)) {
                    zValList.set(eleInd, Double.POSITIVE_INFINITY);
                    numZValues.add(Double.POSITIVE_INFINITY);    // NaN corresponds to inactive predictors
                } else {
                    numZValues.add(oneZValue);
                }
            }
            minNumVal = numZValues.stream().min(Double::compare).get(); // minimum z-value of numerical predictors
            numPredMinZ = numPredNames.get(numZValues.indexOf(minNumVal));
        }
        return new PredNameMinZVal(numPredMinZ, minNumVal);
    }

    /***
     * This method extracts the categorical coefficient z-value by using the following method:
     * 1. From GLMModel model, it extracts the column names of the dinfo._adaptedFrame that is used to build the glm 
     * model and generate the glm coefficients.  The column names will be in exactly the same order as the coefficient
     * names with the exception that each enum levels will not be given a name in the column names.
     * 2. To figure out which coefficient name corresponds to which column name, we use the catOffsets which will tell
     * us how many enum levels are used in the glm model coefficients.  If the catOffset for the first coefficient
     * says 3, that means that column will have three enum levels represented in the glm model coefficients.
     */
    public static PredNameMinZVal findCatMinZVal(GLMModel model, List zValList) {
        String[] columnNames = model.names(); // column names of dinfo._adaptedFrame
        int[] catOffsets = model._output.getDinfo()._catOffsets;
        double minCatVal = -1;
        String catPredMinZ = null;
        if (catOffsets != null) {
            minCatVal = Double.MAX_VALUE;
            int numCatCol = catOffsets.length - 1;

            int numNaN = (int) zValList.stream().filter(x -> Double.isNaN(x)).count();
            if (numNaN == zValList.size()) {    // if all levels are NaN, this predictor is redundant
                new PredNameMinZVal(catPredMinZ, Double.POSITIVE_INFINITY);
            } else {
                for (int catInd = 0; catInd < numCatCol; catInd++) {    // go through each categorical column
                    List catZValues = new ArrayList<>();
                    int nextCatOffset = catOffsets[catInd + 1];
                    for (int eleInd = catOffsets[catInd]; eleInd < nextCatOffset; eleInd++) {   // check z-value for each level
                        double oneZVal = zValList.get(eleInd);
                        if (Double.isNaN(oneZVal)) {    // one level is inactivity, let other levels be used
                            zValList.set(eleInd, 0.0);
                            catZValues.add(0.0);
                        } else {
                            catZValues.add(oneZVal);
                        }
                    }
                    if (catZValues.size() > 0) {
                        double oneCatMinZ = catZValues.stream().max(Double::compare).get(); // choose the best z-value here
                        if (oneCatMinZ < minCatVal) {
                            minCatVal = oneCatMinZ;
                            catPredMinZ = columnNames[catInd];
                        }
                    }
                }
            }
        }
        return new PredNameMinZVal(catPredMinZ, minCatVal);
    }
    
    static class PredNameMinZVal {
        String _predName;
        double _minZVal;
        
        public PredNameMinZVal(String predName, double minZVal) {
            _predName= predName;
            _minZVal = minZVal;
        }
    }
    
    public static List extraModelColumnNames(List coefNames, GLMModel bestModel) {
        List coefUsed = new ArrayList();
        List modelColumns = new ArrayList<>(Arrays.asList(bestModel.names()));
        for (String coefName : modelColumns) {
            if (coefNames.contains(coefName)) 
                coefUsed.add(coefName);
        }
        return coefUsed;
    }

    /***
     * Given a predictor subset and the complete CPM, we extract the CPM associated with the predictors 
     * specified in the predictor subset (predIndices).  If there is intercept, it will be moved to the first row
     * and column.
     */
    public static double[][] extractPredSubsetsCPM(double[][] allCPM, int[] predIndices, int[][] pred2CPMIndices,
                                                   boolean hasIntercept) {
        List CPMIndices = extractCPMIndexFromPred(allCPM.length-1, pred2CPMIndices, predIndices, hasIntercept);
        int subsetcpmDim = CPMIndices.size();
        double[][] subsetCPM = new double[subsetcpmDim][subsetcpmDim];

        for (int rIndex=0; rIndex < subsetcpmDim; rIndex++) {
            for (int cIndex=rIndex; cIndex < subsetcpmDim; cIndex++) {
                subsetCPM[rIndex][cIndex] = allCPM[CPMIndices.get(rIndex)][CPMIndices.get(cIndex)];
                subsetCPM[cIndex][rIndex] = allCPM[CPMIndices.get(cIndex)][CPMIndices.get(rIndex)];
            }
        }
        return subsetCPM;
    }

    /***
     * Given a predictor subset and the complete CPM, we extract the CPM associated with the predictors 
     * specified in the predictor subset (predIndices).  If there is intercept, it will be moved to the first row
     * and column.
     */
    public static double[][] extractPredSubsetsCPMFrame(Frame allCPM, int[] predIndices, int[][] pred2CPMIndices,
                                                   boolean hasIntercept) {
        List CPMIndices = extractCPMIndexFromPred(allCPM.numCols()-1, pred2CPMIndices, predIndices, hasIntercept);
        int subsetcpmDim = CPMIndices.size();
        double[][] subsetCPM = new double[subsetcpmDim][subsetcpmDim];

        for (int rIndex=0; rIndex < subsetcpmDim; rIndex++) {
            for (int cIndex=rIndex; cIndex < subsetcpmDim; cIndex++) {
                subsetCPM[rIndex][cIndex] = allCPM.vec(CPMIndices.get(cIndex)).at(CPMIndices.get(rIndex));
                subsetCPM[cIndex][rIndex] = allCPM.vec(CPMIndices.get(rIndex)).at(CPMIndices.get(cIndex));
            }
        }
        return subsetCPM;
    }
}




© 2015 - 2025 Weber Informatics LLC | Privacy Policy