hex.anovaglm.ANOVAGLMUtils Maven / Gradle / Ivy
package hex.anovaglm;
import hex.*;
import hex.glm.GLM;
import hex.glm.GLMModel;
import water.DKV;
import water.Key;
import water.Scope;
import water.fvec.Frame;
import water.util.TwoDimTable;
import java.lang.reflect.Field;
import java.util.ArrayList;
import java.util.Arrays;
import java.util.Collections;
import java.util.List;
import java.util.stream.Collectors;
import java.util.stream.IntStream;
import java.util.stream.Stream;
import static hex.anovaglm.ANOVAGLMModel.ANOVAGLMParameters;
import static hex.gam.MatrixFrameUtils.GAMModelUtils.copyTwoDimTable;
import static hex.gam.MatrixFrameUtils.GAMModelUtils.genCoefficientTable;
import static hex.gam.MatrixFrameUtils.GamUtils.setParamField;
import static hex.glm.GLMModel.GLMParameters;
import static hex.glm.GLMModel.GLMParameters.Family.*;
public class ANOVAGLMUtils {
/***
* This method will extract the individual predictor names that will be used to build the GLM models.
*
* @param dinfo: DataInfo generated from dataset with all predictors
* @param numOfPredictors: number of individual predictors
* @return: copy of individual predictor names in a String array.
*/
public static String[] extractPredNames(DataInfo dinfo, int numOfPredictors) {
String[] predNames = new String[numOfPredictors];
String[] frameNames = dinfo._adaptedFrame.names();
System.arraycopy(frameNames, 0, predNames, 0, numOfPredictors);
return predNames;
}
/**
* In order to calculate Type III SS, we need the individual predictors and their interactions. For details, refer
* to ANOVAGLMTutorial https://h2oai.atlassian.net/browse/PUBDEV-8088 section IV
*
* @param predNamesIndividual: string containing individual predictor names
* @param maxPredInt: maximum number of predictors allowed in interaction term generation
* @return String array with double indices. First index refers to the predictor number, second index refers to
* the names of predictors involved in generating the interaction terms. For terms involving only
* individual predictors, there is only one predictor name.
*/
public static String[][] generatePredictorCombos(String[] predNamesIndividual, int maxPredInt) {
List predCombo = new ArrayList<>();
addIndividualPred(predNamesIndividual, predCombo); // add individual predictors
for (int index = 2; index <= maxPredInt; index++) {
generateOneCombo(predNamesIndividual, index, predCombo);
}
return predCombo.toArray(new String[0][0]);
}
public static void addIndividualPred(String[] predNames, List predCombo) {
int numPred = predNames.length;
for (int index=0; index < numPred; index++) {
predCombo.add(new String[]{predNames[index]});
}
}
public static void generateOneCombo(String[] predNames, int numInteract, List predCombo) {
int predNum = predNames.length;
int[] predInd = IntStream.range(0, numInteract).toArray();
int zeroBound = predNum-numInteract;
int[] bounds = IntStream.range(zeroBound, predNum).toArray();
int numCombo = hex.genmodel.utils.MathUtils.combinatorial(predNum, numInteract);
for (int index = 0; index < numCombo; index++) {
predCombo.add(predCombo(predNames, predInd));
if (!updatePredCombo(predInd, bounds))
break; // done
}
}
public static boolean updatePredCombo(int[] predInd, int[] bounds) {
int predNum = predInd.length-1;
for (int index = predNum; index >= 0; index--) {
if (predInd[index] < bounds[index]) { //
predInd[index]++;
updateLaterBits(predInd, bounds, index, predNum);
return true;
}
}
return false;
}
public static void updateLaterBits(int[] predInd, int[] bounds, int index, int predNum) {
if (index < predNum) {
for (int ind = index+1; ind <= predNum; ind++) {
predInd[ind] = predInd[ind-1]+1;
}
}
}
public static String[] predCombo(String[] predNames, int[] predInd) {
int predNum = predInd.length;
String[] predCombos = new String[predNum];
for (int index = 0; index < predNum; index++)
predCombos[index] = predNames[predInd[index]];
return predCombos;
}
/***
* Given the number of individual predictors, the highest order of interaction terms allowed, this method will
* calculate the total number of predictors that will be used to build the full model.
*
* @param numPred: number of individual predictors
* @param highestInteractionTerms: highest number of predictors allowed in generating interactions
* @return
*/
public static int calculatePredComboNumber(int numPred, int highestInteractionTerms) {
int numCombo = numPred;
for (int index = 2; index <= highestInteractionTerms; index++)
numCombo += hex.genmodel.utils.MathUtils.combinatorial(numPred, index);
return numCombo;
}
/***
* This method will take the frame that contains transformed columns of predictor A, predictor B, interaction
* of predictor A and B and generate new training frames that contains the following columns:
* - transformed columns of predictor B, interaction of predictor A and B, response
* - transformed columns of predictor A, interaction of predictor A and B, response
* - transformed columns of predictor A, predictor B, response
* - transformed columns of predictor A, predictor B, interaction of predictor A and B, response
*
* The same logic applies if there are more than two individual predictors. You basically generate all the
* predictor combos. In building the model, you leave one predictor combo out.
*
* @param transformedCols: contains frame key of frame containing transformed columns of predictor A, predictor B,
* interaction of predictor A and B
* @param numberOfModels: number of models to build. For 2 factors, this should be 4.
* @return Array of training frames to build all the GLM models.
*/
public static Frame[] buildTrainingFrames(Key transformedCols, int numberOfModels,
String[][] transformedColNames, ANOVAGLMParameters parms) {
Frame[] trainingFrames = new Frame[numberOfModels];
int numFrames2Build = numberOfModels-1;
Frame allCols = DKV.getGet(transformedCols); // contains all the transformed columns except response, weight/offset
trainingFrames[numFrames2Build] = allCols;
int[][] predNums = new int[numFrames2Build][];
for (int index = 0; index < numFrames2Build; index++) {
predNums[index] = oneIndexOut(index, numFrames2Build);
}
for (int index = 0; index < numFrames2Build; index++) {
trainingFrames[index] = buildSpecificFrame(predNums[index], allCols, transformedColNames, parms);
DKV.put(trainingFrames[index]);
}
return trainingFrames;
}
public static int[] oneIndexOut(int currIndex, int indexRange) {
int[] indexArray = new int[indexRange-1];
int count = 0;
for (int index = 0; index < indexRange; index++) {
if (index != currIndex) {
indexArray[count++] = index;
}
}
return indexArray;
}
/**
* I copied this method from Zuzana Olajcova to add model metrics of the full GLM model as the ANOVAModel model
* metrics
* @param aModel
* @param glmModel
* @param trainingFrame
*/
public static void fillModelMetrics(ANOVAGLMModel aModel, GLMModel glmModel, Frame trainingFrame) {
aModel._output._training_metrics = glmModel._output._training_metrics;
for (Key modelMetricsKey : glmModel._output.getModelMetrics()) {
aModel.addModelMetrics(modelMetricsKey.get().deepCloneWithDifferentModelAndFrame(glmModel, trainingFrame));
}
aModel._output._scoring_history = copyTwoDimTable(glmModel._output._scoring_history, "glm scoring history");
}
/***
* Simple method to extract GLM Models from GLM ModelBuilders.
* @param glmResults: array of GLM ModelBuilders
* @return: array of GLMModels
*/
public static GLMModel[] extractGLMModels(GLM[] glmResults) {
int numberModel = glmResults.length;
GLMModel[] models = new GLMModel[numberModel];
for (int index = 0; index < numberModel; index++) {
models[index] = glmResults[index].get();
Scope.track_generic(models[index]);
}
return models;
}
public static void removeFromDKV(Frame[] trainingFrames, int numFrame2Delete) {
for (int index=0; index < numFrame2Delete; index++)
DKV.remove(trainingFrames[index]._key);
}
/***
* This method is used to attach the weight/offset columns if they exist and the response columns, specific
* transformed columns to a training frames.
*
* @param predNums: number of all predictor combos
* @param allCols: Frame containing all transformed columns
* @param transformedColNames: transformed predictor combo arrays containing only predictor combos for a specific
* training dataset. Recall that models are built with one predictor combo left out. This
* is to generate that training frame with a specific predictor combo left out.
* @param parms: AnovaGLMParameters
* @return training frame excluding a specific set of predictor combos.
*/
public static Frame buildSpecificFrame(int[] predNums, Frame allCols, String[][] transformedColNames,
ANOVAGLMParameters parms) {
final Frame predVecs = new Frame(Key.make());
int numVecs = predNums.length;
for (int index = 0; index < numVecs; index++) {
int predVecNum = predNums[index];
predVecs.add(allCols.subframe(transformedColNames[predVecNum]));
}
if (parms._weights_column != null)
predVecs.add(parms._weights_column, allCols.vec(parms._weights_column));
if (parms._offset_column != null)
predVecs.add(parms._offset_column, allCols.vec(parms._offset_column));
predVecs.add(parms._response_column, allCols.vec(parms._response_column));
return predVecs;
}
public static GLMParameters[] buildGLMParameters(Frame[] trainingFrames, ANOVAGLMParameters parms) {
final int numberOfModels = trainingFrames.length;
GLMParameters[] glmParams = new GLMParameters[numberOfModels];
final List anovaglmOnlyList = Arrays.asList("save_transformed_framekeys", "type");
final Field[] field1 = ANOVAGLMParameters.class.getDeclaredFields();
final Field[] field2 = Model.Parameters.class.getDeclaredFields();
for (int index = 0; index < numberOfModels; index++) {
glmParams[index] = new GLMParameters();
setParamField(parms, glmParams[index], false, field1, anovaglmOnlyList);
setParamField(parms, glmParams[index], true, field2, Collections.emptyList());
glmParams[index]._train = trainingFrames[index]._key;
glmParams[index]._family = parms._family;
}
return glmParams;
}
/***
* This method is used to generate Model SS for all models built except the full model. Refer to AnovaGLMTutorial
* https://h2oai.atlassian.net/browse/PUBDEV-8088 section V.
*
* @param glmModels
* @param family
* @return
*/
public static double[] generateGLMSS(GLMModel[] glmModels, GLMParameters.Family family) {
int numModels = glmModels.length;
int lastModelIndex = numModels-1;
double[] modelSS = new double[numModels];
double[] rss = new double[numModels];
for (int index = 0; index < numModels; index++) {
if (binomial.equals(family) || quasibinomial.equals(family) || fractionalbinomial.equals(family))
rss[index] = ((ModelMetricsBinomialGLM) glmModels[index]._output._training_metrics).residual_deviance();
else // for numerical response column
rss[index] = ((ModelMetricsRegressionGLM) glmModels[index]._output._training_metrics).residual_deviance();
}
// calculate model ss as rss - rss with full model
for (int index = 0; index < lastModelIndex; index++)
modelSS[index] = rss[index]-rss[lastModelIndex];
modelSS[lastModelIndex] = rss[lastModelIndex];
return modelSS;
}
public static GLM[] buildGLMBuilders(GLMParameters[] glmParams) {
int numModel = glmParams.length; // copied from Zuzana
GLM[] builder = new GLM[numModel];
for (int index = 0; index < numModel; index++)
builder[index] = new GLM(glmParams[index]);
return builder;
}
/***
* This method aims to generate the column names of the final transformed frames. This means that for single
* enum predictor "ABC" with domains "0" and "1", "2", the new column names will be ABC_0, ABC_1. For single
* numerical column, the same column name will be used.
*
* To generate the names of interaction columns, let's assume there are three predictors, R (2 levels), C (3 levels),
* S (3 levels). If the highest interaction terms allowed is 3, we will generate the following transformed names
* for the interaction columns: R0:C0, R0:C1, C0:S0, C0:S1, C1:S0, C1:S1, R0:C0:S0, R0:C0:S1, R0:C1:S0, R0:C1:S1
* @param predComboNames: string array containing all predictor combos and for each combo, all the predictor names
* involved in generating the interactions.
* @param predictorNames: string array containing all predictor names and for each combo, all the predictor names
* * involved in generating the interactions.
* @param predColumnStart: column of each predictor combo after the frame transformation.
* @param degreeOfFreedom: degree of freedom for each predictor combo
* @param dinfo
*/
public static void generatePredictorNames(String[][] predComboNames, String[][] predictorNames, int[] predColumnStart,
int[] degreeOfFreedom, DataInfo dinfo) {
int predNums = predComboNames.length;
int colStart = 0;
for (int predInd = 0; predInd < predNums; predInd++) {
if (predComboNames[predInd].length == 1) {
if (dinfo._adaptedFrame.vec(predComboNames[predInd][0]).domain() == null) // one numeric column
predictorNames[predInd] = new String[]{predComboNames[predInd][0]};
else
predictorNames[predInd] = transformOneCol(dinfo._adaptedFrame, predComboNames[predInd][0]);
} else { // working with interaction columns
predictorNames[predInd] = transformMultipleCols(dinfo._adaptedFrame, predComboNames, predInd, predictorNames);
}
colStart = updateDOFColInfo(predInd, predictorNames[predInd], degreeOfFreedom, predColumnStart, colStart);
}
}
public static int updateDOFColInfo(int predInd, String[] predComboNames, int[] dof, int[] predCS, int offset) {
dof[predInd] = predComboNames.length;;
predCS[predInd] = offset;
return dof[predInd]+offset;
}
public static int findComboMatch(String[][] predComboNames, int currIndex) {
String[] currCombo = predComboNames[currIndex];
int startPos = 1;
for (int comboSize = currCombo.length-1; comboSize >= 0; comboSize--) {
String[] smallerCurrCombo = Arrays.copyOfRange(currCombo, startPos++, currCombo.length);
for (int sInd = currIndex - 1; sInd >= 0; sInd--) {
if (Arrays.equals(smallerCurrCombo, predComboNames[sInd]))
return sInd;
}
}
return -1;
}
public static String[] combineAndFlat(String[][] predictComboNames) {
int numCombos = predictComboNames.length;
String[] finalPredNames = new String[numCombos];
for (int index = 0; index < numCombos; index++) {
String start = predictComboNames[index][0];
if (predictComboNames[index].length > 1)
for (int subIndex = 1; subIndex < predictComboNames[index].length; subIndex++)
start = start +":"+predictComboNames[index][subIndex];
finalPredNames[index] = start;
}
return finalPredNames;
}
public static String[] transformMultipleCols(Frame vec2Transform, String[][] predComboNames, int currIndex,
String[][] predNames) {
String[] currPredCombo = predComboNames[currIndex];
int matchPreviousCombo = findComboMatch(predComboNames, currIndex);
String[] matchPredNames = predNames[matchPreviousCombo];
String[] searchPair = new String[]{currPredCombo[0], currPredCombo[1]};
return transformTwoCols(vec2Transform, searchPair, matchPredNames);
}
/**
* Generate frame transformation on two interacting columns. Refer to AnovaGLMTutorial
* https://h2oai.atlassian.net/browse/PUBDEV-8088 sectinos III.II. and IV.
*
* @param vec2Transform: frame containing the two predictors to transform
* @param vecNames: name of the predictors
* @param lastComboNames: predictor combo names of the second vector if applicable. This is used to transform
* more than two predictors
* @return String containing the transformed column names.
*/
public static String[] transformTwoCols(Frame vec2Transform, String[] vecNames, String[] lastComboNames) {
String[] domains1 = vec2Transform.vec(vecNames[0]).domain();
String[] domains2 = lastComboNames == null ? vec2Transform.vec(vecNames[1]).domain() : lastComboNames;
String colName1 = vecNames[0];
String colName2 = vecNames[1];
int degOfFreedomC1 = domains1 == null ? 1 : (domains1.length-1);
int degOfFreedomC2 = lastComboNames == null ? (domains2.length-1) : domains2.length;
String[] newColNames = new String[degOfFreedomC1*degOfFreedomC2];
int colIndex = 0;
for (int col1 = 0; col1 < degOfFreedomC1; col1++) {
String part1 = colName1;
if (domains1 != null)
part1 = colName1 + "_" + domains1[col1];
for (int col2 = 0; col2 < degOfFreedomC2; col2++) {
if (lastComboNames == null) {
if (domains2 == null)
newColNames[colIndex++] = part1 + ":" + colName2;
else
newColNames[colIndex++] = part1 + ":" + colName2 + "_" + domains2[col2];
} else {
newColNames[colIndex++] = part1 + ":"+domains2[col2];
}
}
}
return newColNames;
}
/**
* perform data transformation described in AnovaGLMTutorial https://h2oai.atlassian.net/browse/PUBDEV-8088
* section III.II on one predictor.
*
* @param vec2Transform: frame containing that one predictor to transform.
* @param vecName: name of predictor
* @return: string array containing the transformed predictor column names.
*/
public static String[] transformOneCol(Frame vec2Transform, String vecName) {
String[] domains = vec2Transform.vec(vecName).domain();
int degOfFreedom = domains.length-1;
String[] newColNames = new String[degOfFreedom];
for (int domainInd = 0; domainInd < degOfFreedom; domainInd++)
newColNames[domainInd] = vecName+"_"+domains[domainInd];
return newColNames;
}
public static String[] generateModelNames(String[][] predictComboNames) {
int numPredCombo = predictComboNames.length;
String[] modelNames = new String[numPredCombo+1];
for (int index=0; index < numPredCombo; index++) {
if (predictComboNames[index].length == 1)
modelNames[index] = "GLM model built without predictor " + predictComboNames[index][0];
else
modelNames[index] = "GLM model built without predictors interactions " +
Stream.of(predictComboNames[index]).collect(Collectors.joining(":"));
}
modelNames[numPredCombo] = "GLM model built with all predictors";
return modelNames;
}
}
© 2015 - 2025 Weber Informatics LLC | Privacy Policy