All Downloads are FREE. Search and download functionalities are using the official Maven repository.

hex.schemas.ANOVAGLMV3 Maven / Gradle / Ivy

There is a newer version: 3.46.0.6
Show newest version
package hex.schemas;

import hex.anovaglm.ANOVAGLM;
import hex.anovaglm.ANOVAGLMModel;
import hex.glm.GLMModel;
import water.api.API;
import water.api.schemas3.KeyV3;
import water.api.schemas3.ModelParametersSchemaV3;

public class ANOVAGLMV3 extends ModelBuilderSchema {

  public static final class ANOVAGLMParametersV3 extends ModelParametersSchemaV3 {
    public static final String[] fields = new String[] {
            "model_id",
            "training_frame",
            "seed",
            "response_column",
            "ignored_columns",
            "ignore_const_cols",
            "score_each_iteration",
            "offset_column",
            "weights_column",
            "family",
            "tweedie_variance_power",
            "tweedie_link_power",
            "theta", // equals to 1/r and should be > 0 and <=1, used by negative binomial
            "solver",
            "missing_values_handling",
            "plug_values",
            "compute_p_values",
            "standardize",
            "non_negative",
            "max_iterations",
            "link",
            "prior",
            "alpha",
            "lambda",
            "lambda_search",
            "stopping_rounds",
            "stopping_metric",
            "early_stopping",
            "stopping_tolerance",
            "balance_classes",
            "class_sampling_factors",
            "max_after_balance_size",
            "max_runtime_secs",
            "save_transformed_framekeys",
            "highest_interaction_term",
            "nparallelism",
            "type" // GLM SS Type, only support 3 right now
    };

    @API(help = "Seed for pseudo random number generator (if applicable)", gridable = true)
    public long seed;

    @API(help = "Standardize numeric columns to have zero mean and unit variance", level = API.Level.critical)
    public boolean standardize;

    // Input fields
    @API(help = "Family. Use binomial for classification with logistic regression, others are for regression problems.",
            values = {"AUTO", "gaussian", "binomial", "fractionalbinomial", "quasibinomial", "poisson", "gamma",
                    "tweedie", "negativebinomial"}, level = API.Level.critical)
    public GLMModel.GLMParameters.Family family;

    @API(help = "Tweedie variance power", level = API.Level.critical, gridable = true)
    public double tweedie_variance_power;

    @API(help = "Tweedie link power", level = API.Level.critical, gridable = true)
    public double tweedie_link_power;

    @API(help = "Theta", level = API.Level.critical, gridable = true)
    public double theta; // used by negtaive binomial distribution family

    @API(help = "Distribution of regularization between the L1 (Lasso) and L2 (Ridge) penalties." +
            " A value of 1 for alpha represents Lasso regression, a value of 0 produces Ridge regression, and " +
            "anything in between specifies the amount of mixing between the two. Default value of alpha is 0 when" +
            " SOLVER = 'L-BFGS'; 0.5 otherwise.", level = API.Level.critical, gridable = true)
    public double[] alpha;

    @API(help = "Regularization strength", required = false, level = API.Level.critical, gridable = true)
    public double[] lambda;
    
    @API(help = "Use lambda search starting at lambda max, given lambda is then interpreted as lambda min", 
            level = API.Level.critical)
    public boolean lambda_search;

    @API(help = "AUTO will set the solver based on given data and the other parameters. IRLSM is fast on on problems" +
            " with small number of predictors and for lambda-search with L1 penalty, L_BFGS scales better for datasets" +
            " with many columns.", values = {"AUTO", "IRLSM", "L_BFGS","COORDINATE_DESCENT_NAIVE",
            "COORDINATE_DESCENT", "GRADIENT_DESCENT_LH", "GRADIENT_DESCENT_SQERR"}, level = API.Level.critical)
    public GLMModel.GLMParameters.Solver solver;

    @API(help = "Handling of missing values. Either MeanImputation, Skip or PlugValues.", values = { "MeanImputation",
            "Skip", "PlugValues" }, level = API.Level.expert, direction=API.Direction.INOUT, gridable = true)
    public GLMModel.GLMParameters.MissingValuesHandling missing_values_handling;

    @API(help = "Plug Values (a single row frame containing values that will be used to impute missing values of the" +
            " training/validation frame, use with conjunction missing_values_handling = PlugValues)",
            direction = API.Direction.INPUT)
    public KeyV3.FrameKeyV3 plug_values;

    @API(help = "Restrict coefficients (not intercept) to be non-negative")
    public boolean non_negative;

    @API(help="Request p-values computation, p-values work only with IRLSM solver and no regularization",
            level = API.Level.secondary, direction = API.Direction.INPUT)
    public boolean compute_p_values; // _remove_collinear_columns

    @API(help = "Maximum number of iterations", level = API.Level.secondary)
    public int max_iterations;

    @API(help = "Link function.", level = API.Level.secondary, values = {"family_default", "identity", "logit", "log",
            "inverse", "tweedie", "ologit"}) //"oprobit", "ologlog": will be supported.
    public GLMModel.GLMParameters.Link link;

    @API(help = "Prior probability for y==1. To be used only for logistic regression iff the data has been sampled and" +
            " the mean of response does not reflect reality.", level = API.Level.expert)
    public double prior;

    // dead unused args, formely inherited from supervised model schema
    /**
     * For imbalanced data, balance training data class counts via
     * over/under-sampling. This can result in improved predictive accuracy.
     */
    @API(help = "Balance training data class counts via over/under-sampling (for imbalanced data).",
            level = API.Level.secondary, direction = API.Direction.INOUT)
    public boolean balance_classes;

    /**
     * Desired over/under-sampling ratios per class (lexicographic order).
     * Only when balance_classes is enabled.
     * If not specified, they will be automatically computed to obtain class balance during training.
     */
    @API(help = "Desired over/under-sampling ratios per class (in lexicographic order). If not specified, sampling" +
            " factors will be automatically computed to obtain class balance during training. Requires " +
            "balance_classes.", level = API.Level.expert, direction = API.Direction.INOUT)
    public float[] class_sampling_factors;

    /**
     * When classes are balanced, limit the resulting dataset size to the
     * specified multiple of the original dataset size.
     */
    @API(help = "Maximum relative size of the training data after balancing class counts (can be less than 1.0). " +
            "Requires balance_classes.", /* dmin=1e-3, */ level = API.Level.expert, direction = API.Direction.INOUT)
    public float max_after_balance_size;

    @API(help = "Limit the number of interaction terms, if 2 means interaction between 2 columns only, 3 for three" +
            " columns and so on...  Default to 2.", level = API.Level.critical)
    public int highest_interaction_term;  // GLM SS Type, only support 3

    @API(help = "Refer to the SS type 1, 2, 3, or 4.  We are currently only supporting 3", level = API.Level.critical)
    public int type;  // GLM SS Type, only support 3

    @API(help="Stop early when there is no more relative improvement on train or validation (if provided).")
    public boolean early_stopping;

    @API(help="true to save the keys of transformed predictors and interaction column.")
    public boolean save_transformed_framekeys;

    @API(help="Number of models to build in parallel.  Default to 4.  Adjust according to your system.")
    public int nparallelism;
  }
}




© 2015 - 2025 Weber Informatics LLC | Privacy Policy