hex.glm.ComputationState Maven / Gradle / Ivy
package hex.glm;
import hex.DataInfo;
import hex.glm.GLM.BetaConstraint;
import hex.glm.GLM.GLMGradientInfo;
import hex.glm.GLM.GLMGradientSolver;
import hex.glm.GLMModel.GLMParameters;
import hex.glm.GLMModel.GLMParameters.Family;
import hex.gram.Gram;
import hex.optimization.ADMM;
import hex.optimization.OptimizationUtils.GradientInfo;
import hex.optimization.OptimizationUtils.GradientSolver;
import water.Job;
import water.MemoryManager;
import water.fvec.Frame;
import water.util.ArrayUtils;
import water.util.Log;
import water.util.MathUtils;
import java.util.Arrays;
import java.util.Comparator;
import java.util.stream.IntStream;
import static hex.glm.GLMModel.GLMParameters.Family.gaussian;
import static hex.glm.GLMUtils.calSmoothNess;
import static hex.glm.GLMUtils.copyGInfo;
public final class ComputationState {
final boolean _intercept;
final int _nbetas;
private final GLMParameters _parms;
private BetaConstraint _bc;
double _alpha;
double[] _ymu;
double [] _u;
private double [] _zValues;
private boolean _dispersionEstimated;
boolean _allIn;
int _iter;
int _iterHGLM_GLMMME; // keep track of iterations used in estimating fixed/random coefficients
private double _lambda = 0;
private double _lambdaMax = Double.NaN;
private GLMGradientInfo _ginfo; // gradient info excluding l1 penalty
private double _likelihood;
private double _gradientErr;
private boolean _lambdaNull; // true if lambda was not provided by user
private double _gMax; // store max value of original gradient without dividing by math.max(1e-2, _parms._alpha[0])
private DataInfo _activeData;
private BetaConstraint _activeBC = null;
private final GLM.BetaInfo _modelBetaInfo;
private double[] _beta; // vector of coefficients corresponding to active data
private double[] _ubeta; // HGLM, store coefficients of random effects;
private double[] _psi; // HGLM, psi
private double[] _phi; // HGLM, size random columns levels
private double _tau; // HGLM for ei
private double _correction_HL; // HGLM
double[] _sumEtaSquareConvergence; // HGLM: sotre sumEtaSquare, convergence
double[] _likelihoodInfo; // HGLM: stores 4 elements: hlik, pvh, pbvh, cAIC
public String[] _randCoeffNames; // store random coefficient names
private Frame _priorw_wpsi; // weight calculated for psi
final DataInfo _dinfo;
private GLMGradientSolver _gslvr;
private final Job _job;
private int _activeClass = -1;
double[][][] _penaltyMatrix;
int[][] _gamBetaIndices;
int _totalBetaLength; // actual coefficient length without taking into account active columns only
int _betaLengthPerClass;
public ComputationState(Job job, GLMParameters parms, DataInfo dinfo, BetaConstraint bc, GLM.BetaInfo bi){
_job = job;
_parms = parms;
_bc = bc;
_activeBC = _bc;
_dinfo = dinfo;
_activeData = _dinfo;
_intercept = _parms._intercept;
_alpha = _parms._alpha[0];
_nbetas = bi._nBetas;
_betaLengthPerClass = dinfo.fullN()+1;
_totalBetaLength = _betaLengthPerClass * _nbetas;
if (_parms._HGLM) {
_sumEtaSquareConvergence = new double[2];
if (_parms._calc_like)
_likelihoodInfo = new double[4];
}
_modelBetaInfo = bi;
}
public ComputationState(Job job, GLMParameters parms, DataInfo dinfo, BetaConstraint bc, GLM.BetaInfo bi,
double[][][] penaltyMat, int[][] gamColInd){
this (job, parms, dinfo, bc, bi);
_penaltyMatrix = penaltyMat;
_gamBetaIndices = gamColInd;
_lambdaNull = (_parms._lambda==null) && !(_parms._lambda_search);
}
// copy over parameters from _model to _state for checkpointing
// jest of this method is to restore the _state to be the same as before
void copyCheckModel2State(GLMModel model, int[][] _gamColIndices) {
GLMModel.GLMOutput modelOutput = model._output;
int submodelInd;
int coefLen = _nbetas > 2 ? (_dinfo.fullN() + 1) * _nbetas : (_dinfo.fullN() + 1);
if (modelOutput._submodels.length > 1) // lambda search or multiple alpha/lambda cases
submodelInd = modelOutput._submodels.length - 1; // submodel where the model building ends
else // no lambda search or multiple alpha/lambda case
submodelInd = 0;
setIter(modelOutput._submodels[submodelInd].iteration);
setAlpha(modelOutput._submodels[submodelInd].alpha_value);
if (submodelInd > 0) {
int preCurrSubmodelInd = gaussian.equals(_parms._family) ? submodelInd : (submodelInd - 1);
_activeData._activeCols = modelOutput._submodels[preCurrSubmodelInd].idxs;
double[] betaExpand = Family.multinomial.equals(_parms._family)
? ArrayUtils.expandAndScatter(modelOutput._submodels[preCurrSubmodelInd].beta, coefLen, _activeData._activeCols)
: expandBeta(modelOutput._submodels[preCurrSubmodelInd].beta);
GLMGradientInfo ginfo = new GLMGradientSolver(_job, _parms, _dinfo, 0, activeBC(), _modelBetaInfo, _penaltyMatrix,
_gamColIndices).getGradient(betaExpand); // gradient obtained with zero penalty
_activeData._activeCols = null;
updateState(betaExpand, ginfo);
setLambdaSimple(_parms._lambda[preCurrSubmodelInd]);
}
// this part must be done for single model before setting coefficients
if (!gaussian.equals(_parms._family)) // will build for new lambda for gaussian
setLambda(modelOutput._submodels[submodelInd].lambda_value);
// update _state with last submodelInd coefficients
double[] expandedBeta = modelOutput._submodels[submodelInd].idxs == null
? modelOutput._submodels[submodelInd].beta
: ArrayUtils.expandAndScatter(modelOutput._submodels[submodelInd].beta, coefLen,
modelOutput._submodels[submodelInd].idxs);
GLMGradientInfo ginfo = new GLMGradientSolver(_job, _parms, _dinfo, 0, activeBC(), _modelBetaInfo,
_penaltyMatrix, _gamColIndices).getGradient(expandedBeta); // gradient obtained with zero penalty
updateState(expandedBeta, ginfo);
// make sure model._betaCndCheckpoint is of the right size
if (model._betaCndCheckpoint != null) {
if (_activeData._activeCols == null || (_activeData._activeCols.length != model._betaCndCheckpoint.length)) {
double[] betaCndCheckpoint = ArrayUtils.expandAndScatter(model._betaCndCheckpoint, coefLen,
modelOutput._submodels[submodelInd].idxs); // expand betaCndCheckpoint out
if (_activeData._activeCols != null) // contract the betaCndCheckpoint to the right activeCol length
betaCndCheckpoint = extractSubRange(betaCndCheckpoint.length, 0, activeData()._activeCols, betaCndCheckpoint);
model._betaCndCheckpoint = betaCndCheckpoint;
}
}
}
public void set_sumEtaSquareConvergence(double[] sumInfo) {
_sumEtaSquareConvergence = sumInfo;
}
/***
* Copy GLM coefficients stored in beta to _beta of computationState
* @param beta: store coefficients to be copied from
* @param startIdx: first index of beta to copy from
* @param len: length of coefficients to copy from beta
* @param interceptFirst: true if the first index of beta stored the intercept term
*/
public void set_beta_HGLM(double[] beta, int startIdx, int len, boolean interceptFirst) {
if (_beta==null)
_beta = new double[len];
if (interceptFirst) {
int lastIndex = len-1;
System.arraycopy(beta, startIdx+1, _beta, 0, lastIndex);
_beta[lastIndex] = beta[startIdx];
} else {
System.arraycopy(beta, startIdx, _beta, 0, len);
}
}
public void set_likelihoodInfo(double hlik, double pvh, double pbvh, double cAIC) {
_likelihoodInfo[0] = hlik;
_likelihoodInfo[1] = pvh;
_likelihoodInfo[2] = pbvh;
_likelihoodInfo[3] = cAIC;
}
public void set_ubeta_HGLM(double[] ubeta, int startIdx, int len) {
if (_ubeta==null)
_ubeta = new double[len];
System.arraycopy(ubeta, startIdx, _ubeta, 0, len);
}
public void setZValues(double[] zValues, boolean dispersionEstimated) {
_zValues = zValues;
_dispersionEstimated = dispersionEstimated;
}
public double[] get_psi() {
return _psi;
}
public double get_correction_HL() {
return _correction_HL;
}
public double[] get_phi() {
return _phi;
}
public Frame get_priorw_wpsi() {
return _priorw_wpsi;
}
public double get_tau() {
return _tau;
}
public boolean getLambdaNull() { return _lambdaNull; }
public void set_tau(double tau) {
_tau=tau;
}
public void set_psi(double[] psi) {
assert _psi.length==psi.length:"Length of _psi and psi should be the same.";
System.arraycopy(psi, 0, _psi, 0, psi.length);
}
public void set_phi(double[] phi) {
assert _phi.length==phi.length:"Length of _phi and phi should be the same.";
System.arraycopy(phi, 0, _phi, 0, phi.length);
}
public GLMGradientSolver gslvr(){return _gslvr;}
public double lambda(){return _lambda;}
public double alpha() {return _alpha;}
public double[] zValues() {return _zValues;}
public boolean dispersionEstimated() {return _dispersionEstimated;}
public void setLambdaMax(double lmax) {
_lambdaMax = lmax;
}
public void setgMax(double gmax) {
_gMax = gmax;
}
public void setAlpha(double alpha) {
_alpha=alpha;
setLambdaMax(_gMax/Math.max(1e-2,alpha)); // need to set _lmax every time alpha value changes
}
public void setLambda(double lambda) {
adjustToNewLambda(0, _lambda);
// strong rules are to be applied on the gradient with no l2 penalty
// NOTE: we start with lambdaOld being 0, not lambda_max
// non-recursive strong rules should use lambdaMax instead of _lambda
// However, it seems tobe working nicely to use 0 instead and be more aggressive on the predictor pruning
// (shoudl be safe as we check the KKTs anyways)
applyStrongRules(lambda, _lambda);
_lambda = lambda;
if (_penaltyMatrix == null)
_gslvr = new GLMGradientSolver(_job, _parms, _activeData, l2pen(), _activeBC, _modelBetaInfo);
else
_gslvr = new GLMGradientSolver(_job, _parms, _activeData, l2pen(), _activeBC, _modelBetaInfo, _penaltyMatrix, _gamBetaIndices);
adjustToNewLambda(lambda, 0);
}
public double [] beta(){
if(_activeClass != -1)
return betaMultinomial(_activeClass,_beta);
return _beta;
}
public double[] ubeta(){
return _ubeta; // could be null. Be careful
}
public GLMGradientInfo ginfo(){return _ginfo == null?(_ginfo = gslvr().getGradient(beta())):_ginfo;}
public BetaConstraint activeBC(){return _activeBC;}
public double likelihood() {return _likelihood;}
public boolean ginfoNull() {return _ginfo==null;}
public DataInfo activeData(){
if(_activeClass != -1)
return activeDataMultinomial(_activeClass);
return _activeData;
}
public DataInfo activeDataMultinomial(){return _activeData;}
public void dropActiveData(){_activeData = null;}
public String toString() {
return "iter=" + _iter + " lmb=" + GLM.lambdaFormatter.format(_lambda) + " alpha=" +
GLM.lambdaFormatter.format(_alpha)+ " obj=" + MathUtils.roundToNDigits(objective(),4) + " imp=" +
GLM.lambdaFormatter.format(_relImprovement) + " bdf=" + GLM.lambdaFormatter.format(_betaDiff);
}
private void adjustToNewLambda(double lambdaNew, double lambdaOld) {
double ldiff = lambdaNew - lambdaOld;
if(ldiff == 0 || l2pen() == 0) return;
double l2pen = .5*ArrayUtils.l2norm2(_beta,true);
if (_parms._family==Family.ordinal)
l2pen = l2pen/ _nbetas; // need only one set of parameters
if(l2pen > 0) {
if (_ginfo == null) _ginfo = ginfo();
if(_parms._family == Family.multinomial || _parms._family == Family.ordinal) {
l2pen = 0;
int off = 0;
for(int c = 0; c < _nbetas; ++c) {
DataInfo activeData = activeDataMultinomial(c);
for (int i = 0; i < activeData.fullN(); ++i) {
double b = _beta[off + i];
_ginfo._gradient[off + i] += ldiff * b;
l2pen += b*b;
}
if (_parms._family == Family.ordinal) // one beta for all classes
break;
off += activeData.fullN()+1;
}
l2pen *= .5;
} else for(int i = 0; i < _activeData.fullN(); ++i)
_ginfo._gradient[i] += ldiff*_beta[i];
}
_ginfo = new GLMGradientInfo(_ginfo._likelihood, _ginfo._objVal + ldiff * l2pen, _ginfo._gradient);
}
public double l1pen() {return _alpha*_lambda;}
public double l2pen() {return (1-_alpha)*_lambda;}
/**
* Apply strong rules to filter out expected inactive (with zero coefficient) predictors.
*
* @return indices of expected active predictors.
*/
protected void applyStrongRules(double lambdaNew, double lambdaOld) {
lambdaNew = Math.min(_lambdaMax,lambdaNew);
lambdaOld = Math.min(_lambdaMax,lambdaOld);
if (_parms._family == Family.multinomial || _parms._family == Family.ordinal/* && _parms._solver != GLMParameters.Solver.L_BFGS */) {
applyStrongRulesMultinomial(lambdaNew, lambdaOld);
return;
}
int P = _dinfo.fullN();
_activeBC = _bc;
_activeData = _activeData != null?_activeData:_dinfo;
_allIn = _allIn || _alpha*lambdaNew == 0 || _activeBC.hasBounds();
if (!_allIn) {
int newlySelected = 0;
final double rhs = Math.max(0,_alpha * (2 * lambdaNew - lambdaOld));
int [] newCols = MemoryManager.malloc4(P);
int j = 0;
int[] oldActiveCols = _activeData._activeCols == null ? new int[]{P} : _activeData.activeCols();
for (int i = 0; i < P; ++i) {
if(j < oldActiveCols.length && oldActiveCols[j] == i)
j++;
else if (_ginfo._gradient[i] > rhs || -_ginfo._gradient[i] > rhs)
newCols[newlySelected++] = i; // choose active columns here
}
if(_parms._max_active_predictors != -1 && (oldActiveCols.length + newlySelected -1) > _parms._max_active_predictors){
Integer [] bigInts = ArrayUtils.toIntegers(newCols, 0, newlySelected);
Arrays.sort(bigInts, new Comparator() {
@Override
public int compare(Integer o1, Integer o2) {
return (int)Math.signum(_ginfo._gradient[o2.intValue()]*_ginfo._gradient[o2.intValue()] - _ginfo._gradient[o1.intValue()]*_ginfo._gradient[o1.intValue()]);
}
});
newCols = ArrayUtils.toInt(bigInts,0,_parms._max_active_predictors - oldActiveCols.length + 1);
Arrays.sort(newCols);
} else newCols = Arrays.copyOf(newCols,newlySelected);
newCols = ArrayUtils.sortedMerge(oldActiveCols,newCols);
// merge already active columns in
int active = newCols.length;
_allIn = active == P;
if(!_allIn) {
int [] cols = newCols;
assert cols[active-1] == P; // intercept is always selected, even if it is false (it's gonna be dropped later, it is needed for other stuff too)
_beta = ArrayUtils.select(_beta, cols);
if(_u != null) _u = ArrayUtils.select(_u,cols);
_activeData = _dinfo.filterExpandedColumns(cols);
assert _activeData.activeCols().length == _beta.length;
assert _u == null || _activeData.activeCols().length == _u.length;
_ginfo = new GLMGradientInfo(_ginfo._likelihood, _ginfo._objVal, ArrayUtils.select(_ginfo._gradient, cols));
_activeBC = _bc.filterExpandedColumns(_activeData.activeCols());
_gslvr = _penaltyMatrix == null ? new GLMGradientSolver(_job,_parms,_activeData,(1-_alpha)*_lambda,_bc,_modelBetaInfo)
: new GLMGradientSolver(_job, _parms, _dinfo, (1 - _alpha) * _lambda, _bc, _modelBetaInfo, _penaltyMatrix,
_gamBetaIndices);
assert _beta.length == cols.length;
return;
}
}
_activeData = _dinfo;
}
public boolean _lsNeeded = false;
public DataInfo [] _activeDataMultinomial;
public DataInfo activeDataMultinomial(int c) {return _activeDataMultinomial != null?_activeDataMultinomial[c]:_dinfo;}
/**
* This method will return a double array that is extracted from src (which includes active and non-active columns)
* to only include active columns stated in ids.
*
* @param N
* @param c
* @param ids
* @param src
* @return
*/
public static double [] extractSubRange(int N, int c, int [] ids, double [] src) {
if(ids == null) return Arrays.copyOfRange(src,c*N,c*N+N);
double [] res = MemoryManager.malloc8d(ids.length);
int j = 0;
int off = c*N;
for(int i:ids)
res[j++] = src[off+i];
return res;
}
/**
* This method will extract coefficients from multinomial. The extracted coefficients are only from one class
* and it contains the active and non-active columns.
*
* @param N
* @param c
* @param ids
* @param src
* @param dst
*/
static void fillSubRange(int N, int c, int [] ids, double [] src, double [] dst) {
if(ids == null) {
System.arraycopy(src,0,dst,c*N,N);
} else {
int j = 0;
int off = c * N;
for (int i : ids)
dst[off + i] = src[j++];
}
}
public double [] betaMultinomial(){return _beta;}
public double [] betaMultinomial(int c, double [] beta) {
return extractSubRange(_activeData.fullN()+1,c,_activeDataMultinomial[c].activeCols(),beta);
}
public double [] betaMultinomialFull(int c, double [] beta) {
if (_parms._remove_collinear_columns)
return extractSubRange(_betaLengthPerClass,c,_activeDataMultinomial[c].activeCols(),beta);
else
return extractSubRange(_activeData.fullN()+1,c,_activeDataMultinomial[c].activeCols(),beta);
}
public double[] shrinkFullArray(double[] fullArray) {
if (_activeData.activeCols() == null)
return fullArray;
int[] activeColsAllClass = genActiveColsAllClass(_activeData.activeCols().length* _nbetas,
_betaLengthPerClass, _activeData.activeCols(), _nbetas);
return ArrayUtils.select(fullArray, activeColsAllClass);
}
public static double[] expandToFullArray(double[] shortenArr, int[] activeCols, int _totalBetaLength, int nclasses,
int betaLengthPerClass) {
if (activeCols == null)
return shortenArr;
int[] activeColsAllClass = genActiveColsAllClass(activeCols.length*nclasses,
betaLengthPerClass, activeCols, nclasses);
double[] fullArray = new double[_totalBetaLength];
fillSubRange(_totalBetaLength, 0, activeColsAllClass, shortenArr, fullArray);
return fullArray;
}
public static int[] genActiveColsAllClass(int activeColsLen, int numBetaPerClass, int[] activeColsOrig, int nclasses) {
int[] activeCols = new int[activeColsLen];
int offset = 0;
int[] activeColsOneClass = activeColsOrig;
for (int classIndex=0; classIndex < nclasses; classIndex++) {
int finalOffset = numBetaPerClass*classIndex;
int[] activeCols1Class = IntStream.of(activeColsOneClass).map(i->i+finalOffset).toArray();
int num2Copy = activeColsOneClass.length;
System.arraycopy(activeCols1Class, 0, activeCols, offset, num2Copy);
offset += num2Copy;
}
return activeCols;
}
public int[] genActiveColsIndClass(int activeColsLen, int numBetaPerClass, int[] activeColsOrig, int activeClass,
int nclasses) {
int[] activeCols = new int[activeColsLen];// total length
int offset = 0;
int[] activeColsOneClass = activeColsOrig;
for (int classIndex = 0; classIndex < activeClass; classIndex++) {
int finalOffset = numBetaPerClass*classIndex;
int num2Copy = activeColsOneClass.length;
int[] activeCols1Class = IntStream.of(activeColsOneClass).map(i->i+finalOffset).toArray();
System.arraycopy(activeCols1Class, 0, activeCols, offset, num2Copy);
offset += num2Copy;
}
for (int classInd = activeClass; classInd < nclasses; classInd++) {
int finalOffset = numBetaPerClass*classInd;
int[] activeCols1Class = IntStream.range(0, numBetaPerClass).map(i->i+finalOffset).toArray();
System.arraycopy(activeCols1Class, 0, activeCols, offset, numBetaPerClass);
offset += numBetaPerClass;
}
return activeCols;
}
public GLMSubsetGinfo ginfoMultinomial(int c) {
return new GLMSubsetGinfo(_ginfo,(_activeData.fullN()+1),c,_activeDataMultinomial[c].activeCols());
}
public GLMSubsetGinfo ginfoMultinomialRCC(int c) {
if (_activeData.fullN() + 1 == _activeData.activeCols().length)
return new GLMSubsetGinfo(_ginfo, (_activeData.fullN() + 1), c, IntStream.range(0,
_activeData.activeCols().length).toArray());
else
return new GLMSubsetGinfo(_ginfo, (_activeData.fullN() + 1), c, _activeData.activeCols());
}
public void setBC(BetaConstraint bc) {
_bc = bc;
_activeBC = _bc;
}
public void setActiveClass(int activeClass) {_activeClass = activeClass;}
public double deviance() {
switch (_parms._family) {
case gaussian:
case binomial:
case quasibinomial:
case ordinal:
case multinomial:
case fractionalbinomial:
return 2*likelihood();
case poisson:
case gamma:
case negativebinomial:
case tweedie:
return likelihood();
default:
throw new RuntimeException("unknown family " + _parms._family);
}
}
/***
* This method will grab a subset of the gradient for each multinomial class. However, if remove_collinear_columns is
* on, fullInfo will only contains the gradient of active columns.
*/
public static class GLMSubsetGinfo extends GLMGradientInfo {
public final GLMGradientInfo _fullInfo;
public GLMSubsetGinfo(GLMGradientInfo fullInfo, int N, int c, int [] ids) {
super(fullInfo._likelihood, fullInfo._objVal, extractSubRange(N,c,ids,fullInfo._gradient));
_fullInfo = fullInfo; // fullInfo._gradient may not be full
}
}
public GradientSolver gslvrMultinomial(final int c) {
double[] betaCopy = new double[_totalBetaLength]; // make sure fullbeta is full length
if (_beta.length < _totalBetaLength) {
if (_beta.length == _activeData.activeCols().length* _nbetas) { // all classes converted
int[] activeCols = genActiveColsAllClass(_beta.length, _betaLengthPerClass, _activeData.activeCols(), _nbetas);
fillSubRange(_totalBetaLength, 0, activeCols, _beta, betaCopy);
} else {
int[] activeCols = genActiveColsIndClass(_beta.length, _betaLengthPerClass, _activeData.activeCols(), c, _nbetas);
fillSubRange(_totalBetaLength, 0, activeCols, _beta, betaCopy);
}
} else {
System.arraycopy(_beta, 0, betaCopy, 0, _totalBetaLength);
}
final double [] fullbeta = betaCopy; // make sure fullbeta contains everything
return new GradientSolver() {
// beta is full coeff Per class. Need to return gradient with full columns
@Override
public GradientInfo getGradient(double[] beta) {
// fill fullbeta with new values of beta for class c
fillSubRange(_dinfo.fullN()+1,c,_activeDataMultinomial[c].activeCols(),beta,fullbeta); // fullbeta contains everything
GLMGradientInfo fullGinfo = _gslvr.getGradient(fullbeta); // beta contains all columns
if (fullbeta.length > fullGinfo._gradient.length) { // fullGinfo only contains gradient for active columns here
double[] fullGinfoGradient = expandToFullArray(fullGinfo._gradient, _activeData.activeCols(),
_totalBetaLength, _nbetas, _betaLengthPerClass);
fullGinfo._gradient = fullGinfoGradient; // make sure fullGinfo contains full gradient
}
return new GLMSubsetGinfo(fullGinfo,_betaLengthPerClass,c,_activeData.activeCols());// fullGinfo has full gradient
//return new GLMSubsetGinfo(fullGinfo,_activeData.fullN()+1,c,_activeDataMultinomial[c].activeCols());
}
@Override
public GradientInfo getObjective(double[] beta) {return getGradient(beta);}
};
}
public void setBetaMultinomial(int c, double [] beta, double [] bc) {
if(_u != null) Arrays.fill(_u,0);
if (_parms._remove_collinear_columns)
fillSubRange(_betaLengthPerClass,c,_activeDataMultinomial[c].activeCols(),bc,beta);
else
fillSubRange(_activeData.fullN()+1,c,_activeDataMultinomial[c].activeCols(),bc,beta);
}
/**
* Apply strong rules to filter out expected inactive (with zero coefficient) predictors.
*
* @return indices of expected active predictors.
*/
/**
* Apply strong rules to filter out expected inactive (with zero coefficient) predictors.
*
* @return indices of expected active predictors.
*/
protected int applyStrongRulesMultinomial_old(double lambdaNew, double lambdaOld) {
int P = _dinfo.fullN();
int N = P+1;
int selected = 0;
_activeBC = _bc;
_activeData = _dinfo;
if (!_allIn) {
if(_activeDataMultinomial == null)
_activeDataMultinomial = new DataInfo[_nbetas];
final double rhs = _alpha * (2 * lambdaNew - lambdaOld);
int[] oldActiveCols = _activeData._activeCols == null ? new int[0] : _activeData.activeCols();
int [] cols = MemoryManager.malloc4(N* _nbetas);
int j = 0;
for(int c = 0; c < _nbetas; ++c) {
int start = selected;
for (int i = 0; i < P; ++i) {
if (j < oldActiveCols.length && i == oldActiveCols[j]) {
cols[selected++] = i;
++j;
} else if (_ginfo._gradient[c*N+i] > rhs || _ginfo._gradient[c*N+i] < -rhs) {
cols[selected++] = i;
}
}
cols[selected++] = P;// intercept
_activeDataMultinomial[c] = _dinfo.filterExpandedColumns(Arrays.copyOfRange(cols,start,selected));
for(int i = start; i < selected; ++i)
cols[i] += c*N;
}
_allIn = selected == cols.length;
}
return selected;
}
/**
* Apply strong rules to filter out expected inactive (with zero coefficient) predictors.
*
* @return indices of expected active predictors.
*/
protected void applyStrongRulesMultinomial(double lambdaNew, double lambdaOld) {
int P = _dinfo.fullN();
int N = P+1;
int selected = 0;
_activeBC = _bc;
_activeData = _dinfo;
if (!_allIn) {
if(_activeDataMultinomial == null)
_activeDataMultinomial = new DataInfo[_nbetas];
final double rhs = _alpha * (2 * lambdaNew - lambdaOld);
int [] cols = MemoryManager.malloc4(N* _nbetas);
int oldActiveColsTotal = 0;
for(int c = 0; c < _nbetas; ++c) {
int j = 0;
int[] oldActiveCols = _activeDataMultinomial[c] == null ? new int[]{P} : _activeDataMultinomial[c]._activeCols;
oldActiveColsTotal += oldActiveCols.length;
for (int i = 0; i < P; ++i) {
if (j < oldActiveCols.length && i == oldActiveCols[j]) {
++j;
} else { // need access to _ginfo
if (_ginfo == null) _ginfo = ginfo();
if (_ginfo._gradient[c * N + i] > rhs || _ginfo._gradient[c * N + i] < -rhs) {
cols[selected++] = c * N + i;
}
}
}
}
if(_parms._max_active_predictors != -1 && _parms._max_active_predictors - oldActiveColsTotal + _nbetas < selected) {
Integer[] bigInts = ArrayUtils.toIntegers(cols, 0, selected);
Arrays.sort(bigInts, new Comparator() {
@Override
public int compare(Integer o1, Integer o2) {
return (int) Math.signum(_ginfo._gradient[o2.intValue()] * _ginfo._gradient[o2.intValue()] - _ginfo._gradient[o1.intValue()] * _ginfo._gradient[o1.intValue()]);
}
});
cols = ArrayUtils.toInt(bigInts, 0, _parms._max_active_predictors - oldActiveColsTotal + _nbetas);
Arrays.sort(cols);
selected = cols.length;
}
int i = 0;
int [] cs = new int[P+1];
int sum = 0;
for(int c = 0; c < _nbetas; ++c){
int [] classcols = cs;
int[] oldActiveCols = _activeDataMultinomial[c] == null ? new int[]{P} : _activeDataMultinomial[c]._activeCols;
int k = 0;
while(i < selected && cols[i] < (c+1)*N)
classcols[k++] = cols[i++]-c*N;
classcols = ArrayUtils.sortedMerge(oldActiveCols,Arrays.copyOf(classcols,k));
sum += classcols.length;
_activeDataMultinomial[c] = _dinfo.filterExpandedColumns(classcols);
}
assert _parms._max_active_predictors == -1 || sum <= _parms._max_active_predictors + _nbetas :"sum = " + sum + " max_active_preds = " + _parms._max_active_predictors + ", nclasses = " + _nbetas;
_allIn = sum == N* _nbetas;
}
}
protected boolean checkKKTsMultinomial(){
return true;
//if(_activeData._activeCols == null) return true;
// throw H2O.unimpl();
}
protected boolean checkKKTs() {
if(_parms._family == Family.multinomial || _parms._family == Family.ordinal) // always return true?
return checkKKTsMultinomial();
double [] beta = _beta;
double [] u = _u;
if(_activeData._activeCols != null) {
beta = ArrayUtils.expandAndScatter(beta, _dinfo.fullN() + 1, _activeData._activeCols);
if(_u != null)
u = ArrayUtils.expandAndScatter(_u, _dinfo.fullN() + 1, _activeData._activeCols);
}
int [] activeCols = _activeData.activeCols();
if(beta != _beta || _ginfo == null) {
_gslvr = _penaltyMatrix == null ? new GLMGradientSolver(_job, _parms, _dinfo, (1 - _alpha) * _lambda, _bc, _modelBetaInfo)
: new GLMGradientSolver(_job, _parms, _dinfo, (1 - _alpha) * _lambda, _bc, _modelBetaInfo, _penaltyMatrix, _gamBetaIndices);
_ginfo = _gslvr.getGradient(beta);
}
double[] grad = _ginfo._gradient.clone();
double err = 1e-4;
if(u != null && u != _u){ // fill in u for missing variables
int k = 0;
for(int i = 0; i < u.length; ++i) {
if(_activeData._activeCols[k] == i){
++k; continue;
}
assert u[i] == 0;
u[i] = -grad[i];
}
}
ADMM.subgrad(_alpha * _lambda, beta, grad);
for (int c : activeCols) // set the error tolerance to the highest error of included columns
if (grad[c] > err) err = grad[c];
else if (grad[c] < -err) err = -grad[c];
_gradientErr = err;
_beta = beta;
_u = u;
_activeBC = null;
if(_parms._max_active_predictors == _activeData.fullN()){
Log.info("skipping KKT check, reached maximum number of active predictors (" + _parms._max_active_predictors + ")");
} else if(!_allIn) {
int[] failedCols = new int[64];
int fcnt = 0;
for (int i = 0; i < grad.length - 1; ++i) {
if (Arrays.binarySearch(activeCols, i) >= 0) continue; // always include all previously active columns
if (grad[i] > err || -grad[i] > err) {
if (fcnt == failedCols.length)
failedCols = Arrays.copyOf(failedCols, failedCols.length << 1);
failedCols[fcnt++] = i;
}
}
if (fcnt > 0) {
Log.info(fcnt + " variables failed KKT conditions, adding them to the model and recomputing.");
final int n = activeCols.length;
int[] newCols = Arrays.copyOf(activeCols, activeCols.length + fcnt);
for (int i = 0; i < fcnt; ++i)
newCols[n + i] = failedCols[i];
Arrays.sort(newCols);
_beta = ArrayUtils.select(beta, newCols);
if(_u != null) _u = ArrayUtils.select(_u,newCols);
_ginfo = new GLMGradientInfo(_ginfo._likelihood, _ginfo._objVal, ArrayUtils.select(_ginfo._gradient, newCols));
_activeData = _dinfo.filterExpandedColumns(newCols);
_activeBC = _bc.filterExpandedColumns(_activeData.activeCols());
_gslvr = _penaltyMatrix == null ? new GLMGradientSolver(_job, _parms, _activeData,
(1 - _alpha) * _lambda, _activeBC, _modelBetaInfo) : new GLMGradientSolver(_job, _parms, _activeData,
(1 - _alpha) * _lambda, _activeBC, _modelBetaInfo, _penaltyMatrix, _gamBetaIndices);
return false;
}
}
return true;
}
public void addOffset2Cols(int[] cols) {
int offset = _activeClass*_activeData.activeCols().length;
int colsLen = cols.length;
for (int index = 0; index < colsLen; index++)
cols[index] = cols[index]+offset;
}
public int [] removeCols(int [] cols) { // cols is per class, not overall
int[] activeCols;
int[] colsWOffset = cols.clone();
if (_nbetas > 2 && _parms._remove_collinear_columns) {
activeCols = ArrayUtils.removeIds(_activeDataMultinomial[_activeClass].activeCols(), cols);
addOffset2Cols(colsWOffset);
} else {
activeCols = ArrayUtils.removeIds(_activeData.activeCols(), cols);
}
if (_beta != null)
_beta = ArrayUtils.removeIds(_beta, colsWOffset);
if(_u != null)
_u = ArrayUtils.removeIds(_u,colsWOffset);
if(_ginfo != null && _ginfo._gradient != null)
_ginfo._gradient = ArrayUtils.removeIds(_ginfo._gradient,colsWOffset);
_activeData = _dinfo.filterExpandedColumns(activeCols); // changed _adaptedFrame to excluded inactive columns
_activeBC = _bc.filterExpandedColumns(activeCols);
_gslvr = _penaltyMatrix == null ? new GLMGradientSolver(_job, _parms, _activeData,
(1 - _alpha) * _lambda, _activeBC, _modelBetaInfo) : new GLMGradientSolver(_job, _parms, _activeData,
(1 - _alpha) * _lambda, _activeBC, _modelBetaInfo, _penaltyMatrix, _gamBetaIndices);
_currGram = null;
return activeCols;
}
private double penalty(double [] beta) {
if(_lambda == 0) return 0;
double l1norm = 0, l2norm = 0;
if(_parms._family == Family.multinomial || _parms._family == Family.ordinal) {
int len = beta.length/ _nbetas;
assert len* _nbetas == beta.length;
for(int c = 0; c < _nbetas; ++c) {
for(int i = c*len; i < (c+1)*len-1; ++i) {
double d = beta[i];
l1norm += d >= 0?d:-d;
l2norm += d*d;
}
if (_parms._family == Family.ordinal) // done for ordinal, only one set of beta but numclass-1 intercepts
break;
}
} else
for(int i = 0; i < beta.length-1; ++i) {
double d = beta[i];
l1norm += d >= 0?d:-d;
l2norm += d*d;
}
return l1pen()*l1norm + .5*l2pen()*l2norm;
}
public double objective() {return _beta == null?Double.MAX_VALUE:objective(_beta,_likelihood);}
public double objective(double [] beta, double likelihood) {
double gamVal = 0;
if (_parms._glmType == GLMParameters.GLMType.gam) {
if (beta.length == _totalBetaLength)
gamVal = calSmoothNess(beta, _penaltyMatrix, _gamBetaIndices);
else
gamVal = calSmoothNess(expandBeta(beta), _penaltyMatrix, _gamBetaIndices); // take up memory
}
return likelihood * _parms._obj_reg + gamVal + penalty(beta) + (_activeBC == null?0:_activeBC.proxPen(beta));
}
protected double updateState(double [] beta, double likelihood) {
_betaDiff = ArrayUtils.linfnorm(_beta == null?beta:ArrayUtils.subtract(_beta,beta),false);
double objOld = objective();
_beta = beta;
_ginfo = null;
_likelihood = likelihood;
return (_relImprovement = (objOld - objective())/Math.abs(objOld));
}
private double _betaDiff;
private double _relImprovement;
String convergenceMsg = "";
public boolean converged(){
boolean converged = false;
if(_betaDiff < _parms._beta_epsilon) {
convergenceMsg = "betaDiff < eps; betaDiff = " + _betaDiff + ", eps = " + _parms._beta_epsilon;
converged = true;
} else if(_relImprovement < _parms._objective_epsilon) {
convergenceMsg = "relImprovement < eps; relImprovement = " + _relImprovement + ", eps = " + _parms._objective_epsilon;
converged = true;
} else convergenceMsg = "not converged, betaDiff = " + _betaDiff + ", relImprovement = " + _relImprovement;
return converged;
}
protected double updateState(double [] beta,GLMGradientInfo ginfo){
double objOld;
if (_beta != null && beta.length > _beta.length) { // beta is full while _beta only contains active columns
double[] shortBeta = shrinkFullArray(beta);
_betaDiff = ArrayUtils.linfnorm(_beta == null ? beta : ArrayUtils.subtract(_beta, shortBeta), false);
objOld = objective();
if(_beta == null)_beta = shortBeta.clone();
else System.arraycopy(shortBeta,0,_beta,0,shortBeta.length);
} else {
_betaDiff = ArrayUtils.linfnorm(_beta == null ? beta : ArrayUtils.subtract(_beta, beta), false);
objOld = objective();
if(_beta == null)_beta = beta.clone();
else System.arraycopy(beta,0,_beta,0,beta.length);
}
_ginfo = ginfo;
_likelihood = ginfo._likelihood;
return (_relImprovement = (objOld - objective())/Math.abs(objOld));
}
double getBetaDiff() {return _betaDiff;}
double getGradientErr() {return _gradientErr;}
protected void setBetaDiff(double betaDiff) { _betaDiff = betaDiff; }
protected void setGradientErr(double gErr) { _gradientErr = gErr; }
protected void setGinfo(GLMGradientInfo ginfo) {
_ginfo = copyGInfo(ginfo);
}
protected void setBeta(double[] beta) {
if(_beta == null)_beta = beta.clone();
else System.arraycopy(beta,0, _beta, 0, beta.length);
}
protected void setIter(int iteration) {
_iter = iteration;
}
protected void setLikelihood(double llk) { _likelihood = llk; }
protected void setAllIn(boolean val) { _allIn = val; }
protected void setGslvrNull() { _gslvr = null; }
protected void setActiveDataMultinomialNull() { _activeDataMultinomial = null; }
protected void setActiveDataNull() { _activeData = null; }
protected void setLambdaSimple(double lambda) { _lambda=lambda; }
protected void setHGLMComputationState(double [] beta, double[] ubeta, double[] psi, double[] phi,
double hlcorrection, double tau, Frame wpsi, String[] randCoeffNames){
_beta = Arrays.copyOf(beta, beta.length);
_ubeta = Arrays.copyOf(ubeta, ubeta.length);
_randCoeffNames = Arrays.copyOf(randCoeffNames, randCoeffNames.length);
_psi = Arrays.copyOf(psi, psi.length);
_phi = Arrays.copyOf(phi, phi.length);
_correction_HL = hlcorrection;
_tau = tau;
_priorw_wpsi = wpsi; // store prior_weight and calculated wpsi value for coefficients of random columns
_iterHGLM_GLMMME = 0;
}
public double [] expandBeta(double [] beta) { // for multinomials
int fullCoefLen = (_dinfo.fullN() + 1) * _nbetas;
if(_activeData._activeCols == null || beta.length == fullCoefLen)
return beta;
if (_nbetas <= 2 || !_parms._remove_collinear_columns)
return ArrayUtils.expandAndScatter(beta, (_dinfo.fullN() + 1) * _nbetas,_activeData._activeCols);
else
return expandToFullArray(beta, _activeData.activeCols(), _totalBetaLength, _nbetas, _betaLengthPerClass);
}
/**
* Cached state of COD (with covariate updates) solver.
*/
public static final class GramXY {
public final Gram gram;
final double[] beta;
final int[] activeCols;
int [] newCols;
public final double[] xy;
private double [] grads;
public double yy;
public final double likelihood;
public double sumOfRowWeights; // sum of all r.weight
public GramXY(Gram gram, double[] xy, double [] grads, double[] beta, int[] activeCols, int [] newActiveCols, double yy, double likelihood) {
this.gram = gram;
this.xy = xy;
this.grads = grads;
this.beta = beta == null ? null : beta.clone();
this.activeCols = activeCols == null ? null : activeCols.clone();
this.newCols = newActiveCols;
this.yy = yy;
this.likelihood = likelihood;
}
public final double [] getCODGradients(){
if(grads == null){
double [][] xx = gram.getXX();
grads = new double[xy.length];
for(int i = 0; i < grads.length; ++i)
grads[i] = xy[i] - ArrayUtils.innerProduct(xx[i], beta) + xx[i][i] * beta[i];
}
if(newCols != null) {
double [][] xx = gram.getXX();
for (int i : newCols)
grads[i] = xy[i] - ArrayUtils.innerProduct(xx[i], beta) + xx[i][i] * beta[i];
}
return grads;
}
public boolean match(double[] beta, int[] activeCols) {
return Arrays.equals(this.beta, beta) && Arrays.equals(this.activeCols, activeCols);
}
static double [] mergeRow(int k, double [] xrowOld, double [] xrow,int [] newColsIds, double [][] xxUpdate){
for(int i = 0; i < newColsIds.length; ++i){
int j = newColsIds[i];
xrow[j] = xxUpdate[i][k];
for(int l = i == 0?0:newColsIds[i-1]+1; l < j; ++l)
xrow[l] = xrowOld[l-i];
}
int l = newColsIds.length;
for(int j = newColsIds[newColsIds.length-1]+1; j < xrow.length; ++j)
xrow[j] = xrowOld[j-l];
return xrow;
}
public static GramXY addCols(double[] beta, final int[] newActiveCols, final int[] newColsIds, final GramXY oldGram, final double[][] xxUpdate, final double[] xyUpdate) {
// update the expanded matrix cache
final double[][] xxCacheNew = new double[newActiveCols.length][];
final double[] xyNew = new double[xxCacheNew.length];
final double[] gradsNew = oldGram.grads == null?null:new double[xxCacheNew.length];
double [][] xx = oldGram.gram.getXX();
for (int k = 0; k < newColsIds.length; ++k) {
int j = newColsIds[k];
xxCacheNew[j] = xxUpdate[k];
xyNew[j] = xyUpdate[k];
for (int i = k == 0 ? 0 : newColsIds[k - 1] + 1; i < j; i++) {
xxCacheNew[i] = mergeRow(i, xx[i - k], new double[newActiveCols.length], newColsIds, xxUpdate);
xyNew[i] = oldGram.xy[i - k];
if(oldGram.grads != null)gradsNew[i] = oldGram.grads[i - k];
}
}
int k = newColsIds.length;
for (int i = newColsIds[newColsIds.length - 1] + 1; i < xyNew.length; ++i) {
xxCacheNew[i] = mergeRow(i, xx[i - k], new double[newActiveCols.length], newColsIds, xxUpdate);
xyNew[i] = oldGram.xy[i - k];
if(oldGram.grads != null)gradsNew[i] = oldGram.grads[i - k];
}
return new GramXY(new Gram(xxCacheNew), xyNew, gradsNew, beta, newActiveCols, newColsIds, oldGram.yy, oldGram.likelihood);
}
}
protected GramXY computeNewGram(DataInfo activeData, double [] beta, GLMParameters.Solver s){
double obj_reg = _parms._obj_reg;
if(_glmw == null) _glmw = new GLMModel.GLMWeightsFun(_parms);
GLMTask.GLMIterationTask gt = new GLMTask.GLMIterationTask(_job._key, activeData, _glmw, beta,
_activeClass).doAll(activeData._adaptedFrame);
gt._gram.mul(obj_reg);
if (_parms._glmType.equals(GLMParameters.GLMType.gam)) { // add contribution from GAM smoothness factor
Integer[] activeCols=null;
int[] activeColumns = activeData.activeCols();
if (activeColumns.length<_dinfo.fullN()) { // columns are deleted
activeCols = ArrayUtils.toIntegers(activeColumns, 0, activeColumns.length);
}
gt._gram.addGAMPenalty(activeCols , _penaltyMatrix, _gamBetaIndices);
}
ArrayUtils.mult(gt._xy,obj_reg);
int [] activeCols = activeData.activeCols();
int [] zeros = gt._gram.findZeroCols();
GramXY res;
if(_parms._family != Family.multinomial && zeros.length > 0 && zeros.length <= activeData.activeCols().length) {
gt._gram.dropCols(zeros);
removeCols(zeros);
res = new ComputationState.GramXY(gt._gram,ArrayUtils.removeIds(gt._xy, zeros),null,gt._beta == null?null:ArrayUtils.removeIds(gt._beta, zeros),activeData().activeCols(),null,gt._yy,gt._likelihood);
} else res = new GramXY(gt._gram,gt._xy,null,beta == null?null:beta,activeCols,null,gt._yy,gt._likelihood);
if (gaussian.equals(_parms._family))
res.sumOfRowWeights = gt.sumOfRowWeights;
return res;
}
GramXY _currGram;
GLMModel.GLMWeightsFun _glmw;
/***
* This method is used only for multinomial family. It differs from computeGram because it calls on _activeData
* which only contains only active columns in its _adaptedFrame. Note activeDataMultinomial(_activeClass) will
* always contains all predictors in its _adaptedFrame.
* @param beta
* @param s
* @return
*/
public GramXY computeGramRCC(double[] beta, GLMParameters.Solver s) {
return computeNewGram(_activeData, ArrayUtils.select(beta, _activeData.activeCols()), s);
}
// get cached gram or incrementally update or compute new one
public GramXY computeGram(double [] beta, GLMParameters.Solver s){
double obj_reg = _parms._obj_reg;
boolean weighted = !gaussian.equals(_parms._family) || !GLMParameters.Link.identity.equals(_parms._link);
if(Family.multinomial.equals(_parms._family)) // no caching
return computeNewGram(activeDataMultinomial(_activeClass),beta,s); // activeDataMultinomial(_activeClass) returns all predictors
if(s != GLMParameters.Solver.COORDINATE_DESCENT)
// only cache for solver==COD
// caching only makes difference when running with lambda search
// and COD and IRLSM need matrix in different shape
// and COD is better for lambda search
return computeNewGram(activeData(),beta,s);
if(_currGram == null) // no cached value, compute new one and store
return _currGram = computeNewGram(activeData(),beta,s);
DataInfo activeData = activeData();
assert beta == null || beta.length == activeData.fullN()+1;
int [] activeCols = activeData.activeCols();
if (Arrays.equals(_currGram.activeCols,activeCols))
return (!weighted || Arrays.equals(_currGram.beta, beta)) ? _currGram : (_currGram = computeNewGram(activeData,
beta, s));
if(_glmw == null) _glmw = new GLMModel.GLMWeightsFun(_parms);
// check if we need full or just incremental update
if(_currGram != null){
int [] newCols = ArrayUtils.sorted_set_diff(activeCols,_currGram.activeCols);
int [] newColsIds = newCols.clone();
int jj = 0;
boolean matches = true;
int k = 0;
for (int i = 0; i < activeCols.length; ++i) {
if (jj < newCols.length && activeCols[i] == newCols[jj]) {
newColsIds[jj++] = i;
matches = matches && (beta == null || beta[i] == 0);
} else {
matches = matches && (beta == null || beta[i] == _currGram.beta[k++]);
}
}
if(!weighted || matches) {
GLMTask.GLMIncrementalGramTask gt = new GLMTask.GLMIncrementalGramTask(newColsIds, activeData, _glmw, beta).doAll(activeData._adaptedFrame); // dense
for (double[] d : gt._gram)
ArrayUtils.mult(d, obj_reg);
ArrayUtils.mult(gt._xy, obj_reg);
// glue the update and old gram together
return _currGram = GramXY.addCols(beta, activeCols, newColsIds, _currGram, gt._gram, gt._xy);
}
}
return _currGram = computeNewGram(activeData,beta,s);
}
}
© 2015 - 2025 Weber Informatics LLC | Privacy Policy