
hex.FrameTask Maven / Gradle / Ivy
package hex;
import water.*;
import water.H2O.H2OCountedCompleter;
import water.Job.JobCancelledException;
import water.fvec.Chunk;
import water.fvec.Frame;
import water.fvec.NewChunk;
import water.fvec.Vec;
import water.util.ArrayUtils;
import water.util.Log;
import water.util.RandomUtils;
import java.util.ArrayList;
import java.util.Arrays;
import java.util.Random;
public abstract class FrameTask> extends MRTask{
protected transient DataInfo _dinfo;
public DataInfo dinfo() { return _dinfo; }
final Key _dinfoKey;
final int [] _activeCols;
final protected Key _jobKey;
// double _ymu = Double.NaN; // mean of the response
// size of the expanded vector of parameters
protected float _useFraction = 1.0f;
protected boolean _shuffle = false;
protected boolean _skipMissing = true;
protected boolean skipMissing() { return _skipMissing; }
public FrameTask(Key jobKey, DataInfo dinfo) {
this(jobKey, dinfo._key, dinfo._activeCols,null);
}
public FrameTask(Key jobKey, DataInfo dinfo, H2OCountedCompleter cmp) {
this(jobKey, dinfo._key, dinfo._activeCols,cmp);
}
public FrameTask(Key jobKey, Key dinfoKey, int [] activeCols) {
this(jobKey,dinfoKey, activeCols,null);
}
public FrameTask(Key jobKey, Key dinfoKey, int [] activeCols, H2OCountedCompleter cmp) {
super(cmp);
assert dinfoKey == null || DKV.get(dinfoKey) != null;
_jobKey = jobKey;
_dinfoKey = dinfoKey;
_activeCols = activeCols;
}
protected FrameTask(FrameTask ft){
_dinfo = ft._dinfo;
_jobKey = ft._jobKey;
_useFraction = ft._useFraction;
_shuffle = ft._shuffle;
_activeCols = ft._activeCols;
_dinfoKey = ft._dinfoKey;
assert DKV.get(_dinfoKey) != null;
}
@Override protected void setupLocal(){
DataInfo dinfo = DKV.get(_dinfoKey).get();
_dinfo = _activeCols == null?dinfo:dinfo.filterExpandedColumns(_activeCols);
}
@Override protected void closeLocal(){ _dinfo = null;}
public final double [] normMul(){return _dinfo._normMul;}
public final double [] normSub(){return _dinfo._normSub;}
public final double [] normRespMul(){return _dinfo._normMul;}
public final double [] normRespSub(){return _dinfo._normSub;}
/**
* Method to process one row of the data for GLM functions.
* Numeric and categorical values are passed separately, as is response.
* Categoricals are passed as absolute indexes into the expanded beta vector, 0-levels are skipped
* (so the number of passed categoricals will not be the same for every row).
*
* Categorical expansion/indexing:
* Categoricals are placed in the beginning of the beta vector.
* Each cat variable with n levels is expanded into n-1 independent binary variables.
* Indexes in cats[] will point to the appropriate coefficient in the beta vector, so e.g.
* assume we have 2 categorical columns both with values A,B,C, then the following rows will have following indexes:
* A,A - ncats = 0, we do not pass any categorical here
* A,B - ncats = 1, indexes = [2]
* B,B - ncats = 2, indexes = [0,2]
* and so on
*
* @param gid - global id of this row, in [0,_adaptedFrame.numRows())
* @param nums - numeric values of this row
* @param ncats - number of passed (non-zero) categoricals
* @param cats - indexes of categoricals into the expanded beta-vector.
* @param response - numeric value for the response
*/
protected void processRow(long gid, double [] nums, int ncats, int [] cats, double [] response){throw new RuntimeException("should've been overriden!");}
protected void processRow(long gid, double [] nums, int ncats, int [] cats, double [] response, NewChunk [] outputs){throw new RuntimeException("should've been overriden!");}
public static class DataInfo extends Keyed {
public int [] _activeCols;
public Frame _adaptedFrame;
public int _responses; // number of responses
@Override
public long checksum() {throw H2O.unimpl();} // don't really need checksum
public enum TransformType { NONE, STANDARDIZE, NORMALIZE, DEMEAN, DESCALE };
public TransformType _predictor_transform;
public TransformType _response_transform;
public boolean _useAllFactorLevels;
public int _nums;
public int _cats;
public int [] _catOffsets;
public int [] _catMissing;
public double [] _normMul;
public double [] _normSub;
public double [] _normRespMul;
public double [] _normRespSub;
public int _foldId;
public int _nfolds;
public Key _frameKey;
public DataInfo deep_clone() {
AutoBuffer ab = new AutoBuffer();
this.write(ab);
ab.flipForReading();
return (DataInfo)new DataInfo().read(ab);
}
private DataInfo() {super(null);_catLvls = null;}
private DataInfo(Key selfKey, DataInfo dinfo, int foldId, int nfolds){
super(selfKey);
assert dinfo._catLvls == null:"Should not be called with filtered levels (assuming the selected levels may change with fold id) ";
assert dinfo._predictor_transform != null;
assert dinfo. _response_transform != null;
_predictor_transform = dinfo._predictor_transform;
_response_transform = dinfo._response_transform;
_responses = dinfo._responses;
_nums = dinfo._nums;
_cats = dinfo._cats;
_adaptedFrame = dinfo._adaptedFrame;
_catOffsets = dinfo._catOffsets;
_catMissing = dinfo._catMissing;
_normMul = dinfo._normMul;
_normSub = dinfo._normSub;
_normRespMul = dinfo._normRespMul;
_normRespSub = dinfo._normRespSub;
_foldId = foldId;
_nfolds = nfolds;
_useAllFactorLevels = dinfo._useAllFactorLevels;
_catLvls = null;
}
public DataInfo(Key selfKey, Frame train, Frame valid, int hasResponses, boolean useAllFactorLvls, double [] normSub, double [] normMul, TransformType predictor_transform, double [] normRespSub, double [] normRespMul){
this(selfKey, train, valid, hasResponses,useAllFactorLvls,
normMul != null && normSub != null ? predictor_transform : TransformType.NONE, //just allocate, doesn't matter whether standardize or normalize is used (will be overwritten below)
normRespMul != null && normRespSub != null ? TransformType.STANDARDIZE : TransformType.NONE);
assert predictor_transform != null;
assert (normSub == null) == (normMul == null);
assert (normRespSub == null) == (normRespMul == null);
if(normSub != null) {
System.arraycopy(normSub, 0, _normSub, 0, normSub.length);
System.arraycopy(normMul, 0, _normMul, 0, normMul.length);
}
if(normRespSub != null) {
System.arraycopy(normRespSub, 0, _normRespSub, 0, normRespSub.length);
System.arraycopy(normRespMul, 0, _normRespMul, 0, normRespMul.length);
}
}
final int [][] _catLvls;
public DataInfo(Key selfKey, Frame train, Frame valid, int nResponses, boolean useAllFactors, TransformType predictor_transform) {
this(selfKey, train, valid, nResponses, useAllFactors, predictor_transform, TransformType.NONE);
}
//new DataInfo(f,catLvls, _responses, _standardize, _response_transform);
public DataInfo(Key selfKey, Frame fr, int[][] catLevels, int responses, TransformType predictor_transform, TransformType response_transform, int foldId, int nfolds){
super(selfKey);
assert predictor_transform != null;
assert response_transform != null;
_predictor_transform = predictor_transform;
_response_transform = response_transform;
_adaptedFrame = fr;
_catOffsets = MemoryManager.malloc4(catLevels.length+1);
_catMissing = new int[catLevels.length];
int s = 0;
for(int i = 0; i < catLevels.length; ++i){
_catOffsets[i] = s;
s += catLevels[i].length;
}
_catLvls = catLevels;
_catOffsets[_catOffsets.length-1] = s;
_responses = responses;
_cats = catLevels.length;
_nums = fr.numCols()-_cats - responses;
if(_nums > 0){
switch(_predictor_transform) {
case STANDARDIZE:
_normMul = MemoryManager.malloc8d(_nums);
_normSub = MemoryManager.malloc8d(_nums);
for (int i = 0; i < _nums; ++i) {
Vec v = fr.vec(catLevels.length+i);
_normMul[i] = (v.sigma() != 0)?1.0/v.sigma():1.0;
_normSub[i] = v.mean();
}
break;
case NORMALIZE:
_normMul = MemoryManager.malloc8d(_nums);
_normSub = MemoryManager.malloc8d(_nums);
for (int i = 0; i < _nums; ++i) {
Vec v = fr.vec(catLevels.length+i);
_normMul[i] = (v.max() - v.min() > 0)?1.0/(v.max() - v.min()):1.0;
_normSub[i] = v.mean();
}
break;
case DEMEAN:
_normMul = null;
_normSub = MemoryManager.malloc8d(_nums);
for (int i = 0; i < _nums; ++i) {
Vec v = fr.vec(catLevels.length+i);
_normSub[i] = v.mean();
}
break;
case DESCALE:
_normMul = MemoryManager.malloc8d(_nums);;
_normSub = null;
for (int i = 0; i < _nums; ++i) {
Vec v = fr.vec(catLevels.length+i);
_normMul[i] = (v.sigma() != 0)?1.0/v.sigma():1.0;
}
break;
case NONE:
_normMul = null;
_normSub = null;
break;
default:
throw H2O.unimpl();
}
}
if(responses > 0){
switch(_response_transform) {
case STANDARDIZE:
_normRespMul = MemoryManager.malloc8d(responses);
_normRespSub = MemoryManager.malloc8d(responses);
for (int i = 0; i < responses; ++i) {
Vec v = fr.vec(fr.numCols()-responses+i);
_normRespMul[i] = (v.sigma() != 0)?1.0/v.sigma():1.0;
_normRespSub[i] = v.mean();
}
break;
case NORMALIZE:
_normRespMul = MemoryManager.malloc8d(responses);
_normRespSub = MemoryManager.malloc8d(responses);
for (int i = 0; i < responses; ++i) {
Vec v = fr.vec(fr.numCols()-responses+i);
_normRespMul[i] = (v.max() - v.min() > 0)?1.0/(v.max() - v.min()):1.0;
_normRespSub[i] = v.mean();
}
break;
case DEMEAN:
_normRespMul = null;
_normRespSub = MemoryManager.malloc8d(responses);
for (int i = 0; i < responses; ++i) {
Vec v = fr.vec(fr.numCols()-responses+i);
_normRespSub[i] = v.mean();
}
break;
case DESCALE:
_normRespSub = null;
_normRespMul = MemoryManager.malloc8d(responses);;
for (int i = 0; i < responses; ++i) {
Vec v = fr.vec(fr.numCols()-responses+i);
_normRespMul[i] = (v.sigma() != 0)?1.0/v.sigma():1.0;
}
break;
case NONE:
_normRespMul = null;
_normRespSub = null;
break;
default:
throw H2O.unimpl();
}
}
_useAllFactorLevels = false;
_adaptedFrame.reloadVecs();
_nfolds = nfolds;
_foldId = foldId;
}
// Modify the train & valid frames directly; sort the categorical columns
// up front according to size; compute the mean/sigma for each column for
// later normalization.
public DataInfo(Key selfKey, Frame train, Frame valid, int nResponses, boolean useAllFactorLevels, TransformType predictor_transform, TransformType response_transform) {
super(selfKey);
assert predictor_transform != null;
assert response_transform != null;
_nfolds = _foldId = 0;
_predictor_transform = predictor_transform;
_response_transform = response_transform;
_responses = nResponses;
_useAllFactorLevels = useAllFactorLevels;
_catLvls = null;
final Vec[] tvecs = train.vecs();
final Vec[] vvecs = valid.vecs();
// Count categorical-vs-numerical
final int n = tvecs.length-_responses;
assert n >= 1; // Checked in init() before
int [] nums = MemoryManager.malloc4(n);
int [] cats = MemoryManager.malloc4(n);
int nnums = 0, ncats = 0;
for(int i = 0; i < n; ++i)
if(tvecs[i].isEnum()) cats[ncats++] = i;
else nums[nnums++] = i;
_nums = nnums;
_cats = ncats;
// sort the cats in the decreasing order according to their size
for(int i = 0; i < ncats; ++i)
for(int j = i+1; j < ncats; ++j)
if( tvecs[cats[i]].domain().length < tvecs[cats[j]].domain().length ) {
int x = cats[i];
cats[i] = cats[j];
cats[j] = x;
}
String[] names = new String[train.numCols()];
Vec[] tvecs2 = new Vec[train.numCols()];
Vec[] vvecs2 = new Vec[train.numCols()];
// Compute the cardinality of each cat
_catOffsets = MemoryManager.malloc4(ncats+1);
_catMissing = new int[ncats];
int len = _catOffsets[0] = 0;
for(int i = 0; i < ncats; ++i) {
names[i] = train._names[cats[i]];
vvecs2 [i] = vvecs[cats[i]];
Vec v = (tvecs2[i] = tvecs[cats[i]]);
_catMissing[i] = v.naCnt() > 0 ? 1 : 0; //needed for test time
_catOffsets[i+1] = (len += v.domain().length - (useAllFactorLevels?0:1) + (v.naCnt()>0?1:0)); //missing values turn into a new factor level
}
// Compute the mean/sigma for each predictor
switch(predictor_transform) {
case STANDARDIZE:
case NORMALIZE:
_normSub = MemoryManager.malloc8d(nnums);
_normMul = MemoryManager.malloc8d(nnums); Arrays.fill(_normMul, 1);
break;
case DEMEAN:
_normSub = MemoryManager.malloc8d(nnums);
_normMul = null;
break;
case DESCALE:
_normSub = null;
_normMul = MemoryManager.malloc8d(nnums);
break;
case NONE:
_normSub = _normMul = null;
break;
default:
break;
}
for(int i = 0; i < nnums; ++i){
names[ncats+i] = train._names[nums[i]];
vvecs2 [ncats+i] = vvecs[nums[i]];
Vec v = (tvecs2[ncats+i] = tvecs[nums[i]]);
switch(predictor_transform){
case STANDARDIZE:
_normSub[i] = v.mean();
_normMul[i] = v.sigma() != 0 ? 1.0/v.sigma() : 1.0;
break;
case NORMALIZE:
_normSub[i] = v.mean();
_normMul[i] = (v.max() - v.min() > 0)?1.0/(v.max() - v.min()):1.0;
break;
case DEMEAN:
_normSub[i] = v.mean();
break;
case DESCALE:
_normMul[i] = (v.sigma() != 0)?1.0/v.sigma():1.0;
break;
case NONE:
break;
default:
break;
}
}
// Compute the mean/sigma for each response
if (_responses > 0) {
switch(response_transform){
case STANDARDIZE:
case NORMALIZE:
_normRespSub = MemoryManager.malloc8d(_responses);
_normRespMul = MemoryManager.malloc8d(_responses); Arrays.fill(_normRespMul, 1);
break;
case DEMEAN:
_normRespSub = MemoryManager.malloc8d(_responses);
_normRespMul = null;
break;
case DESCALE:
_normRespSub = null;
_normRespMul = MemoryManager.malloc8d(_responses);
break;
case NONE:
_normRespSub = _normRespMul = null;
break;
default:
throw H2O.unimpl();
}
for(int i = 0; i < _responses; ++i){
names[ncats+nnums+i] = train._names[ncats+nnums+i];
vvecs2 [ncats+nnums+i] = vvecs[ncats+nnums+i];
Vec v = (tvecs2[ncats+nnums+i] = tvecs[ncats+nnums+i]);
switch(response_transform){
case STANDARDIZE:
_normRespSub[i] = v.mean();
_normRespMul[i] = v.sigma() != 0 ? 1.0/v.sigma() : 1.0;
break;
case NORMALIZE:
_normRespSub[i] = v.mean();
_normRespMul[i] = (v.max() - v.min() > 0)?1.0/(v.max() - v.min()):1.0;
break;
case DEMEAN:
_normRespSub[i] = v.mean();
break;
case DESCALE:
_normRespMul[i] = v.sigma() != 0 ? 1.0/v.sigma() : 1.0;
break;
case NONE:
break;
default:
throw H2O.unimpl();
}
}
}
train.restructure(names,tvecs2);
valid.restructure(names,vvecs2);
_adaptedFrame = train;
}
public DataInfo filterExpandedColumns(int [] cols){
assert _predictor_transform != null;
assert _response_transform != null;
if(cols == null)return this;
int i = 0, j = 0, ignoredCnt = 0;
//public DataInfo(Frame fr, int hasResponses, boolean useAllFactorLvls, double [] normSub, double [] normMul, double [] normRespSub, double [] normRespMul){
int [][] catLvls = new int[_cats][];
int [] ignoredCols = MemoryManager.malloc4(_nums + _cats);
// first do categoricals...
if(_catOffsets != null)
while(i < cols.length && cols[i] < _catOffsets[_catOffsets.length-1]){
int [] levels = MemoryManager.malloc4(_catOffsets[j+1] - _catOffsets[j]);
int k = 0;
while(i < cols.length && cols[i] < _catOffsets[j+1])
levels[k++] = cols[i++]-_catOffsets[j];
if(k > 0)
catLvls[j] = Arrays.copyOf(levels, k);
++j;
}
for(int k =0; k < catLvls.length; ++k)
if(catLvls[k] == null)ignoredCols[ignoredCnt++] = k;
if(ignoredCnt > 0){
int [][] c = new int[_cats-ignoredCnt][];
int y = 0;
for (int[] catLvl : catLvls) if (catLvl != null) c[y++] = catLvl;
assert y == c.length;
catLvls = c;
}
// now numerics
int prev = j = 0;
for(; i < cols.length; ++i){
for(int k = prev; k < (cols[i]-numStart()); ++k ){
ignoredCols[ignoredCnt++] = k+_cats;
++j;
}
prev = ++j;
}
for(int k = prev; k < _nums; ++k)
ignoredCols[ignoredCnt++] = k+_cats;
Frame f = new Frame(_adaptedFrame.names().clone(),_adaptedFrame.vecs().clone());
if(ignoredCnt > 0) f.remove(Arrays.copyOf(ignoredCols,ignoredCnt));
assert catLvls.length < f.numCols():"cats = " + catLvls.length + " numcols = " + f.numCols();
DataInfo dinfo = new DataInfo(_key,f,catLvls, _responses, _predictor_transform, _response_transform, _foldId, _nfolds);
dinfo._activeCols = cols;
return dinfo;
}
public String toString(){
return "";
}
public DataInfo getFold(int foldId, int nfolds){
return new DataInfo(Key.make(),this, foldId, nfolds);
}
public final int fullN(){return _nums + _catOffsets[_cats];}
public final int largestCat(){return _cats > 0?_catOffsets[1]:0;}
public final int numStart(){return _catOffsets[_cats];}
public final String [] coefNames(){
int k = 0;
final int n = fullN();
String [] res = new String[n];
final Vec [] vecs = _adaptedFrame.vecs();
for(int i = 0; i < _cats; ++i) {
for (int j = _useAllFactorLevels ? 0 : 1; j < vecs[i].domain().length; ++j)
res[k++] = _adaptedFrame._names[i] + "." + vecs[i].domain()[j];
if (vecs[i].naCnt() > 0) res[k++] = _adaptedFrame._names[i] + ".missing(NA)";
}
final int nums = n-k;
System.arraycopy(_adaptedFrame._names, _cats, res, k, nums);
return res;
}
/**
* Normalize horizontalized categoricals to become probabilities per factor level.
* This is done with the SoftMax function.
* @param in input values
* @param out output values (can be the same as input)
*/
public final void softMaxCategoricals(float[] in, float[] out) {
if (_cats == 0) return;
if (!_useAllFactorLevels) throw new UnsupportedOperationException("All factor levels must be present for re-scaling with SoftMax.");
assert (in.length == out.length);
assert (in.length == fullN());
final Vec[] vecs = _adaptedFrame.vecs();
int k = 0;
for (int i = 0; i < _cats; ++i) {
final int factors = vecs[i].domain().length;
final float max = ArrayUtils.maxValue(in, k, k + factors);
float scale = 0;
for (int j = 0; j < factors; ++j) {
out[k + j] = (float) Math.exp(in[k + j] - max);
scale += out[k + j];
}
for (int j = 0; j < factors; ++j)
out[k + j] /= scale;
k += factors;
}
assert(k == numStart());
}
/**
* Undo the standardization/normalization of numerical columns
* @param in input values
* @param out output values (can be the same as input)
*/
public final void unScaleNumericals(float[] in, float[] out) {
if (_nums == 0) return;
assert (in.length == out.length);
assert (in.length == fullN());
for (int k=numStart(); k < fullN(); ++k)
out[k] = in[k] / (float)_normMul[k-numStart()] + (float)_normSub[k-numStart()];
}
}
@Override
public T dfork(Frame fr){
assert fr == _dinfo._adaptedFrame;
return super.dfork(fr);
}
/**
* Override this to initialize at the beginning of chunk processing.
*/
protected void chunkInit(){}
/**
* Override this to do post-chunk processing work.
* @param n Number of processed rows
*/
protected void chunkDone(long n){}
/**
* Extracts the values, applies regularization to numerics, adds appropriate offsets to categoricals,
* and adapts response according to the CaseMode/CaseValue if set.
*/
@Override public final void map(Chunk [] chunks, NewChunk [] outputs){
if(_jobKey != null && !Job.isRunning(_jobKey))throw new JobCancelledException();
final int nrows = chunks[0]._len;
final long offset = chunks[0].start();
chunkInit();
double [] nums = MemoryManager.malloc8d(_dinfo._nums);
int [] cats = MemoryManager.malloc4(_dinfo._cats);
double [] response = _dinfo._responses == 0 ? null : MemoryManager.malloc8d(_dinfo._responses);
int start = 0;
int end = nrows;
Random skip_rng = null; //random generator for skipping rows
//Example:
// _useFraction = 0.8 -> 1 repeat with fraction = 0.8
// _useFraction = 1.0 -> 1 repeat with fraction = 1.0
// _useFraction = 1.1 -> 2 repeats with fraction = 0.55
// _useFraction = 2.1 -> 3 repeats with fraction = 0.7
// _useFraction = 3.0 -> 3 repeats with fraction = 1.0
final int repeats = (int)Math.ceil(_useFraction);
final float fraction = _useFraction / repeats;
if (fraction < 1.0) {
skip_rng = RandomUtils.getDeterRNG(new Random().nextLong());
}
long[] shuf_map = null;
if (_shuffle) {
shuf_map = new long[end-start];
for (int i=0;i 0 && (lr % _dinfo._nfolds) == _dinfo._foldId)
|| (skip_rng != null && skip_rng.nextFloat() > fraction))continue;
++num_processed_rows; //count rows with missing values even if they are skipped
for(Chunk c:chunks)if(skipMissing() && c.isNA0(r))continue OUTER; // skip rows with NAs!
int i = 0, ncats = 0;
for(; i < _dinfo._cats; ++i){
int c;
if (chunks[i].isNA0(r)) {
cats[ncats++] = (_dinfo._catOffsets[i+1]-1); //missing value turns into extra (last) factor
} else {
c = (int) chunks[i].at80(r);
if (_dinfo._catLvls != null) { // some levels are ignored?
c = Arrays.binarySearch(_dinfo._catLvls[i], c);
if (c >= 0)
cats[ncats++] = c + _dinfo._catOffsets[i];
} else if (_dinfo._useAllFactorLevels)
cats[ncats++] = c + _dinfo._catOffsets[i];
else if (c != 0)
cats[ncats++] = c + _dinfo._catOffsets[i] - 1;
}
}
final int n = chunks.length-_dinfo._responses;
for(;i < n;++i){
double d = chunks[i].at0(r); //can be NA if skipMissing() == false
if(_dinfo._normMul != null) d = (d - _dinfo._normSub[i-_dinfo._cats])*_dinfo._normMul[i-_dinfo._cats];
nums[i-_dinfo._cats] = d;
}
for(i = 0; i < _dinfo._responses; ++i) {
response[i] = chunks[chunks.length-_dinfo._responses + i].at0(r);
if (_dinfo._normRespMul != null) response[i] = (response[i] - _dinfo._normRespSub[i])*_dinfo._normRespMul[i];
if(Double.isNaN(response[i]))continue OUTER; // skip rows without a valid response (no supervised training possible)
}
long seed = offset + rrr*(end-start) + r;
if (outputs != null && outputs.length > 0)
processRow(seed, nums, ncats, cats, response, outputs);
else
processRow(seed, nums, ncats, cats, response);
}
}
chunkDone(num_processed_rows);
}
}
© 2015 - 2025 Weber Informatics LLC | Privacy Policy