
water.util.Tabulate Maven / Gradle / Ivy
package water.util;
import hex.Interaction;
import water.DKV;
import water.Job;
import water.Key;
import water.MRTask;
import water.exceptions.H2OIllegalArgumentException;
import water.fvec.C0DChunk;
import water.fvec.Chunk;
import water.fvec.Frame;
import water.fvec.Vec;
/**
* Simple Co-Occurrence based tabulation of X vs Y, where X and Y are two Vecs in a given dataset
* Uses histogram of given resolution in X and Y
* Handles numerical/categorical data and missing values
* Supports observation weights
*
* Fills up two double[][] arrays:
* _countData[xbin][ybin] contains the sum of observation weights (or 1) for co-occurrences in bins xbin/ybin
* _responseData[xbin][2] contains the mean value of Y and the sum of observation weights for a given bin for X
*/
public class Tabulate extends Job {
public Frame _dataset;
public Key[] _vecs = new Key[2];
public String _predictor;
public String _response;
public String _weight;
int _nbins_predictor = 20;
int _nbins_response = 10;
// result
double[][] _count_data;
double[][] _response_data;
public TwoDimTable _count_table;
public TwoDimTable _response_table;
// helper to speed up stuff
static private class Stats {
Stats(Vec v) {
_min = v.min();
_max = v.max();
_isEnum = v.isEnum();
_isInt = v.isInt();
_cardinality = v.cardinality();
_missing = v.naCnt() > 0 ? 1 : 0;
_domain = v.domain();
}
final double _min;
final double _max;
final boolean _isEnum;
final boolean _isInt;
final int _cardinality;
final int _missing; //0 or 1
final String[] _domain;
}
final private Stats[] _stats = new Stats[2];
public Tabulate() {
super(Key.make(), "Tabulate job");
}
private int bins(int v) {
return v==1 ? _nbins_response : _nbins_predictor;
}
private int res(final int v) {
final int missing = _stats[v]._missing;
if (_stats[v]._isEnum)
return _stats[v]._cardinality + missing;
return bins(v) + missing;
}
private int bin(final int v, final double val) {
if (Double.isNaN(val)) {
return 0;
}
int b;
int bins = bins(v);
if (_stats[v]._isEnum) {
assert((int)val == val);
b = (int) val;
} else {
double d = (_stats[v]._max - _stats[v]._min) / bins;
b = (int) ((val - _stats[v]._min) / d);
assert(b>=0 && b<= bins);
b = Math.min(b, bins -1);//avoid AIOOBE at upper bound
}
return b+_stats[v]._missing;
}
private String labelForBin(final int v, int b) {
int missing = _stats[v]._missing;
if (missing == 1 && b==0) return "missing(NA)";
if (missing == 1) b--;
if (_stats[v]._isEnum)
return _stats[v]._domain[b];
int bins = bins(v);
if (_stats[v]._isInt && (_stats[v]._max - _stats[v]._min + 1) <= bins)
return Integer.toString((int)(_stats[v]._min + b));
double d = (_stats[v]._max - _stats[v]._min)/bins;
return String.format("%5f", _stats[v]._min + (b + 0.5) * d);
}
public Tabulate execImpl() {
if (_dataset == null)
error("_dataset", "Dataset not found");
if (_nbins_predictor < 1)
error("_binsPredictor", "Number of bins for predictor must be >= 1");
if (_nbins_response < 1)
error("_binsResponse", "Number of bins for response must be >= 1");
Vec x = _dataset.vec(_predictor);
if (x == null)
error("_predictor", "Predictor column " + _predictor + " not found");
else if (x.cardinality() > _nbins_predictor) {
Interaction in = new Interaction();
in._source_frame = _dataset._key;
in._factor_columns = new String[]{_predictor};
in._max_factors = _nbins_predictor -1;
in._dest = Key.make();
in.execImpl();
x = ((Frame)DKV.getGet(in._dest)).anyVec();
in.remove();
} else if (x.isInt() && (x.max() - x.min() + 1) <= _nbins_predictor) {
x = x.toEnum();
}
Vec y = _dataset.vec(_response);
if (y == null)
error("_response", "Response column " + _response + " not found");
else if (y.cardinality() > _nbins_response) {
Interaction in = new Interaction();
in._source_frame = _dataset._key;
in._factor_columns = new String[]{_response};
in._max_factors = _nbins_response -1;
in._dest = Key.make();
in.execImpl();
y = ((Frame)DKV.getGet(in._dest)).anyVec();
in.remove();
} else if (y.isInt() && (y.max() - y.min() + 1) <= _nbins_response) {
y = y.toEnum();
}
if (y!=null && y.cardinality() > 2)
warn("_response", "Response column has more than two factor levels - mean response depends on lexicographic order of factors!");
Vec w = _dataset.vec(_weight); //can be null
if (w != null && (!w.isNumeric() && w.min() < 0))
error("_weight", "Observation weights must be numeric with values >= 0");
if (error_count() > 0){
Tabulate.this.updateValidationMessages();
throw new H2OIllegalArgumentException(validationErrors());
}
if (x!=null) {
_vecs[0] = x._key;
_stats[0] = new Stats(x);
}
if (y!=null) {
_vecs[1] = y._key;
_stats[1] = new Stats(y);
}
Tabulate sp = w != null ? new CoOccurrence(this).doAll(x, y, w)._sp : new CoOccurrence(this).doAll(x, y)._sp;
_count_table = sp.tabulationTwoDimTable();
_response_table = sp.responseCharTwoDimTable();
Log.info(_count_table.toString(2, false));
Log.info(_response_table.toString(2, false));
return sp;
}
private static class CoOccurrence extends MRTask {
final Tabulate _sp;
CoOccurrence(Tabulate sp) {_sp = sp;}
@Override
protected void setupLocal() {
_sp._count_data = new double[_sp.res(0)][_sp.res(1)];
_sp._response_data = new double[_sp.res(0)][2];
}
@Override
public void map(Chunk x, Chunk y) {
map(x,y,new C0DChunk(1, x.len()));
}
@Override
public void map(Chunk x, Chunk y, Chunk w) {
for (int r=0; r
© 2015 - 2025 Weber Informatics LLC | Privacy Policy