
water.rapids.ASTddply Maven / Gradle / Ivy
package water.rapids;
import water.*;
import water.fvec.*;
import water.util.IcedHashMap;
import water.util.IcedInt;
import java.util.ArrayList;
import java.util.HashSet;
/** plyr's ddply: GroupBy by any other name.
* Sample AST: (h2o.ddply $frame {1;5;10} $fun)
*
* First arg is the frame we'll be working over.
* Second arg is column selection to group by.
* Third arg is the function to apply to each group.
*/
public class ASTddply extends ASTOp {
long[] _cols;
String _fun;
AST[] _fun_args;
static final String VARS[] = new String[]{ "ary", "{cols}", "FUN"};
public ASTddply( ) { super(VARS); }
@Override String opStr(){ return "h2o.ddply";}
@Override ASTOp make() {return new ASTddply();}
@Override ASTddply parse_impl(Exec E) {
// get the frame to work
AST ary = E.parse();
// Get the col ids
AST s=E.parse();
if( s instanceof ASTLongList) _cols = ((ASTLongList)s)._l;
else if( s instanceof ASTNum) _cols = new long[]{(long)((ASTNum)s)._d};
else throw new IllegalArgumentException("Columns expected to be a llist or number. Got: " + s.getClass());
// get the fun
_fun = ((ASTId)E.parse())._id;
// get any fun args
ArrayList fun_args = new ArrayList<>();
while( !E.isEnd() )
fun_args.add(E.parse());
if (fun_args.size() > 0) {
_fun_args = fun_args.toArray(new AST[fun_args.size()]);
} else {
_fun_args = null;
}
E.eatEnd();
ASTddply res = (ASTddply)clone();
res._asts = new AST[]{ary};
return res;
}
@Override void apply(Env env) {
Frame fr = env.popAry(); // The Frame to work on
// sanity check cols
for (long l : _cols) {
if (l > fr.numCols() || l < 0) throw new IllegalArgumentException("Column "+(l+1)+" out of range for frame columns "+fr.numCols());
}
// *** LEGACY *** //
// Was pondering a SIMD-like execution model, running the fcn "once" - but
// in parallel for all groups. But this isn't going to work: each fcn
// execution will take different control paths. Also the functions side-
// effects' must only happen once, and they will make multiple passes over
// the Frame passed in.
//
// GroupIDs' can vary from 1 group to 1-per-row. Are formed by the cross-
// product of the selection cols. Will be hashed to find Group - NBHML
// mapping row-contents to group. Index is a sample row. NBHML per-node,
// plus roll-ups. Result/Value is Group structure pointing to NewChunks
// holding row indices.
//
// Pass 1: Find Groups.
// Build a NBHSet of unique double[]'s holding selection cols.
// These are the unique groups, found per-node, rolled-up globally
// Record the rows belonging to each group, locally.
// ddplyPass1 p1 = new ddplyPass1(true,_cols).doAll(fr);
// *** LEGACY *** //
// End up building a "transient" Frame for each group anyhow.
// So finding the groups and the size of each group is relatively cheap!
// pass1A, finds the number of groups and the size of each group, as well as the row numbers for each group (stashed inside of a nbhm instead of newchunks...)
Pass1A p1a = new Pass1A(_cols).doAll(fr); // pass 1 over all data
Group[] grps = p1a._grps.keySet().toArray(new Group[p1a._grps.size()]);
int ngrps = grps.length;
while( grps[ngrps-1] == null ) ngrps--; // chop out any null groups hanging at the end.
Group[] groups = new Group[ngrps];
System.arraycopy(grps,0,groups,0,ngrps);
grps = groups;
// pass2 here does the nominal work of building all of the groups.
// for lots of tiny groups, this is probably lots of data transfer
// this chokes the H2O cloud and can even cause it to OOM!
// this issue is addressed by ASTGroupBy
Pass2 p2;
H2O.submitTask(p2=new Pass2(fr,grps)).join();
// Pass 3: Send Groups 'round the cluster
Key[] groupFrames = p2._keys;
Pass3 p3;
(p3 = new Pass3(groupFrames,ASTOp.get(_fun).make(), grps,_fun_args)).go();
Vec layoutVec = Vec.makeZero(p3._remoteTasks.length);
final RemoteRapids[] results = p3._remoteTasks;
for( int k=0;k(); }
public Group( double ds[] ) { super(ds); }
IcedHashMap a;
}
private static class Pass1A extends MRTask {
private final long _gbCols[];
IcedHashMap _grps;
Pass1A(long[] cols) { _gbCols=cols; }
@Override public void setupLocal() { }
@Override public void map(Chunk[] c) {
_grps = new IcedHashMap<>();
Group g = new Group(_gbCols.length);
Group gOld;
int start = (int)c[0].start();
for(int i=0;i l = _grps;
IcedHashMap r = t._grps;
if( l.size() < r.size() ) { l=r; r=_grps; }
for( Group rg: r.keySet() ) {
if( l.containsKey(rg) ) { // try to add it to the set on the left.. if left already has it, then combine
Group lg = l.getk(rg);
long L = lg._N;
while(!Group.CAS_N(lg,L,L+rg._N))
L = lg._N;
}
}
_grps=l;
t._grps=null;
}
}
}
private static class Pass2 extends H2O.H2OCountedCompleter {
private final Frame _fr;
private final Group[] _grps;
Pass2(Frame f, Group[] grps) { _fr=f; _grps=grps; }
Pass2Task[] _tasks; // want to get out _key from each Pass2Task
Key[] _keys;
@Override protected void compute2() {
addToPendingCount(_grps.length-1);
// build subset vecs for each group...
int numnodes = H2O.CLOUD.size();
_tasks=new Pass2Task[_grps.length];
_keys=new Key[_grps.length];
for( int i=0;i<_grps.length;++i ) {
int nodeID = i%numnodes;
H2ONode n = H2O.CLOUD.members()[nodeID];
Key key = Key.make(n);
(_tasks[i]=new Pass2Task(this,nodeID,_grps[i],_fr._key, n, _keys[i]=key)).fork();
}
}
}
private static class Pass2Task extends H2O.H2OCountedCompleter {
// round robin spread these Vecs
private final int _nodeID;
private final Group _g;
private final Key _frameKey;
// group frame key
Key _key;
H2ONode _n;
Key[] _subsetVecKeys;
Pass2Task(H2O.H2OCountedCompleter cc, int nodeID, Group g, Key frameKey, H2ONode n, Key key) { super(cc); _nodeID=nodeID; _g=g; _frameKey=frameKey; _n=n; _key=key; }
@Override protected void compute2() {
H2ONode n = H2O.CLOUD.members()[_nodeID];
Futures fs = new Futures();
long[] rows = new long[_g.a.size()];
int i=0;
for(IcedInt l: _g.a.keySet() ) rows[i++]=l._val;
BuildGroup b;
fs.add(RPC.call(n, b=new BuildGroup(_key,rows,_frameKey)));
fs.blockForPending();
_subsetVecKeys = b._subsetVecKeys;
tryComplete();
}
}
private static class BuildGroup extends DTask implements Freezable {
private final Key _frameKey; // the frame key
private final Key _key; // this is the Vec key for the rows for the group...
private final long[] _rows; // these are the rows numbers for the group
private Key[] _subsetVecKeys;
BuildGroup(Key key, long[] rows, Key frameKey) {
_key=key;
_rows=rows;
_frameKey=frameKey;
// Always 1 higher priority than calling thread... because the caller will
// block & burn a thread waiting for this MRTask to complete.
Thread cThr = Thread.currentThread();
_priority = (byte)((cThr instanceof H2O.FJWThr) ? ((H2O.FJWThr)cThr)._priority+1 : super.priority());
}
final private byte _priority;
@Override public byte priority() { return _priority; }
@Override protected void compute2() {
assert _key.home() : "Key was not homed to this node!";
Futures fs = new Futures();
// get a layout Vec just for the vector group
Vec layout = Vec.makeZero(_rows.length);
Key key = layout.group().addVec(); // get a new key
layout.remove();
// create the vec of rows numbers
AppendableVec v = new AppendableVec(key);
NewChunk n = new NewChunk(v, 0);
for(long l: _rows) n.addNum(l);
n.close(0, fs);
Vec rows = v.close(fs); // this puts into the DKV!
fs.blockForPending();
Frame f = DKV.getGet(_frameKey); // fetch the Frame we're subsetting
Vec[] data = f.vecs(); // Full data columns
Vec[] gvecs = new Vec[data.length]; // the group vecs, all aligned with the rows Vec
Key[] keys = rows.group().addVecs(data.length); // generate keys from the vector group...
_subsetVecKeys = keys; // store these for later removal...
// loop over and subset each column, ...one at a time...
for (int c = 0; c < data.length; c++) {
gvecs[c] = new SubsetVec(keys[c], rows.get_espc(), data[c]._key, rows._key);
gvecs[c].setDomain(data[c].domain());
DKV.put(gvecs[c]._key, gvecs[c]);
}
// finally put the constructed group into the DKV
Frame aa = new Frame(_key, f._names, gvecs);
DKV.put(_key,aa); // _key is homed to this node!
assert _key.home(): "Key should be homed to the node! Somehow remapped during this compute2.";
tryComplete();
}
}
private static class Pass3 {
private final Key[] _frameKeys;
private final ASTOp _FUN;
private final Group[] _grps;
private final AST[] _funArgs;
RemoteRapids[] _remoteTasks;
Pass3(Key[] frameKeys, ASTOp FUN, Group[] grps, AST[] args) {
_frameKeys=frameKeys; _FUN=FUN; _grps=grps; _funArgs=args;
_remoteTasks=new RemoteRapids[_frameKeys.length]; // gather up the remote tasks...
}
// stupid single threaded pass over all groups...
private void go() {
Futures fs = new Futures();
for( int i=0;i<_frameKeys.length;++i) {
assert DKV.getGet(_frameKeys[i]) !=null : "Frame #" + i + " was NULL: " + _frameKeys[i];
fs.add(RPC.call(_frameKeys[i].home_node(), _remoteTasks[i] = new RemoteRapids(_frameKeys[i], _FUN, _funArgs, _grps[i]._ds)));
}
fs.blockForPending();
}
}
private static class RemoteRapids extends DTask implements Freezable {
private final Key _frameKey; // the group to process...
private final ASTOp _FUN; // the ast to execute on the group
private final AST[] _funArgs; // any additional arguments to the _FUN
private final double[] _ds; // the "group" itself
private double[] _result; // result is 1 row per group!
RemoteRapids(Key frameKey, ASTOp FUN, AST[] args, double[] ds) {
_frameKey=frameKey; _FUN=FUN; _funArgs=args; _ds=ds;
// Always 1 higher priority than calling thread... because the caller will
// block & burn a thread waiting for this MRTask to complete.
Thread cThr = Thread.currentThread();
_priority = (byte)((cThr instanceof H2O.FJWThr) ? ((H2O.FJWThr)cThr)._priority+1 : super.priority());
}
final private byte _priority;
@Override public byte priority() { return _priority; }
@Override public void compute2() {
assert _frameKey.home();
Env e = Env.make(new HashSet());
Frame groupFrame = DKV.getGet(_frameKey);
assert groupFrame!=null : "Frame ID: " + _frameKey;
AST[] args = new AST[_funArgs==null?1:_funArgs.length+1];
args[0] = new ASTFrame(groupFrame);
if( _funArgs!=null ) System.arraycopy(_funArgs,0,args,1,_funArgs.length);
_FUN.make().exec(e,args);
if( !e.isNul() ) {
// grab up the results
Frame fr = null;
if (e.isAry() && (fr = e.popAry()).numRows() != 1)
throw new IllegalArgumentException("Result of ddply can only return 1 row but instead returned " + fr.numRows());
int ncols = fr == null ? 1 : fr.numCols();
_result = new double[_ds.length + ncols]; // fill in the results
System.arraycopy(_ds, 0, _result, 0, _ds.length);
int j = _ds.length;
for (int i = 0; i < ncols; ++i) {
if (e.isStr()) _result[j++] = e.popStr().equals("TRUE") ? 1 : 0;
else if (e.isNum()) _result[j++] = e.popDbl();
else if (fr != null) _result[j++] = fr.vecs()[i].at(0);
}
}
groupFrame.delete();
tryComplete();
}
}
}
© 2015 - 2025 Weber Informatics LLC | Privacy Policy