
water.rapids.ASTDdply Maven / Gradle / Ivy
package water.rapids;
import water.*;
import water.fvec.*;
import water.util.*;
import java.util.Arrays;
/** Ddply
* Group the rows of 'data' by unique combinations of '[group-by-cols]'.
* Apply any function 'fcn' to a group Frame, which must accept a Frame (and
* any "extra" arguments) and return a single scalar value.
*
* Returns a set of grouping columns, with the single answer column, with one
* row per unique group.
*
*/
class ASTDdply extends ASTPrim {
@Override public String[] args() { return new String[]{"ary", "groupByCols", "fun"}; }
@Override int nargs() { return 1+3; } // (ddply data [group-by-cols] fcn )
@Override public String str() { return "ddply"; }
@Override
public Val apply(Env env, Env.StackHelp stk, AST asts[]) {
Frame fr = stk.track(asts[1].exec(env)).getFrame();
int ncols = fr.numCols();
ASTNumList groupby = ASTGroup.check(ncols, asts[2]);
int[] gbCols = groupby.expand4();
AST fun = asts[3].exec(env).getFun();
ASTFun scope = env._scope; // Current execution scope; needed to lookup variables
// Pass 1: Find all the groups (and count rows-per-group)
IcedHashMap gss = ASTGroup.doGroups(fr,gbCols,ASTGroup.aggNRows());
final ASTGroup.G[] grps = gss.keySet().toArray(new ASTGroup.G[gss.size()]);
// apply an ORDER by here...
final int[] ordCols = new ASTNumList(0,gbCols.length).expand4();
Arrays.sort(grps,new java.util.Comparator() {
// Compare 2 groups. Iterate down _gs, stop when _gs[i] > that._gs[i],
// or _gs[i] < that._gs[i]. Order by various columns specified by
// _orderByCols. NaN is treated as least
@Override public int compare( ASTGroup.G g1, ASTGroup.G g2 ) {
for( int i : ordCols ) {
if( Double.isNaN(g1._gs[i]) && !Double.isNaN(g2._gs[i]) ) return -1;
if( !Double.isNaN(g1._gs[i]) && Double.isNaN(g2._gs[i]) ) return 1;
if( g1._gs[i] != g2._gs[i] ) return g1._gs[i] < g2._gs[i] ? -1 : 1;
}
return 0;
}
// I do not believe sort() calls equals() at this time, so no need to implement
@Override public boolean equals( Object o ) { throw H2O.unimpl(); }
});
// Uniquely number the groups
for( int gnum=0; gnum {
final IcedHashMap _gss;
final int[] _gbCols;
BuildGroup( int[] gbCols, IcedHashMap gss ) { _gbCols = gbCols; _gss = gss; }
@Override public void map( Chunk[] cs, NewChunk[] ncs ) {
ASTGroup.G gWork = new ASTGroup.G(_gbCols.length,null); // Working Group
for( int row=0; row {
private Frame _data; // Data frame
private Key _vKey; // the group to process...
private AST _fun; // the ast to execute on the group
private ASTFun _scope; // Execution environment
private double[] _result; // result is 1 row per group!
RemoteRapids( Frame data, Key vKey, AST fun, ASTFun scope) {
_data = data; _vKey=vKey; _fun=fun; _scope = scope;
}
@Override public void compute2() {
assert _vKey.home();
final Vec gvec = DKV.getGet(_vKey);
assert gvec.group().equals(_data.anyVec().group());
// Make a group Frame, using wrapped Vecs wrapping the original data
// frame with the filtered Vec passed in. Run the function, getting a
// scalar or a 1-row Frame back out. Delete the group Frame. Return the
// 1-row Frame as a double[] of results for this group.
// Make the subset Frame Vecs, no chunks yet
Key[] groupKeys = gvec.group().addVecs(_data.numCols());
final Vec[] groupVecs = new Vec[_data.numCols()];
Futures fs = new Futures();
for( int i=0; i<_data.numCols(); i++ )
DKV.put(groupVecs[i] = new Vec(groupKeys[i], gvec._rowLayout, gvec.domain(), gvec.get_type()), fs);
fs.blockForPending();
// Fill in the chunks
new MRTask() {
@Override public void setupLocal() {
Vec[] data_vecs = _data.vecs();
for( int i=0; i
© 2015 - 2025 Weber Informatics LLC | Privacy Policy