water.rapids.ASTDdply Maven / Gradle / Ivy

Go to download
Show more of this group Show more artifacts with this name
Show all versions of h2o-core Show documentation
H2O Core
There is a newer version: 3.8.2.9
package water.rapids;

import water.*;
import water.fvec.*;
import water.util.*;

import java.util.Arrays;

/** Ddply
 *  Group the rows of 'data' by unique combinations of '[group-by-cols]'.
 *  Apply any function 'fcn' to a group Frame, which must accept a Frame (and
 *  any "extra" arguments) and return a single scalar value.
 *
 *  Returns a set of grouping columns, with the single answer column, with one
 *  row per unique group.
 *
 */
class ASTDdply extends ASTPrim {
  @Override public String[] args() { return new String[]{"ary", "groupByCols", "fun"}; }
  @Override int nargs() { return 1+3; } // (ddply data [group-by-cols] fcn )
  @Override public String str() { return "ddply"; }
  @Override
  public Val apply(Env env, Env.StackHelp stk, AST asts[]) {
    Frame fr = stk.track(asts[1].exec(env)).getFrame();
    int ncols = fr.numCols();

    ASTNumList groupby = ASTGroup.check(ncols, asts[2]);
    int[] gbCols = groupby.expand4();

    AST fun = asts[3].exec(env).getFun();
    ASTFun scope = env._scope;  // Current execution scope; needed to lookup variables

    // Pass 1: Find all the groups (and count rows-per-group)
    IcedHashMap gss = ASTGroup.doGroups(fr,gbCols,ASTGroup.aggNRows());
    final ASTGroup.G[] grps = gss.keySet().toArray(new ASTGroup.G[gss.size()]);

    // apply an ORDER by here...
    final int[] ordCols = new ASTNumList(0,gbCols.length).expand4();
    Arrays.sort(grps,new java.util.Comparator() {
        // Compare 2 groups.  Iterate down _gs, stop when _gs[i] > that._gs[i],
        // or _gs[i] < that._gs[i].  Order by various columns specified by
        // _orderByCols.  NaN is treated as least
        @Override public int compare( ASTGroup.G g1, ASTGroup.G g2 ) {
          for( int i : ordCols ) {
            if(  Double.isNaN(g1._gs[i]) && !Double.isNaN(g2._gs[i]) ) return -1;
            if( !Double.isNaN(g1._gs[i]) &&  Double.isNaN(g2._gs[i]) ) return  1;
            if( g1._gs[i] != g2._gs[i] ) return g1._gs[i] < g2._gs[i] ? -1 : 1;
          }
          return 0;
        }
        // I do not believe sort() calls equals() at this time, so no need to implement
        @Override public boolean equals( Object o ) { throw H2O.unimpl(); }
      });

    // Uniquely number the groups
    for( int gnum=0; gnum {
    final IcedHashMap _gss;
    final int[] _gbCols;
    BuildGroup( int[] gbCols, IcedHashMap gss ) { _gbCols = gbCols; _gss = gss; }
    @Override public void map( Chunk[] cs, NewChunk[] ncs ) {
      ASTGroup.G gWork = new ASTGroup.G(_gbCols.length,null); // Working Group
      for( int row=0; row {
    private Frame _data;        // Data frame
    private Key _vKey;     // the group to process...
    private AST _fun;           // the ast to execute on the group
    private ASTFun _scope;      // Execution environment
    private double[] _result;   // result is 1 row per group!

    RemoteRapids( Frame data, Key vKey, AST fun, ASTFun scope) {
      _data = data; _vKey=vKey; _fun=fun; _scope = scope;
    }

    @Override public void compute2() {
      assert _vKey.home();
      final Vec gvec = DKV.getGet(_vKey);
      assert gvec.group().equals(_data.anyVec().group());

      // Make a group Frame, using wrapped Vecs wrapping the original data
      // frame with the filtered Vec passed in.  Run the function, getting a
      // scalar or a 1-row Frame back out.  Delete the group Frame.  Return the
      // 1-row Frame as a double[] of results for this group.

      // Make the subset Frame Vecs, no chunks yet
      Key[] groupKeys = gvec.group().addVecs(_data.numCols());
      final Vec[] groupVecs = new Vec[_data.numCols()];
      Futures fs = new Futures();
      for( int i=0; i<_data.numCols(); i++ )
        DKV.put(groupVecs[i] = new Vec(groupKeys[i], gvec._rowLayout, gvec.domain(), gvec.get_type()), fs);
      fs.blockForPending();
      // Fill in the chunks
      new MRTask() {
        @Override public void setupLocal() {
          Vec[] data_vecs = _data.vecs();
          for( int i=0; i