All Downloads are FREE. Search and download functionalities are using the official Maven repository.

water.rapids.ASTMerge Maven / Gradle / Ivy

package water.rapids;

import water.*;
import water.fvec.*;
import water.nbhm.*;
import java.util.Arrays;


/** plyr's merge: Join by any other name.
 *  Sample AST: (merge $leftFrame $rightFrame allLeftFlag allRightFlag)
 *
 *  Joins two frames; all columns with the same names will be the join key.  If
 *  you want to join on a subset of identical names, rename the columns first
 *  (otherwise the same column name would appear twice in the result).
 *
 *  If allLeftFlag is true, all rows in the leftFrame will be included, even if
 *  there is no matching row in the rightFrame, and vice-versa for
 *  allRightFlag.  Missing data will appear as NAs.  Both flags can be true.
 */
public class ASTMerge extends ASTOp {
  static final String VARS[] = new String[]{ "ary", "leftary", "rightary", "allleft", "allright"};

  boolean _allLeft, _allRite;
  public ASTMerge( ) { super(VARS); }
  @Override String opStr(){ return "merge";}
  @Override ASTOp make() {return new ASTMerge();}

  @Override ASTMerge parse_impl(Exec E) {
    // get the frames to work with
    AST left = E.parse();
    AST rite = E.parse();

    AST a = E.parse();
    if( a instanceof ASTId  ) a = E._env.lookup((ASTId)a);
    if( a instanceof ASTNum ) _allLeft = ((ASTNum)a)._d==1;
    else throw new IllegalArgumentException("Argument `allLeft` expected to be a boolean.");

    a = E.parse();
    if( a instanceof ASTId ) a = E._env.lookup((ASTId)a);
    if( a instanceof ASTNum ) _allRite = ((ASTNum)a)._d==1;
    else throw new IllegalArgumentException("Argument `allRite` expected to be a boolean.");

    E.eatEnd();
    // Finish the rest
    ASTMerge res = (ASTMerge) clone();
    res._asts = new AST[]{left,rite};
    return res;
  }
  @Override void exec(Env e, AST[] args) {throw H2O.fail();}
  @Override void apply(Env env) {
    Frame _l = env.popAry();
    Frame _r = env.popAry();

    Frame l = new Frame(_l.names().clone(),_l.vecs().clone());
    Frame r = new Frame(_r.names().clone(),_r.vecs().clone());

    // Look for the set of columns in common; resort left & right to make the
    // leading prefix of column names match.  Bail out if we find any weird
    // column types.
    int ncols=0;                // Number of columns in common
    for( int i=0; i>32));
      return this;
    }
    @Override public int hashCode() { return _hash; }
    @Override public boolean equals( Object o ) {
      assert o instanceof Row;
      Row r = (Row)o;
      if( _hash != r._hash ) return false;
      if( _chks == r._chks && _row == r._row ) return true;
      // Now must check field contents
      int len = _enum_maps.length;
      for( int c=0; c {
    // All active Merges have a per-Node hashset of one of the datasets
    static NonBlockingHashMap MERGE_SETS = new NonBlockingHashMap<>();
    final Key _uniq;      // Key to allow sharing of this MergeSet on each Node
    final int _ncols;     // Number of leading columns for the Hash Key
    final int[][] _id_maps;
    final Frame _fr;      // Frame to hash-all-rows locally per-node
    transient NonBlockingHashSet _rows;

    MergeSet( int ncols, int[][] id_maps, Frame fr ) { 
      _uniq=Key.make();  _ncols = ncols;  _id_maps = id_maps; _fr = fr; 
    }
    // Per-node, hash the entire _fr dataset
    @Override public void setupLocal() {
      MERGE_SETS.put(_uniq,this);
      _rows = new NonBlockingHashSet<>();
      new MakeHash(this).doAll(_fr,true/*run locally*/);
    }

    // Executed locally only, build a local HashSet over the entire given dataset
    private static class MakeHash extends MRTask {
      transient final MergeSet _ms;
      MakeHash( MergeSet ms ) { _ms = ms; }
      @Override public void map( Chunk chks[] ) {
        int len = chks[0]._len;
        for( int i=0; i {
    private final int _ncols;     // Number of merge columns
    private final Key _uniq;      // Which mergeset being merged
    private final int[][] _enum_maps; // Mapping enum domains
    private final boolean _allLeft;
    DoJoin( int ncols, Key uniq, int[][] enum_maps, boolean allLeft ) {
      _ncols = ncols; _uniq = uniq; _enum_maps = enum_maps;_allLeft = allLeft;
    }
    @Override public void map( Chunk chks[], NewChunk nchks[] ) {
      // Shared common hash map
      NonBlockingHashSet rows = MergeSet.MERGE_SETS.get(_uniq)._rows;
      int len = chks[0]._len;
      Row row = new Row(chks);  // Recycled Row object on the bigger dataset
      for( int i=0; i




© 2015 - 2025 Weber Informatics LLC | Privacy Policy