water.rapids.ASTMerge Maven / Gradle / Ivy

Go to download
Show more of this group Show more artifacts with this name
Show all versions of h2o-core Show documentation
H2O Core
There is a newer version: 3.8.2.9
package water.rapids;

import water.*;
import water.fvec.*;
import water.parser.BufferedString;
import water.util.IcedHashMap;
import java.util.Arrays;


/** plyr's merge: Join by any other name.
 *  Sample AST: (merge $leftFrame $rightFrame allLeftFlag allRightFlag)
 *
 *  Joins two frames; all columns with the same names will be the join key.  If
 *  you want to join on a subset of identical names, rename the columns first
 *  (otherwise the same column name would appear twice in the result).
 *
 *  If the client side wants to allow named columns to be merged, the client
 *  side is reponsible for renaming columns as needed to bring the names into
 *  alignment as above.  This can be as simple as renaming the RHS to match the
 *  LHS column names.  Duplicate columns NOT part of the merge are still not
 *  allowed - because the resulting Frame will end up with duplicate column
 *  names which blows a Frame invariant (uniqueness of column names).
 *
 *  If allLeftFlag is true, all rows in the leftFrame will be included, even if
 *  there is no matching row in the rightFrame, and vice-versa for
 *  allRightFlag.  Missing data will appear as NAs.  Both flags can be true.
 */
public class ASTMerge extends ASTPrim {
  @Override public String[] args() { return new String[]{"left","rite", "all_left", "all_rite", "by_left", "by_right", "method"}; }
  @Override public String str(){ return "merge";}
  @Override int nargs() { return 1+7; } // (merge left rite all.left all.rite method)

  // Size cutoff before switching between a hashed-join vs a sorting join.
  // Hash tables beyond this count are assumed to be inefficient, and we're
  // better served by sorting all the join columns and doing a global
  // merge-join.
  static final int MAX_HASH_SIZE = 120000000;

  @Override
  public Val apply(Env env, Env.StackHelp stk, AST asts[]) {
    Frame l = stk.track(asts[1].exec(env)).getFrame();
    Frame r = stk.track(asts[2].exec(env)).getFrame();
    boolean allLeft = asts[3].exec(env).getNum() == 1;
    boolean allRite = asts[4].exec(env).getNum() == 1;
    int[] byLeft = check(asts[5]);
    int[] byRight = check(asts[6]);
    String method = asts[7].exec(env).getStr();

    // Look for the set of columns in common; resort left & right to make the
    // leading prefix of column names match.  Bail out if we find any weird
    // column types.
    int ncols=0;                // Number of columns in common
    for( int i=0; i r.numRows();
    } else {
      walkLeft = allLeft;
    }
    Frame walked = walkLeft ? l : r;
    Frame hashed = walkLeft ? r : l;
    if( !walkLeft ) { boolean tmp = allLeft;  allLeft = allRite;  allRite = tmp; }

    // Build categorical mappings, to rapidly convert categoricals from the
    // distributed set to the hashed & replicated set.
    int[][] id_maps = new int[ncols][];
    for( int i=0; i rows = MergeSet.MERGE_SETS.get(uniq)._rows;
    new MRTask() { @Override public void setupLocal() { MergeSet.MERGE_SETS.remove(uniq);  } }.doAllNodes();
    if (method.equals("auto") && (rows == null || rows.size() > MAX_HASH_SIZE ))  // Blew out hash size; switch to a sorting join.  Matt: even with 0, rows was size 3 hence added ||
      return sortingMerge(l,r,allLeft,allRite,ncols,id_maps);

    // All of the walked set, and no dup handling on the right - which means no
    // need to replicate rows of the walked dataset.  Simple 1-pass over the
    // walked set adding in columns (or NAs) from the right.
    if( allLeft && !(allRite && ms._dup) ) {
      // The lifetime of the distributed dataset is independent of the original
      // dataset, so it needs to be a deep copy.
      // TODO: COW Optimization
      walked = walked.deepCopy(null);

      // run a global parallel work: lookup non-hashed rows in hashSet; find
      // matching row; append matching column data
      String[]   names  = Arrays.copyOfRange(hashed._names,   ncols,hashed._names   .length);
      String[][] domains= Arrays.copyOfRange(hashed.domains(),ncols,hashed.domains().length);
      byte[] types = Arrays.copyOfRange(hashed.types(),ncols,hashed.numCols());
      Frame res = new AllLeftNoDupe(ncols,rows,hashed,allRite).doAll(types,walked).outputFrame(names,domains);
      return new ValFrame(walked.add(res));
    }

    // Can be full or partial on the left, but won't nessecarily do all of the
    // right.  Dups on right are OK (left will be replicated or dropped as needed).
    if( !allRite ) {
      String[] names = Arrays.copyOf(walked.names(),walked.numCols() + hashed.numCols()-ncols);
      System.arraycopy(hashed.names(),ncols,names,walked.numCols(),hashed.numCols()-ncols);
      String[][] domains = Arrays.copyOf(walked.domains(),walked.numCols() + hashed.numCols()-ncols);
      System.arraycopy(hashed.domains(),ncols,domains,walked.numCols(),hashed.numCols()-ncols);
      byte[] types = walked.types();
      types = Arrays.copyOf(types,types.length+hashed.numCols()-ncols);
      System.arraycopy(hashed.types(),ncols,types,walked.numCols(),hashed.numCols()-ncols);
      return new ValFrame(new AllRiteWithDupJoin(ncols,rows,hashed,allLeft).doAll(types,walked).outputFrame(names,domains));
    } 

    throw H2O.unimpl();
  }

  /** Use a sorting merge/join, probably because the hash table size exceeded
   *  MAX_HASH_SIZE; i.e. the number of unique keys in the hashed Frame exceeds
   *  MAX_HASH_SIZE.  Join is done on the first ncol columns in both frames,
   *  which are already known to be not-null and have matching names and types.
   *  The walked and hashed frames are sorted according to allLeft; if allRite
   *  is set then allLeft will also be set (but not vice-versa).
   *
   *  @param left is the LHS frame; not-null.
   *  @param right is the RHS frame; not-null.
   *  @param allLeft all rows in the LHS frame will appear in the result frame.
   *  @param allRite all rows in the RHS frame will appear in the result frame.
   *  @param ncols is the number of columns to join on, and these are ordered
   *  as the first ncols of both the left and right frames.  
   *  @param id_maps if not-null denote simple integer mappings from one
   *  categorical column to another; the width is ncols
   */

  private ValFrame sortingMerge( Frame left, Frame right, boolean allLeft, boolean allRite, int ncols, int[][] id_maps) {
    int cols[] = new int[ncols];
    for (int i=0; i>32));
      _row = chks[0].start()+row; // Payload: actual absolute row number
      return this;
    }
    @Override public int hashCode() { return _hash; }
    @Override public boolean equals( Object o ) {
      if( !(o instanceof Row) ) return false;
      Row r = (Row)o;
      return _hash == r._hash && Arrays.equals(_keys,r._keys);
    }

    private void atomicAddDup(long row) {
      synchronized (this) {
        if( _dups==null ) {
          _dups = new long[]{_row,row};
          _dupIdx=2;
        } else {
          if( _dupIdx==_dups.length )
            _dups = Arrays.copyOf(_dups,_dups.length << 1);
          _dups[_dupIdx++]=row;
        }
      }
    }
  }

  // Build a HashSet of one entire Frame, where the Key is the contents of the
  // first few columns.  One entry-per-row.
  private static class MergeSet extends MRTask {
    // All active Merges have a per-Node hashset of one of the datasets.  If
    // this is missing, it means the HashMap exceeded the size bounds and the
    // whole MergeSet is being aborted (gracefully) - and the Merge is
    // switching to a sorting merge instead of a hashed merge.
    static IcedHashMap MERGE_SETS = new IcedHashMap<>();
    final Key _uniq;      // Key to allow sharing of this MergeSet on each Node
    final int _ncols;     // Number of leading columns for the Hash Key
    final int[][] _id_maps; // Rapid mapping between matching enums
    final boolean _allRite; // Collect all rows with the same matching Key, or just the first
    boolean _dup;           // Dups are present at all
    IcedHashMap _rows;

    MergeSet( int ncols, int[][] id_maps, boolean allRite ) { 
      _uniq=Key.make();  _ncols = ncols;  _id_maps = id_maps;  _allRite = allRite;
    }
    // Per-node, make the empty hashset for later reduction
    @Override public void setupLocal() {
      _rows = new IcedHashMap<>();
      MERGE_SETS.put(_uniq,this);
    }

    @Override public void map( Chunk chks[] ) {
      final IcedHashMap rows = MERGE_SETS.get(_uniq)._rows; // Shared per-node HashMap
      if( rows == null ) return; // Missing: Aborted due to exceeding size
      final int len = chks[0]._len;
      Row row = new Row(_ncols);
      for( int i=0; i MAX_HASH_SIZE ) { abort(); return; }
          row = new Row(_ncols); // If added, need a new row to fill
        }
    }
    private boolean add( IcedHashMap rows, Row row ) {
      if( rows.putIfAbsent(row,"")==null )
        return true;            // Added!
      // dup handling: keys are identical
      if( _allRite ) {          // Collect the dups?
        _dup = true;            // MergeSet has dups.
        rows.getk(row).atomicAddDup(row._row);
      }
      return false;
    }
    private void abort( ) { MERGE_SETS.get(_uniq)._rows = _rows = null; }
    @Override public void reduce( MergeSet ms ) {
      final IcedHashMap rows = _rows; // Shared per-node hashset
      if( rows == ms._rows ) return;
      if( rows == null || ms._rows == null ) { abort(); return; } // Missing: aborted due to size
      for( Row row : ms._rows.keySet() ) 
        add(rows,row);          // Merge RHS into LHS, collecting dups as we go
    }
  }

  private static abstract class JoinTask extends MRTask {
    protected final IcedHashMap _rows;
    protected final int _ncols;     // Number of merge columns
    protected final Frame _hashed;
    protected final boolean _allLeft, _allRite;
    JoinTask( int ncols, IcedHashMap rows, Frame hashed, boolean allLeft, boolean allRite ) {
      _rows = rows; _ncols = ncols; _hashed = hashed; _allLeft = allLeft; _allRite = allRite;
    }
    protected static void addElem(NewChunk nc, Chunk c, int row) {
      if( c.isNA(row) )                 nc.addNA();
      else if( c instanceof CStrChunk ) nc.addStr(c,row);
      else if( c instanceof C16Chunk )  nc.addUUID(c,row);
      else if( c.hasFloat() )           nc.addNum(c.atd(row));
      else                              nc.addNum(c.at8(row),0);
    }
    protected static void addElem(NewChunk nc, Vec v, long absRow, BufferedString bStr) {
      switch( v.get_type() ) {
      case Vec.T_NUM : nc.addNum(v.at(absRow)); break;
      case Vec.T_CAT :
      case Vec.T_TIME: if( v.isNA(absRow) ) nc.addNA(); else nc.addNum(v.at8(absRow)); break;
      case Vec.T_STR : nc.addStr(v.atStr(bStr, absRow)); break;
      default: throw H2O.unimpl();
      }
    }
  }

  // Build the join-set by iterating over all the local Chunks of the walked
  // dataset, doing a hash-lookup on the hashed replicated dataset, and adding
  // in the matching columns.
  private static class AllLeftNoDupe extends JoinTask {
    AllLeftNoDupe(int ncols, IcedHashMap rows, Frame hashed, boolean allRite) {
      super(ncols, rows, hashed, true, allRite);
    }

    @Override public void map( Chunk chks[], NewChunk nchks[] ) {
      // Shared common hash map
      final IcedHashMap rows = _rows;
      Vec[] vecs = _hashed.vecs(); // Data source from hashed set
      assert vecs.length == _ncols + nchks.length;
      Row row = new Row(_ncols);  // Recycled Row object on the bigger dataset
      BufferedString bStr = new BufferedString(); // Recycled BufferedString
      int len = chks[0]._len;
      for( int i=0; i rows, Frame hashed, boolean allLeft) {
      super(ncols, rows, hashed, allLeft, true);
    }

    @Override public void map(Chunk[] chks, NewChunk[] nchks) {
      // Shared common hash map
      final IcedHashMap rows = _rows;
      Vec[] vecs = _hashed.vecs(); // Data source from hashed set
//      assert vecs.length == _ncols + nchks.length;
      Row row = new Row(_ncols);   // Recycled Row object on the bigger dataset
      BufferedString bStr = new BufferedString(); // Recycled BufferedString
      int len = chks[0]._len;
      for( int i=0; i