
water.rapids.ASTMerge Maven / Gradle / Ivy
package water.rapids;
import water.*;
import water.fvec.*;
import water.parser.BufferedString;
import water.util.IcedHashMap;
import java.util.Arrays;
/** plyr's merge: Join by any other name.
* Sample AST: (merge $leftFrame $rightFrame allLeftFlag allRightFlag)
*
* Joins two frames; all columns with the same names will be the join key. If
* you want to join on a subset of identical names, rename the columns first
* (otherwise the same column name would appear twice in the result).
*
* If the client side wants to allow named columns to be merged, the client
* side is reponsible for renaming columns as needed to bring the names into
* alignment as above. This can be as simple as renaming the RHS to match the
* LHS column names. Duplicate columns NOT part of the merge are still not
* allowed - because the resulting Frame will end up with duplicate column
* names which blows a Frame invariant (uniqueness of column names).
*
* If allLeftFlag is true, all rows in the leftFrame will be included, even if
* there is no matching row in the rightFrame, and vice-versa for
* allRightFlag. Missing data will appear as NAs. Both flags can be true.
*/
public class ASTMerge extends ASTPrim {
@Override public String[] args() { return new String[]{"left","rite", "all_left", "all_rite", "by_left", "by_right", "method"}; }
@Override public String str(){ return "merge";}
@Override int nargs() { return 1+7; } // (merge left rite all.left all.rite method)
// Size cutoff before switching between a hashed-join vs a sorting join.
// Hash tables beyond this count are assumed to be inefficient, and we're
// better served by sorting all the join columns and doing a global
// merge-join.
static final int MAX_HASH_SIZE = 120000000;
@Override
public Val apply(Env env, Env.StackHelp stk, AST asts[]) {
Frame l = stk.track(asts[1].exec(env)).getFrame();
Frame r = stk.track(asts[2].exec(env)).getFrame();
boolean allLeft = asts[3].exec(env).getNum() == 1;
boolean allRite = asts[4].exec(env).getNum() == 1;
int[] byLeft = check(asts[5]);
int[] byRight = check(asts[6]);
String method = asts[7].exec(env).getStr();
// Look for the set of columns in common; resort left & right to make the
// leading prefix of column names match. Bail out if we find any weird
// column types.
int ncols=0; // Number of columns in common
for( int i=0; i r.numRows();
} else {
walkLeft = allLeft;
}
Frame walked = walkLeft ? l : r;
Frame hashed = walkLeft ? r : l;
if( !walkLeft ) { boolean tmp = allLeft; allLeft = allRite; allRite = tmp; }
// Build categorical mappings, to rapidly convert categoricals from the
// distributed set to the hashed & replicated set.
int[][] id_maps = new int[ncols][];
for( int i=0; i rows = MergeSet.MERGE_SETS.get(uniq)._rows;
new MRTask() { @Override public void setupLocal() { MergeSet.MERGE_SETS.remove(uniq); } }.doAllNodes();
if (method.equals("auto") && (rows == null || rows.size() > MAX_HASH_SIZE )) // Blew out hash size; switch to a sorting join. Matt: even with 0, rows was size 3 hence added ||
return sortingMerge(l,r,allLeft,allRite,ncols,id_maps);
// All of the walked set, and no dup handling on the right - which means no
// need to replicate rows of the walked dataset. Simple 1-pass over the
// walked set adding in columns (or NAs) from the right.
if( allLeft && !(allRite && ms._dup) ) {
// The lifetime of the distributed dataset is independent of the original
// dataset, so it needs to be a deep copy.
// TODO: COW Optimization
walked = walked.deepCopy(null);
// run a global parallel work: lookup non-hashed rows in hashSet; find
// matching row; append matching column data
String[] names = Arrays.copyOfRange(hashed._names, ncols,hashed._names .length);
String[][] domains= Arrays.copyOfRange(hashed.domains(),ncols,hashed.domains().length);
byte[] types = Arrays.copyOfRange(hashed.types(),ncols,hashed.numCols());
Frame res = new AllLeftNoDupe(ncols,rows,hashed,allRite).doAll(types,walked).outputFrame(names,domains);
return new ValFrame(walked.add(res));
}
// Can be full or partial on the left, but won't nessecarily do all of the
// right. Dups on right are OK (left will be replicated or dropped as needed).
if( !allRite ) {
String[] names = Arrays.copyOf(walked.names(),walked.numCols() + hashed.numCols()-ncols);
System.arraycopy(hashed.names(),ncols,names,walked.numCols(),hashed.numCols()-ncols);
String[][] domains = Arrays.copyOf(walked.domains(),walked.numCols() + hashed.numCols()-ncols);
System.arraycopy(hashed.domains(),ncols,domains,walked.numCols(),hashed.numCols()-ncols);
byte[] types = walked.types();
types = Arrays.copyOf(types,types.length+hashed.numCols()-ncols);
System.arraycopy(hashed.types(),ncols,types,walked.numCols(),hashed.numCols()-ncols);
return new ValFrame(new AllRiteWithDupJoin(ncols,rows,hashed,allLeft).doAll(types,walked).outputFrame(names,domains));
}
throw H2O.unimpl();
}
/** Use a sorting merge/join, probably because the hash table size exceeded
* MAX_HASH_SIZE; i.e. the number of unique keys in the hashed Frame exceeds
* MAX_HASH_SIZE. Join is done on the first ncol columns in both frames,
* which are already known to be not-null and have matching names and types.
* The walked and hashed frames are sorted according to allLeft; if allRite
* is set then allLeft will also be set (but not vice-versa).
*
* @param left is the LHS frame; not-null.
* @param right is the RHS frame; not-null.
* @param allLeft all rows in the LHS frame will appear in the result frame.
* @param allRite all rows in the RHS frame will appear in the result frame.
* @param ncols is the number of columns to join on, and these are ordered
* as the first ncols of both the left and right frames.
* @param id_maps if not-null denote simple integer mappings from one
* categorical column to another; the width is ncols
*/
private ValFrame sortingMerge( Frame left, Frame right, boolean allLeft, boolean allRite, int ncols, int[][] id_maps) {
int cols[] = new int[ncols];
for (int i=0; i>32));
_row = chks[0].start()+row; // Payload: actual absolute row number
return this;
}
@Override public int hashCode() { return _hash; }
@Override public boolean equals( Object o ) {
if( !(o instanceof Row) ) return false;
Row r = (Row)o;
return _hash == r._hash && Arrays.equals(_keys,r._keys);
}
private void atomicAddDup(long row) {
synchronized (this) {
if( _dups==null ) {
_dups = new long[]{_row,row};
_dupIdx=2;
} else {
if( _dupIdx==_dups.length )
_dups = Arrays.copyOf(_dups,_dups.length << 1);
_dups[_dupIdx++]=row;
}
}
}
}
// Build a HashSet of one entire Frame, where the Key is the contents of the
// first few columns. One entry-per-row.
private static class MergeSet extends MRTask {
// All active Merges have a per-Node hashset of one of the datasets. If
// this is missing, it means the HashMap exceeded the size bounds and the
// whole MergeSet is being aborted (gracefully) - and the Merge is
// switching to a sorting merge instead of a hashed merge.
static IcedHashMap MERGE_SETS = new IcedHashMap<>();
final Key _uniq; // Key to allow sharing of this MergeSet on each Node
final int _ncols; // Number of leading columns for the Hash Key
final int[][] _id_maps; // Rapid mapping between matching enums
final boolean _allRite; // Collect all rows with the same matching Key, or just the first
boolean _dup; // Dups are present at all
IcedHashMap _rows;
MergeSet( int ncols, int[][] id_maps, boolean allRite ) {
_uniq=Key.make(); _ncols = ncols; _id_maps = id_maps; _allRite = allRite;
}
// Per-node, make the empty hashset for later reduction
@Override public void setupLocal() {
_rows = new IcedHashMap<>();
MERGE_SETS.put(_uniq,this);
}
@Override public void map( Chunk chks[] ) {
final IcedHashMap rows = MERGE_SETS.get(_uniq)._rows; // Shared per-node HashMap
if( rows == null ) return; // Missing: Aborted due to exceeding size
final int len = chks[0]._len;
Row row = new Row(_ncols);
for( int i=0; i MAX_HASH_SIZE ) { abort(); return; }
row = new Row(_ncols); // If added, need a new row to fill
}
}
private boolean add( IcedHashMap rows, Row row ) {
if( rows.putIfAbsent(row,"")==null )
return true; // Added!
// dup handling: keys are identical
if( _allRite ) { // Collect the dups?
_dup = true; // MergeSet has dups.
rows.getk(row).atomicAddDup(row._row);
}
return false;
}
private void abort( ) { MERGE_SETS.get(_uniq)._rows = _rows = null; }
@Override public void reduce( MergeSet ms ) {
final IcedHashMap rows = _rows; // Shared per-node hashset
if( rows == ms._rows ) return;
if( rows == null || ms._rows == null ) { abort(); return; } // Missing: aborted due to size
for( Row row : ms._rows.keySet() )
add(rows,row); // Merge RHS into LHS, collecting dups as we go
}
}
private static abstract class JoinTask extends MRTask {
protected final IcedHashMap _rows;
protected final int _ncols; // Number of merge columns
protected final Frame _hashed;
protected final boolean _allLeft, _allRite;
JoinTask( int ncols, IcedHashMap rows, Frame hashed, boolean allLeft, boolean allRite ) {
_rows = rows; _ncols = ncols; _hashed = hashed; _allLeft = allLeft; _allRite = allRite;
}
protected static void addElem(NewChunk nc, Chunk c, int row) {
if( c.isNA(row) ) nc.addNA();
else if( c instanceof CStrChunk ) nc.addStr(c,row);
else if( c instanceof C16Chunk ) nc.addUUID(c,row);
else if( c.hasFloat() ) nc.addNum(c.atd(row));
else nc.addNum(c.at8(row),0);
}
protected static void addElem(NewChunk nc, Vec v, long absRow, BufferedString bStr) {
switch( v.get_type() ) {
case Vec.T_NUM : nc.addNum(v.at(absRow)); break;
case Vec.T_CAT :
case Vec.T_TIME: if( v.isNA(absRow) ) nc.addNA(); else nc.addNum(v.at8(absRow)); break;
case Vec.T_STR : nc.addStr(v.atStr(bStr, absRow)); break;
default: throw H2O.unimpl();
}
}
}
// Build the join-set by iterating over all the local Chunks of the walked
// dataset, doing a hash-lookup on the hashed replicated dataset, and adding
// in the matching columns.
private static class AllLeftNoDupe extends JoinTask {
AllLeftNoDupe(int ncols, IcedHashMap rows, Frame hashed, boolean allRite) {
super(ncols, rows, hashed, true, allRite);
}
@Override public void map( Chunk chks[], NewChunk nchks[] ) {
// Shared common hash map
final IcedHashMap rows = _rows;
Vec[] vecs = _hashed.vecs(); // Data source from hashed set
assert vecs.length == _ncols + nchks.length;
Row row = new Row(_ncols); // Recycled Row object on the bigger dataset
BufferedString bStr = new BufferedString(); // Recycled BufferedString
int len = chks[0]._len;
for( int i=0; i rows, Frame hashed, boolean allLeft) {
super(ncols, rows, hashed, allLeft, true);
}
@Override public void map(Chunk[] chks, NewChunk[] nchks) {
// Shared common hash map
final IcedHashMap rows = _rows;
Vec[] vecs = _hashed.vecs(); // Data source from hashed set
// assert vecs.length == _ncols + nchks.length;
Row row = new Row(_ncols); // Recycled Row object on the bigger dataset
BufferedString bStr = new BufferedString(); // Recycled BufferedString
int len = chks[0]._len;
for( int i=0; i
© 2015 - 2025 Weber Informatics LLC | Privacy Policy