
water.rapids.BinaryMerge Maven / Gradle / Ivy
package water.rapids;
// Since we have a single key field in H2O (different to data.table), bmerge() becomes a lot simpler (no
// need for recursion through join columns) with a downside of transfer-cost should we not need all the key.
import water.*;
import water.fvec.Chunk;
import water.fvec.Frame;
import water.fvec.NewChunk;
import water.fvec.Vec;
import static water.rapids.SingleThreadRadixOrder.getSortedOXHeaderKey;
import water.util.ArrayUtils;
import java.util.Arrays;
public class BinaryMerge extends DTask {
long _numRowsInResult=0; // returned to caller, so not transient
int _chunkSizes[]; // TODO: only _chunkSizes.length is needed by caller, so return that length only
double _timings[];
transient long _retFirst[/*n2GB*/][]; // The row number of the first right table's index key that matches
transient long _retLen[/*n2GB*/][]; // How many rows does it match to?
transient byte _leftKey[/*n2GB*/][/*i mod 2GB * _keySize*/];
transient byte _rightKey[][];
transient long _leftOrder[/*n2GB*/][/*i mod 2GB * _keySize*/];
transient long _rightOrder[][];
transient boolean _oneToManyMatch = false; // does any left row match to more than 1 right row? If not, can allocate and loop more efficiently, and mark the resulting key'd frame with a 'unique' index.
// TODO: implement
int _leftFieldSizes[], _rightFieldSizes[]; // the widths of each column in the key
long _leftColMins[], _rightColMins[]; // the col.min() of each column in the key
transient int _leftKeyNCol, _rightKeyNCol; // the number of columns in the key i.e. length of _leftFieldSizes and _rightFieldSizes
transient int _leftKeySize, _rightKeySize; // the total width in bytes of the key, sum of field sizes
transient int _numJoinCols;
transient long _leftN, _rightN;
transient long _leftBatchSize, _rightBatchSize;
Frame _leftFrame, _rightFrame;
transient long _perNodeNumRightRowsToFetch[];
transient long _perNodeNumLeftRowsToFetch[];
int _leftMSB, _rightMSB;
boolean _allLeft, _allRight;
Vec _leftVec, _rightVec;
transient int _leftChunkNode[], _rightChunkNode[]; // fast lookups to save repeated calls to node.index() which calls binarysearch within it.
BinaryMerge(Frame leftFrame, Frame rightFrame, int leftMSB, int rightMSB, int leftFieldSizes[], int rightFieldSizes[], long leftColMins[], long rightColMins[], boolean allLeft) { // In X[Y], 'left'=i and 'right'=x
_leftFrame = leftFrame;
_rightFrame = rightFrame;
_leftMSB = leftMSB;
_rightMSB = rightMSB;
_leftFieldSizes = leftFieldSizes; _rightFieldSizes = rightFieldSizes;
_leftColMins = leftColMins; _rightColMins = rightColMins;
_allLeft = allLeft;
_allRight = false; // TODO: pass through
// TODO: set 2 Frame and 2 int[] to NULL at the end of compute2 to save some traffic back, but should be small and insignificant
}
@Override
public void compute2() {
_timings = new double[20];
long t0 = System.nanoTime();
/*for (int s=0; s<1; s++) {
try { Thread.sleep(1000); } catch(InterruptedException ex) { Thread.currentThread().interrupt(); }
System.gc();
}*/
SingleThreadRadixOrder.OXHeader leftSortedOXHeader = DKV.getGet(getSortedOXHeaderKey(/*left=*/true, _leftMSB));
if (leftSortedOXHeader == null) {
if (_allRight) throw H2O.unimpl(); // TODO pass through _allRight and implement
tryComplete(); return;
}
/*for (int s=0; s<1; s++) {
try { Thread.sleep(1000); } catch(InterruptedException ex) { Thread.currentThread().interrupt(); }
System.gc();
}*/
SingleThreadRadixOrder.OXHeader rightSortedOXHeader = DKV.getGet(getSortedOXHeaderKey(/*left=*/false, _rightMSB));
if (rightSortedOXHeader == null) {
if (_allLeft == false) { tryComplete(); return; }
rightSortedOXHeader = new SingleThreadRadixOrder.OXHeader(0, 0, 0); // enables general case code to run below without needing new special case code
}
// both left and right MSB have some data to match
_leftBatchSize = leftSortedOXHeader._batchSize;
_rightBatchSize = rightSortedOXHeader._batchSize;
_perNodeNumRightRowsToFetch = new long[H2O.CLOUD.size()];
_perNodeNumLeftRowsToFetch = new long[H2O.CLOUD.size()];
// get left batches
_leftKey = new byte[leftSortedOXHeader._nBatch][];
_leftOrder = new long[leftSortedOXHeader._nBatch][];
_retFirst = new long[leftSortedOXHeader._nBatch][];
_retLen = new long[leftSortedOXHeader._nBatch][];
for (int b=0; b lLowIn && (rLow > rLowIn || _allLeft)) // '|| _allLeft' is needed here in H2O (but not data.table) for the _perNodeNumLeftRowsToFetch above to populate and pass the assert near the end of the compute2() above.
bmerge_r(lLowIn, lLow + 1, rLowIn, rLow+1);
if (lUpp < lUppIn && (rUpp < rUppIn || _allLeft))
bmerge_r(lUpp-1, lUppIn, rUpp-1, rUppIn);
// We don't feel tempted to reduce the global _ansN here and make a global frame,
// since we want to process each MSB l/r combo individually without allocating them all.
// Since recursive, no more code should be here (it would run too much)
}
private void createChunksInDKV() {
// Collect all matches
// Create the final frame (part) for this MSB combination
// Cannot use a List as that's restricted to 2Bn items and also isn't an Iced datatype
long t0 = System.nanoTime();
long perNodeRightRows[][][] = new long[H2O.CLOUD.size()][][];
//long perNodeRightRowsFrom[][][] = new long[H2O.CLOUD.size()][][];
long perNodeRightLoc[] = new long[H2O.CLOUD.size()];
long perNodeLeftRows[][][] = new long[H2O.CLOUD.size()][][];
//long perNodeLeftRowsFrom[][][] = new long[H2O.CLOUD.size()][][];
//long perNodeLeftRowsRepeat[][][] = new long[H2O.CLOUD.size()][][];
long perNodeLeftLoc[] = new long[H2O.CLOUD.size()];
/*for (int s=0; s<1; s++) {
try { Thread.sleep(1000); } catch(InterruptedException ex) { Thread.currentThread().interrupt(); }
System.gc();
}
*/
// Allocate memory to split this MSB combn's left and right matching rows into contiguous batches sent to the nodes they reside on
int batchSize = 256*1024*1024 / 8; // 256GB DKV limit / sizeof(long)
int thisNode = H2O.SELF.index();
for (int i = 0; i < H2O.CLOUD.size(); i++) {
if (_perNodeNumRightRowsToFetch[i] > 0) {
int nbatch = (int) ((_perNodeNumRightRowsToFetch[i] - 1) / batchSize + 1); // TODO: wrap in class to avoid this boiler plate
int lastSize = (int) (_perNodeNumRightRowsToFetch[i] - (nbatch - 1) * batchSize);
System.out.println("Sending " +_perNodeNumRightRowsToFetch[i]+ " row requests to node " +i+ " in " +nbatch+ " batches from node " +thisNode+ " for rightMSB " +_rightMSB);
assert nbatch >= 1;
assert lastSize > 0;
perNodeRightRows[i] = new long[nbatch][];
//perNodeRightRowsFrom[i] = new long[nbatch][];
int b;
for (b = 0; b < nbatch - 1; b++) {
perNodeRightRows[i][b] = new long[batchSize]; // TO DO?: use MemoryManager.malloc()
//perNodeRightRowsFrom[i][b] = new long[batchSize];
}
perNodeRightRows[i][b] = new long[lastSize];
//perNodeRightRowsFrom[i][b] = new long[lastSize];
}
if (_perNodeNumLeftRowsToFetch[i] > 0) {
int nbatch = (int) ((_perNodeNumLeftRowsToFetch[i] - 1) / batchSize + 1); // TODO: wrap in class to avoid this boiler plate
int lastSize = (int) (_perNodeNumLeftRowsToFetch[i] - (nbatch - 1) * batchSize);
System.out.println("Sending " +_perNodeNumLeftRowsToFetch[i]+ " row requests to node " +i+ " in " +nbatch+ " batches from node " +thisNode+ " for leftMSB " + _leftMSB);
assert nbatch >= 1;
assert lastSize > 0;
perNodeLeftRows[i] = new long[nbatch][];
//perNodeLeftRowsFrom[i] = new long[nbatch][];
//perNodeLeftRowsRepeat[i] = new long[nbatch][];
int b;
for (b = 0; b < nbatch - 1; b++) {
perNodeLeftRows[i][b] = new long[batchSize]; // TO DO?: use MemoryManager.malloc()
//perNodeLeftRowsFrom[i][b] = new long[batchSize];
//perNodeLeftRowsRepeat[i][b] = new long[batchSize];
}
perNodeLeftRows[i][b] = new long[lastSize];
//perNodeLeftRowsFrom[i][b] = new long[lastSize];
//perNodeLeftRowsRepeat[i][b] = new long[lastSize];
}
}
_timings[2] += (System.nanoTime() - t0) / 1e9;
t0 = System.nanoTime();
/*for (int s=0; s<1; s++) {
try { Thread.sleep(1000); } catch(InterruptedException ex) { Thread.currentThread().interrupt(); }
System.gc();
}*/
// Loop over _retFirst and _retLen and populate the batched requests for each node helper
// _retFirst and _retLen are the same shape
long prevf = -1, prevl = -1;
long resultLoc=0; // sweep upwards through the final result, filling it in
long leftLoc=-1; // sweep through left table along the sorted row locations. // TODO: hop back to original order here for [] syntax.
for (int jb=0; jb<_retFirst.length; ++jb) { // jb = j batch
for (int jo=0; jo<_retFirst[jb].length; ++jo) { // jo = j offset
leftLoc++; // to save jb*_retFirst[0].length + jo;
long f = _retFirst[jb][jo]; // TODO: take _retFirst[jb] outside inner loop
long l = _retLen[jb][jo];
if (f==0) {
// left row matches to no right row
assert l == 0; // doesn't have to be 0 (could be 1 already if allLeft==true) but currently it should be, so check it
if (!_allLeft) continue;
// now insert the left row once and NA for the right columns i.e. left outer join
}
{ // new scope so 'row' can be declared in the for() loop below and registerized (otherwise 'already defined in this scope' in that scope)
// Fetch the left rows and mark the contiguous from-ranges each left row should be recycled over
// TODO: when single node, not needed
long row = _leftOrder[(int)(leftLoc / _leftBatchSize)][(int)(leftLoc % _leftBatchSize)]; // TODO could loop through batches rather than / and % wastefully
int chkIdx = _leftVec.elem2ChunkIdx(row); //binary search in espc
int ni = _leftChunkNode[chkIdx];
long pnl = perNodeLeftLoc[ni]++; // pnl = per node location
perNodeLeftRows[ni][(int)(pnl/batchSize)][(int)(pnl%batchSize)] = row; // ask that node for global row number row
//perNodeLeftRowsFrom[ni][(int)(pnl/batchSize)][(int)(pnl%batchSize)] = resultLoc; // TODO: could store the batch and offset separately? If it will be used to assign into a Vec, then that's have different shape/espc so the location is better.
//perNodeLeftRowsRepeat[ni][(int)(pnl/batchSize)][(int)(pnl%batchSize)] = Math.max(1,l);
}
if (f==0) { resultLoc++; continue; }
assert l > 0;
if (prevf == f && prevl == l) continue; // don't re-fetch the same matching rows (cartesian). We'll repeat them locally later.
prevf = f; prevl = l;
for (int r=0; r= 1;
assert lastSize > 0;
_chunkSizes = new int[nbatch];
int _numLeftCols = _leftFrame.numCols();
int _numColsInResult = _leftFrame.numCols() + _rightFrame.numCols() - _numJoinCols;
double[][][] frameLikeChunks = new double[_numColsInResult][nbatch][]; //TODO: compression via int types
for (int col=0; col<_numColsInResult; col++) {
int b;
for (b = 0; b < nbatch - 1; b++) {
frameLikeChunks[col][b] = new double[batchSize];
Arrays.fill(frameLikeChunks[col][b], Double.NaN); // NA by default to save filling with NA for nomatches when allLeft
_chunkSizes[b] = batchSize;
}
frameLikeChunks[col][b] = new double[lastSize];
Arrays.fill(frameLikeChunks[col][b], Double.NaN);
_chunkSizes[b] = lastSize;
}
_timings[4] += (System.nanoTime() - t0) / 1e9;
t0 = System.nanoTime();
/*for (int s=0; s<1; s++) {
try { Thread.sleep(1000); } catch(InterruptedException ex) { Thread.currentThread().interrupt(); }
System.gc();
}*/
RPC grrrsRiteRPC[][] = new RPC[H2O.CLOUD.size()][];
RPC grrrsLeftRPC[][] = new RPC[H2O.CLOUD.size()][];
GetRawRemoteRows grrrsLeft[][] = new GetRawRemoteRows[H2O.CLOUD.size()][];
GetRawRemoteRows grrrsRite[][] = new GetRawRemoteRows[H2O.CLOUD.size()][];
for (H2ONode node : H2O.CLOUD._memary) {
int ni = node.index();
int bUppRite = perNodeRightRows[ni] == null ? 0 : perNodeRightRows[ni].length;
int bUppLeft = perNodeLeftRows[ni] == null ? 0 : perNodeLeftRows[ni].length;
grrrsRiteRPC[ni] = new RPC[bUppRite];
grrrsLeftRPC[ni] = new RPC[bUppLeft];
grrrsRite[ni] = new GetRawRemoteRows[bUppRite];
grrrsLeft[ni] = new GetRawRemoteRows[bUppLeft];
for (int b = 0; b < bUppRite; b++) {
// Arrays.sort(perNodeRightRows[ni][b]); Simple quick test of fetching in monotic order. Doesn't seem to help so far. TODO try again now with better method
grrrsRiteRPC[ni][b] = new RPC<>(node, new GetRawRemoteRows(_rightFrame, perNodeRightRows[ni][b])).call();
}
for (int b = 0; b < bUppLeft; b++) {
// Arrays.sort(perNodeLeftRows[ni][b]);
grrrsLeftRPC[ni][b] = new RPC<>(node, new GetRawRemoteRows(_leftFrame, perNodeLeftRows[ni][b])).call();
}
}
for (H2ONode node : H2O.CLOUD._memary) {
// TODO: just send and wait for first batch on each node and then .get() next batch as needed.
int ni = node.index();
int bUppRite = perNodeRightRows[ni] == null ? 0 : perNodeRightRows[ni].length;
for (int b = 0; b < bUppRite; b++) {
_timings[5] += (grrrsRite[ni][b] = grrrsRiteRPC[ni][b].get()).timeTaken;
}
int bUppLeft = perNodeLeftRows[ni] == null ? 0 : perNodeLeftRows[ni].length;
for (int b = 0; b < bUppLeft; b++) {
_timings[5] += (grrrsLeft[ni][b] = grrrsLeftRPC[ni][b].get()).timeTaken;
}
}
_timings[6] += (System.nanoTime() - t0) / 1e9; // all this time is expected to be in [5]
t0 = System.nanoTime();
grrrsRiteRPC = null;
grrrsLeftRPC = null;
// Now loop through _retFirst and _retLen and populate
resultLoc=0; // sweep upwards through the final result, filling it in
leftLoc=-1; // sweep through left table along the sorted row locations. // TODO: hop back to original order here for [] syntax.
prevf = -1; prevl = -1;
for (int jb=0; jb<_retFirst.length; ++jb) { // jb = j batch
for (int jo=0; jo<_retFirst[jb].length; ++jo) { // jo = j offset
leftLoc++; // to save jb*_retFirst[0].length + jo;
long f = _retFirst[jb][jo]; // TODO: take _retFirst[jb] outside inner loop
long l = _retLen[jb][jo];
if (f==0 && !_allLeft) continue; // f==0 => left row matches to no right row
// else insert the left row once and NA for the right columns i.e. left outer join
// Fetch the left rows and recycle it if more than 1 row in the right table is matched to
long row = _leftOrder[(int)(leftLoc / _leftBatchSize)][(int)(leftLoc % _leftBatchSize)]; // TODO could loop through batches rather than / and % wastefully
// TODO should leftOrder and retFirst/retLen have the same batch size to make this easier?
// TODO Can we not just loop through _leftOrder only? Why jb and jo too through
int chkIdx = _leftVec.elem2ChunkIdx(row); //binary search in espc
int ni = _leftChunkNode[chkIdx];
long pnl = perNodeLeftLoc[ni]++; // pnl = per node location. TODO: batch increment this rather than
int b = (int)(pnl / batchSize);
int o = (int)(pnl % batchSize);
double[][] chks = grrrsLeft[ni][b]._chk;
for (int rep = 0; rep < Math.max(l,1); rep++) {
long a = resultLoc + rep;
int whichChunk = (int) (a / batchSize); // TODO: loop into batches to save / and % for each repeat and still cater for crossing multiple batch boundaries
int offset = (int) (a % batchSize);
for (int col=0; col 0;
if (prevf == f && prevl == l) {
// ** just copy from previous batch in the result (populated by for() below). Contiguous easy in-cache copy (other than batches).
for (int r=0; r {
double[/*col*/][] _chk; //null on the way to remote node, non-null on the way back
long[/*rows*/] _rows; //which rows to fetch from remote node, non-null on the way to remote, null on the way back
double timeTaken;
Frame _fr;
GetRawRemoteRows(Frame fr, long[] rows) {
_rows = rows;
_fr = fr;
}
@Override
public void compute2() {
assert(_rows!=null);
assert(_chk ==null);
long t0 = System.nanoTime();
_chk = new double[_fr.numCols()][_rows.length];
int cidx[] = new int[_rows.length];
int offset[] = new int[_rows.length];
Vec anyVec = _fr.anyVec();
for (int row=0; row<_rows.length; row++) {
cidx[row] = anyVec.elem2ChunkIdx(_rows[row]); // binary search of espc array. TODO: sort input row numbers to avoid
offset[row] = (int)(_rows[row] - anyVec.espc()[cidx[row]]);
}
Chunk c[] = new Chunk[anyVec.nChunks()];
for (int col=0; col<_fr.numCols(); col++) {
Vec v = _fr.vec(col);
for (int i=0; i
© 2015 - 2025 Weber Informatics LLC | Privacy Policy