water.rapids.BinaryMerge Maven / Gradle / Ivy

Go to download
Show more of this group Show more artifacts with this name
Show all versions of h2o-core Show documentation
H2O Core
There is a newer version: 3.8.2.9
package water.rapids;

// Since we have a single key field in H2O (different to data.table), bmerge() becomes a lot simpler (no
// need for recursion through join columns) with a downside of transfer-cost should we not need all the key.

import water.*;
import water.fvec.Chunk;
import water.fvec.Frame;
import water.fvec.NewChunk;
import water.fvec.Vec;
import static water.rapids.SingleThreadRadixOrder.getSortedOXHeaderKey;
import water.util.ArrayUtils;

import java.util.Arrays;

public class BinaryMerge extends DTask {
  long _numRowsInResult=0;  // returned to caller, so not transient
  int _chunkSizes[];      // TODO:  only _chunkSizes.length is needed by caller, so return that length only
  double _timings[];

  transient long _retFirst[/*n2GB*/][];  // The row number of the first right table's index key that matches
  transient long _retLen[/*n2GB*/][];    // How many rows does it match to?
  transient byte _leftKey[/*n2GB*/][/*i mod 2GB * _keySize*/];
  transient byte _rightKey[][];
  transient long _leftOrder[/*n2GB*/][/*i mod 2GB * _keySize*/];
  transient long _rightOrder[][];
  transient boolean _oneToManyMatch = false;   // does any left row match to more than 1 right row?  If not, can allocate and loop more efficiently, and mark the resulting key'd frame with a 'unique' index.
                                               //   TODO: implement
  int _leftFieldSizes[], _rightFieldSizes[];   // the widths of each column in the key
  long _leftColMins[], _rightColMins[];        // the col.min() of each column in the key
  transient int _leftKeyNCol, _rightKeyNCol;   // the number of columns in the key i.e. length of _leftFieldSizes and _rightFieldSizes
  transient int _leftKeySize, _rightKeySize;   // the total width in bytes of the key, sum of field sizes
  transient int _numJoinCols;
  transient long _leftN, _rightN;
  transient long _leftBatchSize, _rightBatchSize;
  Frame _leftFrame, _rightFrame;

  transient long _perNodeNumRightRowsToFetch[];
  transient long _perNodeNumLeftRowsToFetch[];
  int _leftMSB, _rightMSB;

  boolean _allLeft, _allRight;

  Vec _leftVec, _rightVec;

  transient int _leftChunkNode[], _rightChunkNode[];  // fast lookups to save repeated calls to node.index() which calls binarysearch within it.

  BinaryMerge(Frame leftFrame, Frame rightFrame, int leftMSB, int rightMSB, int leftFieldSizes[], int rightFieldSizes[], long leftColMins[], long rightColMins[], boolean allLeft) {   // In X[Y], 'left'=i and 'right'=x
    _leftFrame = leftFrame;
    _rightFrame = rightFrame;
    _leftMSB = leftMSB;
    _rightMSB = rightMSB;
    _leftFieldSizes = leftFieldSizes; _rightFieldSizes = rightFieldSizes;
    _leftColMins = leftColMins; _rightColMins = rightColMins;
    _allLeft = allLeft;
    _allRight = false;  // TODO: pass through
    // TODO: set 2 Frame and 2 int[] to NULL at the end of compute2 to save some traffic back, but should be small and insignificant
  }

  @Override
  public void compute2() {
    _timings = new double[20];
    long t0 = System.nanoTime();


    /*for (int s=0; s<1; s++) {
      try { Thread.sleep(1000); } catch(InterruptedException ex) { Thread.currentThread().interrupt(); }
      System.gc();
    }*/

    SingleThreadRadixOrder.OXHeader leftSortedOXHeader = DKV.getGet(getSortedOXHeaderKey(/*left=*/true, _leftMSB));
    if (leftSortedOXHeader == null) {
      if (_allRight) throw H2O.unimpl();  // TODO pass through _allRight and implement
      tryComplete(); return;
    }


    /*for (int s=0; s<1; s++) {
      try { Thread.sleep(1000); } catch(InterruptedException ex) { Thread.currentThread().interrupt(); }
      System.gc();
    }*/


    SingleThreadRadixOrder.OXHeader rightSortedOXHeader = DKV.getGet(getSortedOXHeaderKey(/*left=*/false, _rightMSB));
    if (rightSortedOXHeader == null) {
      if (_allLeft == false) { tryComplete(); return; }
      rightSortedOXHeader = new SingleThreadRadixOrder.OXHeader(0, 0, 0);  // enables general case code to run below without needing new special case code
    }
    // both left and right MSB have some data to match
    _leftBatchSize = leftSortedOXHeader._batchSize;
    _rightBatchSize = rightSortedOXHeader._batchSize;
    _perNodeNumRightRowsToFetch = new long[H2O.CLOUD.size()];
    _perNodeNumLeftRowsToFetch = new long[H2O.CLOUD.size()];

    // get left batches
    _leftKey = new byte[leftSortedOXHeader._nBatch][];
    _leftOrder = new long[leftSortedOXHeader._nBatch][];
    _retFirst = new long[leftSortedOXHeader._nBatch][];
    _retLen = new long[leftSortedOXHeader._nBatch][];
    for (int b=0; b lLowIn && (rLow > rLowIn || _allLeft)) // '|| _allLeft' is needed here in H2O (but not data.table) for the _perNodeNumLeftRowsToFetch above to populate and pass the assert near the end of the compute2() above.
      bmerge_r(lLowIn, lLow + 1, rLowIn, rLow+1);
    if (lUpp < lUppIn && (rUpp < rUppIn || _allLeft))
      bmerge_r(lUpp-1, lUppIn, rUpp-1, rUppIn);

    // We don't feel tempted to reduce the global _ansN here and make a global frame,
    // since we want to process each MSB l/r combo individually without allocating them all.
    // Since recursive, no more code should be here (it would run too much)
  }

  private void createChunksInDKV() {

    // Collect all matches
    // Create the final frame (part) for this MSB combination
    // Cannot use a List as that's restricted to 2Bn items and also isn't an Iced datatype
    long t0 = System.nanoTime();
    long perNodeRightRows[][][] = new long[H2O.CLOUD.size()][][];
    //long perNodeRightRowsFrom[][][] = new long[H2O.CLOUD.size()][][];
    long perNodeRightLoc[] = new long[H2O.CLOUD.size()];

    long perNodeLeftRows[][][] = new long[H2O.CLOUD.size()][][];
    //long perNodeLeftRowsFrom[][][] = new long[H2O.CLOUD.size()][][];
    //long perNodeLeftRowsRepeat[][][] = new long[H2O.CLOUD.size()][][];
    long perNodeLeftLoc[] = new long[H2O.CLOUD.size()];


    /*for (int s=0; s<1; s++) {
      try { Thread.sleep(1000); } catch(InterruptedException ex) { Thread.currentThread().interrupt(); }
      System.gc();
    }
    */


    // Allocate memory to split this MSB combn's left and right matching rows into contiguous batches sent to the nodes they reside on
    int batchSize = 256*1024*1024 / 8;  // 256GB DKV limit / sizeof(long)
    int thisNode = H2O.SELF.index();
    for (int i = 0; i < H2O.CLOUD.size(); i++) {
      if (_perNodeNumRightRowsToFetch[i] > 0) {
        int nbatch = (int) ((_perNodeNumRightRowsToFetch[i] - 1) / batchSize + 1);  // TODO: wrap in class to avoid this boiler plate
        int lastSize = (int) (_perNodeNumRightRowsToFetch[i] - (nbatch - 1) * batchSize);
        System.out.println("Sending " +_perNodeNumRightRowsToFetch[i]+ " row requests to node " +i+ " in " +nbatch+ " batches from node " +thisNode+ " for rightMSB " +_rightMSB);
        assert nbatch >= 1;
        assert lastSize > 0;
        perNodeRightRows[i] = new long[nbatch][];
        //perNodeRightRowsFrom[i] = new long[nbatch][];
        int b;
        for (b = 0; b < nbatch - 1; b++) {
          perNodeRightRows[i][b] = new long[batchSize];  // TO DO?: use MemoryManager.malloc()
          //perNodeRightRowsFrom[i][b] = new long[batchSize];
        }
        perNodeRightRows[i][b] = new long[lastSize];
        //perNodeRightRowsFrom[i][b] = new long[lastSize];
      }
      if (_perNodeNumLeftRowsToFetch[i] > 0) {
        int nbatch = (int) ((_perNodeNumLeftRowsToFetch[i] - 1) / batchSize + 1);  // TODO: wrap in class to avoid this boiler plate
        int lastSize = (int) (_perNodeNumLeftRowsToFetch[i] - (nbatch - 1) * batchSize);
        System.out.println("Sending " +_perNodeNumLeftRowsToFetch[i]+ " row requests to node " +i+ " in " +nbatch+ " batches from node " +thisNode+ " for leftMSB " + _leftMSB);
        assert nbatch >= 1;
        assert lastSize > 0;
        perNodeLeftRows[i] = new long[nbatch][];
        //perNodeLeftRowsFrom[i] = new long[nbatch][];
        //perNodeLeftRowsRepeat[i] = new long[nbatch][];
        int b;
        for (b = 0; b < nbatch - 1; b++) {
          perNodeLeftRows[i][b] = new long[batchSize];  // TO DO?: use MemoryManager.malloc()
          //perNodeLeftRowsFrom[i][b] = new long[batchSize];
          //perNodeLeftRowsRepeat[i][b] = new long[batchSize];
        }
        perNodeLeftRows[i][b] = new long[lastSize];
        //perNodeLeftRowsFrom[i][b] = new long[lastSize];
        //perNodeLeftRowsRepeat[i][b] = new long[lastSize];
      }
    }
    _timings[2] += (System.nanoTime() - t0) / 1e9;
    t0 = System.nanoTime();


    /*for (int s=0; s<1; s++) {
      try { Thread.sleep(1000); } catch(InterruptedException ex) { Thread.currentThread().interrupt(); }
      System.gc();
    }*/



    // Loop over _retFirst and _retLen and populate the batched requests for each node helper
    // _retFirst and _retLen are the same shape
    long prevf = -1, prevl = -1;
    long resultLoc=0;  // sweep upwards through the final result, filling it in
    long leftLoc=-1;   // sweep through left table along the sorted row locations.  // TODO: hop back to original order here for [] syntax.
    for (int jb=0; jb<_retFirst.length; ++jb) {              // jb = j batch
      for (int jo=0; jo<_retFirst[jb].length; ++jo) {        // jo = j offset
        leftLoc++;  // to save jb*_retFirst[0].length + jo;
        long f = _retFirst[jb][jo];  // TODO: take _retFirst[jb] outside inner loop
        long l = _retLen[jb][jo];
        if (f==0) {
          // left row matches to no right row
          assert l == 0;  // doesn't have to be 0 (could be 1 already if allLeft==true) but currently it should be, so check it
          if (!_allLeft) continue;
          // now insert the left row once and NA for the right columns i.e. left outer join
        }
        { // new scope so 'row' can be declared in the for() loop below and registerized (otherwise 'already defined in this scope' in that scope)
          // Fetch the left rows and mark the contiguous from-ranges each left row should be recycled over
          // TODO: when single node, not needed
          long row = _leftOrder[(int)(leftLoc / _leftBatchSize)][(int)(leftLoc % _leftBatchSize)];  // TODO could loop through batches rather than / and % wastefully
          int chkIdx = _leftVec.elem2ChunkIdx(row); //binary search in espc
          int ni = _leftChunkNode[chkIdx];
          long pnl = perNodeLeftLoc[ni]++;   // pnl = per node location
          perNodeLeftRows[ni][(int)(pnl/batchSize)][(int)(pnl%batchSize)] = row;  // ask that node for global row number row
          //perNodeLeftRowsFrom[ni][(int)(pnl/batchSize)][(int)(pnl%batchSize)] = resultLoc;  // TODO: could store the batch and offset separately?  If it will be used to assign into a Vec, then that's have different shape/espc so the location is better.
          //perNodeLeftRowsRepeat[ni][(int)(pnl/batchSize)][(int)(pnl%batchSize)] = Math.max(1,l);
        }
        if (f==0) { resultLoc++; continue; }
        assert l > 0;
        if (prevf == f && prevl == l) continue;  // don't re-fetch the same matching rows (cartesian). We'll repeat them locally later.
        prevf = f; prevl = l;
        for (int r=0; r= 1;
    assert lastSize > 0;
    _chunkSizes = new int[nbatch];
    int _numLeftCols = _leftFrame.numCols();
    int _numColsInResult = _leftFrame.numCols() + _rightFrame.numCols() - _numJoinCols;
    double[][][] frameLikeChunks = new double[_numColsInResult][nbatch][]; //TODO: compression via int types
    for (int col=0; col<_numColsInResult; col++) {
      int b;
      for (b = 0; b < nbatch - 1; b++) {
        frameLikeChunks[col][b] = new double[batchSize];
        Arrays.fill(frameLikeChunks[col][b], Double.NaN);   // NA by default to save filling with NA for nomatches when allLeft
        _chunkSizes[b] = batchSize;
      }
      frameLikeChunks[col][b] = new double[lastSize];
      Arrays.fill(frameLikeChunks[col][b], Double.NaN);
      _chunkSizes[b] = lastSize;
    }
    _timings[4] += (System.nanoTime() - t0) / 1e9;
    t0 = System.nanoTime();


    /*for (int s=0; s<1; s++) {
      try { Thread.sleep(1000); } catch(InterruptedException ex) { Thread.currentThread().interrupt(); }
      System.gc();
    }*/



    RPC grrrsRiteRPC[][] = new RPC[H2O.CLOUD.size()][];
    RPC grrrsLeftRPC[][] = new RPC[H2O.CLOUD.size()][];
    GetRawRemoteRows grrrsLeft[][] = new GetRawRemoteRows[H2O.CLOUD.size()][];
    GetRawRemoteRows grrrsRite[][] = new GetRawRemoteRows[H2O.CLOUD.size()][];
    for (H2ONode node : H2O.CLOUD._memary) {
      int ni = node.index();
      int bUppRite = perNodeRightRows[ni] == null ? 0 : perNodeRightRows[ni].length;
      int bUppLeft =  perNodeLeftRows[ni] == null ? 0 :  perNodeLeftRows[ni].length;
      grrrsRiteRPC[ni] = new RPC[bUppRite];
      grrrsLeftRPC[ni] = new RPC[bUppLeft];
      grrrsRite[ni] = new GetRawRemoteRows[bUppRite];
      grrrsLeft[ni] = new GetRawRemoteRows[bUppLeft];
      for (int b = 0; b < bUppRite; b++) {
        // Arrays.sort(perNodeRightRows[ni][b]);  Simple quick test of fetching in monotic order. Doesn't seem to help so far. TODO try again now with better method
        grrrsRiteRPC[ni][b] = new RPC<>(node, new GetRawRemoteRows(_rightFrame, perNodeRightRows[ni][b])).call();
      }
      for (int b = 0; b < bUppLeft; b++) {
        // Arrays.sort(perNodeLeftRows[ni][b]);
        grrrsLeftRPC[ni][b] = new RPC<>(node, new GetRawRemoteRows(_leftFrame, perNodeLeftRows[ni][b])).call();
      }
    }
    for (H2ONode node : H2O.CLOUD._memary) {
      // TODO: just send and wait for first batch on each node and then .get() next batch as needed.
      int ni = node.index();
      int bUppRite = perNodeRightRows[ni] == null ? 0 : perNodeRightRows[ni].length;
      for (int b = 0; b < bUppRite; b++) {
        _timings[5] += (grrrsRite[ni][b] = grrrsRiteRPC[ni][b].get()).timeTaken;
      }
      int bUppLeft = perNodeLeftRows[ni] == null ? 0 :  perNodeLeftRows[ni].length;
      for (int b = 0; b < bUppLeft; b++) {
        _timings[5] += (grrrsLeft[ni][b] = grrrsLeftRPC[ni][b].get()).timeTaken;
      }
    }
    _timings[6] += (System.nanoTime() - t0) / 1e9;   // all this time is expected to be in [5]
    t0 = System.nanoTime();
    grrrsRiteRPC = null;
    grrrsLeftRPC = null;

    // Now loop through _retFirst and _retLen and populate
    resultLoc=0;  // sweep upwards through the final result, filling it in
    leftLoc=-1;   // sweep through left table along the sorted row locations.  // TODO: hop back to original order here for [] syntax.
    prevf = -1; prevl = -1;
    for (int jb=0; jb<_retFirst.length; ++jb) {              // jb = j batch
      for (int jo=0; jo<_retFirst[jb].length; ++jo) {        // jo = j offset
        leftLoc++;  // to save jb*_retFirst[0].length + jo;
        long f = _retFirst[jb][jo];  // TODO: take _retFirst[jb] outside inner loop
        long l = _retLen[jb][jo];
        if (f==0 && !_allLeft) continue;  // f==0 => left row matches to no right row
        // else insert the left row once and NA for the right columns i.e. left outer join

        // Fetch the left rows and recycle it if more than 1 row in the right table is matched to
        long row = _leftOrder[(int)(leftLoc / _leftBatchSize)][(int)(leftLoc % _leftBatchSize)];  // TODO could loop through batches rather than / and % wastefully
        // TODO should leftOrder and retFirst/retLen have the same batch size to make this easier?
        // TODO Can we not just loop through _leftOrder only? Why jb and jo too through
        int chkIdx = _leftVec.elem2ChunkIdx(row); //binary search in espc
        int ni = _leftChunkNode[chkIdx];
        long pnl = perNodeLeftLoc[ni]++;   // pnl = per node location.  TODO: batch increment this rather than
        int b = (int)(pnl / batchSize);
        int o = (int)(pnl % batchSize);
        double[][] chks = grrrsLeft[ni][b]._chk;

        for (int rep = 0; rep < Math.max(l,1); rep++) {
          long a = resultLoc + rep;
          int whichChunk = (int) (a / batchSize);  // TODO: loop into batches to save / and % for each repeat and still cater for crossing multiple batch boundaries
          int offset = (int) (a % batchSize);

          for (int col=0; col 0;
        if (prevf == f && prevl == l) {
          // ** just copy from previous batch in the result (populated by for() below). Contiguous easy in-cache copy (other than batches).
          for (int r=0; r {
    double[/*col*/][] _chk; //null on the way to remote node, non-null on the way back
    long[/*rows*/] _rows; //which rows to fetch from remote node, non-null on the way to remote, null on the way back
    double timeTaken;
    Frame _fr;
    GetRawRemoteRows(Frame fr, long[] rows) {
      _rows = rows;
      _fr = fr;
    }

    @Override
    public void compute2() {
      assert(_rows!=null);
      assert(_chk ==null);
      long t0 = System.nanoTime();

      _chk  = new double[_fr.numCols()][_rows.length];
      int cidx[] = new int[_rows.length];
      int offset[] = new int[_rows.length];
      Vec anyVec = _fr.anyVec();
      for (int row=0; row<_rows.length; row++) {
        cidx[row] = anyVec.elem2ChunkIdx(_rows[row]);  // binary search of espc array.  TODO: sort input row numbers to avoid
        offset[row] = (int)(_rows[row] - anyVec.espc()[cidx[row]]);
      }
      Chunk c[] = new Chunk[anyVec.nChunks()];
      for (int col=0; col<_fr.numCols(); col++) {
        Vec v = _fr.vec(col);
        for (int i=0; i