hex.FrameSplitter Maven / Gradle / Ivy

Go to download
Show more of this group Show more artifacts with this name
Show all versions of h2o-core Show documentation
H2O Core
There is a newer version: 3.8.2.9
package hex;

import java.util.Arrays;

import jsr166y.CountedCompleter;
import water.*;
import water.H2O.H2OCountedCompleter;
import water.fvec.*;

/**
 * Frame splitter function to divide given frame into
 * multiple partitions based on given ratios.
 *
 * The task creates ratios.length+1 output frame each containing a
 * demanded fraction of rows from source dataset
 *
 * The tasks internally extract data from source chunks and create output chunks in preserving order of parts.
 * I.e., the 1st partition contains the first P1-rows, the 2nd partition contains following P2-rows, ...
 * 
 *
 * Assumptions and invariants
 * 
 * number of demanding split parts is reasonable number, i.e., <10. The task is not designed to split into many small parts.
 * the worker DOES NOT preserves distribution of new chunks over the cloud according to source dataset chunks.
 * rows inside one output chunk are not shuffled, they are extracted deterministically in the same order as they appear in source chunk.
 * workers can enforce data transfers if they need to obtain data from remote chunks.
 * 
 *
 * NOTE: the implementation is data-transfer expensive and in some cases it would be beneficial to use original
 * implementation from 9af3f4e..
 */
public class FrameSplitter extends H2OCountedCompleter {
  /** Dataset to split */
  final Frame   dataset;
  /** Split ratios - resulting number of split is ratios.length+1 */
  final double[] ratios;
  /** Destination keys for each output frame split. */
  final Key[]   destKeys;
  /** Optional job key */
  final Key     jobKey;

  /** Output frames for each output split part */
  private Frame[] splits;

  public FrameSplitter(Frame dataset, double[] ratios, Key[] destKeys, Key jobKey) {
    this(null, dataset, ratios,destKeys,jobKey);
  }
  public FrameSplitter(H2OCountedCompleter cc, Frame dataset, double[] ratios, Key[] destKeys, Key jobKey) {
    super(cc);
    assert ratios.length > 0 : "No ratio specified!";
    assert ratios.length < 100 : "Too many frame splits demanded!";
    assert destKeys!=null : "Destination keys are not specified!";
    assert destKeys.length == ratios.length+1 : "Unexpected number of destination keys.";
    this.dataset  = dataset;
    this.ratios   = ratios;
    this.jobKey   = jobKey;
    this.destKeys = destKeys;
  }

  @Override public void compute2() {
    // Lock all possible data
    dataset.read_lock(jobKey);

    // Create a template vector for each segment
    final Vec[][] templates = makeTemplates(dataset, ratios);

    final int nsplits = templates.length;
    assert nsplits == ratios.length+1 : "Unexpected number of split templates!";
    // Launch number of distributed FJ for each split part
    final Vec[] datasetVecs = dataset.vecs();
    splits = new Frame[nsplits];
    for (int s=0; s0 && espc[0] == 0;
    assert espc[espc.length-1] == len;
    long[] partSizes = partitione(len, ratios); // Split of whole vector
    int nparts = ratios.length+1;
    long[][] r = new long[nparts][espc.length]; // espc for each partition
    long nrows = 0;
    long start = 0;
    for (int p=0,c=0; p_srcVecs
   * into output chunk.*/
  private static class FrameSplitTask extends MRTask {
    final Vec  [] _srcVecs; // a source frame given by list of its columns
    final double[] _ratios;  // split ratios
    final int     _partIdx; // part index

    transient int _pcidx; // Start chunk index for this partition
    transient int _psrow; // Start row in chunk for this partition

    public FrameSplitTask(H2OCountedCompleter completer, Vec[] srcVecs, double[] ratios, int partIdx) {
      super(completer);
      _srcVecs = srcVecs;
      _ratios  = ratios;
      _partIdx = partIdx;
    }
    @Override protected void setupLocal() {
      // Precompute the first input chunk index and start row inside that chunk for this partition
      Vec anyInVec = _srcVecs[0];
      long[] partSizes = partitione(anyInVec.length(), _ratios);
      long pnrows = 0;
      for (int p=0; p<_partIdx; p++) pnrows += partSizes[p];
      long[] espc = anyInVec.espc();
      while (_pcidx < espc.length-1 && (pnrows -= (espc[_pcidx+1]-espc[_pcidx])) >= 0 ) _pcidx++;
      assert pnrows <= 0;
      _psrow = (int) (pnrows + espc[_pcidx+1]-espc[_pcidx]);
    }
    @Override public void map(Chunk[] cs) { // Output chunks
      int coutidx = cs[0].cidx(); // Index of output Chunk
      int cinidx = _pcidx + coutidx;
      int startRow = coutidx > 0 ? 0 : _psrow; // where to start extracting
      int nrows = cs[0]._len;
      // For each output chunk extract appropriate rows for partIdx-th part
      for (int i=0; i