All Downloads are FREE. Search and download functionalities are using the official Maven repository.

hex.FrameSplitter Maven / Gradle / Ivy

There is a newer version: 3.8.2.9
Show newest version
package hex;

import java.util.Arrays;

import jsr166y.CountedCompleter;
import water.*;
import water.H2O.H2OCountedCompleter;
import water.fvec.*;

/**
 * Frame splitter function to divide given frame into
 * multiple partitions based on given ratios.
 *
 * 

The task creates ratios.length+1 output frame each containing a * demanded fraction of rows from source dataset

* *

The tasks internally extract data from source chunks and create output chunks in preserving order of parts. * I.e., the 1st partition contains the first P1-rows, the 2nd partition contains following P2-rows, ... *

* *

Assumptions and invariants

*
    *
  • number of demanding split parts is reasonable number, i.e., <10. The task is not designed to split into many small parts.
  • *
  • the worker DOES NOT preserves distribution of new chunks over the cloud according to source dataset chunks.
  • *
  • rows inside one output chunk are not shuffled, they are extracted deterministically in the same order as they appear in source chunk.
  • *
  • workers can enforce data transfers if they need to obtain data from remote chunks.
  • *
* *

NOTE: the implementation is data-transfer expensive and in some cases it would be beneficial to use original * implementation from 9af3f4e.

. */ public class FrameSplitter extends H2OCountedCompleter { /** Dataset to split */ final Frame dataset; /** Split ratios - resulting number of split is ratios.length+1 */ final double[] ratios; /** Destination keys for each output frame split. */ final Key[] destKeys; /** Optional job key */ final Key jobKey; /** Output frames for each output split part */ private Frame[] splits; public FrameSplitter(Frame dataset, double[] ratios, Key[] destKeys, Key jobKey) { this(null, dataset, ratios,destKeys,jobKey); } public FrameSplitter(H2OCountedCompleter cc, Frame dataset, double[] ratios, Key[] destKeys, Key jobKey) { super(cc); assert ratios.length > 0 : "No ratio specified!"; assert ratios.length < 100 : "Too many frame splits demanded!"; assert destKeys!=null : "Destination keys are not specified!"; assert destKeys.length == ratios.length+1 : "Unexpected number of destination keys."; this.dataset = dataset; this.ratios = ratios; this.jobKey = jobKey; this.destKeys = destKeys; } @Override public void compute2() { // Lock all possible data dataset.read_lock(jobKey); // Create a template vector for each segment final Vec[][] templates = makeTemplates(dataset, ratios); final int nsplits = templates.length; assert nsplits == ratios.length+1 : "Unexpected number of split templates!"; // Launch number of distributed FJ for each split part final Vec[] datasetVecs = dataset.vecs(); splits = new Frame[nsplits]; for (int s=0; s0 && espc[0] == 0; assert espc[espc.length-1] == len; long[] partSizes = partitione(len, ratios); // Split of whole vector int nparts = ratios.length+1; long[][] r = new long[nparts][espc.length]; // espc for each partition long nrows = 0; long start = 0; for (int p=0,c=0; p_srcVecs
* into output chunk.*/ private static class FrameSplitTask extends MRTask { final Vec [] _srcVecs; // a source frame given by list of its columns final double[] _ratios; // split ratios final int _partIdx; // part index transient int _pcidx; // Start chunk index for this partition transient int _psrow; // Start row in chunk for this partition public FrameSplitTask(H2OCountedCompleter completer, Vec[] srcVecs, double[] ratios, int partIdx) { super(completer); _srcVecs = srcVecs; _ratios = ratios; _partIdx = partIdx; } @Override protected void setupLocal() { // Precompute the first input chunk index and start row inside that chunk for this partition Vec anyInVec = _srcVecs[0]; long[] partSizes = partitione(anyInVec.length(), _ratios); long pnrows = 0; for (int p=0; p<_partIdx; p++) pnrows += partSizes[p]; long[] espc = anyInVec.espc(); while (_pcidx < espc.length-1 && (pnrows -= (espc[_pcidx+1]-espc[_pcidx])) >= 0 ) _pcidx++; assert pnrows <= 0; _psrow = (int) (pnrows + espc[_pcidx+1]-espc[_pcidx]); } @Override public void map(Chunk[] cs) { // Output chunks int coutidx = cs[0].cidx(); // Index of output Chunk int cinidx = _pcidx + coutidx; int startRow = coutidx > 0 ? 0 : _psrow; // where to start extracting int nrows = cs[0]._len; // For each output chunk extract appropriate rows for partIdx-th part for (int i=0; i




© 2015 - 2025 Weber Informatics LLC | Privacy Policy