
hex.FrameSplitter Maven / Gradle / Ivy
package hex; import java.util.Arrays; import jsr166y.CountedCompleter; import water.*; import water.H2O.H2OCountedCompleter; import water.fvec.*; /** * Frame splitter function to divide given frame into * multiple partitions based on given ratios. * *
* into output chunk.*/ private static class FrameSplitTask extends MRTaskThe task creates
* *ratios.length+1
output frame each containing a * demanded fraction of rows from source datasetThe tasks internally extract data from source chunks and create output chunks in preserving order of parts. * I.e., the 1st partition contains the first P1-rows, the 2nd partition contains following P2-rows, ... *
* *Assumptions and invariants
**
* *- number of demanding split parts is reasonable number, i.e., <10. The task is not designed to split into many small parts.
*- the worker DOES NOT preserves distribution of new chunks over the cloud according to source dataset chunks.
*- rows inside one output chunk are not shuffled, they are extracted deterministically in the same order as they appear in source chunk.
*- workers can enforce data transfers if they need to obtain data from remote chunks.
*NOTE: the implementation is data-transfer expensive and in some cases it would be beneficial to use original * implementation from 9af3f4e.
. */ public class FrameSplitter extends H2OCountedCompleter{ /** Dataset to split */ final Frame dataset; /** Split ratios - resulting number of split is ratios.length+1 */ final double[] ratios; /** Destination keys for each output frame split. */ final Key[] destKeys; /** Optional job key */ final Key jobKey; /** Output frames for each output split part */ private Frame[] splits; public FrameSplitter(Frame dataset, double[] ratios, Key[] destKeys, Key jobKey) { this(null, dataset, ratios,destKeys,jobKey); } public FrameSplitter(H2OCountedCompleter cc, Frame dataset, double[] ratios, Key[] destKeys, Key jobKey) { super(cc); assert ratios.length > 0 : "No ratio specified!"; assert ratios.length < 100 : "Too many frame splits demanded!"; assert destKeys!=null : "Destination keys are not specified!"; assert destKeys.length == ratios.length+1 : "Unexpected number of destination keys."; this.dataset = dataset; this.ratios = ratios; this.jobKey = jobKey; this.destKeys = destKeys; } @Override public void compute2() { // Lock all possible data dataset.read_lock(jobKey); // Create a template vector for each segment final Vec[][] templates = makeTemplates(dataset, ratios); final int nsplits = templates.length; assert nsplits == ratios.length+1 : "Unexpected number of split templates!"; // Launch number of distributed FJ for each split part final Vec[] datasetVecs = dataset.vecs(); splits = new Frame[nsplits]; for (int s=0; s 0 && espc[0] == 0; assert espc[espc.length-1] == len; long[] partSizes = partitione(len, ratios); // Split of whole vector int nparts = ratios.length+1; long[][] r = new long[nparts][espc.length]; // espc for each partition long nrows = 0; long start = 0; for (int p=0,c=0; p _srcVecs { final Vec [] _srcVecs; // a source frame given by list of its columns final double[] _ratios; // split ratios final int _partIdx; // part index transient int _pcidx; // Start chunk index for this partition transient int _psrow; // Start row in chunk for this partition public FrameSplitTask(H2OCountedCompleter completer, Vec[] srcVecs, double[] ratios, int partIdx) { super(completer); _srcVecs = srcVecs; _ratios = ratios; _partIdx = partIdx; } @Override protected void setupLocal() { // Precompute the first input chunk index and start row inside that chunk for this partition Vec anyInVec = _srcVecs[0]; long[] partSizes = partitione(anyInVec.length(), _ratios); long pnrows = 0; for (int p=0; p<_partIdx; p++) pnrows += partSizes[p]; long[] espc = anyInVec.espc(); while (_pcidx < espc.length-1 && (pnrows -= (espc[_pcidx+1]-espc[_pcidx])) >= 0 ) _pcidx++; assert pnrows <= 0; _psrow = (int) (pnrows + espc[_pcidx+1]-espc[_pcidx]); } @Override public void map(Chunk[] cs) { // Output chunks int coutidx = cs[0].cidx(); // Index of output Chunk int cinidx = _pcidx + coutidx; int startRow = coutidx > 0 ? 0 : _psrow; // where to start extracting int nrows = cs[0]._len; // For each output chunk extract appropriate rows for partIdx-th part for (int i=0; i
© 2015 - 2025 Weber Informatics LLC | Privacy Policy