
water.MRTask Maven / Gradle / Ivy
package water;
import jsr166y.CountedCompleter;
import jsr166y.ForkJoinPool;
import water.fvec.*;
import water.util.DistributedException;
import water.util.PrettyPrint;
import water.fvec.Vec.VectorGroup;
import java.util.Arrays;
/**
* Map/Reduce style distributed computation.
*
* MRTask provides several map
and reduce
methods
* that can be overridden to specify a computation. Several instances of this
* class will be created to distribute the computation over F/J threads and
* machines. Non-transient fields are copied and serialized to instances
* created for map invocations. Reduce methods can store their results in
* fields. Results are serialized and reduced all the way back to the invoking
* node. When the last reduce method has been called, fields of the initial
* MRTask instance contains the computation results.
*
* Apart from small reduced POJO returned to the calling node, MRTask can
* produce output vector(s) as a result. These will have chunks co-located
* with the input dataset, however, their number of lines will generally differ
* so they won't be strictly compatible with the original. To produce output
* vectors, call doAll.dfork version with required number of outputs and
* override appropriate map
call taking required number of
* NewChunks. MRTask will automatically close the new Appendable vecs and a
* call to outputFrame
will make a frame with newly created Vecs.
*
*
* Overview
*
* Distributed computation starts by calling doAll
,
* dfork
, or dfork
. doAll
simply
* calls dfork
and dfork
before blocking;
* dfork
and dfork
are non-blocking. The main
* pardigm is divide-conquer-combine using ForkJoin.
*
* If doAll
is called with Keys, then one map
call is
* made per Key, on the Key's home node. If MRTask is invoked on a Frame (or
* equivalently a Vec[]), then one map
call is made per Chunk for
* all Vecs at once, on the Chunk's home node. In both modes,
* reduce
is called between each pair of calls to
* map
.
*
* MRTask can also be called with doAllNodes
, in which case only
* the setupLocal call is made once per node; neither map nor reduce are
* called.
*
* Computation is tailored primarily by overriding. The main method is the
* map
call, coupled sometimes with a reduce
call.
* setupLocal
is called once per node before any map calls are
* made on that node (but perhaps other nodes have already started); in reverse
* closeLocal
is called after the last map call completes on a
* node (but perhaps other nodes are still computing maps).
* postGlobal
is called once only after all maps, reduces and
* closeLocals, and only on the home node.
*/
public abstract class MRTask> extends DTask implements ForkJoinPool.ManagedBlocker {
/*
* Technical note to developers:
*
* There are several internal flags and counters used throughout. They are gathered in
* this note to help you reason about the execution of an MRTask.
*
* internal "top-level" fields
* ---------------------------
* - RPC _nleft, _nrite: "child" node/JVMs that are doing work
* - boolean _topLocal : "root" MRTask on a local machine
* - boolean _topGlobal : "root" MRTask on the "root" node
* - T _left, _rite : "child" MRTasks on a local machine
* - T _res : "result" MRTask (everything reduced into here)
* - int _nlo,_nhi : range of nodes to do remote work on (divide-conquer; see Diagram 2)
* - Futures _fs : _topLocal task blocks on _fs for _left and _rite to complete
*
* Diagram 1: N is for Node; T is for Task
* -------------------------------------
* 3 node cloud Inside one of the 'N' nodes:
* N1 T _topLocal**
* / \ / \
* N2 (_nleft) N3 (_nrite) T (_left) T (_rite)
*
* **: T is also _topGlobal if N==N1
*
* These fields get set in the SetupLocal0 call. Let's see what it does:
*
* Diagram 2:
* ----------
* dfork on N1
* - _topGlobal=true
* - _nlo=0
* - _nhi=CLOUD_SIZE
* ||
* ||
* ||
* ==> setupLocal0 on N1
* - topLocal=true
* - _fs = new Futures()
* - nmid = (_nlo + _nhi) >> 1 => split the range of nodes (divide-conquer)
* - _nleft = remote_compute(_nlo,nmid) => chooses a node in range and does new RPC().call()
* - _nrite = remote_compute(nmid,_nhi) serializing MRTask and call dinvoke on remote.
* / \
* / \
* / \
* dinvoke on N2 dinvoke on N3
* setupLocal0 on N2 setupLocal0 on N3
* - topLocal=true - topLocal=true
* - _fs = new Futures() - _fs = new Futures()
* - (continue splitting) - (continue splitting)
* H2O.submitTask(this) => compute2 H2O.submitTask(this) => compute2
*
*/
public MRTask() { super(); }
protected MRTask(H2O.H2OCountedCompleter cmp) {super(cmp); }
protected MRTask(byte prior) { super(prior); }
/**
* This Frame instance is the handle for computation over a set of Vec instances. Recall
* that a Frame is a collection Vec instances, so this includes any invocation of
* doAll
with Frame and Vec[] instances. Top-level calls to
* doAll
wrap Vec instances into a new Frame instance and set this into
* _fr
during a call to dfork
.
*/
public Frame _fr;
/** This Key[]
instance is the handle used for computation when
* an MRTask is invoked over an array of Key
instances. */
public Key[] _keys;
/** The number and type of output Vec instances produced by an MRTask. If
* null then there are no outputs, _appendables will be null, and calls to
* outputFrame
will return null. */
private byte _output_types[];
/** First reserved VectorGroup key index for all output Vecs */
private int _vid;
/** New Output vectors; may be null.
* @return the set of AppendableVec instances or null if _output_types is null */
public AppendableVec[] appendables() { return _appendables; }
/** Appendables are treated separately (roll-ups computed in map/reduce
* style, can not be passed via K/V store).*/
protected AppendableVec[] _appendables;
/** Internal field to track the left & right remote nodes/JVMs to work on */
transient protected RPC _nleft, _nrite;
/** Internal field to track if this is a top-level local call */
transient protected boolean _topLocal; // Top-level local call, returning results over the wire
/** Internal field to track if this is a top-level call. */
transient boolean _topGlobal = false;
/** Internal field to track the left & right sub-range of chunks to work on */
transient protected T _left, _rite; // In-progress execution tree
/** Internal field upon which all reduces occur. */
transient private T _res; // Result
/** The range of Nodes to work on remotely */
protected short _nlo, _nhi;
/** Internal field to track a range of local Chunks to work on */
transient protected int _lo, _hi;
/** We can add more things to block on - in case we want a bunch of lazy
* tasks produced by children to all end before this top-level task ends.
* Semantically, these will all complete before we return from the top-level
* task. Pragmatically, we block on a finer grained basis. */
transient protected Futures _fs; // More things to block on
/** If true, run entirely local - which will pull all the data locally. */
protected boolean _run_local;
public String profString() { return _profile != null ? _profile.toString() : "Profiling turned off"; }
MRProfile _profile;
/** Used to invoke profiling. Call as: new MRTask().profile().doAll();*/
public T profile() { _profile = new MRProfile(this); return (T)this; }
/** Get the resulting Frame from this invoked MRTask. This Frame is not
* in the DKV. AppendableVec instances are closed into Vec instances,
* which then appear in the DKV.
*
* @return null if no outputs, otherwise returns the resulting Frame from
* the MRTask. The Frame has no column names nor domains.
*/
public Frame outputFrame() { return outputFrame(null,null,null); }
/** Get the resulting Frame from this invoked MRTask. This Frame is not in
* the DKV. AppendableVec instances are closed into Vec instances, which
* then appear in the DKV.
*
* @param names The names of the columns in the resulting Frame.
* @param domains The domains of the columns in the resulting Frame.
* @return The result Frame, or null if no outputs
*/
public Frame outputFrame(String [] names, String [][] domains){ return outputFrame(null,names,domains); }
/**
* Get the resulting Frame from this invoked MRTask. If the passed in key
* is not null, then the resulting Frame will appear in the DKV. AppendableVec instances
* are closed into Vec instances, which then appear in the DKV.
*
* @param key If null, then the Frame will not appear in the DKV. Otherwise, this result
* will appear in the DKV under this key.
* @param names The names of the columns in the resulting Frame.
* @param domains The domains of the columns in the resulting Frame.
* @return null if _noutputs is 0, otherwise returns a Frame.
*/
public Frame outputFrame(Key key, String [] names, String [][] domains){
Futures fs = new Futures();
Frame res = closeFrame(key, names, domains, fs);
if( key != null ) DKV.put(res,fs);
fs.blockForPending();
return res;
}
// the work-horse for the outputFrame calls
private Frame closeFrame(Key key, String[] names, String[][] domains, Futures fs) {
if( _output_types == null ) return null;
final int noutputs = _output_types.length;
Vec[] vecs = new Vec[noutputs];
if( _appendables==null || _appendables.length == 0) // Zero rows?
for( int i = 0; i < noutputs; i++ )
vecs[i] = _fr.anyVec().makeZero();
else {
int rowLayout = _appendables[0].compute_rowLayout();
for( int i = 0; i < noutputs; i++ ) {
_appendables[i].setDomain(domains==null ? null : domains[i]);
vecs[i] = _appendables[i].close(rowLayout,fs);
}
}
return new Frame(key,names,vecs);
}
/** Override with your map implementation. This overload is given a single
* local input Chunk. It is meant for map/reduce jobs that use a
* single column in a input Frame. All map variants are called, but only one is
* expected to be overridden. */
public void map( Chunk c ) { }
public void map( Chunk c, NewChunk nc ) { }
/** Override with your map implementation. This overload is given two
* local Chunks. All map variants are called, but only one
* is expected to be overridden. */
public void map( Chunk c0, Chunk c1 ) { }
//public void map( Chunk c0, Chunk c1, NewChunk nc) { }
//public void map( Chunk c0, Chunk c1, NewChunk nc1, NewChunk nc2 ) { }
/** Override with your map implementation. This overload is given three
* local input Chunks. All map variants are called, but only one
* is expected to be overridden. */
public void map( Chunk c0, Chunk c1, Chunk c2 ) { }
//public void map( Chunk c0, Chunk c1, Chunk c2, NewChunk nc ) { }
//public void map( Chunk c0, Chunk c1, Chunk c2, NewChunk nc1, NewChunk nc2 ) { }
/** Override with your map implementation. This overload is given an array
* of local input Chunks, for Frames with arbitrary column
* numbers. All map variants are called, but only one is expected to be
* overridden. */
public void map( Chunk cs[] ) { }
/** The handy method to generate a new vector based on existing vectors.
*
* Note: This method is used by Sparkling Water examples.
*
* @param cs input vectors
* @param nc output vector
*/
public void map( Chunk cs[], NewChunk nc ) { }
public void map( Chunk cs[], NewChunk nc1, NewChunk nc2 ) { }
public void map( Chunk cs[], NewChunk [] ncs ) { }
/** Override with your map implementation. Used when doAll is called with
* an array of Keys, and called once-per-Key on the Key's Home node */
public void map( Key key ) { }
/** Override to combine results from 'mrt' into 'this' MRTask. Both 'this'
* and 'mrt' are guaranteed to either have map() run on them, or be the
* results of a prior reduce(). Reduce is optional if, e.g., the result is
* some output vector. */
public void reduce( T mrt ) { }
/** Override to do any remote initialization on the 1st remote instance of
* this object, for initializing node-local shared data structures. */
protected void setupLocal() {}
/** Override to do any remote cleaning on the last remote instance of
* this object, for disposing of node-local shared data structures. */
protected void closeLocal() { }
/** Compute a permissible node index on which to launch remote work. */
private int addShift( int x ) { x += _nlo; int sz = H2O.CLOUD.size(); return x < sz ? x : x-sz; }
private int subShift( int x ) { x -= _nlo; int sz = H2O.CLOUD.size(); return x < 0 ? x+sz : x; }
private short selfidx() { int idx = H2O.SELF.index(); if( idx>= 0 ) return (short)idx; assert H2O.SELF._heartbeat._client; return 0; }
// Profiling support. Time for each subpart of a single M/R task, plus any
// nested MRTasks. All numbers are CTM stamps or millisecond times.
private static class MRProfile extends Iced {
String _clz;
public MRProfile(MRTask mrt) {
_clz = mrt.getClass().toString();
_localdone = System.currentTimeMillis();
}
// See where these are set to understand their meaning. If we split the
// job, then _lstart & _rstart are the start of left & right jobs. If we
// do NOT split, then _rstart is 0 and _lstart is for the user map job(s).
long _localstart, _rpcLstart, _rpcRstart, _rpcRdone, _localdone; // Local setup, RPC network i/o times
long _mapstart, _userstart, _closestart, _mapdone; // MAP phase
long _onCstart, _reducedone, _closeLocalDone, _remoteBlkDone, _localBlkDone, _onCdone; // REDUCE phase
// If we split the job left/right, then we get a total recording of the
// last job, and the exec time & completion time of 1st job done.
long _time1st, _done1st;
int _size_rez0, _size_rez1; // i/o size in bytes during reduce
MRProfile _last;
long sumTime() { return _onCdone - (_localstart==0 ? _mapstart : _localstart); }
void gather( MRProfile p, int size_rez ) {
p._clz=null;
if( _last == null ) { _last=p; _time1st = p.sumTime(); _done1st = p._onCdone; }
else {
MRProfile first = _last._onCdone <= p._onCdone ? _last : p;
_last = _last._onCdone > p._onCdone ? _last : p;
if( first._onCdone > _done1st ) { _time1st = first.sumTime(); _done1st = first._onCdone; }
}
if( size_rez !=0 ) // Record i/o result size
if( _size_rez0 == 0 ) _size_rez0=size_rez;
else _size_rez1=size_rez;
assert _userstart !=0 || _last != null;
assert _last._onCdone >= _done1st;
}
@Override public String toString() { return print(new StringBuilder(),0).toString(); }
private StringBuilder print(StringBuilder sb, int d) {
if( d==0 ) sb.append(_clz).append("\n");
for( int i=0; ioutputs
Vec instances. This call is asynchronous. It returns 'this', on
* which getResult
may be invoked by the caller to block for pending
* computation to complete.
*
* @param types The type of output Vec instances to create.
* @param vecs The input set of Vec instances upon which computation is performed.
* @return this
*/
public final T dfork( byte[] types, Vec... vecs) { return dfork(types,new Frame(vecs),false); }
public final T dfork(Vec... vecs){ return dfork(null,new Frame(vecs),false); }
/**
* Invokes the map/reduce computation over the given Frame instance. This call is
* asynchronous. It returns 'this', on which getResult
may be invoked
* by the caller to block for pending computation to complete. This call produces no
* output Vec instances or Frame instances.
*
* @param fr Perform the computation on this Frame instance.
* @return this
*/
public final T dfork(Frame fr){ return dfork(null,fr,false); }
/** Fork the task in strictly non-blocking fashion.
* Same functionality as dfork, but does not raise priority, so user is should
* *never* block on it.
* Because it does not raise priority, these can be tail-call chained together
* for any length.
*/
public final T dfork( byte[] types, Frame fr, boolean run_local) {
_topGlobal = true;
_output_types = types;
if( types != null && types.length > 0 )
_vid = fr.anyVec().group().reserveKeys(types.length);
_fr = fr; // Record vectors to work on
_nlo = selfidx(); _nhi = (short)H2O.CLOUD.size(); // Do Whole Cloud
_run_local = run_local; // Run locally by copying data, or run globally?
setupLocal0(); // Local setup
H2O.submitTask(this); // Begin normal execution on a FJ thread
return self();
}
/** Block for and get any final results from a dfork'd MRTask.
* Note: the desired name 'get' is final in ForkJoinTask. */
public final T getResult() {
assert getCompleter()==null; // No completer allowed here; FJ never awakens threads with completers
while(!isReleasable()) {
try {
ForkJoinPool.managedBlock(this);
Throwable ex = getDException();
if (ex != null) throw ex;
} catch (InterruptedException ignore) {
// do nothing
} catch (Throwable re) {
throw (re instanceof DistributedException)?new DistributedException(re.getMessage(),re.getCause()):new DistributedException(re);
}
}
assert _topGlobal:"lost top global flag";
return self();
}
// Return true if blocking is unnecessary, which is true if the Task isDone.
public boolean isReleasable() { return isDone(); }
// Possibly blocks the current thread. Returns true if isReleasable would
// return true. Used by the FJ Pool management to spawn threads to prevent
// deadlock is otherwise all threads would block on waits.
public boolean block() throws InterruptedException {
while( !isDone() ) join();
return true;
}
/** Called once on remote at top level, probably with a subset of the cloud.
* Called internal by D/F/J. Not expected to be user-called. */
@Override public final void dinvoke(H2ONode sender) {
setupLocal0(); // Local setup
H2O.submitTask(this);
}
/*
* Set top-level fields and fire off remote work (if there is any to do) to 2 selected
* child JVM/nodes. Setup for local work: fire off any global work to cloud neighbors; do all
* chunks; call user's init.
*/
private void setupLocal0() {
if(_profile != null)
(_profile = new MRProfile(this))._localstart = System.currentTimeMillis();
// Make a blockable Futures for both internal and user work to block on.
_fs = new Futures();
_topLocal = true;
// Check for global vs local work
int selfidx = selfidx();
int nlo = subShift(selfidx);
assert nlo < _nhi;
final int nmid = (nlo+_nhi)>>>1; // Mid-point
// Run remote IF:
// - Not forced to run local (no remote jobs allowed) AND
// - - There's remote work, or Client mode (always remote work)
if( (!_run_local) && ((nlo+1 < _nhi) || H2O.ARGS.client) ) {
if(_profile!=null) _profile._rpcLstart = System.currentTimeMillis();
_nleft = remote_compute(H2O.ARGS.client ? nlo : nlo+1,nmid);
if(_profile!=null) _profile._rpcRstart = System.currentTimeMillis();
_nrite = remote_compute( nmid,_nhi);
if(_profile!=null) _profile._rpcRdone = System.currentTimeMillis();
} else {
if(_profile!=null)
_profile._rpcLstart = _profile._rpcRstart = _profile._rpcRdone = System.currentTimeMillis();
}
if( _fr != null ) { // Doing a Frame
_lo = 0; _hi = _fr.numCols()==0 ? 0 : _fr.anyVec().nChunks(); // Do All Chunks
// get the Vecs from the K/V store, to avoid racing fetches from the map calls
_fr.vecs();
} else if( _keys != null ) { // Else doing a set of Keys
_lo = 0; _hi = _keys.length; // Do All Keys
}
// Setup any user's shared local structures for both normal cluster nodes
// and any client; want this for possible reduction ONTO client
setupLocal();
if(_profile!=null) _profile._localdone = System.currentTimeMillis();
}
// Make an RPC call to some node in the middle of the given range. Add a
// pending completion to self, so that we complete when the RPC completes.
private RPC remote_compute( int nlo, int nhi ) {
if( nlo < nhi ) { // have remote work
int node = addShift(nlo);
assert node != H2O.SELF.index(); // Not the same as selfidx() if this is a client
T mrt = copyAndInit();
mrt._nhi = (short) nhi;
addToPendingCount(1); // Not complete until the RPC returns
// Set self up as needing completion by this RPC: when the ACK comes back
// we'll get a wakeup.
// Note the subtle inter-play of onCompletion madness here:
// - when run on the remote, the RPCCall (NOT RPC!) is completed by the
// last map/compute2 call, signals end of the remote work, and ACK's
// back the result. i.e., last-map calls RPCCall.onCompletion.
// - when launched on the local (right here, in this next line of code)
// the completed RPC calls our self completion. i.e. the completed RPC
// calls MRTask.onCompletion
return new RPC<>(H2O.CLOUD._memary[node], mrt).addCompleter(this).call();
}
return null; // nlo >= nhi => no remote work
}
/** Called from FJ threads to do local work. The first called Task (which is
* also the last one to Complete) also reduces any global work. Called
* internal by F/J. Not expected to be user-called. */
@Override public final void compute2() {
assert _left == null && _rite == null && _res == null;
if(_profile!=null) _profile._mapstart = System.currentTimeMillis();
if( (_hi-_lo) >= 2 ) { // Multi-chunk case: just divide-and-conquer to 1 chunk
final int mid = (_lo+_hi)>>>1; // Mid-point
_left = copyAndInit();
_rite = copyAndInit();
_left._hi = mid; // Reset mid-point
_rite._lo = mid; // Also set self mid-point
addToPendingCount(1); // One fork awaiting completion
_left.fork(); // Runs in another thread/FJ instance
_rite.compute2(); // Runs in THIS F/J thread
if(_profile!=null) _profile._mapdone = System.currentTimeMillis();
return; // Not complete until the fork completes
}
// Zero or 1 chunks, and further chunk might not be homed here
if( _fr==null ) { // No Frame, so doing Keys?
if( _keys == null || // Once-per-node mode
_hi > _lo && _keys[_lo].home() ) {
assert(_keys == null || !H2O.ARGS.client) : "Client node should not process any keys in MRTask!";
if(_profile!=null) _profile._userstart = System.currentTimeMillis();
if( _keys != null ) map(_keys[_lo]);
_res = self(); // Save results since called map() at least once!
if(_profile!=null) _profile._closestart = System.currentTimeMillis();
}
} else if( _hi > _lo ) { // Frame, Single chunk?
Vec v0 = _fr.anyVec();
if( _run_local || v0.chunkKey(_lo).home() ) { // And chunk is homed here?
assert(_run_local || !H2O.ARGS.client) : "Client node should not process any keys in MRTask!";
// Make decompression chunk headers for these chunks
Vec vecs[] = _fr.vecs();
Chunk bvs[] = new Chunk[vecs.length];
NewChunk [] appendableChunks = null;
for( int i=0; i© 2015 - 2025 Weber Informatics LLC | Privacy Policy