
water.Job Maven / Gradle / Ivy
package water;
import jsr166y.CountedCompleter;
import java.util.Arrays;
import hex.ModelBuilder;
import water.H2O.H2OCountedCompleter;
import water.exceptions.H2OKeyNotFoundArgumentException;
import water.util.Log;
import water.util.StringUtils;
/** Jobs are used to do minimal tracking of long-lifetime user actions,
* including progress-bar updates and the ability to review in progress or
* completed Jobs, and cancel currently running Jobs.
*
* Jobs are {@link Keyed}, because they need to Key to control e.g. atomic updates.
* Jobs produce a {@link Keyed} result, such as a Frame (from Parsing), or a Model.
* NOTE: the Job class is parametrized on the type of its _dest field.
*/
public class Job extends Keyed {
/** A system key for global list of Job keys. */
public static final Key LIST = Key.make(" JobList", (byte) 0, Key.BUILT_IN_KEY, false);
/** A list of field validation issues. */
public ValidationMessage[] _messages = new ValidationMessage[0];
private int _error_count = -1; // -1 ==> init not run yet, for those Jobs that have an init, like ModelBuilder. Note, this counts ONLY errors, not WARNs and etc.
/**
* init(expensive) is called inside a DTask, not from the http request thread. If we add validation messages to the
* Job we want to update it in the DKV so the client can see them when polling and later on
* after the Job completes.
*
* NOTE: this should only be called when no other threads are updating the job, for example from init() or after the
* DTask is stopped and is getting cleaned up.
*/
public void updateValidationMessages() {
// Atomically update the validation messages in the Job in the DKV.
// In some cases we haven't stored to the DKV yet:
new TAtomic() {
@Override public Job atomic(Job old) {
if( old == null ) throw new H2OKeyNotFoundArgumentException((Key)null);
old._messages = _messages;
return old;
}
}.invoke(_key);
}
public int error_count() { return (_error_count > 0 ? _error_count : 0); }
public int error_count_or_uninitialized() { return _error_count; }
public void hide (String field_name, String message) { message(ValidationMessage.MessageType.HIDE , field_name, message); }
public void info (String field_name, String message) { message(ValidationMessage.MessageType.INFO , field_name, message); }
public void warn (String field_name, String message) { message(ValidationMessage.MessageType.WARN , field_name, message); }
public void error(String field_name, String message) { message(ValidationMessage.MessageType.ERROR, field_name, message); _error_count++; }
public void clearValidationErrors() {
_messages = new ValidationMessage[0];
_error_count = 0;
}
public void message(ValidationMessage.MessageType message_type, String field_name, String message) {
_messages = Arrays.copyOf(_messages, _messages.length + 1);
_messages[_messages.length - 1] = new ValidationMessage(message_type, field_name, message);
}
/** Get a string representation of only the ERROR ValidationMessages (e.g., to use in an exception throw). */
public String validationErrors() {
StringBuilder sb = new StringBuilder();
for( ValidationMessage vm : _messages )
if( vm.message_type == ValidationMessage.MessageType.ERROR )
sb.append(vm.toString()).append("\n");
return sb.toString();
}
private static class JobList extends Keyed {
Key[] _jobs;
JobList() { super(LIST); _jobs = new Key[0]; }
private JobList(Key[]jobs) { super(LIST); _jobs = jobs; }
@Override protected long checksum_impl() { throw H2O.fail("Joblist checksum does not exist by definition"); }
}
/** The list of all Jobs, past and present.
* @return The list of all Jobs, past and present */
public static Job[] jobs() {
final Value val = DKV.get(LIST);
if( val==null ) return new Job[0];
JobList jl = val.get();
Job[] jobs = new Job[jl._jobs.length];
int j=0;
for( int i=0; i _dest; // Key for result
/** Since _dest is public final, not sure why we have a getter but some
* people like 'em. */
public final Key dest() { return _dest; }
public final Key jobKey() { return _key; }
/** User description */
public String _description;
/** Job start_time using Sys.CTM */
public long _start_time; // Job started
/** Job end_time using Sys.CTM, or 0 if not ended */
public long _end_time; // Job end time, or 0 if not ended
/** Any exception thrown by this Job, or null if none */
public String _exception; // Unpacked exception & stack trace
/** Possible job states. These are ORDERED; state levels can increased but never decrease */
public enum JobState {
CREATED, // Job was created
RUNNING, // Job is running
DONE, // Job was successfully finished
CANCELLED, // Job was cancelled by user
FAILED, // Job crashed, error message/exception is available
}
public JobState _state;
/** Returns true if the job was cancelled by the user or crashed.
* @return true if the job is in state {@link JobState#CANCELLED} or {@link JobState#FAILED} */
public boolean isCancelledOrCrashed() {
return _state == JobState.CANCELLED || _state == JobState.FAILED;
}
/** Returns true if this job is running
* @return returns true only if this job is in running state. */
public boolean isRunning() { return _state == JobState.RUNNING; }
/** Returns true if this job is done
* @return true if the job is in state {@link JobState#DONE} */
public boolean isDone () { return _state == JobState.DONE ; }
/** Returns true if this job was started and is now stopped */
public boolean isStopped() { return _state == JobState.DONE || isCancelledOrCrashed(); }
/** Check if given job is running.
* @param job_key job key
* @return true if job is still running else returns false. */
public static boolean isRunning(Key job_key) { return job_key.get().isRunning(); }
/** Current runtime; zero if not started */
public final long msec() {
switch( _state ) {
case CREATED: return 0;
case RUNNING: return System.currentTimeMillis() - _start_time;
default: return _end_time - _start_time;
}
}
private Job(Key jobKey, Key dest, String desc) {
super(jobKey);
_description = desc;
_dest = dest;
_state = JobState.CREATED; // Created, but not yet running
}
/** Create a Job
* @param dest Final result Key to be produced by this Job
* @param desc String description
*/
public Job(Key dest, String desc) {
this(defaultJobKey(),dest,desc);
}
// Job Keys are pinned to this node (i.e., the node that invoked the
// computation), because it should be almost always updated locally
private static Key defaultJobKey() { return Key.make((byte) 0, Key.JOB, false, H2O.SELF); }
/** Start this task based on given top-level fork-join task representing job computation.
* @param fjtask top-level job computation task.
* @param work Units of work to be completed
* @param restartTimer
* @return this job in {@link JobState#RUNNING} state
*
* @see JobState
* @see H2OCountedCompleter
*/
protected Job start(final H2OCountedCompleter fjtask, long work, boolean restartTimer) {
if (work >= 0)
DKV.put(_progressKey = createProgressKey(), new Progress(work));
assert _state == JobState.CREATED : "Trying to run job which was already run?";
assert fjtask != null : "Starting a job with null working task is not permitted!";
assert fjtask.getCompleter() == null : "Cannot have a completer; this must be a top-level task";
_fjtask = fjtask;
// Make a wrapper class that only *starts* when the fjtask completes -
// especially it only starts even when fjt completes exceptionally... thus
// the fjtask onExceptionalCompletion code runs completely before this
// empty task starts - providing a simple barrier. Threads blocking on the
// job will block on the "barrier" task, which will block until the fjtask
// runs the onCompletion or onExceptionCompletion code.
_barrier = new H2OCountedCompleter() {
@Override public void compute2() { }
@Override public boolean onExceptionalCompletion(Throwable ex, CountedCompleter caller) {
if( getCompleter() == null ) { // nobody else to handle this exception, so print it out
System.err.println("barrier onExCompletion for "+fjtask);
ex.printStackTrace();
Job.this.failed(ex);
}
return true;
}
};
fjtask.setCompleter(_barrier);
if (restartTimer) _start_time = System.currentTimeMillis();
_state = JobState.RUNNING;
// Save the full state of the job
DKV.put(_key, this);
// Update job list
final Key jobkey = _key;
new TAtomic() {
@Override public JobList atomic(JobList old) {
if( old == null ) old = new JobList();
Key[] jobs = old._jobs;
old._jobs = Arrays.copyOf(jobs, jobs.length + 1);
old._jobs[jobs.length] = jobkey;
return old;
}
}.invoke(LIST);
H2O.submitTask(fjtask);
return this;
}
protected Key createProgressKey() { return Key.make(); }
protected boolean deleteProgressKey() { return true; }
/** Blocks and get result of this job.
*
* This call blocks on working task which was passed via {@link #start}
* method and returns the result which is fetched from DKV based on job
* destination key.
*
* @return result of this job fetched from DKV by destination key.
* @see #start
* @see DKV
*/
public T get() {
block();
assert !isRunning() : "Job state should not be running, but it is " + _state;
return _dest.get();
}
/**
* Blocks for job completion, but do not return anything as the destination object might not yet be finished
*/
public void block() {
assert _fjtask != null : "Cannot block on missing F/J task";
_barrier.join(); // Block on the *barrier* task, which blocks until the fjtask on*Completion code runs completely
}
/** Marks job as finished and records job end time. */
public void done() {
done(false);
}
/**
* Conditionally mark the job as finished and record job end time
* @param force If set to false, then ask canBeDone() whether to mark the job as finished
*/
protected void done(boolean force) {
if (force || canBeDone()) changeJobState(null, JobState.DONE);
}
/**
* Allow ModelBuilders to override this to conditionally mark the job as finished
* @return whether or not the job should be marked as finished in done() or done(false)
*/
protected boolean canBeDone() { return true; }
/** Signal cancellation of this job.
* The job will be switched to state {@link JobState#CANCELLED} which signals that
* the job was cancelled by a user. */
public void cancel() { changeJobState(null, JobState.CANCELLED); }
/** Signal exceptional cancellation of this job.
* @param ex exception causing the termination of job. */
public void failed(Throwable ex) {
String stackTrace = StringUtils.toString(ex);
changeJobState("Got exception '" + ex.getClass() + "', with msg '" + ex.getMessage() + "'\n" + stackTrace, JobState.FAILED);
//if(_fjtask != null && !_fjtask.isDone()) _fjtask.completeExceptionally(ex);
}
/** Signal exceptional cancellation of this job.
* @param msg cancellation message explaining reason for cancellation */
public void cancel(final String msg) { changeJobState(msg, msg == null ? JobState.CANCELLED : JobState.FAILED); }
private void changeJobState(final String msg, final JobState resultingState) {
assert resultingState != JobState.RUNNING;
if( _state == JobState.CANCELLED ) Log.info("Canceled job " + _key + "(" + _description + ") was cancelled again.");
if( _state == resultingState ) return; // No change if already done
final float finalProgress = resultingState==JobState.DONE ? 1.0f : progress_impl(); // One-shot set from NaN to progress, no longer need Progress Key
final long done = System.currentTimeMillis();
// Atomically flag the job as canceled
new TAtomic() {
@Override public Job atomic(Job old) {
if( old == null ) return null; // Job already removed
// States monotonically increase; states can increase but not decrease
if( resultingState.ordinal() <= old._state.ordinal() ) return null;
// Atomically capture changeJobState/crash state, plus end time
old._exception = msg;
old._state = resultingState;
old._end_time = done;
old._finalProgress = finalProgress;
return old;
}
}.invoke(_key);
// Also immediately update immediately a possibly cached local POJO (might
// be shared with the DKV cached job, might not).
if( this != DKV.getGet(_key) ) {
_exception = msg;
_state = resultingState;
_end_time = done;
_finalProgress = finalProgress;
}
// Remove on cancel/fail/done, only used whilst Job is Running
if (deleteProgressKey())
DKV.remove(_progressKey);
}
/** Returns a float from 0 to 1 representing progress. Polled periodically.
* Can default to returning e.g. 0 always. */
public float progress() {
return isStopped() ? _finalProgress : progress_impl();
}
// Read racy progress in a non-racy way: read the DKV exactly once,
// null-checking as we go. Handles the case where the Job is being removed
// exactly when we are reading progress e.g. for the GUI.
private Progress getProgress() {
Key k = _progressKey;
Value val;
return k!=null && (val=DKV.get(k))!=null ? (Progress)val.get() : null;
}
// Checks the DKV for the progress Key & object
private float progress_impl() {
Progress p = getProgress();
return p==null ? 0f : p.progress();
}
/** Returns last progress message. */
public String progress_msg() { return isStopped() ? _state.toString() : progress_msg_impl(); }
private String progress_msg_impl() {
Progress p = getProgress();
return p==null ? "" : p.progress_msg();
}
protected Key