alluxio.master.scheduler.Scheduler Maven / Gradle / Ivy

Go to download
/*
 * The Alluxio Open Foundation licenses this work under the Apache License, version 2.0
 * (the "License"). You may not use this work except in compliance with the License, which is
 * available at www.apache.org/licenses/LICENSE-2.0
 *
 * This software is distributed on an "AS IS" basis, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND,
 * either express or implied, as more fully set forth in the License.
 *
 * See the NOTICE file distributed with this work for information regarding copyright ownership.
 */

package alluxio.master.scheduler;

import static java.lang.String.format;

import alluxio.Constants;
import alluxio.annotation.SuppressFBWarnings;
import alluxio.client.block.stream.BlockWorkerClient;
import alluxio.client.file.FileSystemContext;
import alluxio.collections.ConcurrentHashSet;
import alluxio.conf.Configuration;
import alluxio.conf.PropertyKey;
import alluxio.exception.runtime.AlluxioRuntimeException;
import alluxio.exception.runtime.InternalRuntimeException;
import alluxio.exception.runtime.NotFoundRuntimeException;
import alluxio.exception.runtime.ResourceExhaustedRuntimeException;
import alluxio.exception.runtime.UnavailableRuntimeException;
import alluxio.grpc.JobProgressReportFormat;
import alluxio.job.JobDescription;
import alluxio.metrics.MetricKey;
import alluxio.metrics.MetricsSystem;
import alluxio.resource.CloseableResource;
import alluxio.scheduler.job.Job;
import alluxio.scheduler.job.JobMetaStore;
import alluxio.scheduler.job.JobState;
import alluxio.scheduler.job.Task;
import alluxio.util.ThreadFactoryUtils;
import alluxio.util.ThreadUtils;
import alluxio.wire.WorkerInfo;

import com.google.common.annotations.VisibleForTesting;
import com.google.common.base.MoreObjects;
import com.google.common.base.Objects;
import com.google.common.base.Preconditions;
import com.google.common.collect.ImmutableMap;
import com.google.common.collect.ImmutableSet;
import org.slf4j.Logger;
import org.slf4j.LoggerFactory;

import java.util.HashMap;
import java.util.List;
import java.util.Map;
import java.util.Set;
import java.util.concurrent.ConcurrentHashMap;
import java.util.concurrent.Executors;
import java.util.concurrent.PriorityBlockingQueue;
import java.util.concurrent.ScheduledExecutorService;
import java.util.concurrent.TimeUnit;
import java.util.concurrent.atomic.AtomicInteger;
import java.util.concurrent.atomic.AtomicReference;
import java.util.stream.Collectors;
import javax.annotation.Nullable;
import javax.annotation.concurrent.ThreadSafe;

/**
 * The Scheduler which controls jobs. It is responsible for managing active workers, updating jobs
 *  and update job information to job meta store.
 * The workflow is:
 *  1. Submit a job to the scheduler.
 *  2. The scheduler will pull the task from the job and assign the task to a worker.
 *  3. The worker will execute the task and report the result to the job.
 *  4. The job will update the progress. And schedule the next task if the job is not done.
 *  5. One worker would have one task running for one job description at a time.
 */
@ThreadSafe
@SuppressFBWarnings({"SE_NO_SERIALVERSIONID"})
public final class Scheduler {

  private static final Logger LOG = LoggerFactory.getLogger(Scheduler.class);
  private static final int CAPACITY = 100;
  private static final int MAX_TASK_PER_WORKER = 10;
  private static final long WORKER_UPDATE_INTERVAL = Configuration.getMs(
      PropertyKey.MASTER_WORKER_INFO_CACHE_REFRESH_TIME);
  private final long mSchedulerInitialDelay = Configuration.getMs(
      PropertyKey.MASTER_SCHEDULER_INITIAL_DELAY
  );
  private static final int EXECUTOR_SHUTDOWN_MS = 10 * Constants.SECOND_MS;
  private static AtomicReference sInstance = new AtomicReference<>();
  private final Map> mExistingJobs = new ConcurrentHashMap<>();
  private final Map, ConcurrentHashSet>> mJobToRunningTasks =
      new ConcurrentHashMap<>();
  private final JobMetaStore mJobMetaStore;
  // initial thread in start method since we would stop and start thread when gainPrimacy
  private ScheduledExecutorService mSchedulerExecutor;
  private volatile boolean mRunning = false;
  private final FileSystemContext mFileSystemContext;
  private final WorkerInfoHub mWorkerInfoHub;

  /**
   * Constructor.
   *
   * @param fsCtx file system context
   * @param workerProvider   workerProvider
   * @param jobMetaStore     jobMetaStore
   */
  public Scheduler(FileSystemContext fsCtx, WorkerProvider workerProvider,
      JobMetaStore jobMetaStore) {
    mFileSystemContext = fsCtx;
    mJobMetaStore = jobMetaStore;
    MetricsSystem.registerCachedGaugeIfAbsent(
        MetricKey.MASTER_JOB_SCHEDULER_RUNNING_COUNT.getName(), mJobToRunningTasks::size);
    mWorkerInfoHub = new WorkerInfoHub(this, workerProvider);
    // the scheduler won't be instantiated twice
    sInstance.compareAndSet(null, this);
  }

  /**
   * Get the singleton instance of Scheduler.
   * getInstance won't be called before constructor.
   * @return Scheduler instance
   */
  public static @Nullable Scheduler getInstance() {
    return sInstance.get();
  }

  /**
   * Start scheduler.
   */
  public void start() {
    if (!mRunning) {
      retrieveJobs();
      mSchedulerExecutor = Executors.newSingleThreadScheduledExecutor(
          ThreadFactoryUtils.build("scheduler", false));
      mSchedulerExecutor.scheduleAtFixedRate(mWorkerInfoHub::updateWorkers, 0,
          WORKER_UPDATE_INTERVAL, TimeUnit.MILLISECONDS);
      mSchedulerExecutor.scheduleWithFixedDelay(this::processJobs, mSchedulerInitialDelay, 2000,
          TimeUnit.MILLISECONDS);
      mSchedulerExecutor.scheduleWithFixedDelay(this::cleanupStaleJob, 1, 1, TimeUnit.HOURS);
      mRunning = true;
    }
  }

  /**
   * Update workers.
   */
  public void updateWorkers() {
    mWorkerInfoHub.updateWorkers();
  }

  /*
   TODO(lucy) in future we should remove job automatically, but keep all history jobs in db to help
   user retrieve all submitted jobs status.
   */

  private void retrieveJobs() {
    for (Job job : mJobMetaStore.getJobs()) {
      mExistingJobs.put(job.getDescription(), job);
      if (job.isDone()) {
        mJobToRunningTasks.remove(job);
      }
      else {
        job.initializeJob();
        mJobToRunningTasks.put(job, new ConcurrentHashSet<>());
      }
    }
  }

  /**
   * Stop scheduler.
   */
  public void stop() {
    if (mRunning) {
      mWorkerInfoHub.mActiveWorkers.values().forEach(CloseableResource::close);
      mWorkerInfoHub.mActiveWorkers = ImmutableMap.of();
      ThreadUtils.shutdownAndAwaitTermination(mSchedulerExecutor, EXECUTOR_SHUTDOWN_MS);
      mExistingJobs.clear();
      mJobToRunningTasks.clear();
      mWorkerInfoHub.mWorkerToTaskQ.clear();
      mRunning = false;
    }
  }

  /**
   * Submit a job.
   * @param job the job
   * @return true if the job is new, false if the job has already been submitted
   * @throws ResourceExhaustedRuntimeException if the job cannot be submitted because the scheduler
   *  is at capacity
   * @throws UnavailableRuntimeException if the job cannot be submitted because the meta store is
   * not ready
   */
  public synchronized boolean submitJob(Job job) {
    Job existingJob = mExistingJobs.get(job.getDescription());
    if (existingJob != null && !existingJob.isDone()) {
      mJobToRunningTasks.compute(existingJob, (k, v) -> {
        if (k.getJobState() == JobState.STOPPED) {
          k.setJobState(JobState.RUNNING, true);
          LOG.debug(format("restart existing job: %s", existingJob));
          return new ConcurrentHashSet<>();
        }
        return v;
      });
      return false;
    }

    if (mJobToRunningTasks.size() >= CAPACITY) {
      throw new ResourceExhaustedRuntimeException("Too many jobs running, please submit later.",
          true);
    }
    ConcurrentHashSet> result =
        mJobToRunningTasks.putIfAbsent(job, new ConcurrentHashSet<>());
    if (result != null) {
      LOG.warn("There's concurrent submit while job is still in cleaning state");
      return false;
    }
    mJobMetaStore.updateJob(job);
    mExistingJobs.put(job.getDescription(), job);
    job.initializeJob();
    LOG.info(format("start job: %s", job));
    return true;
  }

  /**
   * Stop a job.
   * @param jobDescription job identifier
   * @return true if the job is stopped, false if the job does not exist or has already finished
   */
  public boolean stopJob(JobDescription jobDescription) {
    Job existingJob = mExistingJobs.get(jobDescription);
    if (existingJob != null && existingJob.isRunning()) {
      existingJob.setJobState(JobState.STOPPED, true);
      // leftover tasks in mJobToRunningTasks would be removed by scheduling thread.
      return true;
    }
    return false;
  }

  /**
   * Get the job's progress report.
   * @param jobDescription job identifier
   * @param format progress report format
   * @param verbose whether to include details on failed files and failures
   * @return the progress report
   * @throws NotFoundRuntimeException if the job does not exist
   * @throws AlluxioRuntimeException if any other Alluxio exception occurs
   */
  public String getJobProgress(
      JobDescription jobDescription,
      JobProgressReportFormat format,
      boolean verbose) {
    Job job = mExistingJobs.get(jobDescription);
    if (job == null) {
      throw new NotFoundRuntimeException(format("%s cannot be found.", jobDescription));
    }
    String progress = job.getProgress(format, verbose);
    return progress;
  }

  /**
   * Get the job's state.
   * @param jobDescription job identifier
   * @return the job state
   * @throws NotFoundRuntimeException if the job does not exist
   */
  public JobState getJobState(JobDescription jobDescription) {
    Job job = mExistingJobs.get(jobDescription);
    if (job == null) {
      throw new NotFoundRuntimeException(format("%s cannot be found.", jobDescription));
    }
    return job.getJobState();
  }

  /**
   * Get active workers.
   * @return active workers
   */
  @VisibleForTesting
  public Set getActiveWorkers() {
    return mWorkerInfoHub.mActiveWorkers.keySet().stream()
        .map(x -> x.mWorkerInfo).collect(Collectors.toSet());
  }

  /**
   * Removes all finished jobs outside the retention time.
   */
  @VisibleForTesting
  public void cleanupStaleJob() {
    long current = System.currentTimeMillis();
    mExistingJobs
        .entrySet().removeIf(job -> !job.getValue().isRunning()
        && job.getValue().getEndTime().isPresent()
        && job.getValue().getEndTime().getAsLong() <= (current - Configuration.getMs(
        PropertyKey.JOB_RETENTION_TIME)));
  }

  /**
   * Get jobs.
   *
   * @return jobs
   */
  @VisibleForTesting
  public Map> getJobs() {
    return mExistingJobs;
  }

  private void processJobs() {
    if (Thread.currentThread().isInterrupted()) {
      return;
    }
    mJobToRunningTasks.forEach((k, v) -> processJob(k));
    // kickstart the head task from each q of the worker if it's not running
    mWorkerInfoHub.kickStartTasks();
  }

  private void processJob(Job job) {
    ConcurrentHashSet> runningTasks = mJobToRunningTasks.compute(job, (k, v) -> {
      if (!k.isRunning()) {
        return null;
      }
      return v;
    });
    // job is not running anymore
    if (runningTasks == null) {
      return;
    }

    if (!job.isHealthy()) {
      job.failJob(new InternalRuntimeException("Job failed because it's not healthy."));
      return;
    }

    try {
      List tasks;
      try {
        Set workers = mWorkerInfoHub.mActiveWorkers.keySet()
            .stream().map(x -> x.mWorkerInfo).collect(Collectors.toSet());
        tasks = (List) job.getNextTasks(workers);
      } catch (AlluxioRuntimeException e) {
        LOG.warn(format("error getting next task for job %s", job), e);
        if (!e.isRetryable()) {
          job.failJob(e);
        }
        return;
      }
      // enqueue the worker task q
      for (Task task : tasks) {
        boolean taskEnqueued = getWorkerInfoHub().enqueueTaskForWorker(
            task.getMyRunningWorker(), task);
        if (!taskEnqueued) {
          job.onTaskSubmitFailure(task);
        }
      }
      mJobToRunningTasks.compute(job, (k, v) -> {
        if ((v == null || v.isEmpty()) && k.isCurrentPassDone()) {
          checkAndSetJobStatus(k);
        }
        return v;
      });
    } catch (Exception e) {
      // Unknown exception. This should not happen, but if it happens we don't want to lose the
      // scheduler thread, thus catching it here. Any exception surfaced here should be properly
      // handled.
      LOG.error("Unexpected exception thrown in processJob.", e);
      job.failJob(new InternalRuntimeException(e));
    }
  }

  private static void checkAndSetJobStatus(Job job) {
    if (job.needVerification()) {
      job.initiateVerification();
    }
    else {
      if (job.isHealthy()) {
        if (job.hasFailure()) {
          job.failJob(new InternalRuntimeException("Job partially failed."));
        }
        else {
          job.setJobSuccess();
        }
      }
      else {
        if (job.getJobState() != JobState.FAILED) {
          job.failJob(
              new InternalRuntimeException("Job failed because it exceed healthy threshold."));
        }
      }
    }
  }

  /**
   * Get the workerinfo hub.
   * @return worker info hub
   */
  public WorkerInfoHub getWorkerInfoHub() {
    return mWorkerInfoHub;
  }

  /**
   * Get job meta store.
   * @return jobmetastore
   */
  public JobMetaStore getJobMetaStore() {
    return mJobMetaStore;
  }

  /**
   * Bounded priority queue impl.
   * @param 
   */
  @SuppressFBWarnings({"SE_BAD_FIELD_INNER_CLASS", "SE_NO_SERIALVERSIONID"})
  public class BoundedPriorityBlockingQueue extends PriorityBlockingQueue {

    private AtomicInteger mLen = new AtomicInteger(0);
    private final int mCapacity;

    /**
     * Constructor for Bounded priority queue with a max capacity.
     * @param capacity
     */
    public BoundedPriorityBlockingQueue(int capacity) {
      mCapacity = capacity;
    }

    @Override
    public boolean offer(E e) {
      if (mLen.incrementAndGet() > mCapacity) {
        mLen.decrementAndGet();
        return false;
      }
      // this will always return true
      return super.offer(e);
    }

    @Override
    public E poll() {
      E e = super.poll();
      if (e != null) {
        mLen.decrementAndGet();
      }
      return e;
    }

    @Override
    public boolean remove(Object o) {
      boolean removed = super.remove(o);
      if (removed) {
        mLen.decrementAndGet();
      }
      return removed;
    }
  }

  /**
   * Util class here for tracking unique identity of a worker as
   * WorkerInfo class uses constantly changing field such as
   * mLastContactSec for equals(), which can't be served as key
   * class in map.
   */
  public static class WorkerInfoIdentity {
    public final WorkerInfo mWorkerInfo;

    /**
     * Constructor for WorkerInfoIdentity from WorkerInfo.
     * @param workerInfo
     */
    public WorkerInfoIdentity(WorkerInfo workerInfo) {
      mWorkerInfo = workerInfo;
    }

    @Override
    public int hashCode() {
      return Objects.hashCode(mWorkerInfo.getId(), mWorkerInfo.getAddress());
    }

    @Override
    public boolean equals(Object o) {
      if (this == o) {
        return true;
      }
      if (o == null || getClass() != o.getClass()) {
        return false;
      }
      WorkerInfoIdentity anotherO = (WorkerInfoIdentity) o;
      return mWorkerInfo.getAddress().equals(anotherO.mWorkerInfo.getAddress())
          && mWorkerInfo.getId() == anotherO.mWorkerInfo.getId();
    }

    @Override
    public String toString() {
      return MoreObjects.toStringHelper(this)
          .add("id", mWorkerInfo.getId())
          .add("address", mWorkerInfo.getAddress())
          .toString();
    }
  }

  /**
   * Worker information hub.
   */
  @SuppressFBWarnings(value = "NP_NULL_ON_SOME_PATH_FROM_RETURN_VALUE",
      justification = "Already performed null check")
  public class WorkerInfoHub {
    public Map>
        mActiveWorkers = ImmutableMap.of();
    private final WorkerProvider mWorkerProvider;

    /**
     * Constructor.
     * @param scheduler scheduler
     * @param workerProvider worker provider
     */
    public WorkerInfoHub(Scheduler scheduler, WorkerProvider workerProvider) {
      mWorkerProvider = workerProvider;
    }

    private final Map> mWorkerToTaskQ
        = new ConcurrentHashMap<>();

    /**
     * Kick stark tasks for each worker task q.
     */
    public void kickStartTasks() {
      // Kick off one task for each worker
      mWorkerToTaskQ.forEach((workerInfo, tasksQ) -> {
        LOG.debug("Kick start task for worker:{}, taskQ size:{}",
            workerInfo.mWorkerInfo.getAddress().getHost(),
            tasksQ.size());
        CloseableResource blkWorkerClientResource
            = mActiveWorkers.get(workerInfo);
        Task task = tasksQ.peek();
        // only make sure 1 task is running at the time
        if (task == null || task.getResponseFuture() != null) {
          LOG.debug("head task is {}", (task == null) ? "NULL" : "already running");
          return;
        }
        if (blkWorkerClientResource == null) {
          LOG.warn("Didn't find corresponding BlockWorkerClient for workerInfo:{}",
              workerInfo);
          task.getJob().onWorkerUnavailable(task);
          return;
        }
        task.execute(blkWorkerClientResource.get(), workerInfo.mWorkerInfo);
        task.getResponseFuture().addListener(() -> {
          Job job = task.getJob();
          try {
            job.processResponse(task); // retry on failure logic inside
            // TODO(lucy) currently processJob is only called in the single
            // threaded scheduler thread context, in future once tasks are
            // completed, they should be able to call processJob to resume
            // their own job to schedule next set of tasks to run.
          } catch (Exception e) {
            // Unknown exception. This should not happen, but if it happens we don't
            // want to lose the worker thread, thus catching it here. Any exception
            // surfaced here should be properly handled.
            LOG.error("Unexpected exception thrown in response future listener.", e);
            job.failJob(new InternalRuntimeException(e));
          } finally {
            tasksQ.remove(task);
            mJobToRunningTasks.compute(job, (k, v) -> {
              if (v == null) {
                return null;
              }
              v.remove(task);
              return v;
            });
          }
        }, mSchedulerExecutor);
      });
    }

    /**
     * Enqueue task for worker.
     * @param workerInfo the worker
     * @param task the task
     * @return whether the task is enqueued successfully
     */
    public boolean enqueueTaskForWorker(@Nullable WorkerInfo workerInfo, Task task) {
      if (workerInfo == null) {
        return false;
      }
      BoundedPriorityBlockingQueue workerTaskQ = mWorkerToTaskQ
          .computeIfAbsent(new WorkerInfoIdentity(workerInfo),
              k -> new BoundedPriorityBlockingQueue<>(MAX_TASK_PER_WORKER));
      if (!workerTaskQ.offer(task)) {
        LOG.debug("Exceeded maximum task per q[{}] for worker:{}",
            MAX_TASK_PER_WORKER, new WorkerInfoIdentity(workerInfo));
        return false;
      }
      ConcurrentHashSet> tasks = mJobToRunningTasks.computeIfAbsent(task.getJob(),
          j -> new ConcurrentHashSet<>());
      tasks.add(task);
      return true;
    }

    /**
     * @return the worker to task queue
     */
    public Map> getWorkerToTaskQ() {
      return mWorkerToTaskQ;
    }

    /**
     * Refresh active workers.
     */
    @VisibleForTesting
    public void updateWorkers() {
      if (Thread.currentThread().isInterrupted()) {
        return;
      }
      Set workerInfoIds;
      try {
        try {
          workerInfoIds = ImmutableSet.copyOf(mWorkerProvider.getWorkerInfos()).stream()
              .map(x -> new WorkerInfoIdentity(x)).collect(Collectors.toSet());
        } catch (AlluxioRuntimeException e) {
          LOG.warn("Failed to get worker info, using existing worker infos of {} workers",
              mActiveWorkers.size());
          return;
        }
        if (workerInfoIds.size() == mActiveWorkers.size()
            && workerInfoIds.containsAll(mActiveWorkers.keySet())) {
          return;
        }

        ImmutableMap.Builder>
            updatedWorkers = ImmutableMap.builder();
        for (WorkerInfoIdentity workerInfoId : workerInfoIds) {
          try {
            if (mActiveWorkers.get(workerInfoId) != null) {
              CloseableResource workerClient = Preconditions.checkNotNull(
                  mActiveWorkers.get(workerInfoId));
              updatedWorkers.put(workerInfoId, workerClient);
            } else {
              updatedWorkers.put(workerInfoId, mWorkerProvider.getWorkerClient(
                  workerInfoId.mWorkerInfo.getAddress()));
            }
          } catch (AlluxioRuntimeException e) {
            LOG.warn("Updating worker {} address failed",
                workerInfoId.mWorkerInfo.getAddress(), e);
            // skip the worker if we cannot obtain a client
          }
        }
        // Close clients connecting to lost workers
        for (Map.Entry> entry :
            mActiveWorkers.entrySet()) {
          WorkerInfoIdentity workerInfoId = entry.getKey();
          if (!workerInfoIds.contains(workerInfoId)) {
            CloseableResource resource = entry.getValue();
            resource.close();
            LOG.debug("Closed BlockWorkerClient to lost worker {}", workerInfoId);
          }
        }
        // Build the clients to the current active worker list
        mActiveWorkers = updatedWorkers.build();
      } catch (Exception e) {
        // Unknown exception. This should not happen, but if it happens we don't want to lose the
        // scheduler thread, thus catching it here. Any exception surfaced here should be properly
        // handled.
        LOG.error("Unexpected exception thrown in updateWorkers.", e);
      }
    }
  }

  /**
   * Job/Tasks stats.
   */
  public static class SchedulerStats {
    public Map> mRunningJobToTasksStat = new HashMap<>();
    public Map mExistingJobAndProgresses = new HashMap<>();
    public Map mWorkerQInfos = new HashMap<>();
  }

  /**
   * Print job status.
   * @return SchedulerStats
   */
  public SchedulerStats printJobsStatus() {
    SchedulerStats schedulerStats = new SchedulerStats();
    for (Map.Entry, ConcurrentHashSet>> entry : mJobToRunningTasks.entrySet()) {
      schedulerStats.mRunningJobToTasksStat.put(entry.getKey(),  //entry.getKey().getDescription(),
          entry.getValue().stream().map(t -> t.toString()).collect(Collectors.toList()));
    }
    for (Map.Entry> entry : mExistingJobs.entrySet()) {
      schedulerStats.mExistingJobAndProgresses.put(entry.getValue(),
          entry.getValue().getProgress(JobProgressReportFormat.JSON, true));
    }
    for (Map.Entry> entry :
        mWorkerInfoHub.getWorkerToTaskQ().entrySet()) {
      String tasks = String.join(",",
          entry.getValue().stream().map(x ->
                  "Job:" + x.getJob().getJobId() + ":Task:" + x.getTaskId())
              .collect(Collectors.toList()));
      schedulerStats.mWorkerQInfos.put(entry.getKey().toString(), tasks);
    }
    return schedulerStats;
  }
}