io.mantisrx.master.JobClustersManagerActor Maven / Gradle / Ivy
The newest version!
/*
* Copyright 2019 Netflix, Inc.
*
* Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License.
* You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
package io.mantisrx.master;
import static akka.pattern.PatternsCS.ask;
import static io.mantisrx.master.jobcluster.proto.BaseResponse.ResponseCode.CLIENT_ERROR;
import static io.mantisrx.master.jobcluster.proto.BaseResponse.ResponseCode.CLIENT_ERROR_CONFLICT;
import static io.mantisrx.master.jobcluster.proto.BaseResponse.ResponseCode.CLIENT_ERROR_NOT_FOUND;
import static io.mantisrx.master.jobcluster.proto.BaseResponse.ResponseCode.SERVER_ERROR;
import static io.mantisrx.master.jobcluster.proto.BaseResponse.ResponseCode.SUCCESS;
import static io.mantisrx.master.jobcluster.proto.BaseResponse.ResponseCode.SUCCESS_CREATED;
import static io.mantisrx.master.jobcluster.proto.JobClusterManagerProto.CreateJobClusterRequest;
import static io.mantisrx.master.jobcluster.proto.JobClusterManagerProto.CreateJobClusterResponse;
import static io.mantisrx.master.jobcluster.proto.JobClusterManagerProto.DeleteJobClusterRequest;
import static io.mantisrx.master.jobcluster.proto.JobClusterManagerProto.DeleteJobClusterResponse;
import static io.mantisrx.master.jobcluster.proto.JobClusterManagerProto.DisableJobClusterRequest;
import static io.mantisrx.master.jobcluster.proto.JobClusterManagerProto.DisableJobClusterResponse;
import static io.mantisrx.master.jobcluster.proto.JobClusterManagerProto.EnableJobClusterRequest;
import static io.mantisrx.master.jobcluster.proto.JobClusterManagerProto.EnableJobClusterResponse;
import static io.mantisrx.master.jobcluster.proto.JobClusterManagerProto.GetJobClusterRequest;
import static io.mantisrx.master.jobcluster.proto.JobClusterManagerProto.GetJobClusterResponse;
import static io.mantisrx.master.jobcluster.proto.JobClusterManagerProto.GetJobDetailsResponse;
import static io.mantisrx.master.jobcluster.proto.JobClusterManagerProto.GetJobSchedInfoRequest;
import static io.mantisrx.master.jobcluster.proto.JobClusterManagerProto.GetJobSchedInfoResponse;
import static io.mantisrx.master.jobcluster.proto.JobClusterManagerProto.GetLastSubmittedJobIdStreamRequest;
import static io.mantisrx.master.jobcluster.proto.JobClusterManagerProto.GetLastSubmittedJobIdStreamResponse;
import static io.mantisrx.master.jobcluster.proto.JobClusterManagerProto.GetLatestJobDiscoveryInfoRequest;
import static io.mantisrx.master.jobcluster.proto.JobClusterManagerProto.GetLatestJobDiscoveryInfoResponse;
import static io.mantisrx.master.jobcluster.proto.JobClusterManagerProto.JobClustersManagerInitialize;
import static io.mantisrx.master.jobcluster.proto.JobClusterManagerProto.JobClustersManagerInitializeResponse;
import static io.mantisrx.master.jobcluster.proto.JobClusterManagerProto.KillJobRequest;
import static io.mantisrx.master.jobcluster.proto.JobClusterManagerProto.KillJobResponse;
import static io.mantisrx.master.jobcluster.proto.JobClusterManagerProto.ListArchivedWorkersRequest;
import static io.mantisrx.master.jobcluster.proto.JobClusterManagerProto.ListArchivedWorkersResponse;
import static io.mantisrx.master.jobcluster.proto.JobClusterManagerProto.ListCompletedJobsInClusterRequest;
import static io.mantisrx.master.jobcluster.proto.JobClusterManagerProto.ListCompletedJobsInClusterResponse;
import static io.mantisrx.master.jobcluster.proto.JobClusterManagerProto.ListJobClustersRequest;
import static io.mantisrx.master.jobcluster.proto.JobClusterManagerProto.ListJobClustersResponse;
import static io.mantisrx.master.jobcluster.proto.JobClusterManagerProto.ListJobIdsRequest;
import static io.mantisrx.master.jobcluster.proto.JobClusterManagerProto.ListJobIdsResponse;
import static io.mantisrx.master.jobcluster.proto.JobClusterManagerProto.ListJobsRequest;
import static io.mantisrx.master.jobcluster.proto.JobClusterManagerProto.ListJobsResponse;
import static io.mantisrx.master.jobcluster.proto.JobClusterManagerProto.ListWorkersRequest;
import static io.mantisrx.master.jobcluster.proto.JobClusterManagerProto.ListWorkersResponse;
import static io.mantisrx.master.jobcluster.proto.JobClusterManagerProto.ReconcileJobCluster;
import static io.mantisrx.master.jobcluster.proto.JobClusterManagerProto.ResubmitWorkerResponse;
import static io.mantisrx.master.jobcluster.proto.JobClusterManagerProto.ScaleStageRequest;
import static io.mantisrx.master.jobcluster.proto.JobClusterManagerProto.ScaleStageResponse;
import static io.mantisrx.master.jobcluster.proto.JobClusterManagerProto.SubmitJobRequest;
import static io.mantisrx.master.jobcluster.proto.JobClusterManagerProto.SubmitJobResponse;
import static io.mantisrx.master.jobcluster.proto.JobClusterManagerProto.UpdateJobClusterArtifactResponse;
import static io.mantisrx.master.jobcluster.proto.JobClusterManagerProto.UpdateJobClusterLabelsResponse;
import static io.mantisrx.master.jobcluster.proto.JobClusterManagerProto.UpdateJobClusterRequest;
import static io.mantisrx.master.jobcluster.proto.JobClusterManagerProto.UpdateJobClusterResponse;
import static io.mantisrx.master.jobcluster.proto.JobClusterManagerProto.UpdateJobClusterSLAResponse;
import static io.mantisrx.master.jobcluster.proto.JobClusterManagerProto.UpdateJobClusterWorkerMigrationStrategyResponse;
import static java.util.Optional.empty;
import static java.util.Optional.ofNullable;
import akka.actor.AbstractActorWithTimers;
import akka.actor.ActorPaths;
import akka.actor.ActorRef;
import akka.actor.Props;
import akka.actor.SupervisorStrategy;
import akka.actor.Terminated;
import io.mantisrx.common.metrics.Counter;
import io.mantisrx.common.metrics.Metrics;
import io.mantisrx.common.metrics.MetricsRegistry;
import io.mantisrx.common.metrics.spectator.GaugeCallback;
import io.mantisrx.common.metrics.spectator.MetricGroupId;
import io.mantisrx.master.akka.MantisActorSupervisorStrategy;
import io.mantisrx.master.events.LifecycleEventPublisher;
import io.mantisrx.master.jobcluster.IJobClusterMetadata;
import io.mantisrx.master.jobcluster.JobClusterActor;
import io.mantisrx.master.jobcluster.job.CostsCalculator;
import io.mantisrx.master.jobcluster.job.IMantisJobMetadata;
import io.mantisrx.master.jobcluster.job.JobHelper;
import io.mantisrx.master.jobcluster.job.JobState;
import io.mantisrx.master.jobcluster.proto.BaseResponse;
import io.mantisrx.master.jobcluster.proto.JobClusterManagerProto.GetJobDetailsRequest;
import io.mantisrx.master.jobcluster.proto.JobClusterManagerProto.ResubmitWorkerRequest;
import io.mantisrx.master.jobcluster.proto.JobClusterManagerProto.UpdateJobClusterArtifactRequest;
import io.mantisrx.master.jobcluster.proto.JobClusterManagerProto.UpdateJobClusterLabelsRequest;
import io.mantisrx.master.jobcluster.proto.JobClusterManagerProto.UpdateJobClusterSLARequest;
import io.mantisrx.master.jobcluster.proto.JobClusterManagerProto.UpdateJobClusterWorkerMigrationStrategyRequest;
import io.mantisrx.master.jobcluster.proto.JobClusterManagerProto.UpdateSchedulingInfoResponse;
import io.mantisrx.master.jobcluster.proto.JobClusterProto;
import io.mantisrx.runtime.descriptor.SchedulingInfo;
import io.mantisrx.server.core.JobCompletedReason;
import io.mantisrx.server.master.config.ConfigurationProvider;
import io.mantisrx.server.master.domain.IJobClusterDefinition;
import io.mantisrx.server.master.domain.JobClusterDefinitionImpl;
import io.mantisrx.server.master.domain.JobClusterDefinitionImpl.CompletedJob;
import io.mantisrx.server.master.domain.JobDefinition;
import io.mantisrx.server.master.domain.JobId;
import io.mantisrx.server.master.persistence.MantisJobStore;
import io.mantisrx.server.master.scheduler.MantisSchedulerFactory;
import io.mantisrx.server.master.scheduler.WorkerEvent;
import io.mantisrx.shaded.com.google.common.collect.Lists;
import java.time.Duration;
import java.util.ArrayList;
import java.util.Collections;
import java.util.HashMap;
import java.util.List;
import java.util.Map;
import java.util.Objects;
import java.util.Optional;
import java.util.Set;
import java.util.concurrent.CompletionStage;
import java.util.stream.Collectors;
import lombok.Value;
import org.slf4j.Logger;
import org.slf4j.LoggerFactory;
import rx.Observable;
import rx.schedulers.Schedulers;
/*
Supervisor Actor responsible for creating/deletion/listing of all Job Clusters in the system
*/
public class JobClustersManagerActor extends AbstractActorWithTimers implements IJobClustersManager {
private static final String CHECK_CLUSTERS_TIMER_KEY = "CHECK_CLUSTER_TIMER";
public static final int STATE_TRANSITION_TIMEOUT_MSECS = 5000;
private final Logger logger = LoggerFactory.getLogger(JobClustersManagerActor.class);
private final long checkAgainInSecs = 30;
private final Counter numJobClusterInitFailures;
private final Counter numJobClusterInitSuccesses;
private Receive initializedBehavior;
public static Props props(final MantisJobStore jobStore, final LifecycleEventPublisher eventPublisher, final CostsCalculator costsCalculator, int slaHeadroomForAcceptedJobs) {
return Props.create(JobClustersManagerActor.class, jobStore, eventPublisher, costsCalculator, slaHeadroomForAcceptedJobs)
.withMailbox("akka.actor.metered-mailbox");
}
private final MantisJobStore jobStore;
private final LifecycleEventPublisher eventPublisher;
private final CostsCalculator costsCalculator;
private MantisSchedulerFactory mantisSchedulerFactory = null;
JobClusterInfoManager jobClusterInfoManager;
private final int slaHeadroomForAcceptedJobs;
private ActorRef jobListHelperActor;
public JobClustersManagerActor(final MantisJobStore store, final LifecycleEventPublisher eventPublisher, final CostsCalculator costsCalculator, int slaHeadroomForAcceptedJobs) {
this.jobStore = store;
this.eventPublisher = eventPublisher;
this.costsCalculator = costsCalculator;
this.slaHeadroomForAcceptedJobs = slaHeadroomForAcceptedJobs;
MetricGroupId metricGroupId = getMetricGroupId();
Metrics m = new Metrics.Builder()
.id(metricGroupId)
.addCounter("numJobClusterInitFailures")
.addCounter("numJobClusterInitSuccesses")
.build();
m = MetricsRegistry.getInstance().registerAndGet(m);
this.numJobClusterInitFailures = m.getCounter("numJobClusterInitFailures");
this.numJobClusterInitSuccesses = m.getCounter("numJobClusterInitSuccesses");
initializedBehavior = getInitializedBehavior();
}
MetricGroupId getMetricGroupId() {
return new MetricGroupId("JobClustersManagerActor");
}
/**
* JobClusterManager Actor behaviors 27 total
* - Init
* // CLUSTER RELATED
* - CreateJC
* - InitalizeJCResponse
* - DeleteJC
* - DeleteJCResponse
* - UpdateJC
* - UpdateLabel
* - UpdateSLA
* - UpdateArtifact
* - UpdateMigrationStrat
* - ENABLE JC
* - DISABLE JC
* - GET CLUSTER
* - LIST completed jobs
* - GET LAST SUBMITTED JOB
* - LIST archived workers
*
* - LIST JCs
* - LIST JOBS
* - LIST JOB IDS
* - LIST WORKERS -> (pass thru to each Job Actor)
* *
* // pass thru to JOB
* - SUBMIT JOB -> (INIT JOB on Job Actor)
* - KILL JOB -> (pass thru Job Actor)
* - GET JOB -> (pass thru Job Actor)
* - GET JOB SCHED INFO -> (pass thru Job Actor)
* - SCALE JOB -> (pass thru Job Actor)
* - RESUBMIT WORKER -> (pass thru Job Actor)
*
* - WORKER EVENT -> (pass thru Job Actor)
* @return
*/
private Receive getInitializedBehavior() {
String state = "initialized";
return receiveBuilder()
.match(ReconcileJobCluster.class, this::onReconcileJobClusters)
// Specific Job Cluster related messages
.match(CreateJobClusterRequest.class, this::onJobClusterCreate)
.match(JobClusterProto.InitializeJobClusterResponse.class, this::onJobClusterInitializeResponse)
.match(DeleteJobClusterRequest.class, this::onJobClusterDelete)
.match(JobClusterProto.DeleteJobClusterResponse.class, this::onJobClusterDeleteResponse)
.match(UpdateJobClusterRequest.class, this::onJobClusterUpdate)
.match(UpdateJobClusterSLARequest.class, this::onJobClusterUpdateSLA)
.match(UpdateJobClusterArtifactRequest.class, this::onJobClusterUpdateArtifact)
.match(UpdateSchedulingInfo.class, this::onJobClusterUpdateSchedulingInfo)
.match(UpdateJobClusterLabelsRequest.class, this::onJobClusterUpdateLabels)
.match(UpdateJobClusterWorkerMigrationStrategyRequest.class, this::onJobClusterUpdateWorkerMigrationConfig)
.match(EnableJobClusterRequest.class, this::onJobClusterEnable)
.match(DisableJobClusterRequest.class, this::onJobClusterDisable)
.match(GetJobClusterRequest.class, this::onJobClusterGet)
.match(ListCompletedJobsInClusterRequest.class, this::onJobListCompleted)
.match(GetLastSubmittedJobIdStreamRequest.class, this::onGetLastSubmittedJobIdSubject)
.match(ListArchivedWorkersRequest.class, this::onListArchivedWorkers)
// List Job Cluster related messages
.match(ListJobClustersRequest.class, this::onJobClustersList)
// List Jobs related messages
.match(ListJobsRequest.class, this::onJobList)
.match(ListJobIdsRequest.class, this::onJobIdList)
.match(ListWorkersRequest.class, this::onListActiveWorkers)
//delegate to job
.match(SubmitJobRequest.class, this::onJobSubmit)
.match(KillJobRequest.class, this::onJobKillRequest)
// .match(JobClusterProto.KillJobResponse.class, this::onJobKillResponse)
.match(GetJobDetailsRequest.class, this::onGetJobDetailsRequest)
.match(GetJobSchedInfoRequest.class, this::onGetJobStatusSubject)
.match(GetLatestJobDiscoveryInfoRequest.class, this::onGetLatestJobDiscoveryInfo)
.match(ScaleStageRequest.class, this::onScaleStage)
.match(ResubmitWorkerRequest.class, this::onResubmitWorker)
//delegate to worker
.match(WorkerEvent.class, this::onWorkerEvent)
.match(Terminated.class, this::onTerminated)
// Unexpected
.match(JobClustersManagerInitialize.class, (x) -> getSender().tell(new JobClustersManagerInitializeResponse(x.requestId, CLIENT_ERROR, genUnexpectedMsg(x.toString(), state) ), getSelf()))
.matchAny(x -> logger.warn("unexpected message {} received by Job Cluster Manager actor. In initialized state ", x))
.build();
}
private String genUnexpectedMsg(String event, String state) {
return String.format("Unexpected message %s received by JobClustersManager actor in %s State", event, state);
}
private Receive getInitializingBehavior() {
String state = "initializing";
return receiveBuilder()
// EXPECTED MESSAGES BEGIN
.match(JobClustersManagerInitialize.class, this::initialize)
// EXPECTED MESSAGES END
// UNEXPECTED MESSAGES BEGIN
.match(ReconcileJobCluster.class, (x) -> logger.warn(genUnexpectedMsg(x.toString(), state)))
.match(CreateJobClusterRequest.class, (x) -> getSender().tell(new CreateJobClusterResponse(x.requestId, CLIENT_ERROR, genUnexpectedMsg(x.toString(), state), x.getJobClusterDefinition().getName()), getSelf()))
.match(JobClusterProto.InitializeJobClusterResponse.class, (x) -> logger.warn(genUnexpectedMsg(x.toString(), state)))
.match(DeleteJobClusterRequest.class, (x) -> getSender().tell(new DeleteJobClusterResponse(x.requestId, CLIENT_ERROR, genUnexpectedMsg(x.toString(), state)), getSelf()))
.match(JobClusterProto.DeleteJobClusterResponse.class, (x) -> logger.warn(genUnexpectedMsg(x.toString(), state)))
.match(UpdateJobClusterRequest.class, (x) -> getSender().tell(new UpdateJobClusterResponse(x.requestId, CLIENT_ERROR, genUnexpectedMsg(x.toString(), state)), getSelf()))
.match(UpdateJobClusterSLARequest.class, (x) -> getSender().tell(new UpdateJobClusterSLAResponse(x.requestId, CLIENT_ERROR, genUnexpectedMsg(x.toString(), state)), getSelf()))
.match(UpdateJobClusterArtifactRequest.class, (x) -> getSender().tell(new UpdateJobClusterArtifactResponse(x.requestId, CLIENT_ERROR, genUnexpectedMsg(x.toString(), state)), getSelf()))
.match(UpdateSchedulingInfo.class, (x) -> getSender().tell(new UpdateSchedulingInfoResponse(x.requestId, CLIENT_ERROR, genUnexpectedMsg(x.toString(), state)), getSelf()))
.match(UpdateJobClusterLabelsRequest.class, (x) -> getSender().tell(new UpdateJobClusterLabelsResponse(x.requestId, CLIENT_ERROR, genUnexpectedMsg(x.toString(), state)), getSelf()))
.match(UpdateJobClusterWorkerMigrationStrategyRequest.class, (x) -> getSender().tell(new UpdateJobClusterWorkerMigrationStrategyResponse(x.requestId, CLIENT_ERROR, genUnexpectedMsg(x.toString(), state)), getSelf()))
.match(EnableJobClusterRequest.class, (x) -> getSender().tell(new EnableJobClusterResponse(x.requestId, CLIENT_ERROR, genUnexpectedMsg(x.toString(), state)), getSelf()))
.match(DisableJobClusterRequest.class, (x) -> getSender().tell(new DisableJobClusterResponse(x.requestId, CLIENT_ERROR, genUnexpectedMsg(x.toString(), state)), getSelf()))
.match(GetJobClusterRequest.class, (x) -> getSender().tell(new GetJobClusterResponse(x.requestId, CLIENT_ERROR, genUnexpectedMsg(x.toString(), state), empty()), getSelf()))
.match(ListCompletedJobsInClusterRequest.class, (x) -> logger.warn(genUnexpectedMsg(x.toString(), state)))
.match(GetLastSubmittedJobIdStreamRequest.class, (x) -> getSender().tell(new GetLastSubmittedJobIdStreamResponse(x.requestId, CLIENT_ERROR, genUnexpectedMsg(x.toString(), state), empty()), getSelf()))
.match(ListArchivedWorkersRequest.class, (x) -> getSender().tell(new ListArchivedWorkersResponse(x.requestId, CLIENT_ERROR, genUnexpectedMsg(x.toString(), state), Lists.newArrayList()), getSelf()))
.match(ListJobClustersRequest.class, (x) -> getSender().tell(new ListJobClustersResponse(x.requestId, CLIENT_ERROR, genUnexpectedMsg(x.toString(), state), Lists.newArrayList()), getSelf()))
.match(ListJobsRequest.class, (x) -> getSender().tell(new ListJobsResponse(x.requestId, CLIENT_ERROR, genUnexpectedMsg(x.toString(), state), Lists.newArrayList()), getSelf()))
.match(ListJobIdsRequest.class, (x) -> getSender().tell(new ListJobIdsResponse(x.requestId, CLIENT_ERROR, genUnexpectedMsg(x.toString(), state), Lists.newArrayList()), getSelf()))
.match(ListWorkersRequest.class, (x) -> getSender().tell(new ListWorkersResponse(x.requestId, CLIENT_ERROR, genUnexpectedMsg(x.toString(), state), Lists.newArrayList()), getSelf()))
.match(SubmitJobRequest.class, (x) -> getSender().tell(new SubmitJobResponse(x.requestId, CLIENT_ERROR, genUnexpectedMsg(x.toString(), state), empty()), getSelf()))
.match(KillJobRequest.class, (x) -> getSender().tell(new KillJobResponse(x.requestId, CLIENT_ERROR, JobState.Noop, genUnexpectedMsg(x.toString(), state), x.getJobId(), x.getUser()), getSelf()))
.match(JobClusterProto.KillJobResponse.class, (x) -> logger.warn(genUnexpectedMsg(x.toString(), state)))
.match(GetJobDetailsRequest.class, (x) -> getSender().tell(new GetJobDetailsResponse(x.requestId, CLIENT_ERROR, genUnexpectedMsg(x.toString(), state), empty()), getSelf()))
.match(GetJobSchedInfoRequest.class, (x) -> getSender().tell(new GetJobSchedInfoResponse(x.requestId, CLIENT_ERROR, genUnexpectedMsg(x.toString(), state), empty()), getSelf()))
.match(GetLatestJobDiscoveryInfoRequest.class, (x) -> getSender().tell(new GetLatestJobDiscoveryInfoResponse(x.requestId, CLIENT_ERROR, genUnexpectedMsg(x.toString(), state), empty()), getSelf()))
.match(ScaleStageRequest.class, (x) -> getSender().tell(new ScaleStageResponse(x.requestId, CLIENT_ERROR, genUnexpectedMsg(x.toString(), state), 0), getSelf()))
.match(ResubmitWorkerRequest.class, (x) -> getSender().tell(new ResubmitWorkerResponse(x.requestId, CLIENT_ERROR, genUnexpectedMsg(x.toString(), state)), getSelf()))
.match(WorkerEvent.class, (x) -> logger.warn(genUnexpectedMsg(x.toString(), state)))
// everything else
.matchAny(x -> logger.warn("unexpected message {} received by Job Cluster Manager actor. It needs to be initialized first ", x))
// UNEXPECTED MESSAGES BEGIN
.build();
}
private void initialize(JobClustersManagerInitialize initMsg) {
ActorRef sender = getSender();
try {
logger.info("In JobClustersManagerActor:initialize");
this.jobListHelperActor = getContext().actorOf(JobListHelperActor.props(), "JobListHelperActor");
getContext().watch(jobListHelperActor);
mantisSchedulerFactory = initMsg.getScheduler();
Map jobClusterMap = new HashMap<>();
this.jobClusterInfoManager = new JobClusterInfoManager(jobStore, mantisSchedulerFactory, eventPublisher, costsCalculator);
if (!initMsg.isLoadJobsFromStore()) {
getContext().become(initializedBehavior);
sender.tell(new JobClustersManagerInitializeResponse(initMsg.requestId, SUCCESS, "JobClustersManager successfully inited"), getSelf());
} else {
List jobClusters = jobStore.loadAllJobClusters();
logger.info("Read {} job clusters from storage", jobClusters.size());
List activeJobs = jobStore.loadAllActiveJobs();
logger.info("Read {} jobs from storage", activeJobs.size());
for (IJobClusterMetadata jobClusterMeta : jobClusters) {
String clusterName = jobClusterMeta.getJobClusterDefinition().getName();
jobClusterMap.put(clusterName, jobClusterMeta);
}
Map> clusterToJobMap = new HashMap<>();
// group jobs by cluster
for (IMantisJobMetadata jobMeta : activeJobs) {
String clusterName = jobMeta.getClusterName();
clusterToJobMap.computeIfAbsent(clusterName, k -> new ArrayList<>()).add(jobMeta);
}
long masterInitTimeoutSecs = ConfigurationProvider.getConfig().getMasterInitTimeoutSecs();
long timeout = ((masterInitTimeoutSecs - 60)) > 0 ? (masterInitTimeoutSecs - 60) : masterInitTimeoutSecs;
Observable.from(jobClusterMap.values())
.filter((jobClusterMeta) -> jobClusterMeta != null && jobClusterMeta.getJobClusterDefinition() != null)
.flatMap((jobClusterMeta) -> {
Duration t = Duration.ofSeconds(timeout);
Optional jobClusterInfoO = jobClusterInfoManager.createClusterActorAndRegister(jobClusterMeta.getJobClusterDefinition());
if (!jobClusterInfoO.isPresent()) {
logger.info("skipping job cluster {} on bootstrap as actor creating failed", jobClusterMeta.getJobClusterDefinition().getName());
return Observable.empty();
}
JobClusterInfo jobClusterInfo = jobClusterInfoO.get();
List jobList = Lists.newArrayList();
List jList = clusterToJobMap.get(jobClusterMeta.getJobClusterDefinition().getName());
if (jList != null) {
jobList.addAll(jList);
}
List completedJobsList = Lists.newArrayList();
JobClusterProto.InitializeJobClusterRequest req = new JobClusterProto.InitializeJobClusterRequest((JobClusterDefinitionImpl) jobClusterMeta.getJobClusterDefinition(),
jobClusterMeta.isDisabled(), jobClusterMeta.getLastJobCount(), jobList,
"system", getSelf(), false);
return jobClusterInfoManager.initializeCluster(jobClusterInfo, req, t);
})
.filter(Objects::nonNull)
.toBlocking()
.subscribe((clusterInit) -> {
logger.info("JobCluster {} inited with code {}", clusterInit.jobClusterName, clusterInit.responseCode);
numJobClusterInitSuccesses.increment();
}, (error) -> {
logger.warn("Exception initializing clusters {}", error.getMessage(), error);
logger.error("JobClusterManagerActor had errors during initialization NOT transitioning to initialized behavior");
// getContext().become(initializedBehavior);
sender.tell(new JobClustersManagerInitializeResponse(initMsg.requestId, SERVER_ERROR, "JobClustersManager inited with errors"), getSelf());
}, () -> {
logger.info("JobClusterManagerActor transitioning to initialized behavior");
getContext().become(initializedBehavior);
sender.tell(new JobClustersManagerInitializeResponse(initMsg.requestId, SUCCESS, "JobClustersManager successfully inited"), getSelf());
});
getTimers().startPeriodicTimer(CHECK_CLUSTERS_TIMER_KEY, new ReconcileJobCluster(), Duration.ofSeconds(checkAgainInSecs));
// kick off loading of archived jobs
logger.info("Kicking off archived job load asynchronously");
jobStore.loadAllArchivedJobsAsync();
}
} catch(Exception e) {
logger.error("caught exception", e);
sender.tell(new JobClustersManagerInitializeResponse(initMsg.requestId, SERVER_ERROR, e.getMessage()), getSelf());
}
logger.info("JobClustersManagerActor:initialize ends");
}
@Override
public void onReconcileJobClusters(ReconcileJobCluster p) {
Set jobClusterInfos = this.jobClusterInfoManager.getAllJobClusterInfo().values().stream()
.filter((jci) -> ((jci.currentState == JobClusterInfo.JobClusterState.INITIALIZING || jci.currentState == JobClusterInfo.JobClusterState.DELETING)
&& (p.timeOfEnforcement.toEpochMilli() - jci.stateUpdateTime) > STATE_TRANSITION_TIMEOUT_MSECS))
.collect(Collectors.toSet());
if(jobClusterInfos.size() > 0) {
logger.warn("{} JobClusters stuck in initializing/deleting state ", jobClusterInfos.size());
jobClusterInfos.stream().forEach((jci) -> {
if(jci.currentState.equals(JobClusterInfo.JobClusterState.INITIALIZING)) {
// retry init request
logger.warn("Retrying init on JobCluster {} stuck in {} state since {}", jci.clusterName, jci.currentState, jci.stateUpdateTime);
jci.stateUpdateTime = p.timeOfEnforcement.toEpochMilli();
jci.jobClusterActor.tell(jci.initRequest, getSelf());
} else { // in pending delete state
logger.warn("Deregistering JobCluster {} stuck in {} state since {}", jci.clusterName, jci.currentState, jci.stateUpdateTime);
jobClusterInfoManager.deregisterJobCluster(jci.clusterName);
}
});
}
}
@Override
public void onJobClusterCreate(final CreateJobClusterRequest request) {
final String name = request.getJobClusterDefinition().getName();
if (!jobClusterInfoManager.isClusterExists(name)) {
try {
Optional jobClusterInfoO = jobClusterInfoManager.createClusterActorAndRegister(request.getJobClusterDefinition());
if (jobClusterInfoO.isPresent()) {
jobClusterInfoManager.initializeClusterAsync(jobClusterInfoO.get(), new JobClusterProto.InitializeJobClusterRequest(request.getJobClusterDefinition(), request.getUser(), getSender()));
} else {
getSender().tell(new CreateJobClusterResponse(
request.requestId, CLIENT_ERROR,
"Job Cluster " + request.getJobClusterDefinition().getName() + " could not be created due to invalid name",
request.getJobClusterDefinition().getName()), getSelf());
}
} catch (Exception e) {
getSender().tell(new CreateJobClusterResponse(
request.requestId, SERVER_ERROR,
"Job Cluster " + request.getJobClusterDefinition().getName() + " could not be created due to " + e.getMessage(),
request.getJobClusterDefinition().getName()), getSelf());
}
} else {
getSender().tell(new CreateJobClusterResponse(
request.requestId, CLIENT_ERROR_CONFLICT,
"Job Cluster " + request.getJobClusterDefinition().getName() + " already exists",
request.getJobClusterDefinition().getName()), getSelf());
}
}
@Override
public void onJobClusterInitializeResponse(final JobClusterProto.InitializeJobClusterResponse createResp) {
logger.info("Got JobClusterInitializeResponse {}", createResp);
jobClusterInfoManager.processInitializeResponse(createResp);
}
@Override
public void onJobClusterDelete(final DeleteJobClusterRequest request) {
jobClusterInfoManager.processDeleteRequest(request);
}
@Override
public void onJobClusterDeleteResponse(final JobClusterProto.DeleteJobClusterResponse resp) {
jobClusterInfoManager.processDeleteResponse(resp);
}
@Override
public void onJobClusterUpdate(final UpdateJobClusterRequest request) {
Optional jobClusterInfo = jobClusterInfoManager.getJobClusterInfo(request.getJobClusterDefinition().getName());
ActorRef sender = getSender();
if(jobClusterInfo.isPresent()) {
jobClusterInfo.get().jobClusterActor.forward(request, getContext());
} else {
sender.tell(new UpdateJobClusterResponse(request.requestId, CLIENT_ERROR_NOT_FOUND, "JobCluster " + request.getJobClusterDefinition().getName() + " doesn't exist"), getSelf());
}
}
@Override
public void onJobClustersList(final ListJobClustersRequest request) {
if(logger.isDebugEnabled()) { logger.info("In onJobClustersListRequest {}", request); }
ActorRef sender = getSender();
Map jobClusterInfoMap = jobClusterInfoManager.getAllJobClusterInfo();
jobListHelperActor.tell(new JobListHelperActor.ListJobClusterRequestWrapper(request,sender,jobClusterInfoMap),getSelf());
}
@Override
public void onJobClusterGet(GetJobClusterRequest r) {
Optional jobClusterInfo = jobClusterInfoManager.getJobClusterInfo(r.getJobClusterName());
ActorRef sender = getSender();
if(jobClusterInfo.isPresent()) {
jobClusterInfo.get().jobClusterActor.forward(r, getContext());
} else {
sender.tell(new GetJobClusterResponse(r.requestId, CLIENT_ERROR_NOT_FOUND, "No such Job cluster " + r.getJobClusterName(), empty()), getSelf());
}
}
@Override
public void onGetLastSubmittedJobIdSubject(GetLastSubmittedJobIdStreamRequest r) {
Optional jobClusterInfo = jobClusterInfoManager.getJobClusterInfo(r.getClusterName());
ActorRef sender = getSender();
if(jobClusterInfo.isPresent()) {
jobClusterInfo.get().jobClusterActor.forward(r, getContext());
} else {
sender.tell(new GetLastSubmittedJobIdStreamResponse(r.requestId, CLIENT_ERROR_NOT_FOUND, "No such Job cluster " + r.getClusterName(), empty()), getSelf());
}
}
@Override
public void onWorkerEvent(WorkerEvent workerEvent) {
if(logger.isDebugEnabled()) { logger.debug("Entering JobClusterManagerActor:onWorkerEvent {}", workerEvent); }
String clusterName = workerEvent.getWorkerId().getJobCluster();
Optional jobClusterInfo = jobClusterInfoManager.getJobClusterInfo(clusterName);
if(jobClusterInfo.isPresent()) {
jobClusterInfo.get().jobClusterActor.forward(workerEvent, getContext());
} else {
if(!JobHelper.isTerminalWorkerEvent(workerEvent)) {
logger.warn("Event from Worker {} for a cluster {} that no longer exists. Terminate worker", workerEvent, workerEvent.getWorkerId().getJobCluster());
Optional host = JobHelper.getWorkerHostFromWorkerEvent(workerEvent);
Optional archivedJobDefinition =
jobClusterInfoManager.getArchivedJobDefinition(workerEvent.getWorkerId().getJobId());
if (archivedJobDefinition.isPresent()) {
mantisSchedulerFactory
.forJob(archivedJobDefinition.get())
.unscheduleAndTerminateWorker(workerEvent.getWorkerId(), host);
} else {
logger.error("Non-Terminal Event {} from worker {} for a cluster {} that no longer exists and the job definition not yet archived", workerEvent, workerEvent.getWorkerId(), workerEvent.getWorkerId().getJobCluster());
}
} else {
logger.warn("Terminal Event from Worker {} for a cluster {} that no longer exists. Ignore worker", workerEvent, workerEvent.getWorkerId().getJobCluster());
}
}
}
private void onTerminated(final Terminated terminated) {
logger.warn("onTerminated {}", terminated.actor());
}
//////////////////// JOB OPERATIONS ////////////////////////////////////////////////
@Override
public void onJobSubmit(final SubmitJobRequest request) {
logger.info("Submitting job " + request);
Optional jobClusterInfo = jobClusterInfoManager.getJobClusterInfo(request.getClusterName());
ActorRef sender = getSender();
if(jobClusterInfo.isPresent()) {
jobClusterInfo.get().jobClusterActor.forward(request, getContext());
} else {
sender.tell(new SubmitJobResponse(request.requestId, CLIENT_ERROR_NOT_FOUND, "Job Cluster " + request.getClusterName() + " doesn't exist", empty()), getSelf());
}
}
@Override
public void onJobKillRequest(final KillJobRequest request) {
logger.info("Killing job " + request);
ActorRef sender = getSender();
JobId jobIdToKill = request.getJobId();
Optional jobClusterInfo = jobClusterInfoManager.getJobClusterInfo(jobIdToKill.getCluster());
if(jobClusterInfo.isPresent()) {
jobClusterInfo.get().jobClusterActor.tell(
new JobClusterProto.KillJobRequest(request.getJobId(), request.getReason(),
JobCompletedReason.Killed, request.getUser(), sender), getSelf());
} else {
logger.info("Job cluster {} not found", jobIdToKill.getCluster());
sender.tell(new KillJobResponse(request.requestId, CLIENT_ERROR_NOT_FOUND, JobState.Noop, "Job cluster " + jobIdToKill.getCluster() + " doesn't exist", jobIdToKill, request.getUser()), getSelf());
}
}
////////////////////// JOB OPERATIONS END //////////////////////////////////////////////
@Override
public void preStart() throws Exception {
logger.info("JobClusterManager Actor started");
super.preStart();
}
@Override
public void postStop() throws Exception {
logger.info("JobClusterManager Actor stopped");
super.postStop();
}
@Override
public void preRestart(Throwable t, Optional