All Downloads are FREE. Search and download functionalities are using the official Maven repository.

io.mantisrx.master.resourcecluster.TaskExecutorState Maven / Gradle / Ivy

The newest version!
/*
 * Copyright 2023 Netflix, Inc.
 *
 * Licensed under the Apache License, Version 2.0 (the "License");
 * you may not use this file except in compliance with the License.
 * You may obtain a copy of the License at
 *
 *     http://www.apache.org/licenses/LICENSE-2.0
 *
 * Unless required by applicable law or agreed to in writing, software
 * distributed under the License is distributed on an "AS IS" BASIS,
 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 * See the License for the specific language governing permissions and
 * limitations under the License.
 */

package io.mantisrx.master.resourcecluster;

import io.mantisrx.master.resourcecluster.ResourceClusterActor.Assigned;
import io.mantisrx.master.resourcecluster.ResourceClusterActor.AvailabilityState;
import io.mantisrx.master.resourcecluster.ResourceClusterActor.Pending;
import io.mantisrx.master.resourcecluster.ResourceClusterActor.Running;
import io.mantisrx.server.core.domain.WorkerId;
import io.mantisrx.server.master.resourcecluster.TaskExecutorHeartbeat;
import io.mantisrx.server.master.resourcecluster.TaskExecutorRegistration;
import io.mantisrx.server.master.resourcecluster.TaskExecutorReport;
import io.mantisrx.server.master.resourcecluster.TaskExecutorReport.Available;
import io.mantisrx.server.master.resourcecluster.TaskExecutorReport.Occupied;
import io.mantisrx.server.master.resourcecluster.TaskExecutorStatusChange;
import io.mantisrx.server.master.resourcecluster.TaskExecutorTaskCancelledException;
import io.mantisrx.server.master.scheduler.JobMessageRouter;
import io.mantisrx.server.master.scheduler.WorkerOnDisabledVM;
import io.mantisrx.server.worker.TaskExecutorGateway;
import java.time.Clock;
import java.time.Instant;
import java.util.Map;
import java.util.concurrent.CompletableFuture;
import javax.annotation.Nullable;
import lombok.AllArgsConstructor;
import lombok.extern.slf4j.Slf4j;
import org.apache.flink.runtime.rpc.RpcService;

@SuppressWarnings("UnusedReturnValue")
@AllArgsConstructor
@Slf4j
class TaskExecutorState {

    enum RegistrationState {
        Registered,
        Unregistered,
    }

    private RegistrationState state;
    @Nullable
    private TaskExecutorRegistration registration;

    // availabilityState being null here represents that we don't know about the actual state of the task executor
    // and are waiting for more information
    @Nullable
    private AvailabilityState availabilityState;
    private boolean disabled;

    // last interaction initiated by the task executor
    private Instant lastActivity;
    private final Clock clock;
    private final RpcService rpcService;
    private final JobMessageRouter jobMessageRouter;

    // isTaskCancelled: this state is to mark the current assigned worker has been cancelled and this executor need to
    // stop the task and re-register.
    @Nullable
    private WorkerId cancelledWorkerOnTask;

    static TaskExecutorState of(Clock clock, RpcService rpcService, JobMessageRouter jobMessageRouter) {
        return new TaskExecutorState(
            RegistrationState.Unregistered,
            null,
            null,
            false,
            clock.instant(),
            clock,
            rpcService,
            jobMessageRouter,
            null);
    }

    boolean isRegistered() {
        return state == RegistrationState.Registered;
    }

    boolean isDisconnected() {
        return !isRegistered();
    }

    boolean isDisabled() {
        return disabled;
    }

    @Nullable
    WorkerId getCancelledWorkerId() {
        return this.cancelledWorkerOnTask;
    }

    void setCancelledWorkerOnTask(WorkerId cancelledWorkerOnTask) {
        this.cancelledWorkerOnTask = cancelledWorkerOnTask;
    }

    boolean onRegistration(TaskExecutorRegistration registration) {
        if (state == RegistrationState.Registered) {
            return false;
        } else {
            this.state = RegistrationState.Registered;
            this.registration = registration;
            updateTicker();
            return true;
        }
    }

    boolean onDisconnection() {
        if (state == RegistrationState.Unregistered) {
            return false;
        } else {
            state = RegistrationState.Unregistered;
            registration = null;
            setAvailabilityState(null);
            updateTicker();
            return true;
        }
    }

    private static AvailabilityState from(TaskExecutorReport report) {
        if (report instanceof Available) {
            return AvailabilityState.pending();
        } else if (report instanceof Occupied) {
            return AvailabilityState.running(((Occupied) report).getWorkerId());
        } else {
            throw new RuntimeException(String.format("TaskExecutorReport=%s was unexpected", report));
        }
    }

    boolean onAssignment(WorkerId workerId) throws IllegalStateException {
        if (!isRegistered()) {
            throwNotRegistered(String.format("assignment to %s", workerId));
        }

        if (this.availabilityState == null) {
            throw new IllegalStateException("availability state was null when unassignmentas was issued");
        }

        return setAvailabilityState(this.availabilityState.onAssignment(workerId));
    }

    boolean onUnassignment() throws IllegalStateException {
        if (this.availabilityState == null) {
            throw new IllegalStateException("availability state was null when unassignment was issued");
        }

        return setAvailabilityState(this.availabilityState.onUnassignment());
    }

    boolean onNodeDisabled() {
        if (!this.disabled) {
            this.disabled = true;
            if (this.availabilityState instanceof Running) {
                jobMessageRouter.routeWorkerEvent(new WorkerOnDisabledVM(this.availabilityState.getWorkerId()));
            }
            return true;
        } else {
            return false;
        }
    }

    boolean onHeartbeat(TaskExecutorHeartbeat heartbeat)
        throws IllegalStateException, TaskExecutorTaskCancelledException {
        if (!isRegistered()) {
            throwNotRegistered(String.format("heartbeat %s", heartbeat));
        }

        TaskExecutorReport report = heartbeat.getTaskExecutorReport();
        if (this.cancelledWorkerOnTask != null) {
            if (report instanceof Occupied && ((Occupied) report).getWorkerId().equals(this.cancelledWorkerOnTask)) {
                log.warn("{} cancelled, request cancel on heartbeat.", this.cancelledWorkerOnTask);
                throw new TaskExecutorTaskCancelledException(
                    String.format(
                        "heartbeat from %s has cancelled task %s",
                        heartbeat.getTaskExecutorID(),
                        this.cancelledWorkerOnTask),
                    this.cancelledWorkerOnTask);
            } else {
                log.info("{} cancelled but executor is no longer occupied by it.", this.cancelledWorkerOnTask);
                this.cancelledWorkerOnTask = null;
            }
        }

        boolean result = handleStatusChange(report);
        updateTicker();
        return result;
    }

    boolean onTaskExecutorStatusChange(TaskExecutorStatusChange statusChange) {
        if (!isRegistered()) {
            throwNotRegistered(String.format("status change %s", statusChange));
        }

        boolean result = handleStatusChange(statusChange.getTaskExecutorReport());
        updateTicker();
        return result;
    }

    private boolean handleStatusChange(TaskExecutorReport report) throws IllegalStateException {
        if (availabilityState == null) {
            return setAvailabilityState(from(report));
        } else {
            return setAvailabilityState(availabilityState.onTaskExecutorStatusChange(report));
        }
    }

    private boolean setAvailabilityState(AvailabilityState newState) {
        if (this.availabilityState != newState) {
            this.availabilityState = newState;
            if (this.availabilityState instanceof Running) {
                if (isDisabled()) {
                    jobMessageRouter.routeWorkerEvent(new WorkerOnDisabledVM(newState.getWorkerId()));
                }
            }
            return true;
        } else {
            return false;
        }
    }

    @Nullable
    protected WorkerId getWorkerId() {
        if (this.availabilityState != null) {
            return this.availabilityState.getWorkerId();
        } else {
            return null;
        }
    }

    private void throwNotRegistered(String message) throws IllegalStateException {
        throw new IllegalStateException(
            String.format("Task Executor un-registered when it received %s", message));
    }

    private void updateTicker() {
        this.lastActivity = clock.instant();
    }

    boolean isAvailable() {
        return this.availabilityState instanceof Pending && !isDisabled();
    }

    boolean isRunningTask() {
        return this.availabilityState instanceof Running;
    }

    boolean isAssigned() {
        return this.availabilityState instanceof Assigned;
    }

    boolean isRunningOrAssigned(WorkerId workerId) {
        return this.getWorkerId() != null && this.getWorkerId().equals(workerId);
    }

    // Captures the last interaction from the task executor. Any interactions
    // that are caused from within the server do not cause an uptick.
    Instant getLastActivity() {
        return this.lastActivity;
    }

    TaskExecutorRegistration getRegistration() {
        return this.registration;
    }

    protected CompletableFuture getGatewayAsync() {
        if (this.registration == null || this.state == RegistrationState.Unregistered) {
            throw new IllegalStateException("TE is unregistered");
        }

        // [Note] here the gateway connection is re-created every time it's requested to avoid corrupted state that
        // can block the connection to TE.
        // To be able to store and re-use the gateway, we probably need to make a chain of callbacks so taht the TE
        // is only marked as available after this gateway connection is successfully established with proper retry
        // loops (since the TE only register once and take the ack as success on API response).
        return rpcService.connect(registration.getTaskExecutorAddress(), TaskExecutorGateway.class)
                .whenComplete((gateway, throwable) -> {
                    if (throwable != null) {
                        log.error("Failed to connect to the gateway", throwable);
                    }
                });
    }

    boolean containsAttributes(Map attributes) {
        return registration != null && registration.containsAttributes(attributes);
    }
}




© 2015 - 2024 Weber Informatics LLC | Privacy Policy