All Downloads are FREE. Search and download functionalities are using the official Maven repository.

com.netflix.genie.web.services.impl.JobLaunchServiceImpl Maven / Gradle / Ivy

The newest version!
/*
 *
 *  Copyright 2019 Netflix, Inc.
 *
 *     Licensed under the Apache License, Version 2.0 (the "License");
 *     you may not use this file except in compliance with the License.
 *     You may obtain a copy of the License at
 *
 *         http://www.apache.org/licenses/LICENSE-2.0
 *
 *     Unless required by applicable law or agreed to in writing, software
 *     distributed under the License is distributed on an "AS IS" BASIS,
 *     WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 *     See the License for the specific language governing permissions and
 *     limitations under the License.
 *
 */
package com.netflix.genie.web.services.impl;

import brave.SpanCustomizer;
import brave.Tracer;
import com.fasterxml.jackson.databind.JsonNode;
import com.google.common.collect.Sets;
import com.netflix.genie.common.dto.JobStatusMessages;
import com.netflix.genie.common.internal.dtos.ArchiveStatus;
import com.netflix.genie.common.internal.dtos.JobStatus;
import com.netflix.genie.common.internal.exceptions.checked.GenieJobResolutionException;
import com.netflix.genie.common.internal.tracing.brave.BraveTracingComponents;
import com.netflix.genie.web.agent.launchers.AgentLauncher;
import com.netflix.genie.web.data.services.DataServices;
import com.netflix.genie.web.data.services.PersistenceService;
import com.netflix.genie.web.dtos.JobSubmission;
import com.netflix.genie.web.dtos.ResolvedJob;
import com.netflix.genie.web.dtos.ResourceSelectionResult;
import com.netflix.genie.web.exceptions.checked.AgentLaunchException;
import com.netflix.genie.web.exceptions.checked.IdAlreadyExistsException;
import com.netflix.genie.web.exceptions.checked.NotFoundException;
import com.netflix.genie.web.exceptions.checked.ResourceSelectionException;
import com.netflix.genie.web.selectors.AgentLauncherSelectionContext;
import com.netflix.genie.web.selectors.AgentLauncherSelector;
import com.netflix.genie.web.services.JobLaunchService;
import com.netflix.genie.web.services.JobResolverService;
import com.netflix.genie.web.util.MetricsUtils;
import io.micrometer.core.instrument.MeterRegistry;
import io.micrometer.core.instrument.Tag;
import lombok.extern.slf4j.Slf4j;

import javax.annotation.Nonnull;
import javax.validation.Valid;
import java.util.Collection;
import java.util.Optional;
import java.util.Set;
import java.util.concurrent.TimeUnit;

/**
 * Default implementation of the {@link JobLaunchService}.
 *
 * @author tgianos
 * @since 4.0.0
 */
@Slf4j
public class JobLaunchServiceImpl implements JobLaunchService {
    static final String BEGIN_LAUNCH_JOB_ANNOTATION = "Beginning to Launch Job";
    static final String SAVED_JOB_SUBMISSION_ANNOTATION = "Saved Job Submission";
    static final String RESOLVED_JOB_ANNOTATION = "Resolved Job";
    static final String MARKED_JOB_ACCEPTED_ANNOTATION = "Marked Job Accepted";
    static final String LAUNCHED_AGENT_ANNOTATION = "Launched Agent";
    static final String SAVED_LAUNCHER_EXT_ANNOTATION = "Saved Launcher Ext Data";
    static final String END_LAUNCH_JOB_ANNOTATION = "Completed Launching Job";

    private static final String LAUNCH_JOB_TIMER = "genie.services.jobLaunch.launchJob.timer";
    private static final String AGENT_LAUNCHER_SELECTOR_TIMER = "genie.services.jobLaunch.selectLauncher.timer";
    private static final String AVAILABLE_LAUNCHERS_TAG = "numAvailableLaunchers";
    private static final String SELECTOR_CLASS_TAG = "agentLauncherSelectorClass";
    private static final String LAUNCHER_CLASS_TAG = "agentLauncherSelectedClass";
    private static final int MAX_STATUS_UPDATE_ATTEMPTS = 5;
    private static final int INITIAL_ATTEMPT = 0;
    private static final String ACCEPTED_MESSAGE = "The job has been accepted by the system for execution";

    private final PersistenceService persistenceService;
    private final JobResolverService jobResolverService;
    private final AgentLauncherSelector agentLauncherSelector;
    private final Tracer tracer;
    private final MeterRegistry registry;

    /**
     * Constructor.
     *
     * @param dataServices          The {@link DataServices} instance to use
     * @param jobResolverService    {@link JobResolverService} implementation used to resolve job details
     * @param agentLauncherSelector {@link AgentLauncher} implementation to launch agents
     * @param tracingComponents     {@link BraveTracingComponents} instance to use to get access to instrumentation
     * @param registry              {@link MeterRegistry} metrics repository
     */
    public JobLaunchServiceImpl(
        final DataServices dataServices,
        final JobResolverService jobResolverService,
        final AgentLauncherSelector agentLauncherSelector,
        final BraveTracingComponents tracingComponents,
        final MeterRegistry registry
    ) {
        this.persistenceService = dataServices.getPersistenceService();
        this.jobResolverService = jobResolverService;
        this.agentLauncherSelector = agentLauncherSelector;
        this.tracer = tracingComponents.getTracer();
        this.registry = registry;
    }

    /**
     * {@inheritDoc}
     */
    @Override
    @Nonnull
    public String launchJob(
        @Valid final JobSubmission jobSubmission
    ) throws
        AgentLaunchException,
        GenieJobResolutionException,
        IdAlreadyExistsException,
        NotFoundException {
        final long start = System.nanoTime();
        final SpanCustomizer span = this.tracer.currentSpanCustomizer();
        span.annotate(BEGIN_LAUNCH_JOB_ANNOTATION);
        final Set tags = Sets.newHashSet();
        try {
            /*
             * Steps:
             *
             * 1. Save the job information
             * 2. Attempt to resolve the job information (includes saving)
             * 3. Mark the job as accepted
             * 4. Launch the agent process given the implementation configured for this Genie instance
             * 5. If the agent launch fails mark the job failed else return
             */
            final String jobId = this.persistenceService.saveJobSubmission(jobSubmission);
            span.annotate(SAVED_JOB_SUBMISSION_ANNOTATION);

            final ResolvedJob resolvedJob;
            try {
                resolvedJob = this.jobResolverService.resolveJob(jobId);
            } catch (final Throwable t) {
                final String message;
                if (t instanceof GenieJobResolutionException) {
                    message = JobStatusMessages.FAILED_TO_RESOLVE_JOB;
                } else {
                    message = JobStatusMessages.RESOLUTION_RUNTIME_ERROR;
                }

                MetricsUtils.addFailureTagsWithException(tags, t);
                this.persistenceService.updateJobArchiveStatus(jobId, ArchiveStatus.NO_FILES);
                if (
                    this.updateJobStatus(jobId, JobStatus.RESERVED, JobStatus.FAILED, message, INITIAL_ATTEMPT)
                        != JobStatus.FAILED
                ) {
                    log.error("Updating status to failed didn't succeed");
                }
                throw t; // Caught below for metrics gathering
            }
            span.annotate(RESOLVED_JOB_ANNOTATION);

            // Job state should be RESOLVED now. Mark it ACCEPTED to avoid race condition with agent starting up
            // before we get return from launchAgent and trying to set it to CLAIMED
            try {
                final JobStatus updatedStatus = this.updateJobStatus(
                    jobId,
                    JobStatus.RESOLVED,
                    JobStatus.ACCEPTED,
                    ACCEPTED_MESSAGE,
                    INITIAL_ATTEMPT
                );
                if (updatedStatus != JobStatus.ACCEPTED) {
                    throw new AgentLaunchException("Unable to mark job accepted. Job state " + updatedStatus);
                }
            } catch (final Exception e) {
                this.persistenceService.updateJobArchiveStatus(jobId, ArchiveStatus.NO_FILES);
                // TODO: Failed to update the status to accepted. Try to set it to failed or rely on other cleanup
                //       mechanism? For now rely on janitor mechanisms
                throw e;
            }
            span.annotate(MARKED_JOB_ACCEPTED_ANNOTATION);

            // TODO: at the moment this is not populated, it's going to be a null node (not null)
            final JsonNode requestedLauncherExt = this.persistenceService.getRequestedLauncherExt(jobId);

            final Optional launcherExt;
            try {
                final AgentLauncher launcher = this.selectLauncher(jobId, jobSubmission, resolvedJob);
                tags.add(Tag.of(LAUNCHER_CLASS_TAG, launcher.getClass().getCanonicalName()));
                launcherExt = launcher.launchAgent(resolvedJob, requestedLauncherExt);
            } catch (final AgentLaunchException e) {
                this.persistenceService.updateJobArchiveStatus(jobId, ArchiveStatus.NO_FILES);
                this.updateJobStatus(jobId, JobStatus.ACCEPTED, JobStatus.FAILED, e.getMessage(), INITIAL_ATTEMPT);
                // TODO: How will we get the ID back to the user? Should we add it to an exception? We don't get
                //       We don't get the ID until after saveJobSubmission so if that fails we'd still return nothing
                //       Probably need multiple exceptions to be thrown from this API (if we go with checked)
                throw e;
            }
            span.annotate(LAUNCHED_AGENT_ANNOTATION);

            if (launcherExt.isPresent()) {
                try {
                    this.persistenceService.updateLauncherExt(jobId, launcherExt.get());
                } catch (final Exception e) {
                    // Being unable to update the launcher ext is not optimal however
                    // it's not worth returning an error to the user at this point as
                    // the agent has launched and we have all the other pieces in place
                    log.error("Unable to update the launcher ext for job {}", jobId, e);
                }
            }
            span.annotate(SAVED_LAUNCHER_EXT_ANNOTATION);

            MetricsUtils.addSuccessTags(tags);
            return jobId;
        } catch (final Throwable t) {
            MetricsUtils.addFailureTagsWithException(tags, t);
            throw t;
        } finally {
            span.annotate(END_LAUNCH_JOB_ANNOTATION);
            this.registry
                .timer(LAUNCH_JOB_TIMER, tags)
                .record(System.nanoTime() - start, TimeUnit.NANOSECONDS);
        }
    }

    private AgentLauncher selectLauncher(
        final String jobId,
        final JobSubmission jobSubmission,
        final ResolvedJob resolvedJob
    ) throws AgentLaunchException {
        final Collection availableLaunchers = this.agentLauncherSelector.getAgentLaunchers();
        log.debug("Selecting agent launcher for job {} ({} available)", jobId, availableLaunchers.size());
        final AgentLauncherSelectionContext context = new AgentLauncherSelectionContext(
            jobId,
            jobSubmission.getJobRequest(),
            jobSubmission.getJobRequestMetadata(),
            resolvedJob,
            availableLaunchers
        );

        final Set tags = Sets.newHashSet();
        final long start = System.nanoTime();
        tags.add(Tag.of(AVAILABLE_LAUNCHERS_TAG, String.valueOf(availableLaunchers.size())));
        tags.add(Tag.of(SELECTOR_CLASS_TAG, this.agentLauncherSelector.getClass().getSimpleName()));

        final ResourceSelectionResult selectionResult;
        try {
            selectionResult = this.agentLauncherSelector.select(context);
            final AgentLauncher selectedLauncher = selectionResult.getSelectedResource().orElseThrow(
                () -> new ResourceSelectionException(
                    "No AgentLauncher selected: "
                        + selectionResult.getSelectionRationale().orElse("Rationale unknown")
                )
            );
            MetricsUtils.addSuccessTags(tags);
            tags.add(Tag.of(LAUNCHER_CLASS_TAG, selectedLauncher.getClass().getSimpleName()));
            log.debug("Selected launcher {} for job {}", selectedLauncher, jobId);
            return selectedLauncher;
        } catch (ResourceSelectionException e) {
            log.error("Error selecting agent launcher", e);
            MetricsUtils.addFailureTagsWithException(tags, e);
            throw new AgentLaunchException("Failed to select an Agent Launcher", e);
        } finally {
            this.registry
                .timer(AGENT_LAUNCHER_SELECTOR_TIMER, tags)
                .record(System.nanoTime() - start, TimeUnit.NANOSECONDS);
        }
    }

    /**
     * Helper method to update the job status ONLY IF the job is not in a final state already. This will maintain the
     * final state that may have been set elsewhere by another process (e.g. kill request).
     *
     * @param jobId                The id of the job to update status for
     * @param desiredStatus        The {@link JobStatus} that is the status the job should be in the database after
     *                             exiting this method
     * @param desiredStatusMessage The status message for the database
     * @param attemptNumber        The number of attempts that have been made to update this job status
     * @return The {@link JobStatus} in the database after the final attempt of this method
     */
    private JobStatus updateJobStatus(
        final String jobId,
        final JobStatus expectedStatus,
        final JobStatus desiredStatus,
        final String desiredStatusMessage,
        final int attemptNumber
    ) throws NotFoundException {
        final int nextAttemptNumber = attemptNumber + 1;
        final JobStatus currentStatus = this.persistenceService.updateJobStatus(
            jobId,
            expectedStatus,
            desiredStatus,
            desiredStatusMessage
        );
        if (currentStatus.isFinished()) {
            log.info(
                "Won't change job status of {} from {} to {} desired status as {} is already a final status",
                jobId,
                currentStatus,
                desiredStatus,
                currentStatus
            );
            return currentStatus;
        } else if (currentStatus == desiredStatus) {
            log.debug("Successfully updated status of {} from {} to {}", jobId, expectedStatus, desiredStatus);
            return currentStatus;
        } else {
            log.error(
                "Job {} status changed from expected {} to {}. Couldn't update to {}. Attempt {}",
                jobId,
                expectedStatus,
                currentStatus,
                desiredStatus,
                nextAttemptNumber
            );
            // Recursive call that should break out if update is successful or job is now in a final state
            // or if attempts reach the max attempts
            if (nextAttemptNumber < MAX_STATUS_UPDATE_ATTEMPTS) {
                return this.updateJobStatus(
                    jobId,
                    currentStatus,
                    desiredStatus,
                    desiredStatusMessage,
                    nextAttemptNumber
                );
            } else {
                // breakout condition, stop attempting to update DB
                log.error(
                    "Out of attempts to update job {} status to {}. Unable to complete status update",
                    jobId,
                    desiredStatus
                );
                return currentStatus;
            }
        }
    }
}




© 2015 - 2024 Weber Informatics LLC | Privacy Policy