All Downloads are FREE. Search and download functionalities are using the official Maven repository.

com.netflix.genie.web.services.impl.JobResolverServiceImpl Maven / Gradle / Ivy

The newest version!
/*
 *
 *  Copyright 2018 Netflix, Inc.
 *
 *     Licensed under the Apache License, Version 2.0 (the "License");
 *     you may not use this file except in compliance with the License.
 *     You may obtain a copy of the License at
 *
 *         http://www.apache.org/licenses/LICENSE-2.0
 *
 *     Unless required by applicable law or agreed to in writing, software
 *     distributed under the License is distributed on an "AS IS" BASIS,
 *     WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 *     See the License for the specific language governing permissions and
 *     limitations under the License.
 *
 */
package com.netflix.genie.web.services.impl;

import brave.SpanCustomizer;
import brave.Tracer;
import com.netflix.genie.common.internal.dtos.Application;
import com.netflix.genie.common.internal.dtos.Cluster;
import com.netflix.genie.common.internal.dtos.ClusterMetadata;
import com.netflix.genie.common.internal.dtos.Command;
import com.netflix.genie.common.internal.dtos.ComputeResources;
import com.netflix.genie.common.internal.dtos.Criterion;
import com.netflix.genie.common.internal.dtos.Image;
import com.netflix.genie.common.internal.dtos.JobEnvironment;
import com.netflix.genie.common.internal.dtos.JobMetadata;
import com.netflix.genie.common.internal.dtos.JobRequest;
import com.netflix.genie.common.internal.dtos.JobSpecification;
import com.netflix.genie.common.internal.dtos.JobStatus;
import com.netflix.genie.common.internal.exceptions.checked.GenieJobResolutionException;
import com.netflix.genie.common.internal.exceptions.unchecked.GenieJobResolutionRuntimeException;
import com.netflix.genie.common.internal.jobs.JobConstants;
import com.netflix.genie.common.internal.tracing.TracingConstants;
import com.netflix.genie.common.internal.tracing.brave.BraveTagAdapter;
import com.netflix.genie.common.internal.tracing.brave.BraveTracingComponents;
import com.netflix.genie.web.data.services.DataServices;
import com.netflix.genie.web.data.services.PersistenceService;
import com.netflix.genie.web.dtos.ResolvedJob;
import com.netflix.genie.web.dtos.ResourceSelectionResult;
import com.netflix.genie.web.exceptions.checked.ResourceSelectionException;
import com.netflix.genie.web.properties.JobResolutionProperties;
import com.netflix.genie.web.properties.JobsProperties;
import com.netflix.genie.web.selectors.ClusterSelectionContext;
import com.netflix.genie.web.selectors.ClusterSelector;
import com.netflix.genie.web.selectors.CommandSelectionContext;
import com.netflix.genie.web.selectors.CommandSelector;
import com.netflix.genie.web.services.JobResolverService;
import com.netflix.genie.web.util.MetricsConstants;
import com.netflix.genie.web.util.MetricsUtils;
import io.micrometer.core.instrument.MeterRegistry;
import io.micrometer.core.instrument.Tag;
import lombok.Getter;
import lombok.RequiredArgsConstructor;
import lombok.Setter;
import lombok.ToString;
import org.apache.commons.lang3.RegExUtils;
import org.apache.commons.lang3.StringUtils;
import org.slf4j.Logger;
import org.slf4j.LoggerFactory;
import org.springframework.aop.TargetClassAware;
import org.springframework.transaction.annotation.Transactional;
import org.springframework.validation.annotation.Validated;

import javax.annotation.Nonnull;
import javax.annotation.Nullable;
import javax.validation.Valid;
import javax.validation.constraints.NotEmpty;
import java.io.File;
import java.net.URI;
import java.nio.file.Paths;
import java.util.ArrayList;
import java.util.Collection;
import java.util.Collections;
import java.util.Comparator;
import java.util.HashMap;
import java.util.HashSet;
import java.util.List;
import java.util.Map;
import java.util.Optional;
import java.util.Set;
import java.util.concurrent.TimeUnit;
import java.util.function.Supplier;
import java.util.stream.Collectors;

/**
 * Implementation of the {@link JobResolverService} APIs.
 *
 * @author tgianos
 * @since 4.0.0
 */
@Validated
public class JobResolverServiceImpl implements JobResolverService {
    private static final Logger LOG = LoggerFactory.getLogger(JobResolverServiceImpl.class);

    //region Metric Constants
    /**
     * How long it takes to completely resolve a job given inputs.
     */
    private static final String RESOLVE_JOB_TIMER = "genie.services.jobResolver.resolve.timer";

    /**
     * How long it takes to resolve a command for a job given the supplied command criterion.
     */
    private static final String RESOLVE_COMMAND_TIMER = "genie.services.jobResolver.resolveCommand.timer";

    /**
     * How long it takes to resolve a cluster for a job given the resolved command and the request criteria.
     */
    private static final String RESOLVE_CLUSTER_TIMER = "genie.services.jobResolver.resolveCluster.timer";

    /**
     * How long it takes to resolve the applications for a given command.
     */
    private static final String RESOLVE_APPLICATIONS_TIMER = "genie.services.jobResolver.resolveApplications.timer";

    /**
     * How long it takes to resolve a cluster for a job given the resolved command and the request criteria.
     */
    private static final String GENERATE_CRITERIA_PERMUTATIONS_TIMER
        = "genie.services.jobResolver.generateClusterCriteriaPermutations.timer";

    /**
     * How many times a cluster selector is invoked.
     */
    private static final String CLUSTER_SELECTOR_COUNTER
        = "genie.services.jobResolver.resolveCluster.clusterSelector.counter";

    private static final int DEFAULT_CPU = 1;
    private static final int DEFAULT_GPU = 0;
    private static final long DEFAULT_MEMORY = 1_500L;
    private static final long DEFAULT_DISK = 10_000L;
    private static final long DEFAULT_NETWORK = 256L;
    private static final String NO_RATIONALE = "No rationale provided";
    private static final String NO_ID_FOUND = "No id found";
    private static final String VERSION_4 = "4";
    private static final Tag SAVED_TAG = Tag.of("saved", "true");
    private static final Tag NOT_SAVED_TAG = Tag.of("saved", "false");
    private static final Tag NO_CLUSTER_RESOLVED_ID = Tag.of(MetricsConstants.TagKeys.CLUSTER_ID, "None Resolved");
    private static final Tag NO_CLUSTER_RESOLVED_NAME = Tag.of(MetricsConstants.TagKeys.CLUSTER_NAME, "None Resolved");
    private static final Tag NO_COMMAND_RESOLVED_ID = Tag.of(MetricsConstants.TagKeys.COMMAND_ID, "None Resolved");
    private static final Tag NO_COMMAND_RESOLVED_NAME = Tag.of(MetricsConstants.TagKeys.COMMAND_NAME, "None Resolved");

    private static final String ID_FIELD = "id";
    private static final String NAME_FIELD = "name";
    private static final String STATUS_FIELD = "status";
    private static final String VERSION_FIELD = "version";

    private static final String CLUSTER_SELECTOR_STATUS_SUCCESS = "success";
    private static final String CLUSTER_SELECTOR_STATUS_NO_PREFERENCE = "no preference";
    //endregion

    //region Members
    private final PersistenceService persistenceService;
    private final List clusterSelectors;
    private final CommandSelector commandSelector;
    private final MeterRegistry registry;
    // TODO: Switch to path
    private final File defaultJobDirectory;
    private final String defaultArchiveLocation;
    private final Tracer tracer;
    private final BraveTagAdapter tagAdapter;
    private final JobResolutionProperties jobResolutionProperties;
    //endregion

    //region Public APIs

    /**
     * Constructor.
     *
     * @param dataServices            The {@link DataServices} encapsulation instance to use
     * @param clusterSelectors        The {@link ClusterSelector} implementations to use
     * @param commandSelector         The {@link CommandSelector} implementation to use
     * @param registry                The {@link MeterRegistry }metrics repository to use
     * @param jobsProperties          The properties for running a job set by the user
     * @param jobResolutionProperties The {@link JobResolutionProperties} instance
     * @param tracingComponents       The {@link BraveTracingComponents} instance to use
     */
    public JobResolverServiceImpl(
        final DataServices dataServices,
        @NotEmpty final List clusterSelectors,
        final CommandSelector commandSelector, // TODO: For now this is a single value but maybe support List
        final MeterRegistry registry,
        final JobsProperties jobsProperties,
        final JobResolutionProperties jobResolutionProperties,
        final BraveTracingComponents tracingComponents
    ) {
        this.persistenceService = dataServices.getPersistenceService();
        this.clusterSelectors = clusterSelectors;
        this.commandSelector = commandSelector;
        this.jobResolutionProperties = jobResolutionProperties;

        final URI jobDirProperty = jobsProperties.getLocations().getJobs();
        this.defaultJobDirectory = Paths.get(jobDirProperty).toFile();
        final String archiveLocation = jobsProperties.getLocations().getArchives().toString();
        this.defaultArchiveLocation = archiveLocation.endsWith(File.separator)
            ? archiveLocation
            : archiveLocation + File.separator;

        // Metrics
        this.registry = registry;

        // tracing
        this.tracer = tracingComponents.getTracer();
        this.tagAdapter = tracingComponents.getTagAdapter();
    }

    /**
     * {@inheritDoc}
     */
    @Override
    @Nonnull
    @Transactional
    public ResolvedJob resolveJob(
        final String id
    ) throws GenieJobResolutionException, GenieJobResolutionRuntimeException {
        final long start = System.nanoTime();
        final Set tags = new HashSet<>();
        tags.add(SAVED_TAG);
        try {
            LOG.info("Received request to resolve a job with id {}", id);
            final JobStatus jobStatus = this.persistenceService.getJobStatus(id);
            if (!jobStatus.isResolvable()) {
                throw new IllegalArgumentException("Job " + id + " is already resolved: " + jobStatus);
            }

            final JobRequest jobRequest = this.persistenceService.getJobRequest(id);

            // TODO: Possible improvement to combine this query with a few others to save DB trips but for now...
            final boolean apiJob = this.persistenceService.isApiJob(id);

            final JobResolutionContext context = new JobResolutionContext(
                id,
                jobRequest,
                apiJob,
                this.tracer.currentSpanCustomizer()
            );

            final ResolvedJob resolvedJob = this.resolve(context);

            /*
             * TODO: There is currently a gap in database schema where the resolved CPU value is not persisted. This
             *       means that it requires that the returned resolvedJob object here be used within the same call. If
             *       we for some reason eventually put the job id on a queue or something and pull data back from DB
             *       it WILL NOT be accurate. I'm purposely not doing this right now as it's not critical and modifying
             *       the schema will require a prod downtime and there are likely other fields (requestedNetwork,
             *       usedNetwork, usedDisk, resolvedDisk, requestedImage, usedImage) we want to add at the
             *       same time to minimize downtimes. - TJG 2/2/21
             */
            this.persistenceService.saveResolvedJob(id, resolvedJob);
            MetricsUtils.addSuccessTags(tags);
            return resolvedJob;
        } catch (final GenieJobResolutionException e) {
            MetricsUtils.addFailureTagsWithException(tags, e);
            throw e;
        } catch (final Throwable t) {
            MetricsUtils.addFailureTagsWithException(tags, t);
            throw new GenieJobResolutionRuntimeException(t);
        } finally {
            this.registry
                .timer(RESOLVE_JOB_TIMER, tags)
                .record(System.nanoTime() - start, TimeUnit.NANOSECONDS);
        }
    }

    /**
     * {@inheritDoc}
     */
    @Override
    @Nonnull
    public ResolvedJob resolveJob(
        final String id,
        @Valid final JobRequest jobRequest,
        final boolean apiJob
    ) throws GenieJobResolutionException, GenieJobResolutionRuntimeException {
        final long start = System.nanoTime();
        final Set tags = new HashSet<>();
        tags.add(NOT_SAVED_TAG);
        try {
            LOG.info(
                "Received request to resolve a job for id {} and request {}",
                id,
                jobRequest
            );

            final JobResolutionContext context = new JobResolutionContext(
                id,
                jobRequest,
                apiJob,
                this.tracer.currentSpanCustomizer()
            );

            final ResolvedJob resolvedJob = this.resolve(context);
            MetricsUtils.addSuccessTags(tags);
            return resolvedJob;
        } catch (final GenieJobResolutionException e) {
            MetricsUtils.addFailureTagsWithException(tags, e);
            throw e;
        } catch (final Throwable t) {
            MetricsUtils.addFailureTagsWithException(tags, t);
            throw new GenieJobResolutionRuntimeException(t);
        } finally {
            this.registry
                .timer(RESOLVE_JOB_TIMER, tags)
                .record(System.nanoTime() - start, TimeUnit.NANOSECONDS);
        }
    }
    //endregion

    //region Resolution Helpers
    private ResolvedJob resolve(
        final JobResolutionContext context
    ) throws GenieJobResolutionException, GenieJobResolutionRuntimeException {
        this.tagSpanWithJobMetadata(context);
        this.resolveCommand(context);
        this.resolveCluster(context);
        this.resolveApplications(context);
        this.resolveComputeResources(context);
        this.resolveImages(context);
        this.resolveEnvironmentVariables(context);
        this.resolveTimeout(context);
        this.resolveArchiveLocation(context);
        this.resolveJobDirectory(context);

        return context.build();
    }

    /*
     * Overall Algorithm:
     *
     * 1. Take command criterion from user job request and query database for all possible matching commands
     * 2. Take clusterCriteria from jobRequest and clusterCriteria from each command and create uber query which finds
     *    ALL clusters that match at least one of the resulting merged criterion (merged meaning combining a job and
     *    command cluster criterion)
     * 3. Iterate through commands from step 1 and evaluate job/command cluster criterion against resulting set of
     *    clusters from step 2. Filter out any commands that don't match any clusters. Save resulting cluster set for
     *    each command in map command -> Set
     * 4. Pass set, jobRequest, jobId, map>  to command selector which will return single
     *    command
     * 5. Using command result pass previously computed Set to cluster selector
     * 6. Save results and run job
     */
    private void resolveCommand(final JobResolutionContext context) throws GenieJobResolutionException {
        final long start = System.nanoTime();
        final Set tags = new HashSet<>();
        try {
            final JobRequest jobRequest = context.getJobRequest();
            final Criterion criterion = jobRequest.getCriteria().getCommandCriterion();

            //region Algorithm Step 1
            final Set commands = this.persistenceService.findCommandsMatchingCriterion(criterion, true);

            // Short circuit if there are no commands
            if (commands.isEmpty()) {
                throw new GenieJobResolutionException("No command matching command criterion found");
            }
            //endregion

            //region Algorithm Step 2
            final Map> commandClusterCriterions = this.generateClusterCriteriaPermutations(
                commands,
                jobRequest
            );

            final Set uniqueCriteria = this.flattenClusterCriteriaPermutations(commandClusterCriterions);

            final Set allCandidateClusters = this.persistenceService.findClustersMatchingAnyCriterion(
                uniqueCriteria,
                true
            );
            if (allCandidateClusters.isEmpty()) {
                throw new GenieJobResolutionException("No clusters available to run any candidate command on");
            }
            //endregion

            //region Algorithm Step 3
            final Map> commandClusters = this.generateCommandClustersMap(
                commandClusterCriterions,
                allCandidateClusters
            );
            // this should never really happen based on above check but just in case
            if (commandClusters.isEmpty()) {
                throw new GenieJobResolutionException("No clusters available to run any candidate command on");
            }
            // save the map for use later by cluster resolution
            context.setCommandClusters(commandClusters);
            //endregion

            //region Algorithm Step 4
            final ResourceSelectionResult result = this.commandSelector.select(
                new CommandSelectionContext(
                    context.getJobId(),
                    jobRequest,
                    context.isApiJob(),
                    commandClusters
                )
            );
            //endregion

            final Command command = result
                .getSelectedResource()
                .orElseThrow(
                    () -> new GenieJobResolutionException(
                        "Expected a command but "
                            + result.getSelectorClass().getSimpleName()
                            + " didn't select anything. Rationale: "
                            + result.getSelectionRationale().orElse(NO_RATIONALE)
                    )
                );
            LOG.debug(
                "Selected command {} for criterion {} using {} due to {}",
                command.getId(),
                criterion,
                result.getSelectorClass().getName(),
                result.getSelectionRationale().orElse(NO_RATIONALE)
            );

            MetricsUtils.addSuccessTags(tags);
            final String commandId = command.getId();
            final String commandName = command.getMetadata().getName();
            tags.add(Tag.of(MetricsConstants.TagKeys.COMMAND_ID, commandId));
            tags.add(Tag.of(MetricsConstants.TagKeys.COMMAND_NAME, commandName));
            final SpanCustomizer spanCustomizer = context.getSpanCustomizer();
            this.tagAdapter.tag(spanCustomizer, TracingConstants.JOB_COMMAND_ID_TAG, commandId);
            this.tagAdapter.tag(spanCustomizer, TracingConstants.JOB_COMMAND_NAME_TAG, commandName);
            context.setCommand(command);
        } catch (final GenieJobResolutionException e) {
            // No candidates or selector choose none
            tags.add(NO_COMMAND_RESOLVED_ID);
            tags.add(NO_COMMAND_RESOLVED_NAME);
            MetricsUtils.addFailureTagsWithException(tags, e);
            throw e;
        } catch (final ResourceSelectionException t) {
            // Selector runtime error
            MetricsUtils.addFailureTagsWithException(tags, t);
            throw new GenieJobResolutionRuntimeException(t);
        } finally {
            this.registry
                .timer(RESOLVE_COMMAND_TIMER, tags)
                .record(System.nanoTime() - start, TimeUnit.NANOSECONDS);
        }
    }

    /*
     * At this point we should have resolved a command and now we can use the map command -> clusters that was
     * previously computed to invoke the cluster selectors to narrow down the candidate clusters to a single cluster
     * for use.
     */
    private void resolveCluster(final JobResolutionContext context) throws GenieJobResolutionException {
        final long start = System.nanoTime();
        final Set tags = new HashSet<>();

        final String jobId = context.getJobId();
        try {
            final Command command = context
                .getCommand()
                .orElseThrow(
                    () -> new IllegalStateException(
                        "Command not resolved before attempting to resolve a cluster for job " + jobId
                    )
                );
            final Set candidateClusters = context
                .getCommandClusters()
                .orElseThrow(
                    () -> new IllegalStateException("Command to candidate cluster map not available for job " + jobId)
                )
                .get(command);
            if (candidateClusters == null || candidateClusters.isEmpty()) {
                throw new IllegalStateException(
                    "Command " + command.getId() + " had no candidate clusters for job " + jobId
                );
            }

            Cluster cluster = null;
            for (final ClusterSelector clusterSelector : this.clusterSelectors) {
                // Create subset of tags just for this selector. Copy existing tags if any.
                final Set selectorTags = new HashSet<>(tags);
                // Note: This is done before the selection because if we do it after and the selector throws
                //       exception then we don't have this tag in the metrics. Which is unfortunate since the result
                //       does return the selector
                final String clusterSelectorClass = this.getProxyObjectClassName(clusterSelector);
                selectorTags.add(Tag.of(MetricsConstants.TagKeys.CLASS_NAME, clusterSelectorClass));

                try {
                    final ResourceSelectionResult result = clusterSelector.select(
                        new ClusterSelectionContext(
                            jobId,
                            context.getJobRequest(),
                            context.isApiJob(),
                            command,
                            candidateClusters
                        )
                    );

                    final Optional selectedClusterOptional = result.getSelectedResource();
                    if (selectedClusterOptional.isPresent()) {
                        cluster = selectedClusterOptional.get();
                        LOG.debug(
                            "Successfully selected cluster {} using selector {} for job {} with rationale: {}",
                            cluster.getId(),
                            clusterSelectorClass,
                            jobId,
                            result.getSelectionRationale().orElse(NO_RATIONALE)
                        );
                        selectorTags.add(Tag.of(MetricsConstants.TagKeys.STATUS, CLUSTER_SELECTOR_STATUS_SUCCESS));
                        selectorTags.add(Tag.of(MetricsConstants.TagKeys.CLUSTER_ID, cluster.getId()));
                        selectorTags.add(
                            Tag.of(MetricsConstants.TagKeys.CLUSTER_NAME, cluster.getMetadata().getName())
                        );
                        break;
                    } else {
                        selectorTags.add(
                            Tag.of(MetricsConstants.TagKeys.STATUS, CLUSTER_SELECTOR_STATUS_NO_PREFERENCE)
                        );
                        selectorTags.add(NO_CLUSTER_RESOLVED_ID);
                        selectorTags.add(NO_CLUSTER_RESOLVED_NAME);
                        LOG.debug(
                            "Selector {} returned no preference with rationale: {}",
                            clusterSelectorClass,
                            result.getSelectionRationale().orElse(NO_RATIONALE)
                        );
                    }
                } catch (final Exception e) {
                    // Swallow exception and proceed to next selector.
                    // This is a choice to provides "best-service": select a cluster as long as it matches criteria,
                    // even if one of the selectors encountered an error and cannot choose the best candidate.
                    MetricsUtils.addFailureTagsWithException(selectorTags, e);
                    LOG.warn(
                        "Cluster selector {} evaluation threw exception for job {}",
                        clusterSelectorClass,
                        jobId,
                        e
                    );
                } finally {
                    this.registry.counter(CLUSTER_SELECTOR_COUNTER, selectorTags).increment();
                }
            }

            if (cluster == null) {
                throw new GenieJobResolutionException("No cluster resolved for job " + jobId);
            }

            LOG.debug("Resolved cluster {} for job {}", cluster.getId(), jobId);

            context.setCluster(cluster);
            MetricsUtils.addSuccessTags(tags);
            final String clusterId = cluster.getId();
            final String clusterName = cluster.getMetadata().getName();
            tags.add(Tag.of(MetricsConstants.TagKeys.CLUSTER_ID, clusterId));
            tags.add(Tag.of(MetricsConstants.TagKeys.CLUSTER_NAME, clusterName));
            final SpanCustomizer spanCustomizer = context.getSpanCustomizer();
            this.tagAdapter.tag(spanCustomizer, TracingConstants.JOB_CLUSTER_ID_TAG, clusterId);
            this.tagAdapter.tag(spanCustomizer, TracingConstants.JOB_CLUSTER_NAME_TAG, clusterName);
        } catch (final GenieJobResolutionException e) {
            tags.add(NO_CLUSTER_RESOLVED_ID);
            tags.add(NO_CLUSTER_RESOLVED_NAME);
            MetricsUtils.addFailureTagsWithException(tags, e);
            throw e;
        } catch (final Throwable t) {
            MetricsUtils.addFailureTagsWithException(tags, t);
            throw new GenieJobResolutionRuntimeException(t);
        } finally {
            this.registry
                .timer(RESOLVE_CLUSTER_TIMER, tags)
                .record(System.nanoTime() - start, TimeUnit.NANOSECONDS);
        }
    }

    private void resolveApplications(final JobResolutionContext context) throws GenieJobResolutionException {
        final long start = System.nanoTime();
        final Set tags = new HashSet<>();
        final String id = context.getJobId();
        final JobRequest jobRequest = context.getJobRequest();
        try {
            final String commandId = context
                .getCommand()
                .orElseThrow(() -> new IllegalStateException("Command hasn't been resolved before applications"))
                .getId();
            LOG.debug("Selecting applications for job {} and command {}", id, commandId);
            // TODO: What do we do about application status? Should probably check here
            final List applications = new ArrayList<>();
            if (jobRequest.getCriteria().getApplicationIds().isEmpty()) {
                applications.addAll(this.persistenceService.getApplicationsForCommand(commandId));
            } else {
                for (final String applicationId : jobRequest.getCriteria().getApplicationIds()) {
                    applications.add(this.persistenceService.getApplication(applicationId));
                }
            }
            LOG.debug(
                "Resolved applications {} for job {}",
                applications
                    .stream()
                    .map(Application::getId)
                    .reduce((one, two) -> one + "," + two)
                    .orElse(NO_ID_FOUND),
                id
            );
            MetricsUtils.addSuccessTags(tags);
            context.setApplications(applications);
        } catch (final Throwable t) {
            MetricsUtils.addFailureTagsWithException(tags, t);
            throw new GenieJobResolutionRuntimeException(t);
        } finally {
            this.registry
                .timer(RESOLVE_APPLICATIONS_TIMER, tags)
                .record(System.nanoTime() - start, TimeUnit.NANOSECONDS);
        }
    }

    private void resolveEnvironmentVariables(final JobResolutionContext context) {
        final Command command = context
            .getCommand()
            .orElseThrow(
                () -> new IllegalStateException("Command not resolved before attempting to resolve env variables")
            );
        final Cluster cluster = context
            .getCluster()
            .orElseThrow(
                () -> new IllegalStateException("Cluster not resolved before attempting to resolve env variables")
            );
        final String id = context.getJobId();
        final JobRequest jobRequest = context.getJobRequest();
        final long jobMemory = context
            .getComputeResources()
            .orElseThrow(
                () -> new IllegalStateException("Job memory not resolved before attempting to resolve env variables")
            )
            .getMemoryMb()
            .orElseThrow(() -> new IllegalStateException("No memory has been resolved before attempting to resolve"));
        // N.B. variables may be evaluated in a different order than they are added to this map (due to serialization).
        // Hence variables in this set should not depend on each-other.
        final Map envVariables = new HashMap<>();
        envVariables.put(JobConstants.GENIE_VERSION_ENV_VAR, VERSION_4);
        envVariables.put(JobConstants.GENIE_CLUSTER_ID_ENV_VAR, cluster.getId());
        envVariables.put(JobConstants.GENIE_CLUSTER_NAME_ENV_VAR, cluster.getMetadata().getName());
        envVariables.put(JobConstants.GENIE_CLUSTER_TAGS_ENV_VAR, this.tagsToString(cluster.getMetadata().getTags()));
        envVariables.put(JobConstants.GENIE_COMMAND_ID_ENV_VAR, command.getId());
        envVariables.put(JobConstants.GENIE_COMMAND_NAME_ENV_VAR, command.getMetadata().getName());
        envVariables.put(JobConstants.GENIE_COMMAND_TAGS_ENV_VAR, this.tagsToString(command.getMetadata().getTags()));
        envVariables.put(JobConstants.GENIE_JOB_ID_ENV_VAR, id);
        envVariables.put(JobConstants.GENIE_JOB_NAME_ENV_VAR, jobRequest.getMetadata().getName());
        envVariables.put(JobConstants.GENIE_JOB_MEMORY_ENV_VAR, String.valueOf(jobMemory));
        envVariables.put(JobConstants.GENIE_JOB_TAGS_ENV_VAR, this.tagsToString(jobRequest.getMetadata().getTags()));
        envVariables.put(
            JobConstants.GENIE_JOB_GROUPING_ENV_VAR,
            jobRequest.getMetadata().getGrouping().orElse("")
        );
        envVariables.put(
            JobConstants.GENIE_JOB_GROUPING_INSTANCE_ENV_VAR,
            jobRequest.getMetadata().getGroupingInstance().orElse("")
        );
        envVariables.put(
            JobConstants.GENIE_REQUESTED_COMMAND_TAGS_ENV_VAR,
            this.tagsToString(jobRequest.getCriteria().getCommandCriterion().getTags())
        );
        final List clusterCriteria = jobRequest.getCriteria().getClusterCriteria();
        final List clusterCriteriaTags = new ArrayList<>(clusterCriteria.size());
        for (int i = 0; i < clusterCriteria.size(); i++) {
            final Criterion criterion = clusterCriteria.get(i);
            final String criteriaTagsString = this.tagsToString(criterion.getTags());
            envVariables.put(JobConstants.GENIE_REQUESTED_CLUSTER_TAGS_ENV_VAR + "_" + i, criteriaTagsString);
            clusterCriteriaTags.add("[" + criteriaTagsString + "]");
        }
        envVariables.put(
            JobConstants.GENIE_REQUESTED_CLUSTER_TAGS_ENV_VAR,
            "[" + StringUtils.join(clusterCriteriaTags, ',') + "]"
        );
        envVariables.put(JobConstants.GENIE_USER_ENV_VAR, jobRequest.getMetadata().getUser());
        envVariables.put(JobConstants.GENIE_USER_GROUP_ENV_VAR, jobRequest.getMetadata().getGroup().orElse(""));

        context.setEnvironmentVariables(Collections.unmodifiableMap(envVariables));
    }

    private void resolveTimeout(final JobResolutionContext context) {
        final JobRequest jobRequest = context.getJobRequest();
        if (jobRequest.getRequestedAgentConfig().getTimeoutRequested().isPresent()) {
            context.setTimeout(jobRequest.getRequestedAgentConfig().getTimeoutRequested().get());
        } else if (context.isApiJob()) {
            // For backwards V3 compatibility
            context.setTimeout(com.netflix.genie.common.dto.JobRequest.DEFAULT_TIMEOUT_DURATION);
        }
    }

    private void resolveComputeResources(final JobResolutionContext context) {
        final ComputeResources req = context
            .getJobRequest()
            .getRequestedJobEnvironment()
            .getRequestedComputeResources();
        final ComputeResources command = context
            .getCommand()
            .orElseThrow(() -> new IllegalStateException("Command hasn't been resolved before compute resources"))
            .getComputeResources();
        final ComputeResources defaults = this.jobResolutionProperties.getDefaultComputeResources();
        context.setComputeResources(
            new ComputeResources.Builder()
                .withCpu(this.resolveComputeResource(req::getCpu, command::getCpu, defaults::getCpu, DEFAULT_CPU))
                .withGpu(this.resolveComputeResource(req::getGpu, command::getGpu, defaults::getGpu, DEFAULT_GPU))
                .withMemoryMb(
                    this.resolveComputeResource(
                        req::getMemoryMb,
                        command::getMemoryMb,
                        defaults::getMemoryMb,
                        DEFAULT_MEMORY
                    )
                )
                .withDiskMb(
                    this.resolveComputeResource(req::getDiskMb, command::getDiskMb, defaults::getDiskMb, DEFAULT_DISK)
                )
                .withNetworkMbps(
                    this.resolveComputeResource(
                        req::getNetworkMbps,
                        command::getNetworkMbps,
                        defaults::getNetworkMbps,
                        DEFAULT_NETWORK
                    )
                )
                .build()
        );
    }

    private  T resolveComputeResource(
        final Supplier> requestedResource,
        final Supplier> commandResource,
        final Supplier> configuredDefault,
        final T hardCodedDefault
    ) {
        return requestedResource
            .get()
            .orElse(
                commandResource
                    .get()
                    .orElse(
                        configuredDefault
                            .get()
                            .orElse(hardCodedDefault)
                    )
            );
    }

    private void resolveImages(final JobResolutionContext context) {
        final Map requestImages = context
            .getJobRequest()
            .getRequestedJobEnvironment()
            .getRequestedImages();
        final Map commandImages = context
            .getCommand()
            .orElseThrow(() -> new IllegalStateException("No command resolved before trying to resolve images"))
            .getImages();
        final Map defaultImages = this.jobResolutionProperties.getDefaultImages();

        // Find all the image keys
        final Map resolvedImages = new HashMap<>(defaultImages);
        for (final Map.Entry entry : commandImages.entrySet()) {
            resolvedImages.merge(entry.getKey(), entry.getValue(), this::mergeImages);
        }
        for (final Map.Entry entry : requestImages.entrySet()) {
            resolvedImages.merge(entry.getKey(), entry.getValue(), this::mergeImages);
        }
        context.setImages(resolvedImages);
    }

    private void resolveArchiveLocation(final JobResolutionContext context) {
        // TODO: Disable ability to disable archival for all jobs during internal V4 migration.
        //       Will allow us to reach out to clients who may set this variable but still expect output after
        //       job completion due to it being served off the node after completion in V3 but now it won't.
        //       Put this back in once all use cases have been hunted down and users are sure of their expected
        //       behavior
        context.setArchiveLocation(this.defaultArchiveLocation + context.getJobId());
    }

    private void resolveJobDirectory(final JobResolutionContext context) {
        context.setJobDirectory(
            context.getJobRequest()
                .getRequestedAgentConfig()
                .getRequestedJobDirectoryLocation()
                .orElse(this.defaultJobDirectory)
        );
    }
    //endregion

    //region Additional Helpers

    /**
     * Helper method to generate all the possible viable cluster criterion permutations for the given set of commands
     * and the given job request. The resulting map will be each command to its associated priority ordered list of
     * merged cluster criteria. The priority order is generated as follows:
     * 
     * for (commandClusterCriterion : command.getClusterCriteria()) {
     *     for (jobClusterCriterion : jobRequest.getClusterCriteria()) {
     *         // merge
     *     }
     * }
     * 
* * @param commands The set of {@link Command}s whose cluster criteria should be evaluated * @param jobRequest The {@link JobRequest} whose cluster criteria should be combined with the commands * @return The resulting map of each command to their associated merged criterion list in priority order */ private Map> generateClusterCriteriaPermutations( final Set commands, final JobRequest jobRequest ) { final long start = System.nanoTime(); try { final Map> mapBuilder = new HashMap<>(); for (final Command command : commands) { final List listBuilder = new ArrayList<>(); for (final Criterion commandClusterCriterion : command.getClusterCriteria()) { for (final Criterion jobClusterCriterion : jobRequest.getCriteria().getClusterCriteria()) { try { // Failing to merge the criteria is equivalent to a round-trip DB query that returns // zero results. This is an in memory optimization which also solves the need to implement // the db query as a join with a subquery. listBuilder.add(this.mergeCriteria(commandClusterCriterion, jobClusterCriterion)); } catch (final IllegalArgumentException e) { LOG.debug( "Unable to merge command cluster criterion {} and job cluster criterion {}. Skipping.", commandClusterCriterion, jobClusterCriterion, e ); } } } mapBuilder.put(command, Collections.unmodifiableList(listBuilder)); } return Collections.unmodifiableMap(mapBuilder); } finally { this.registry .timer(GENERATE_CRITERIA_PERMUTATIONS_TIMER) .record(System.nanoTime() - start, TimeUnit.NANOSECONDS); } } private Set flattenClusterCriteriaPermutations(final Map> commandCriteriaMap) { return commandCriteriaMap.values().stream().flatMap(Collection::stream).collect(Collectors.toSet()); } /** * This is an in memory evaluation of the matching done against persistence. * * @param cluster The cluster to evaluate the criterion against * @param criterion The criterion the cluster is being tested against * @return {@literal true} if the {@link Cluster} matches the {@link Criterion} */ private boolean clusterMatchesCriterion(final Cluster cluster, final Criterion criterion) { // TODO: This runs the risk of diverging from DB query mechanism. Perhaps way to unite somewhat? final ClusterMetadata metadata = cluster.getMetadata(); return criterion.getId().map(id -> cluster.getId().equals(id)).orElse(true) && criterion.getName().map(name -> metadata.getName().equals(name)).orElse(true) && criterion.getVersion().map(version -> metadata.getVersion().equals(version)).orElse(true) && criterion.getStatus().map(status -> metadata.getStatus().name().equals(status)).orElse(true) && metadata.getTags().containsAll(criterion.getTags()); } private Map> generateCommandClustersMap( final Map> commandClusterCriteria, final Set candidateClusters ) { final Map> matrixBuilder = new HashMap<>(); for (final Map.Entry> entry : commandClusterCriteria.entrySet()) { final Command command = entry.getKey(); final Set matchedClustersBuilder = new HashSet<>(); // Loop through the criterion in the priority order first for (final Criterion criterion : entry.getValue()) { for (final Cluster candidateCluster : candidateClusters) { if (this.clusterMatchesCriterion(candidateCluster, criterion)) { LOG.debug( "Cluster {} matched criterion {} for command {}", candidateCluster.getId(), criterion, command.getId() ); matchedClustersBuilder.add(candidateCluster); } } final Set matchedClusters = Collections.unmodifiableSet(matchedClustersBuilder); if (!matchedClusters.isEmpty()) { // If we found some clusters the evaluation for this command is done matrixBuilder.put(command, matchedClusters); LOG.debug("For command {} matched clusters {}", command, matchedClusters); // short circuit further criteria evaluation for this command break; } } // If the command never matched any clusters it should be filtered out // of resulting map as no value would be added to the result builder } final Map> matrix = Collections.unmodifiableMap(matrixBuilder); LOG.debug("Complete command -> clusters matrix: {}", matrix); return matrix; } /** * Helper method for merging two criterion. *

* This method makes several assumptions: * - If any of these fields: {@literal id, name, version, status} are in both criterion their values must match * or this criterion combination of criteria can't possibly be matched so an {@link IllegalArgumentException} * is thrown * - If only one criterion has any of these fields {@literal id, name, version, status} then that value is present * in the resulting criterion * - Any {@literal tags} present in either criterion are merged into the super set of both sets of tags * * @param one The first {@link Criterion} * @param two The second {@link Criterion} * @return A merged {@link Criterion} that can be used to search the database * @throws IllegalArgumentException If the criteria can't be merged due to the described assumptions */ private Criterion mergeCriteria(final Criterion one, final Criterion two) throws IllegalArgumentException { final Criterion.Builder builder = new Criterion.Builder(); builder.withId( this.mergeCriteriaStrings(one.getId().orElse(null), two.getId().orElse(null), ID_FIELD) ); builder.withName( this.mergeCriteriaStrings(one.getName().orElse(null), two.getName().orElse(null), NAME_FIELD) ); builder.withStatus( this.mergeCriteriaStrings(one.getStatus().orElse(null), two.getStatus().orElse(null), STATUS_FIELD) ); builder.withVersion( this.mergeCriteriaStrings(one.getVersion().orElse(null), two.getVersion().orElse(null), VERSION_FIELD) ); final Set tags = new HashSet<>(one.getTags()); tags.addAll(two.getTags()); builder.withTags(tags); return builder.build(); } private String mergeCriteriaStrings( @Nullable final String one, @Nullable final String two, final String fieldName ) throws IllegalArgumentException { if (StringUtils.equals(one, two)) { // This handles null == null for us return one; } else if (one == null) { return two; } else if (two == null) { return one; } else { // Both have values but aren't equal throw new IllegalArgumentException(fieldName + "'s were both present but not equal"); } } private Image mergeImages(final Image secondary, final Image primary) { return new Image.Builder() .withName(primary.getName().orElse(secondary.getName().orElse(null))) .withTag(primary.getTag().orElse(secondary.getTag().orElse(null))) .withArguments(primary.getArguments().isEmpty() ? secondary.getArguments() : primary.getArguments()) .build(); } /** * Helper to convert a set of tags into a string that is a suitable value for a shell environment variable. * Adds double quotes as necessary (i.e. in case of spaces, newlines), performs escaping of in-tag quotes. * Input tags are sorted to produce a deterministic output value. * * @param tags a set of tags or null * @return a CSV string */ private String tagsToString(final Set tags) { final List sortedTags = new ArrayList<>(tags); // Sort tags for the sake of determinism (e.g., tests) sortedTags.sort(Comparator.naturalOrder()); final String joinedString = StringUtils.join(sortedTags, ','); // Escape quotes return RegExUtils.replaceAll(RegExUtils.replaceAll(joinedString, "'", "\\'"), "\"", "\\\""); } private String getProxyObjectClassName(final Object possibleProxyObject) { final String className; if (possibleProxyObject instanceof TargetClassAware) { final Class targetClass = ((TargetClassAware) possibleProxyObject).getTargetClass(); if (targetClass != null) { className = targetClass.getCanonicalName(); } else { className = possibleProxyObject.getClass().getCanonicalName(); } } else { className = possibleProxyObject.getClass().getCanonicalName(); } return className; } private void tagSpanWithJobMetadata(final JobResolutionContext context) { final SpanCustomizer spanCustomizer = this.tracer.currentSpanCustomizer(); this.tagAdapter.tag(spanCustomizer, TracingConstants.JOB_ID_TAG, context.getJobId()); final JobMetadata jobMetadata = context.getJobRequest().getMetadata(); this.tagAdapter.tag(spanCustomizer, TracingConstants.JOB_NAME_TAG, jobMetadata.getName()); this.tagAdapter.tag(spanCustomizer, TracingConstants.JOB_USER_TAG, jobMetadata.getUser()); } //endregion //region Helper Classes /** * A helper data class for passing information around / along the resolution pipeline. * * @author tgianos * @since 4.0.0 */ @RequiredArgsConstructor @Getter @Setter @ToString(doNotUseGetters = true) static class JobResolutionContext { private final String jobId; private final JobRequest jobRequest; private final boolean apiJob; private final SpanCustomizer spanCustomizer; private Command command; private Cluster cluster; private List applications; private ComputeResources computeResources; private Map environmentVariables; private Integer timeout; private String archiveLocation; private File jobDirectory; private Map> commandClusters; private Map images; Optional getCommand() { return Optional.ofNullable(this.command); } Optional getCluster() { return Optional.ofNullable(this.cluster); } Optional> getApplications() { return Optional.ofNullable(this.applications); } Optional getComputeResources() { return Optional.ofNullable(this.computeResources); } Optional> getEnvironmentVariables() { return Optional.ofNullable(this.environmentVariables); } Optional getTimeout() { return Optional.ofNullable(this.timeout); } Optional getArchiveLocation() { return Optional.ofNullable(this.archiveLocation); } Optional getJobDirectory() { return Optional.ofNullable(this.jobDirectory); } Optional>> getCommandClusters() { return Optional.ofNullable(this.commandClusters); } Optional> getImages() { return Optional.ofNullable(this.images); } ResolvedJob build() { // Error checking if (this.command == null) { throw new IllegalStateException("Command was never resolved for job " + this.jobId); } if (this.cluster == null) { throw new IllegalStateException("Cluster was never resolved for job " + this.jobId); } if (this.applications == null) { throw new IllegalStateException("Applications were never resolved for job " + this.jobId); } if (this.computeResources == null) { throw new IllegalStateException("Compute resources were never resolved for job " + this.jobId); } if (this.images == null) { throw new IllegalStateException("Images were never resolved for job " + this.jobId); } if (this.environmentVariables == null) { throw new IllegalStateException("Environment variables were never resolved for job " + this.jobId); } if (this.archiveLocation == null) { throw new IllegalStateException("Archive location was never resolved for job " + this.jobId); } if (this.jobDirectory == null) { throw new IllegalStateException("Job directory was never resolved for job " + this.jobId); } // Note: Currently no check for timeout due to it being ok for it to be null at the moment final JobSpecification jobSpecification = new JobSpecification( this.command.getExecutable(), this.jobRequest.getCommandArgs(), new JobSpecification.ExecutionResource(this.jobId, this.jobRequest.getResources()), new JobSpecification.ExecutionResource(this.cluster.getId(), this.cluster.getResources()), new JobSpecification.ExecutionResource(this.command.getId(), this.command.getResources()), this.applications .stream() .map( application -> new JobSpecification.ExecutionResource( application.getId(), application.getResources() ) ) .collect(Collectors.toList()), this.environmentVariables, this.jobRequest.getRequestedAgentConfig().isInteractive(), this.jobDirectory, this.archiveLocation, this.timeout ); final JobEnvironment jobEnvironment = new JobEnvironment .Builder() .withComputeResources(this.computeResources) .withEnvironmentVariables(this.environmentVariables) .withImages(this.images) .build(); return new ResolvedJob(jobSpecification, jobEnvironment, this.jobRequest.getMetadata()); } } //endregion }





© 2015 - 2024 Weber Informatics LLC | Privacy Policy