com.netflix.genie.web.services.impl.JobResolverServiceImpl Maven / Gradle / Ivy
/*
*
* Copyright 2018 Netflix, Inc.
*
* Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License.
* You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*
*/
package com.netflix.genie.web.services.impl;
import brave.SpanCustomizer;
import brave.Tracer;
import com.netflix.genie.common.internal.dtos.Application;
import com.netflix.genie.common.internal.dtos.Cluster;
import com.netflix.genie.common.internal.dtos.ClusterMetadata;
import com.netflix.genie.common.internal.dtos.Command;
import com.netflix.genie.common.internal.dtos.ComputeResources;
import com.netflix.genie.common.internal.dtos.Criterion;
import com.netflix.genie.common.internal.dtos.Image;
import com.netflix.genie.common.internal.dtos.JobEnvironment;
import com.netflix.genie.common.internal.dtos.JobMetadata;
import com.netflix.genie.common.internal.dtos.JobRequest;
import com.netflix.genie.common.internal.dtos.JobSpecification;
import com.netflix.genie.common.internal.dtos.JobStatus;
import com.netflix.genie.common.internal.exceptions.checked.GenieJobResolutionException;
import com.netflix.genie.common.internal.exceptions.unchecked.GenieJobResolutionRuntimeException;
import com.netflix.genie.common.internal.jobs.JobConstants;
import com.netflix.genie.common.internal.tracing.TracingConstants;
import com.netflix.genie.common.internal.tracing.brave.BraveTagAdapter;
import com.netflix.genie.common.internal.tracing.brave.BraveTracingComponents;
import com.netflix.genie.web.data.services.DataServices;
import com.netflix.genie.web.data.services.PersistenceService;
import com.netflix.genie.web.dtos.ResolvedJob;
import com.netflix.genie.web.dtos.ResourceSelectionResult;
import com.netflix.genie.web.exceptions.checked.ResourceSelectionException;
import com.netflix.genie.web.properties.JobResolutionProperties;
import com.netflix.genie.web.properties.JobsProperties;
import com.netflix.genie.web.selectors.ClusterSelectionContext;
import com.netflix.genie.web.selectors.ClusterSelector;
import com.netflix.genie.web.selectors.CommandSelectionContext;
import com.netflix.genie.web.selectors.CommandSelector;
import com.netflix.genie.web.services.JobResolverService;
import com.netflix.genie.web.util.MetricsConstants;
import com.netflix.genie.web.util.MetricsUtils;
import io.micrometer.core.instrument.MeterRegistry;
import io.micrometer.core.instrument.Tag;
import lombok.Getter;
import lombok.RequiredArgsConstructor;
import lombok.Setter;
import lombok.ToString;
import org.apache.commons.lang3.RegExUtils;
import org.apache.commons.lang3.StringUtils;
import org.slf4j.Logger;
import org.slf4j.LoggerFactory;
import org.springframework.aop.TargetClassAware;
import org.springframework.transaction.annotation.Transactional;
import org.springframework.validation.annotation.Validated;
import javax.annotation.Nonnull;
import javax.annotation.Nullable;
import javax.validation.Valid;
import javax.validation.constraints.NotEmpty;
import java.io.File;
import java.net.URI;
import java.nio.file.Paths;
import java.util.ArrayList;
import java.util.Collection;
import java.util.Collections;
import java.util.Comparator;
import java.util.HashMap;
import java.util.HashSet;
import java.util.List;
import java.util.Map;
import java.util.Optional;
import java.util.Set;
import java.util.concurrent.TimeUnit;
import java.util.function.Supplier;
import java.util.stream.Collectors;
/**
* Implementation of the {@link JobResolverService} APIs.
*
* @author tgianos
* @since 4.0.0
*/
@Validated
public class JobResolverServiceImpl implements JobResolverService {
private static final Logger LOG = LoggerFactory.getLogger(JobResolverServiceImpl.class);
//region Metric Constants
/**
* How long it takes to completely resolve a job given inputs.
*/
private static final String RESOLVE_JOB_TIMER = "genie.services.jobResolver.resolve.timer";
/**
* How long it takes to resolve a command for a job given the supplied command criterion.
*/
private static final String RESOLVE_COMMAND_TIMER = "genie.services.jobResolver.resolveCommand.timer";
/**
* How long it takes to resolve a cluster for a job given the resolved command and the request criteria.
*/
private static final String RESOLVE_CLUSTER_TIMER = "genie.services.jobResolver.resolveCluster.timer";
/**
* How long it takes to resolve the applications for a given command.
*/
private static final String RESOLVE_APPLICATIONS_TIMER = "genie.services.jobResolver.resolveApplications.timer";
/**
* How long it takes to resolve a cluster for a job given the resolved command and the request criteria.
*/
private static final String GENERATE_CRITERIA_PERMUTATIONS_TIMER
= "genie.services.jobResolver.generateClusterCriteriaPermutations.timer";
/**
* How many times a cluster selector is invoked.
*/
private static final String CLUSTER_SELECTOR_COUNTER
= "genie.services.jobResolver.resolveCluster.clusterSelector.counter";
private static final int DEFAULT_CPU = 1;
private static final int DEFAULT_GPU = 0;
private static final long DEFAULT_MEMORY = 1_500L;
private static final long DEFAULT_DISK = 10_000L;
private static final long DEFAULT_NETWORK = 256L;
private static final String NO_RATIONALE = "No rationale provided";
private static final String NO_ID_FOUND = "No id found";
private static final String VERSION_4 = "4";
private static final Tag SAVED_TAG = Tag.of("saved", "true");
private static final Tag NOT_SAVED_TAG = Tag.of("saved", "false");
private static final Tag NO_CLUSTER_RESOLVED_ID = Tag.of(MetricsConstants.TagKeys.CLUSTER_ID, "None Resolved");
private static final Tag NO_CLUSTER_RESOLVED_NAME = Tag.of(MetricsConstants.TagKeys.CLUSTER_NAME, "None Resolved");
private static final Tag NO_COMMAND_RESOLVED_ID = Tag.of(MetricsConstants.TagKeys.COMMAND_ID, "None Resolved");
private static final Tag NO_COMMAND_RESOLVED_NAME = Tag.of(MetricsConstants.TagKeys.COMMAND_NAME, "None Resolved");
private static final String ID_FIELD = "id";
private static final String NAME_FIELD = "name";
private static final String STATUS_FIELD = "status";
private static final String VERSION_FIELD = "version";
private static final String CLUSTER_SELECTOR_STATUS_SUCCESS = "success";
private static final String CLUSTER_SELECTOR_STATUS_NO_PREFERENCE = "no preference";
//endregion
//region Members
private final PersistenceService persistenceService;
private final List clusterSelectors;
private final CommandSelector commandSelector;
private final MeterRegistry registry;
// TODO: Switch to path
private final File defaultJobDirectory;
private final String defaultArchiveLocation;
private final Tracer tracer;
private final BraveTagAdapter tagAdapter;
private final JobResolutionProperties jobResolutionProperties;
//endregion
//region Public APIs
/**
* Constructor.
*
* @param dataServices The {@link DataServices} encapsulation instance to use
* @param clusterSelectors The {@link ClusterSelector} implementations to use
* @param commandSelector The {@link CommandSelector} implementation to use
* @param registry The {@link MeterRegistry }metrics repository to use
* @param jobsProperties The properties for running a job set by the user
* @param jobResolutionProperties The {@link JobResolutionProperties} instance
* @param tracingComponents The {@link BraveTracingComponents} instance to use
*/
public JobResolverServiceImpl(
final DataServices dataServices,
@NotEmpty final List clusterSelectors,
final CommandSelector commandSelector, // TODO: For now this is a single value but maybe support List
final MeterRegistry registry,
final JobsProperties jobsProperties,
final JobResolutionProperties jobResolutionProperties,
final BraveTracingComponents tracingComponents
) {
this.persistenceService = dataServices.getPersistenceService();
this.clusterSelectors = clusterSelectors;
this.commandSelector = commandSelector;
this.jobResolutionProperties = jobResolutionProperties;
final URI jobDirProperty = jobsProperties.getLocations().getJobs();
this.defaultJobDirectory = Paths.get(jobDirProperty).toFile();
final String archiveLocation = jobsProperties.getLocations().getArchives().toString();
this.defaultArchiveLocation = archiveLocation.endsWith(File.separator)
? archiveLocation
: archiveLocation + File.separator;
// Metrics
this.registry = registry;
// tracing
this.tracer = tracingComponents.getTracer();
this.tagAdapter = tracingComponents.getTagAdapter();
}
/**
* {@inheritDoc}
*/
@Override
@Nonnull
@Transactional
public ResolvedJob resolveJob(
final String id
) throws GenieJobResolutionException, GenieJobResolutionRuntimeException {
final long start = System.nanoTime();
final Set tags = new HashSet<>();
tags.add(SAVED_TAG);
try {
LOG.info("Received request to resolve a job with id {}", id);
final JobStatus jobStatus = this.persistenceService.getJobStatus(id);
if (!jobStatus.isResolvable()) {
throw new IllegalArgumentException("Job " + id + " is already resolved: " + jobStatus);
}
final JobRequest jobRequest = this.persistenceService.getJobRequest(id);
// TODO: Possible improvement to combine this query with a few others to save DB trips but for now...
final boolean apiJob = this.persistenceService.isApiJob(id);
final JobResolutionContext context = new JobResolutionContext(
id,
jobRequest,
apiJob,
this.tracer.currentSpanCustomizer()
);
final ResolvedJob resolvedJob = this.resolve(context);
/*
* TODO: There is currently a gap in database schema where the resolved CPU value is not persisted. This
* means that it requires that the returned resolvedJob object here be used within the same call. If
* we for some reason eventually put the job id on a queue or something and pull data back from DB
* it WILL NOT be accurate. I'm purposely not doing this right now as it's not critical and modifying
* the schema will require a prod downtime and there are likely other fields (requestedNetwork,
* usedNetwork, usedDisk, resolvedDisk, requestedImage, usedImage) we want to add at the
* same time to minimize downtimes. - TJG 2/2/21
*/
this.persistenceService.saveResolvedJob(id, resolvedJob);
MetricsUtils.addSuccessTags(tags);
return resolvedJob;
} catch (final GenieJobResolutionException e) {
MetricsUtils.addFailureTagsWithException(tags, e);
throw e;
} catch (final Throwable t) {
MetricsUtils.addFailureTagsWithException(tags, t);
throw new GenieJobResolutionRuntimeException(t);
} finally {
this.registry
.timer(RESOLVE_JOB_TIMER, tags)
.record(System.nanoTime() - start, TimeUnit.NANOSECONDS);
}
}
/**
* {@inheritDoc}
*/
@Override
@Nonnull
public ResolvedJob resolveJob(
final String id,
@Valid final JobRequest jobRequest,
final boolean apiJob
) throws GenieJobResolutionException, GenieJobResolutionRuntimeException {
final long start = System.nanoTime();
final Set tags = new HashSet<>();
tags.add(NOT_SAVED_TAG);
try {
LOG.info(
"Received request to resolve a job for id {} and request {}",
id,
jobRequest
);
final JobResolutionContext context = new JobResolutionContext(
id,
jobRequest,
apiJob,
this.tracer.currentSpanCustomizer()
);
final ResolvedJob resolvedJob = this.resolve(context);
MetricsUtils.addSuccessTags(tags);
return resolvedJob;
} catch (final GenieJobResolutionException e) {
MetricsUtils.addFailureTagsWithException(tags, e);
throw e;
} catch (final Throwable t) {
MetricsUtils.addFailureTagsWithException(tags, t);
throw new GenieJobResolutionRuntimeException(t);
} finally {
this.registry
.timer(RESOLVE_JOB_TIMER, tags)
.record(System.nanoTime() - start, TimeUnit.NANOSECONDS);
}
}
//endregion
//region Resolution Helpers
private ResolvedJob resolve(
final JobResolutionContext context
) throws GenieJobResolutionException, GenieJobResolutionRuntimeException {
this.tagSpanWithJobMetadata(context);
this.resolveCommand(context);
this.resolveCluster(context);
this.resolveApplications(context);
this.resolveComputeResources(context);
this.resolveImages(context);
this.resolveEnvironmentVariables(context);
this.resolveTimeout(context);
this.resolveArchiveLocation(context);
this.resolveJobDirectory(context);
return context.build();
}
/*
* Overall Algorithm:
*
* 1. Take command criterion from user job request and query database for all possible matching commands
* 2. Take clusterCriteria from jobRequest and clusterCriteria from each command and create uber query which finds
* ALL clusters that match at least one of the resulting merged criterion (merged meaning combining a job and
* command cluster criterion)
* 3. Iterate through commands from step 1 and evaluate job/command cluster criterion against resulting set of
* clusters from step 2. Filter out any commands that don't match any clusters. Save resulting cluster set for
* each command in map command -> Set
* 4. Pass set, jobRequest, jobId, map> to command selector which will return single
* command
* 5. Using command result pass previously computed Set to cluster selector
* 6. Save results and run job
*/
private void resolveCommand(final JobResolutionContext context) throws GenieJobResolutionException {
final long start = System.nanoTime();
final Set tags = new HashSet<>();
try {
final JobRequest jobRequest = context.getJobRequest();
final Criterion criterion = jobRequest.getCriteria().getCommandCriterion();
//region Algorithm Step 1
final Set commands = this.persistenceService.findCommandsMatchingCriterion(criterion, true);
// Short circuit if there are no commands
if (commands.isEmpty()) {
throw new GenieJobResolutionException("No command matching command criterion found");
}
//endregion
//region Algorithm Step 2
final Map> commandClusterCriterions = this.generateClusterCriteriaPermutations(
commands,
jobRequest
);
final Set uniqueCriteria = this.flattenClusterCriteriaPermutations(commandClusterCriterions);
final Set allCandidateClusters = this.persistenceService.findClustersMatchingAnyCriterion(
uniqueCriteria,
true
);
if (allCandidateClusters.isEmpty()) {
throw new GenieJobResolutionException("No clusters available to run any candidate command on");
}
//endregion
//region Algorithm Step 3
final Map> commandClusters = this.generateCommandClustersMap(
commandClusterCriterions,
allCandidateClusters
);
// this should never really happen based on above check but just in case
if (commandClusters.isEmpty()) {
throw new GenieJobResolutionException("No clusters available to run any candidate command on");
}
// save the map for use later by cluster resolution
context.setCommandClusters(commandClusters);
//endregion
//region Algorithm Step 4
final ResourceSelectionResult result = this.commandSelector.select(
new CommandSelectionContext(
context.getJobId(),
jobRequest,
context.isApiJob(),
commandClusters
)
);
//endregion
final Command command = result
.getSelectedResource()
.orElseThrow(
() -> new GenieJobResolutionException(
"Expected a command but "
+ result.getSelectorClass().getSimpleName()
+ " didn't select anything. Rationale: "
+ result.getSelectionRationale().orElse(NO_RATIONALE)
)
);
LOG.debug(
"Selected command {} for criterion {} using {} due to {}",
command.getId(),
criterion,
result.getSelectorClass().getName(),
result.getSelectionRationale().orElse(NO_RATIONALE)
);
MetricsUtils.addSuccessTags(tags);
final String commandId = command.getId();
final String commandName = command.getMetadata().getName();
tags.add(Tag.of(MetricsConstants.TagKeys.COMMAND_ID, commandId));
tags.add(Tag.of(MetricsConstants.TagKeys.COMMAND_NAME, commandName));
final SpanCustomizer spanCustomizer = context.getSpanCustomizer();
this.tagAdapter.tag(spanCustomizer, TracingConstants.JOB_COMMAND_ID_TAG, commandId);
this.tagAdapter.tag(spanCustomizer, TracingConstants.JOB_COMMAND_NAME_TAG, commandName);
context.setCommand(command);
} catch (final GenieJobResolutionException e) {
// No candidates or selector choose none
tags.add(NO_COMMAND_RESOLVED_ID);
tags.add(NO_COMMAND_RESOLVED_NAME);
MetricsUtils.addFailureTagsWithException(tags, e);
throw e;
} catch (final ResourceSelectionException t) {
// Selector runtime error
MetricsUtils.addFailureTagsWithException(tags, t);
throw new GenieJobResolutionRuntimeException(t);
} finally {
this.registry
.timer(RESOLVE_COMMAND_TIMER, tags)
.record(System.nanoTime() - start, TimeUnit.NANOSECONDS);
}
}
/*
* At this point we should have resolved a command and now we can use the map command -> clusters that was
* previously computed to invoke the cluster selectors to narrow down the candidate clusters to a single cluster
* for use.
*/
private void resolveCluster(final JobResolutionContext context) throws GenieJobResolutionException {
final long start = System.nanoTime();
final Set tags = new HashSet<>();
final String jobId = context.getJobId();
try {
final Command command = context
.getCommand()
.orElseThrow(
() -> new IllegalStateException(
"Command not resolved before attempting to resolve a cluster for job " + jobId
)
);
final Set candidateClusters = context
.getCommandClusters()
.orElseThrow(
() -> new IllegalStateException("Command to candidate cluster map not available for job " + jobId)
)
.get(command);
if (candidateClusters == null || candidateClusters.isEmpty()) {
throw new IllegalStateException(
"Command " + command.getId() + " had no candidate clusters for job " + jobId
);
}
Cluster cluster = null;
for (final ClusterSelector clusterSelector : this.clusterSelectors) {
// Create subset of tags just for this selector. Copy existing tags if any.
final Set selectorTags = new HashSet<>(tags);
// Note: This is done before the selection because if we do it after and the selector throws
// exception then we don't have this tag in the metrics. Which is unfortunate since the result
// does return the selector
final String clusterSelectorClass = this.getProxyObjectClassName(clusterSelector);
selectorTags.add(Tag.of(MetricsConstants.TagKeys.CLASS_NAME, clusterSelectorClass));
try {
final ResourceSelectionResult result = clusterSelector.select(
new ClusterSelectionContext(
jobId,
context.getJobRequest(),
context.isApiJob(),
command,
candidateClusters
)
);
final Optional selectedClusterOptional = result.getSelectedResource();
if (selectedClusterOptional.isPresent()) {
cluster = selectedClusterOptional.get();
LOG.debug(
"Successfully selected cluster {} using selector {} for job {} with rationale: {}",
cluster.getId(),
clusterSelectorClass,
jobId,
result.getSelectionRationale().orElse(NO_RATIONALE)
);
selectorTags.add(Tag.of(MetricsConstants.TagKeys.STATUS, CLUSTER_SELECTOR_STATUS_SUCCESS));
selectorTags.add(Tag.of(MetricsConstants.TagKeys.CLUSTER_ID, cluster.getId()));
selectorTags.add(
Tag.of(MetricsConstants.TagKeys.CLUSTER_NAME, cluster.getMetadata().getName())
);
break;
} else {
selectorTags.add(
Tag.of(MetricsConstants.TagKeys.STATUS, CLUSTER_SELECTOR_STATUS_NO_PREFERENCE)
);
selectorTags.add(NO_CLUSTER_RESOLVED_ID);
selectorTags.add(NO_CLUSTER_RESOLVED_NAME);
LOG.debug(
"Selector {} returned no preference with rationale: {}",
clusterSelectorClass,
result.getSelectionRationale().orElse(NO_RATIONALE)
);
}
} catch (final Exception e) {
// Swallow exception and proceed to next selector.
// This is a choice to provides "best-service": select a cluster as long as it matches criteria,
// even if one of the selectors encountered an error and cannot choose the best candidate.
MetricsUtils.addFailureTagsWithException(selectorTags, e);
LOG.warn(
"Cluster selector {} evaluation threw exception for job {}",
clusterSelectorClass,
jobId,
e
);
} finally {
this.registry.counter(CLUSTER_SELECTOR_COUNTER, selectorTags).increment();
}
}
if (cluster == null) {
throw new GenieJobResolutionException("No cluster resolved for job " + jobId);
}
LOG.debug("Resolved cluster {} for job {}", cluster.getId(), jobId);
context.setCluster(cluster);
MetricsUtils.addSuccessTags(tags);
final String clusterId = cluster.getId();
final String clusterName = cluster.getMetadata().getName();
tags.add(Tag.of(MetricsConstants.TagKeys.CLUSTER_ID, clusterId));
tags.add(Tag.of(MetricsConstants.TagKeys.CLUSTER_NAME, clusterName));
final SpanCustomizer spanCustomizer = context.getSpanCustomizer();
this.tagAdapter.tag(spanCustomizer, TracingConstants.JOB_CLUSTER_ID_TAG, clusterId);
this.tagAdapter.tag(spanCustomizer, TracingConstants.JOB_CLUSTER_NAME_TAG, clusterName);
} catch (final GenieJobResolutionException e) {
tags.add(NO_CLUSTER_RESOLVED_ID);
tags.add(NO_CLUSTER_RESOLVED_NAME);
MetricsUtils.addFailureTagsWithException(tags, e);
throw e;
} catch (final Throwable t) {
MetricsUtils.addFailureTagsWithException(tags, t);
throw new GenieJobResolutionRuntimeException(t);
} finally {
this.registry
.timer(RESOLVE_CLUSTER_TIMER, tags)
.record(System.nanoTime() - start, TimeUnit.NANOSECONDS);
}
}
private void resolveApplications(final JobResolutionContext context) throws GenieJobResolutionException {
final long start = System.nanoTime();
final Set tags = new HashSet<>();
final String id = context.getJobId();
final JobRequest jobRequest = context.getJobRequest();
try {
final String commandId = context
.getCommand()
.orElseThrow(() -> new IllegalStateException("Command hasn't been resolved before applications"))
.getId();
LOG.debug("Selecting applications for job {} and command {}", id, commandId);
// TODO: What do we do about application status? Should probably check here
final List applications = new ArrayList<>();
if (jobRequest.getCriteria().getApplicationIds().isEmpty()) {
applications.addAll(this.persistenceService.getApplicationsForCommand(commandId));
} else {
for (final String applicationId : jobRequest.getCriteria().getApplicationIds()) {
applications.add(this.persistenceService.getApplication(applicationId));
}
}
LOG.debug(
"Resolved applications {} for job {}",
applications
.stream()
.map(Application::getId)
.reduce((one, two) -> one + "," + two)
.orElse(NO_ID_FOUND),
id
);
MetricsUtils.addSuccessTags(tags);
context.setApplications(applications);
} catch (final Throwable t) {
MetricsUtils.addFailureTagsWithException(tags, t);
throw new GenieJobResolutionRuntimeException(t);
} finally {
this.registry
.timer(RESOLVE_APPLICATIONS_TIMER, tags)
.record(System.nanoTime() - start, TimeUnit.NANOSECONDS);
}
}
private void resolveEnvironmentVariables(final JobResolutionContext context) {
final Command command = context
.getCommand()
.orElseThrow(
() -> new IllegalStateException("Command not resolved before attempting to resolve env variables")
);
final Cluster cluster = context
.getCluster()
.orElseThrow(
() -> new IllegalStateException("Cluster not resolved before attempting to resolve env variables")
);
final String id = context.getJobId();
final JobRequest jobRequest = context.getJobRequest();
final long jobMemory = context
.getComputeResources()
.orElseThrow(
() -> new IllegalStateException("Job memory not resolved before attempting to resolve env variables")
)
.getMemoryMb()
.orElseThrow(() -> new IllegalStateException("No memory has been resolved before attempting to resolve"));
// N.B. variables may be evaluated in a different order than they are added to this map (due to serialization).
// Hence variables in this set should not depend on each-other.
final Map envVariables = new HashMap<>();
envVariables.put(JobConstants.GENIE_VERSION_ENV_VAR, VERSION_4);
envVariables.put(JobConstants.GENIE_CLUSTER_ID_ENV_VAR, cluster.getId());
envVariables.put(JobConstants.GENIE_CLUSTER_NAME_ENV_VAR, cluster.getMetadata().getName());
envVariables.put(JobConstants.GENIE_CLUSTER_TAGS_ENV_VAR, this.tagsToString(cluster.getMetadata().getTags()));
envVariables.put(JobConstants.GENIE_COMMAND_ID_ENV_VAR, command.getId());
envVariables.put(JobConstants.GENIE_COMMAND_NAME_ENV_VAR, command.getMetadata().getName());
envVariables.put(JobConstants.GENIE_COMMAND_TAGS_ENV_VAR, this.tagsToString(command.getMetadata().getTags()));
envVariables.put(JobConstants.GENIE_JOB_ID_ENV_VAR, id);
envVariables.put(JobConstants.GENIE_JOB_NAME_ENV_VAR, jobRequest.getMetadata().getName());
envVariables.put(JobConstants.GENIE_JOB_MEMORY_ENV_VAR, String.valueOf(jobMemory));
envVariables.put(JobConstants.GENIE_JOB_TAGS_ENV_VAR, this.tagsToString(jobRequest.getMetadata().getTags()));
envVariables.put(
JobConstants.GENIE_JOB_GROUPING_ENV_VAR,
jobRequest.getMetadata().getGrouping().orElse("")
);
envVariables.put(
JobConstants.GENIE_JOB_GROUPING_INSTANCE_ENV_VAR,
jobRequest.getMetadata().getGroupingInstance().orElse("")
);
envVariables.put(
JobConstants.GENIE_REQUESTED_COMMAND_TAGS_ENV_VAR,
this.tagsToString(jobRequest.getCriteria().getCommandCriterion().getTags())
);
final List clusterCriteria = jobRequest.getCriteria().getClusterCriteria();
final List clusterCriteriaTags = new ArrayList<>(clusterCriteria.size());
for (int i = 0; i < clusterCriteria.size(); i++) {
final Criterion criterion = clusterCriteria.get(i);
final String criteriaTagsString = this.tagsToString(criterion.getTags());
envVariables.put(JobConstants.GENIE_REQUESTED_CLUSTER_TAGS_ENV_VAR + "_" + i, criteriaTagsString);
clusterCriteriaTags.add("[" + criteriaTagsString + "]");
}
envVariables.put(
JobConstants.GENIE_REQUESTED_CLUSTER_TAGS_ENV_VAR,
"[" + StringUtils.join(clusterCriteriaTags, ',') + "]"
);
envVariables.put(JobConstants.GENIE_USER_ENV_VAR, jobRequest.getMetadata().getUser());
envVariables.put(JobConstants.GENIE_USER_GROUP_ENV_VAR, jobRequest.getMetadata().getGroup().orElse(""));
context.setEnvironmentVariables(Collections.unmodifiableMap(envVariables));
}
private void resolveTimeout(final JobResolutionContext context) {
final JobRequest jobRequest = context.getJobRequest();
if (jobRequest.getRequestedAgentConfig().getTimeoutRequested().isPresent()) {
context.setTimeout(jobRequest.getRequestedAgentConfig().getTimeoutRequested().get());
} else if (context.isApiJob()) {
// For backwards V3 compatibility
context.setTimeout(com.netflix.genie.common.dto.JobRequest.DEFAULT_TIMEOUT_DURATION);
}
}
private void resolveComputeResources(final JobResolutionContext context) {
final ComputeResources req = context
.getJobRequest()
.getRequestedJobEnvironment()
.getRequestedComputeResources();
final ComputeResources command = context
.getCommand()
.orElseThrow(() -> new IllegalStateException("Command hasn't been resolved before compute resources"))
.getComputeResources();
final ComputeResources defaults = this.jobResolutionProperties.getDefaultComputeResources();
context.setComputeResources(
new ComputeResources.Builder()
.withCpu(this.resolveComputeResource(req::getCpu, command::getCpu, defaults::getCpu, DEFAULT_CPU))
.withGpu(this.resolveComputeResource(req::getGpu, command::getGpu, defaults::getGpu, DEFAULT_GPU))
.withMemoryMb(
this.resolveComputeResource(
req::getMemoryMb,
command::getMemoryMb,
defaults::getMemoryMb,
DEFAULT_MEMORY
)
)
.withDiskMb(
this.resolveComputeResource(req::getDiskMb, command::getDiskMb, defaults::getDiskMb, DEFAULT_DISK)
)
.withNetworkMbps(
this.resolveComputeResource(
req::getNetworkMbps,
command::getNetworkMbps,
defaults::getNetworkMbps,
DEFAULT_NETWORK
)
)
.build()
);
}
private T resolveComputeResource(
final Supplier> requestedResource,
final Supplier> commandResource,
final Supplier> configuredDefault,
final T hardCodedDefault
) {
return requestedResource
.get()
.orElse(
commandResource
.get()
.orElse(
configuredDefault
.get()
.orElse(hardCodedDefault)
)
);
}
private void resolveImages(final JobResolutionContext context) {
final Map requestImages = context
.getJobRequest()
.getRequestedJobEnvironment()
.getRequestedImages();
final Map commandImages = context
.getCommand()
.orElseThrow(() -> new IllegalStateException("No command resolved before trying to resolve images"))
.getImages();
final Map defaultImages = this.jobResolutionProperties.getDefaultImages();
// Find all the image keys
final Map resolvedImages = new HashMap<>(defaultImages);
for (final Map.Entry entry : commandImages.entrySet()) {
resolvedImages.merge(entry.getKey(), entry.getValue(), this::mergeImages);
}
for (final Map.Entry entry : requestImages.entrySet()) {
resolvedImages.merge(entry.getKey(), entry.getValue(), this::mergeImages);
}
context.setImages(resolvedImages);
}
private void resolveArchiveLocation(final JobResolutionContext context) {
// TODO: Disable ability to disable archival for all jobs during internal V4 migration.
// Will allow us to reach out to clients who may set this variable but still expect output after
// job completion due to it being served off the node after completion in V3 but now it won't.
// Put this back in once all use cases have been hunted down and users are sure of their expected
// behavior
context.setArchiveLocation(this.defaultArchiveLocation + context.getJobId());
}
private void resolveJobDirectory(final JobResolutionContext context) {
context.setJobDirectory(
context.getJobRequest()
.getRequestedAgentConfig()
.getRequestedJobDirectoryLocation()
.orElse(this.defaultJobDirectory)
);
}
//endregion
//region Additional Helpers
/**
* Helper method to generate all the possible viable cluster criterion permutations for the given set of commands
* and the given job request. The resulting map will be each command to its associated priority ordered list of
* merged cluster criteria. The priority order is generated as follows:
*
* for (commandClusterCriterion : command.getClusterCriteria()) {
* for (jobClusterCriterion : jobRequest.getClusterCriteria()) {
* // merge
* }
* }
*
*
* @param commands The set of {@link Command}s whose cluster criteria should be evaluated
* @param jobRequest The {@link JobRequest} whose cluster criteria should be combined with the commands
* @return The resulting map of each command to their associated merged criterion list in priority order
*/
private Map> generateClusterCriteriaPermutations(
final Set commands,
final JobRequest jobRequest
) {
final long start = System.nanoTime();
try {
final Map> mapBuilder = new HashMap<>();
for (final Command command : commands) {
final List listBuilder = new ArrayList<>();
for (final Criterion commandClusterCriterion : command.getClusterCriteria()) {
for (final Criterion jobClusterCriterion : jobRequest.getCriteria().getClusterCriteria()) {
try {
// Failing to merge the criteria is equivalent to a round-trip DB query that returns
// zero results. This is an in memory optimization which also solves the need to implement
// the db query as a join with a subquery.
listBuilder.add(this.mergeCriteria(commandClusterCriterion, jobClusterCriterion));
} catch (final IllegalArgumentException e) {
LOG.debug(
"Unable to merge command cluster criterion {} and job cluster criterion {}. Skipping.",
commandClusterCriterion,
jobClusterCriterion,
e
);
}
}
}
mapBuilder.put(command, Collections.unmodifiableList(listBuilder));
}
return Collections.unmodifiableMap(mapBuilder);
} finally {
this.registry
.timer(GENERATE_CRITERIA_PERMUTATIONS_TIMER)
.record(System.nanoTime() - start, TimeUnit.NANOSECONDS);
}
}
private Set flattenClusterCriteriaPermutations(final Map> commandCriteriaMap) {
return commandCriteriaMap.values().stream().flatMap(Collection::stream).collect(Collectors.toSet());
}
/**
* This is an in memory evaluation of the matching done against persistence.
*
* @param cluster The cluster to evaluate the criterion against
* @param criterion The criterion the cluster is being tested against
* @return {@literal true} if the {@link Cluster} matches the {@link Criterion}
*/
private boolean clusterMatchesCriterion(final Cluster cluster, final Criterion criterion) {
// TODO: This runs the risk of diverging from DB query mechanism. Perhaps way to unite somewhat?
final ClusterMetadata metadata = cluster.getMetadata();
return criterion.getId().map(id -> cluster.getId().equals(id)).orElse(true)
&& criterion.getName().map(name -> metadata.getName().equals(name)).orElse(true)
&& criterion.getVersion().map(version -> metadata.getVersion().equals(version)).orElse(true)
&& criterion.getStatus().map(status -> metadata.getStatus().name().equals(status)).orElse(true)
&& metadata.getTags().containsAll(criterion.getTags());
}
private Map> generateCommandClustersMap(
final Map> commandClusterCriteria,
final Set candidateClusters
) {
final Map> matrixBuilder = new HashMap<>();
for (final Map.Entry> entry : commandClusterCriteria.entrySet()) {
final Command command = entry.getKey();
final Set matchedClustersBuilder = new HashSet<>();
// Loop through the criterion in the priority order first
for (final Criterion criterion : entry.getValue()) {
for (final Cluster candidateCluster : candidateClusters) {
if (this.clusterMatchesCriterion(candidateCluster, criterion)) {
LOG.debug(
"Cluster {} matched criterion {} for command {}",
candidateCluster.getId(),
criterion,
command.getId()
);
matchedClustersBuilder.add(candidateCluster);
}
}
final Set matchedClusters = Collections.unmodifiableSet(matchedClustersBuilder);
if (!matchedClusters.isEmpty()) {
// If we found some clusters the evaluation for this command is done
matrixBuilder.put(command, matchedClusters);
LOG.debug("For command {} matched clusters {}", command, matchedClusters);
// short circuit further criteria evaluation for this command
break;
}
}
// If the command never matched any clusters it should be filtered out
// of resulting map as no value would be added to the result builder
}
final Map> matrix = Collections.unmodifiableMap(matrixBuilder);
LOG.debug("Complete command -> clusters matrix: {}", matrix);
return matrix;
}
/**
* Helper method for merging two criterion.
*
* This method makes several assumptions:
* - If any of these fields: {@literal id, name, version, status} are in both criterion their values must match
* or this criterion combination of criteria can't possibly be matched so an {@link IllegalArgumentException}
* is thrown
* - If only one criterion has any of these fields {@literal id, name, version, status} then that value is present
* in the resulting criterion
* - Any {@literal tags} present in either criterion are merged into the super set of both sets of tags
*
* @param one The first {@link Criterion}
* @param two The second {@link Criterion}
* @return A merged {@link Criterion} that can be used to search the database
* @throws IllegalArgumentException If the criteria can't be merged due to the described assumptions
*/
private Criterion mergeCriteria(final Criterion one, final Criterion two) throws IllegalArgumentException {
final Criterion.Builder builder = new Criterion.Builder();
builder.withId(
this.mergeCriteriaStrings(one.getId().orElse(null), two.getId().orElse(null), ID_FIELD)
);
builder.withName(
this.mergeCriteriaStrings(one.getName().orElse(null), two.getName().orElse(null), NAME_FIELD)
);
builder.withStatus(
this.mergeCriteriaStrings(one.getStatus().orElse(null), two.getStatus().orElse(null), STATUS_FIELD)
);
builder.withVersion(
this.mergeCriteriaStrings(one.getVersion().orElse(null), two.getVersion().orElse(null), VERSION_FIELD)
);
final Set tags = new HashSet<>(one.getTags());
tags.addAll(two.getTags());
builder.withTags(tags);
return builder.build();
}
private String mergeCriteriaStrings(
@Nullable final String one,
@Nullable final String two,
final String fieldName
) throws IllegalArgumentException {
if (StringUtils.equals(one, two)) {
// This handles null == null for us
return one;
} else if (one == null) {
return two;
} else if (two == null) {
return one;
} else {
// Both have values but aren't equal
throw new IllegalArgumentException(fieldName + "'s were both present but not equal");
}
}
private Image mergeImages(final Image secondary, final Image primary) {
return new Image.Builder()
.withName(primary.getName().orElse(secondary.getName().orElse(null)))
.withTag(primary.getTag().orElse(secondary.getTag().orElse(null)))
.withArguments(primary.getArguments().isEmpty() ? secondary.getArguments() : primary.getArguments())
.build();
}
/**
* Helper to convert a set of tags into a string that is a suitable value for a shell environment variable.
* Adds double quotes as necessary (i.e. in case of spaces, newlines), performs escaping of in-tag quotes.
* Input tags are sorted to produce a deterministic output value.
*
* @param tags a set of tags or null
* @return a CSV string
*/
private String tagsToString(final Set tags) {
final List sortedTags = new ArrayList<>(tags);
// Sort tags for the sake of determinism (e.g., tests)
sortedTags.sort(Comparator.naturalOrder());
final String joinedString = StringUtils.join(sortedTags, ',');
// Escape quotes
return RegExUtils.replaceAll(RegExUtils.replaceAll(joinedString, "'", "\\'"), "\"", "\\\"");
}
private String getProxyObjectClassName(final Object possibleProxyObject) {
final String className;
if (possibleProxyObject instanceof TargetClassAware) {
final Class> targetClass = ((TargetClassAware) possibleProxyObject).getTargetClass();
if (targetClass != null) {
className = targetClass.getCanonicalName();
} else {
className = possibleProxyObject.getClass().getCanonicalName();
}
} else {
className = possibleProxyObject.getClass().getCanonicalName();
}
return className;
}
private void tagSpanWithJobMetadata(final JobResolutionContext context) {
final SpanCustomizer spanCustomizer = this.tracer.currentSpanCustomizer();
this.tagAdapter.tag(spanCustomizer, TracingConstants.JOB_ID_TAG, context.getJobId());
final JobMetadata jobMetadata = context.getJobRequest().getMetadata();
this.tagAdapter.tag(spanCustomizer, TracingConstants.JOB_NAME_TAG, jobMetadata.getName());
this.tagAdapter.tag(spanCustomizer, TracingConstants.JOB_USER_TAG, jobMetadata.getUser());
}
//endregion
//region Helper Classes
/**
* A helper data class for passing information around / along the resolution pipeline.
*
* @author tgianos
* @since 4.0.0
*/
@RequiredArgsConstructor
@Getter
@Setter
@ToString(doNotUseGetters = true)
static class JobResolutionContext {
private final String jobId;
private final JobRequest jobRequest;
private final boolean apiJob;
private final SpanCustomizer spanCustomizer;
private Command command;
private Cluster cluster;
private List applications;
private ComputeResources computeResources;
private Map environmentVariables;
private Integer timeout;
private String archiveLocation;
private File jobDirectory;
private Map> commandClusters;
private Map images;
Optional getCommand() {
return Optional.ofNullable(this.command);
}
Optional getCluster() {
return Optional.ofNullable(this.cluster);
}
Optional> getApplications() {
return Optional.ofNullable(this.applications);
}
Optional getComputeResources() {
return Optional.ofNullable(this.computeResources);
}
Optional