All Downloads are FREE. Search and download functionalities are using the official Maven repository.

com.netflix.genie.web.agent.launchers.impl.LocalAgentLauncherImpl Maven / Gradle / Ivy

The newest version!
/*
 *
 *  Copyright 2019 Netflix, Inc.
 *
 *     Licensed under the Apache License, Version 2.0 (the "License");
 *     you may not use this file except in compliance with the License.
 *     You may obtain a copy of the License at
 *
 *         http://www.apache.org/licenses/LICENSE-2.0
 *
 *     Unless required by applicable law or agreed to in writing, software
 *     distributed under the License is distributed on an "AS IS" BASIS,
 *     WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 *     See the License for the specific language governing permissions and
 *     limitations under the License.
 *
 */
package com.netflix.genie.web.agent.launchers.impl;

import brave.Span;
import brave.Tracer;
import com.fasterxml.jackson.databind.JsonNode;
import com.fasterxml.jackson.databind.node.JsonNodeFactory;
import com.github.benmanes.caffeine.cache.Caffeine;
import com.github.benmanes.caffeine.cache.LoadingCache;
import com.google.common.annotations.VisibleForTesting;
import com.google.common.collect.ImmutableMap;
import com.google.common.collect.Lists;
import com.google.common.collect.Maps;
import com.google.common.collect.Sets;
import com.netflix.genie.common.internal.dtos.JobMetadata;
import com.netflix.genie.common.internal.tracing.brave.BraveTracePropagator;
import com.netflix.genie.common.internal.tracing.brave.BraveTracingComponents;
import com.netflix.genie.web.agent.launchers.AgentLauncher;
import com.netflix.genie.web.data.services.DataServices;
import com.netflix.genie.web.data.services.PersistenceService;
import com.netflix.genie.web.data.services.impl.jpa.queries.aggregates.JobInfoAggregate;
import com.netflix.genie.web.dtos.ResolvedJob;
import com.netflix.genie.web.exceptions.checked.AgentLaunchException;
import com.netflix.genie.web.introspection.GenieWebHostInfo;
import com.netflix.genie.web.introspection.GenieWebRpcInfo;
import com.netflix.genie.web.properties.LocalAgentLauncherProperties;
import com.netflix.genie.web.util.ExecutorFactory;
import com.netflix.genie.web.util.MetricsUtils;
import com.netflix.genie.web.util.UNIXUtils;
import io.micrometer.core.instrument.MeterRegistry;
import io.micrometer.core.instrument.Tag;
import lombok.extern.slf4j.Slf4j;
import org.apache.commons.exec.CommandLine;
import org.apache.commons.exec.DefaultExecuteResultHandler;
import org.apache.commons.exec.ExecuteException;
import org.apache.commons.exec.Executor;
import org.apache.commons.exec.PumpStreamHandler;
import org.apache.commons.lang3.SystemUtils;
import org.springframework.boot.actuate.health.Health;

import javax.annotation.Nullable;
import javax.validation.Valid;
import java.io.FileNotFoundException;
import java.io.FileOutputStream;
import java.io.IOException;
import java.util.HashSet;
import java.util.List;
import java.util.Map;
import java.util.Optional;
import java.util.Set;
import java.util.concurrent.TimeUnit;
import java.util.concurrent.atomic.AtomicLong;

/**
 * Implementation of {@link AgentLauncher} which launched Agent instances on the local Genie hardware.
 *
 * @author tgianos
 * @since 4.0.0
 */
@Slf4j
public class LocalAgentLauncherImpl implements AgentLauncher {

    private static final String NUMBER_ACTIVE_JOBS_KEY = "numActiveJobs";
    private static final String ALLOCATED_MEMORY_KEY = "allocatedMemory";
    private static final String USED_MEMORY_KEY = "usedMemory";
    private static final String AVAILABLE_MEMORY_KEY = "availableMemory";
    private static final String AVAILABLE_MAX_JOB_CAPACITY_KEY = "availableMaxJobCapacity";
    private static final Map INFO_UNAVAILABLE_DETAILS = Map.of(
        "jobInfoUnavailable",
        "Unable to retrieve host job information. State unknown."
    );

    private static final String RUN_USER_PLACEHOLDER = "";
    private static final String SETS_ID = "setsid";
    private static final Object MEMORY_CHECK_LOCK = new Object();
    private static final String THIS_CLASS = LocalAgentLauncherImpl.class.getCanonicalName();
    private static final Tag CLASS_TAG = Tag.of(LAUNCHER_CLASS_KEY, THIS_CLASS);
    private static final long DEFAULT_JOB_MEMORY = 1_536L;

    private final String hostname;
    private final PersistenceService persistenceService;
    private final LocalAgentLauncherProperties launcherProperties;
    private final ExecutorFactory executorFactory;
    private final MeterRegistry registry;
    private final Executor sharedExecutor;
    private final int rpcPort;
    private final LoadingCache jobInfoCache;
    private final JsonNode launcherExt;

    private final AtomicLong numActiveJobs;
    private final AtomicLong usedMemory;

    private final Tracer tracer;
    private final BraveTracePropagator tracePropagator;

    /**
     * Constructor.
     *
     * @param hostInfo           The {@link GenieWebHostInfo} instance
     * @param rpcInfo            The {@link GenieWebRpcInfo} instance
     * @param dataServices       The {@link DataServices} encapsulation instance to use
     * @param launcherProperties The properties from the configuration that control agent behavior
     * @param executorFactory    A {@link ExecutorFactory} to create {@link org.apache.commons.exec.Executor}
     *                           instances
     * @param tracingComponents  The {@link BraveTracingComponents} instance to use
     * @param registry           Metrics repository
     */
    public LocalAgentLauncherImpl(
        final GenieWebHostInfo hostInfo,
        final GenieWebRpcInfo rpcInfo,
        final DataServices dataServices,
        final LocalAgentLauncherProperties launcherProperties,
        final ExecutorFactory executorFactory,
        final BraveTracingComponents tracingComponents,
        final MeterRegistry registry
    ) {
        this.hostname = hostInfo.getHostname();
        this.rpcPort = rpcInfo.getRpcPort();
        this.persistenceService = dataServices.getPersistenceService();
        this.launcherProperties = launcherProperties;
        this.executorFactory = executorFactory;
        this.registry = registry;
        this.sharedExecutor = this.executorFactory.newInstance(false);

        this.numActiveJobs = new AtomicLong(0L);
        this.usedMemory = new AtomicLong(0L);

        this.tracer = tracingComponents.getTracer();
        this.tracePropagator = tracingComponents.getTracePropagator();

        final Set tags = Sets.newHashSet(
            Tag.of("launcherClass", this.getClass().getSimpleName())
        );
        // TODO: These metrics should either be renamed or tagged so that it's easier to slice and dice them
        //       Currently we have a single launcher but as more come this won't represent necessarily what the name
        //       implies. Even now there are agent jobs not launched through the API which are not captured in this
        //       metric and thus it doesn't accurately give a number of the active jobs in the system instead it gives
        //       only active jobs running locally on a given node. I'm not renaming it now (5/28/2020) since we don't
        //       yet have a firm plan in place for a) handling multiple launcher and b) if a leadership task should
        //       publish the number of running jobs and other aggregate metrics from the system. Leaving these named
        //       this way also makes it so we don't have to modify as many dashboards or auto scaling policies ATM
        this.registry.gauge("genie.jobs.active.gauge", tags, this.numActiveJobs);
        this.registry.gauge("genie.jobs.memory.used.gauge", tags, this.usedMemory);

        // Leverage a loading cache to handle the timed async fetching for us rather than creating a thread
        // on a scheduler etc. This also provides atomicity.
        // Note that this is not intended to be used for exact calculations and more for metrics and health checks
        // as the data could be somewhat stale
        this.jobInfoCache = Caffeine
            .newBuilder()
            // The refresh fails silently this will protect from stale data
            .expireAfterWrite(this.launcherProperties.getHostInfoExpireAfter())
            .refreshAfterWrite(this.launcherProperties.getHostInfoRefreshAfter())
            .initialCapacity(1)
            .build(
                host -> {
                    final JobInfoAggregate info = this.persistenceService.getHostJobInformation(host);

                    // this should always be the case but just in case
                    if (info != null) {
                        // Proactively update the metric reporting
                        this.numActiveJobs.set(info.getNumberOfActiveJobs());
                        this.usedMemory.set(info.getTotalMemoryAllocated());
                    }

                    return info;
                }
            );

        this.launcherExt = JsonNodeFactory.instance.objectNode()
            .put(LAUNCHER_CLASS_EXT_FIELD, THIS_CLASS)
            .put(SOURCE_HOST_EXT_FIELD, this.hostname);

        // Force the initial fetch so that all subsequent fetches will be non-blocking
        try {
            this.jobInfoCache.get(this.hostname);
        } catch (final Exception e) {
            log.error("Unable to fetch initial job information", e);
        }
    }

    /**
     * {@inheritDoc}
     */
    @Override
    public Optional launchAgent(
        @Valid final ResolvedJob resolvedJob,
        @Nullable final JsonNode requestedLauncherExt
    ) throws AgentLaunchException {
        final long start = System.nanoTime();
        log.info("Received request to launch local agent to run job: {}", resolvedJob);
        final Set tags = new HashSet<>();
        tags.add(CLASS_TAG);

        try {
            final JobMetadata jobMetadata = resolvedJob.getJobMetadata();
            final String user = jobMetadata.getUser();

            if (this.launcherProperties.isRunAsUserEnabled()) {
                final String group = jobMetadata.getGroup().orElse(null);
                try {
                    UNIXUtils.createUser(user, group, this.sharedExecutor);
                } catch (IOException e) {
                    log.error("Failed to create user {}: {}", jobMetadata.getUser(), e.getMessage(), e);
                    throw new AgentLaunchException(e);
                }
            }

            // Check error conditions
            final long jobMemory = resolvedJob
                .getJobEnvironment()
                .getComputeResources()
                .getMemoryMb()
                .orElse(DEFAULT_JOB_MEMORY);
            final String jobId = resolvedJob.getJobSpecification().getJob().getId();

            // Job was resolved with more memory allocated than the system was configured to allow
            if (jobMemory > this.launcherProperties.getMaxJobMemory()) {
                throw new AgentLaunchException(
                    "Unable to launch job as the requested job memory ("
                        + jobMemory
                        + "MB) exceeds the maximum allowed by the configuration of the system ("
                        + this.launcherProperties.getMaxJobMemory()
                        + "MB)"
                );
            }

            final CommandLine commandLine = this.createCommandLine(
                ImmutableMap.of(
                    LocalAgentLauncherProperties.SERVER_HOST_PLACEHOLDER, this.launcherProperties.getServerHostname(),
                    LocalAgentLauncherProperties.SERVER_PORT_PLACEHOLDER, Integer.toString(this.rpcPort),
                    LocalAgentLauncherProperties.JOB_ID_PLACEHOLDER, jobId,
                    RUN_USER_PLACEHOLDER, user,
                    LocalAgentLauncherProperties.AGENT_JAR_PLACEHOLDER, this.launcherProperties.getAgentJarPath()
                )
            );

            // One at a time to ensure we don't overflow configured max
            synchronized (MEMORY_CHECK_LOCK) {
                final long usedMemoryOnHost = this.persistenceService.getUsedMemoryOnHost(this.hostname);
                final long expectedUsedMemoryOnHost = usedMemoryOnHost + jobMemory;
                if (expectedUsedMemoryOnHost > this.launcherProperties.getMaxTotalJobMemory()) {
                    throw new AgentLaunchException(
                        "Running job "
                            + jobId
                            + " with "
                            + jobMemory
                            + "MB of memory would cause there to be more memory used than the configured amount of "
                            + this.launcherProperties.getMaxTotalJobMemory()
                            + "MB. "
                            + usedMemoryOnHost
                            + "MB worth of jobs are currently running on this node."
                    );
                }
            }

            // Inherit server environment
            final Map environment = Maps.newHashMap(System.getenv());
            // Add extra environment from configuration, if any
            environment.putAll(this.launcherProperties.getAdditionalEnvironment());
            // Add tracing context so agent continues trace
            final Span currentSpan = this.tracer.currentSpan();
            if (currentSpan != null) {
                environment.putAll(this.tracePropagator.injectForAgent(currentSpan.context()));
            }
            log.debug("Launching agent: {}, env: {}", commandLine, environment);

            // TODO: What happens if the server crashes? Does the process live on? Make sure this is totally detached
            final Executor executor = this.executorFactory.newInstance(true);

            if (this.launcherProperties.isProcessOutputCaptureEnabled()) {
                final String debugOutputPath =
                    System.getProperty(SystemUtils.JAVA_IO_TMPDIR, "/tmp") + "/agent-job-" + jobId + ".txt";
                try {
                    final FileOutputStream fileOutput = new FileOutputStream(debugOutputPath, false);
                    executor.setStreamHandler(new PumpStreamHandler(fileOutput));
                } catch (final FileNotFoundException e) {
                    log.error("Failed to create agent process output file", e);
                    throw new AgentLaunchException(e);
                }
            }

            log.info("Launching agent for job {}", jobId);

            final AgentResultHandler resultHandler = new AgentResultHandler(jobId);

            try {
                executor.execute(commandLine, environment, resultHandler);
            } catch (final IOException ioe) {
                throw new AgentLaunchException(
                    "Unable to launch agent using command: " + commandLine.toString(),
                    ioe
                );
            }

            MetricsUtils.addSuccessTags(tags);
            return Optional.of(this.launcherExt);
        } catch (final AgentLaunchException e) {
            MetricsUtils.addFailureTagsWithException(tags, e);
            throw e;
        } catch (final Exception e) {
            log.error("Unable to launch local agent due to {}", e.getMessage(), e);
            MetricsUtils.addFailureTagsWithException(tags, e);
            throw new AgentLaunchException("Unable to launch local agent due to unhandled error", e);
        } finally {
            this.registry.timer(LAUNCH_TIMER, tags).record(System.nanoTime() - start, TimeUnit.NANOSECONDS);
        }
    }

    /**
     * {@inheritDoc}
     */
    @Override
    public Health health() {
        final JobInfoAggregate jobInfo;
        try {
            jobInfo = this.jobInfoCache.get(this.hostname);
        } catch (final Exception e) {
            log.error("Computing host info threw exception", e);
            // Could do unknown but if the persistence tier threw an exception we're likely down anyway
            return Health.down(e).build();
        }
        // This should never happen but if it does it's likely a problem deeper in system (persistence tier)
        if (jobInfo == null) {
            log.error("Unable to retrieve host info from cache");
            return Health.unknown().withDetails(INFO_UNAVAILABLE_DETAILS).build();
        }

        // Use allocated memory to make the host go OOS early enough that we don't throw as many exceptions on
        // accepted jobs during launch
        final long memoryAllocated = jobInfo.getTotalMemoryAllocated();
        final long availableMemory = this.launcherProperties.getMaxTotalJobMemory() - memoryAllocated;
        final long maxJobMemory = this.launcherProperties.getMaxJobMemory();

        final Health.Builder builder;

        // If we can fit one more max job in we're still healthy
        if (availableMemory >= maxJobMemory) {
            builder = Health.up();
        } else {
            builder = Health.down();
        }

        return builder
            .withDetail(NUMBER_ACTIVE_JOBS_KEY, jobInfo.getNumberOfActiveJobs())
            .withDetail(ALLOCATED_MEMORY_KEY, memoryAllocated)
            .withDetail(AVAILABLE_MEMORY_KEY, availableMemory)
            .withDetail(USED_MEMORY_KEY, jobInfo.getTotalMemoryUsed())
            .withDetail(
                AVAILABLE_MAX_JOB_CAPACITY_KEY,
                (availableMemory >= 0 && maxJobMemory > 0) ? (availableMemory / maxJobMemory) : 0)
            .build();
    }

    private CommandLine createCommandLine(
        final Map argumentValueReplacements
    ) {
        final List commandLineTemplate = Lists.newArrayList();

        // Run detached with setsid on Linux
        if (SystemUtils.IS_OS_LINUX) {
            commandLineTemplate.add(SETS_ID);
        }

        // Run as different user with sudo
        if (this.launcherProperties.isRunAsUserEnabled()) {
            commandLineTemplate.addAll(Lists.newArrayList("sudo", "-E", "-u", RUN_USER_PLACEHOLDER));
        }

        // Agent  command line to launch agent (i.e. JVM and its options)
        commandLineTemplate.addAll(this.launcherProperties.getLaunchCommandTemplate());

        final CommandLine commandLine = new CommandLine(commandLineTemplate.get(0));

        for (int i = 1; i < commandLineTemplate.size(); i++) {
            final String argument = commandLineTemplate.get(i);
            // If the argument placeholder is a key in the map, replace it with the corresponding value.
            // Otherwise it's not a placeholder, add it as-is to the command-line.
            commandLine.addArgument(argumentValueReplacements.getOrDefault(argument, argument));
        }

        return commandLine;
    }

    /**
     * Simple {@link org.apache.commons.exec.ExecuteResultHandler} implementation that logs completion.
     *
     * @author tgianos
     * @since 4.0.0
     */
    @Slf4j
    @VisibleForTesting
    static class AgentResultHandler extends DefaultExecuteResultHandler {

        private final String jobId;

        /**
         * Constructor.
         *
         * @param jobId The id of the job the agent this handler is attached to is running
         */
        AgentResultHandler(final String jobId) {
            this.jobId = jobId;
        }

        /**
         * {@inheritDoc}
         */
        @Override
        public void onProcessComplete(final int exitValue) {
            super.onProcessComplete(exitValue);
            log.info("Agent process for job {} completed with exit value {}", this.jobId, exitValue);
        }

        /**
         * {@inheritDoc}
         */
        @Override
        public void onProcessFailed(final ExecuteException e) {
            super.onProcessFailed(e);
            log.error("Agent process failed for job {} due to {}", this.jobId, e.getMessage(), e);
        }
    }
}




© 2015 - 2024 Weber Informatics LLC | Privacy Policy