All Downloads are FREE. Search and download functionalities are using the official Maven repository.

com.netflix.genie.web.agent.services.impl.AgentRoutingServiceCuratorDiscoveryImpl Maven / Gradle / Ivy

There is a newer version: 4.3.20
Show newest version
/*
 *
 *  Copyright 2020 Netflix, Inc.
 *
 *     Licensed under the Apache License, Version 2.0 (the "License");
 *     you may not use this file except in compliance with the License.
 *     You may obtain a copy of the License at
 *
 *         http://www.apache.org/licenses/LICENSE-2.0
 *
 *     Unless required by applicable law or agreed to in writing, software
 *     distributed under the License is distributed on an "AS IS" BASIS,
 *     WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 *     See the License for the specific language governing permissions and
 *     limitations under the License.
 *
 */
package com.netflix.genie.web.agent.services.impl;

import com.fasterxml.jackson.annotation.JsonProperty;
import com.google.common.annotations.VisibleForTesting;
import com.google.common.collect.ImmutableSet;
import com.google.common.collect.Maps;
import com.google.common.collect.Sets;
import com.netflix.genie.common.internal.util.GenieHostInfo;
import com.netflix.genie.web.agent.services.AgentRoutingService;
import com.netflix.genie.web.properties.AgentRoutingServiceProperties;
import com.netflix.genie.web.util.MetricsUtils;
import io.micrometer.core.instrument.MeterRegistry;
import io.micrometer.core.instrument.Tag;
import lombok.EqualsAndHashCode;
import lombok.Getter;
import lombok.extern.slf4j.Slf4j;
import org.apache.curator.framework.CuratorFramework;
import org.apache.curator.framework.listen.Listenable;
import org.apache.curator.framework.state.ConnectionState;
import org.apache.curator.framework.state.ConnectionStateListener;
import org.apache.curator.x.discovery.ServiceDiscovery;
import org.apache.curator.x.discovery.ServiceInstance;
import org.apache.curator.x.discovery.ServiceType;
import org.apache.zookeeper.KeeperException;
import org.springframework.scheduling.TaskScheduler;

import javax.validation.constraints.NotBlank;
import java.time.Instant;
import java.util.Map;
import java.util.Optional;
import java.util.Set;
import java.util.concurrent.PriorityBlockingQueue;
import java.util.concurrent.ThreadFactory;
import java.util.concurrent.TimeUnit;
import java.util.concurrent.atomic.AtomicLong;
import java.util.concurrent.atomic.AtomicReference;

/**
 * Implementation of {@link AgentRoutingService} that relies on Curator's Discovery extension.
 * Rather than the traditional use of this recipe (register a service for the node itself, this class registers one
 * service instance for each agent locally connected.
 *
 * @author mprimi
 * @since 4.0.0
 */
@Slf4j
public class AgentRoutingServiceCuratorDiscoveryImpl implements AgentRoutingService {

    private static final String SERVICE_NAME = "agent_connections";
    private static final String METRICS_PREFIX = "genie.agents.connections.";
    private static final String CONNECTED_AGENTS_GAUGE_NAME = METRICS_PREFIX + "connected.gauge";
    private static final String REGISTERED_AGENTS_GAUGE_NAME = METRICS_PREFIX + "registered.gauge";
    private static final String ZOOKEEPER_SESSION_STATE_COUNTER_NAME = METRICS_PREFIX + "zookeeperSessionState.counter";
    private static final String AGENT_REGISTERED_TIMER_NAME = METRICS_PREFIX + "registered.timer";
    private static final String AGENT_UNREGISTERED_TIMER_NAME = METRICS_PREFIX + "unregistered.timer";
    private static final String AGENT_REFRESH_TIMER_NAME = METRICS_PREFIX + "refreshed.timer";
    private static final String AGENT_CONNECTED_COUNTER_NAME = METRICS_PREFIX + "connected.counter";
    private static final String AGENT_DISCONNECTED_COUNTER_NAME = METRICS_PREFIX + "disconnected.counter";
    private static final String AGENT_LOOKUP_TIMER_NAME = METRICS_PREFIX + "lookup.timer";
    private static final String ZK_CONNECTION_STATE_TAG_NAME = "connectionState";
    private static final String ROUTE_FOUND_TAG_NAME = "found";
    private static final Set EMPTY_TAG_SET = ImmutableSet.of();
    private final String localHostname;
    private final ServiceDiscovery serviceDiscovery;
    private final TaskScheduler taskScheduler;
    private final MeterRegistry registry;
    private final AgentRoutingServiceProperties properties;
    private final Set connectedAgentsSet;
    private final Map> registeredAgentsMap;
    private final PriorityBlockingQueue registrationQueue;
    private final AtomicReference registrationTaskThread;
    private final ThreadFactory threadFactory;

    /**
     * Constructor.
     *
     * @param genieHostInfo                    The genie local host information
     * @param serviceDiscovery                 The service discovery client
     * @param taskScheduler                    The task scheduler
     * @param listenableCuratorConnectionState The listenable curator client connection status
     * @param registry                         The metrics registry
     * @param properties                       The service properties
     */
    public AgentRoutingServiceCuratorDiscoveryImpl(
        final GenieHostInfo genieHostInfo,
        final ServiceDiscovery serviceDiscovery,
        final TaskScheduler taskScheduler,
        final Listenable listenableCuratorConnectionState,
        final MeterRegistry registry,
        final AgentRoutingServiceProperties properties
    ) {
        this(
            genieHostInfo,
            serviceDiscovery,
            taskScheduler,
            listenableCuratorConnectionState,
            registry,
            properties,
            new ThreadFactory() {
                private final AtomicLong threadCounter = new AtomicLong();

                @Override
                public Thread newThread(final Runnable r) {
                    return new Thread(
                        r,
                        this.getClass().getSimpleName() + "-registration-" + threadCounter.incrementAndGet()
                    );
                }
            }
        );
    }

    @VisibleForTesting
    AgentRoutingServiceCuratorDiscoveryImpl(
        final GenieHostInfo genieHostInfo,
        final ServiceDiscovery serviceDiscovery,
        final TaskScheduler taskScheduler,
        final Listenable listenableCuratorConnectionState,
        final MeterRegistry registry,
        final AgentRoutingServiceProperties properties,
        final ThreadFactory threadFactory
    ) {
        this.localHostname = genieHostInfo.getHostname();
        this.serviceDiscovery = serviceDiscovery;
        this.taskScheduler = taskScheduler;
        this.registry = registry;
        this.threadFactory = threadFactory;
        this.properties = properties;
        this.registeredAgentsMap = Maps.newConcurrentMap();
        this.connectedAgentsSet = Sets.newConcurrentHashSet();
        this.registrationQueue = new PriorityBlockingQueue<>();
        this.registrationTaskThread = new AtomicReference<>();

        // Create gauge metric for agents connected and registered
        registry.gauge(CONNECTED_AGENTS_GAUGE_NAME, EMPTY_TAG_SET, this.connectedAgentsSet, Set::size);
        registry.gaugeMapSize(REGISTERED_AGENTS_GAUGE_NAME, EMPTY_TAG_SET, this.registeredAgentsMap);

        // Listen for Curator session state changes
        listenableCuratorConnectionState.addListener(this::handleConnectionStateChange);

        // The curator client is passed already connected.
        // See: org.springframework.cloud.zookeeper.ZookeeperAutoConfiguration
        this.startRegistrationThread();
    }

    private void startRegistrationThread() {
        final Thread newThread = this.threadFactory.newThread(this::registrationTask);
        final Thread oldThread = this.registrationTaskThread.getAndSet(newThread);
        if (oldThread != null) {
            oldThread.interrupt();
        }
        newThread.start();
    }

    private void stopRegistrationThread() {
        final Thread thread = this.registrationTaskThread.getAndSet(null);
        if (thread != null) {
            thread.interrupt();
        }
    }

    // Thread task that consumes registration queue items and applies the corresponding mutation with Curator client.
    // This thread is stopped with an interrupt if the client is disconnected.
    private void registrationTask() {
        while (true) {
            try {
                processNextRegistrationMutation();
            } catch (InterruptedException e) {
                break;
            }
        }

        log.debug("Registration thread terminating");
    }

    private void processNextRegistrationMutation() throws InterruptedException {
        RegisterMutation mutation = null;
        try {
            // Blocking
            mutation = this.registrationQueue.take();

            final String jobId = mutation.getJobId();
            // Check if agent is still connected by the time this mutation is taken from the queue to
            // be processed.
            final boolean agentIsConnected = this.connectedAgentsSet.contains(mutation.getJobId());

            if (agentIsConnected) {
                // Register or re-register agent connection
                if (mutation.isRefresh()) {
                    refreshAgentConnection(jobId);
                } else {
                    registerAgentConnection(jobId);
                }

                // Schedule a future refresh for this agent connection
                this.taskScheduler.schedule(
                    () -> this.registrationQueue.add(RegisterMutation.refresh(jobId)),
                    Instant.now().plus(this.properties.getRefreshInterval())
                );

            } else {
                // Unregister agent connection
                unregisterAgentConnection(jobId);
            }

        } catch (InterruptedException e) {
            log.warn("Registration task interrupted", e);
            if (mutation != null) {
                // Re-enqueue mutation that was in-progress when interrupted
                this.registrationQueue.add(mutation);
            }
            throw e;
        }
    }

    private void registerAgentConnection(final String jobId) throws InterruptedException {
        log.debug("Registering route for job: {}", jobId);

        final ServiceInstance serviceInstance = new ServiceInstance<>(
            SERVICE_NAME,
            jobId,
            localHostname,
            null,
            null,
            new Agent(jobId),
            Instant.now().getEpochSecond(),
            ServiceType.DYNAMIC,
            null
        );

        Set tags = MetricsUtils.newSuccessTagsSet();
        final long start = System.nanoTime();
        try {
            this.serviceDiscovery.registerService(serviceInstance);
            this.registeredAgentsMap.put(jobId, serviceInstance);
        } catch (InterruptedException e) {
            // Ensure interrupt is not swallowed by the generic catch
            log.debug("Interrupted while registering {}", jobId);
            tags = MetricsUtils.newFailureTagsSetForException(e);
            throw e;
        } catch (Exception e) {
            log.error("Failed to register agent executing job: {}", jobId, e);
            tags = MetricsUtils.newFailureTagsSetForException(e);
        } finally {
            this.registry.timer(
                AGENT_REGISTERED_TIMER_NAME,
                tags
            ).record(System.nanoTime() - start, TimeUnit.NANOSECONDS);
        }
    }

    private void refreshAgentConnection(final String jobId) throws InterruptedException {
        log.debug("Refreshing route for job: {}", jobId);

        final ServiceInstance serviceInstance = this.registeredAgentsMap.get(jobId);

        if (serviceInstance == null) {
            log.warn("Instance record not found for job {}", jobId);
            this.registerAgentConnection(jobId);
            return;
        }

        Set tags = MetricsUtils.newSuccessTagsSet();
        final long start = System.nanoTime();
        try {
            this.serviceDiscovery.updateService(serviceInstance);
        } catch (KeeperException.NoNodeException e) {
            log.warn("Failed to update registration of agent executing job id: {}", jobId);
            // Failed because expected existing node is not present. Create it.
            this.registerAgentConnection(jobId);
        } catch (InterruptedException e) {
            // Ensure interrupt is not swallowed by the generic catch
            log.debug("Interrupted while refreshing {}", jobId);
            tags = MetricsUtils.newFailureTagsSetForException(e);
            throw e;
        } catch (Exception e) {
            log.error("Failed to refresh agent executing job id: {}", jobId);
            tags = MetricsUtils.newFailureTagsSetForException(e);
        } finally {
            this.registry.timer(
                AGENT_REFRESH_TIMER_NAME,
                tags
            ).record(System.nanoTime() - start, TimeUnit.NANOSECONDS);
        }
    }

    private void unregisterAgentConnection(final String jobId) throws InterruptedException {
        final ServiceInstance serviceInstance = this.registeredAgentsMap.get(jobId);

        if (serviceInstance == null) {
            log.debug("Skipping unregistration, already removed");
            return;
        }

        log.debug("Unregistering route for job: {}", jobId);

        Set tags = MetricsUtils.newSuccessTagsSet();
        final long start = System.nanoTime();
        try {
            this.serviceDiscovery.unregisterService(serviceInstance);
            this.registeredAgentsMap.remove(jobId);
        } catch (InterruptedException e) {
            // Ensure interrupt is not swallowed by the generic catch
            log.debug("Interrupted while unregistering {}", jobId);
            tags = MetricsUtils.newFailureTagsSetForException(e);
            throw e;
        } catch (Exception e) {
            log.error("Failed to unregister agent executing job id: {}", jobId);
            tags = MetricsUtils.newFailureTagsSetForException(e);
        } finally {
            this.registry.timer(
                AGENT_UNREGISTERED_TIMER_NAME,
                tags
            ).record(System.nanoTime() - start, TimeUnit.NANOSECONDS);
        }
    }

    private void handleConnectionStateChange(final CuratorFramework client, final ConnectionState newState) {

        this.registry.counter(
            ZOOKEEPER_SESSION_STATE_COUNTER_NAME,
            Sets.newHashSet(Tag.of(ZK_CONNECTION_STATE_TAG_NAME, newState.name()))
        ).increment();

        log.info("Zookeeper/Curator client: {}", newState);

        switch (newState) {
            case CONNECTED:
            case RECONNECTED:
                startRegistrationThread();
                break;

            case LOST:
            case SUSPENDED:
                stopRegistrationThread();
                break;

            default:
                log.warn("Zookeeper/Curator unhandled connection state: {}", newState);
        }
    }

    /**
     * {@inheritDoc}
     */
    @Override
    public Optional getHostnameForAgentConnection(@NotBlank final String jobId) {
        log.debug("Looking up agent executing job: {}", jobId);
        if (isAgentConnectionLocal(jobId)) {
            return Optional.of(localHostname);
        }

        final long start = System.nanoTime();
        final Set tags = Sets.newHashSet();
        String address = null;

        try {
            final ServiceInstance instance = serviceDiscovery.queryForInstance(SERVICE_NAME, jobId);
            if (instance == null) {
                log.debug("Could not find agent connection for job {}", jobId);
            } else {
                address = instance.getAddress();
            }
            MetricsUtils.addSuccessTags(tags);
        } catch (Exception e) {
            log.error("Error looking up agent connection for job {}", jobId, e);
            address = null;
            MetricsUtils.addFailureTagsWithException(tags, e);
        } finally {
            tags.add(Tag.of(ROUTE_FOUND_TAG_NAME, String.valueOf(address != null)));
            this.registry.timer(
                AGENT_LOOKUP_TIMER_NAME,
                tags
            ).record(System.nanoTime() - start, TimeUnit.NANOSECONDS);
        }

        return Optional.ofNullable(address);
    }

    /**
     * {@inheritDoc}
     */
    @Override
    public boolean isAgentConnectionLocal(@NotBlank final String jobId) {
        return this.connectedAgentsSet.contains(jobId);
    }

    @Override
    public void handleClientConnected(@NotBlank final String jobId) {
        log.debug("Adding to routing table (pending registration): {}", jobId);

        final boolean isNew = this.connectedAgentsSet.add(jobId);
        this.registrationQueue.add(RegisterMutation.update(jobId));

        if (isNew) {
            this.registry.counter(AGENT_CONNECTED_COUNTER_NAME).increment();
        }
    }

    @Override
    public void handleClientDisconnected(@NotBlank final String jobId) {
        log.debug("Removing from routing table (pending un-registration): {}", jobId);

        final boolean removed = this.connectedAgentsSet.remove(jobId);
        this.registrationQueue.add(RegisterMutation.update(jobId));

        if (removed) {
            this.registry.counter(AGENT_DISCONNECTED_COUNTER_NAME).increment();
        }
    }

    /**
     * {@inheritDoc}
     */
    @Override
    public boolean isAgentConnected(final String jobId) {
        return this.getHostnameForAgentConnection(jobId).isPresent();
    }

    /**
     * Payload for typed {@link ServiceDiscovery}.
     */
    @Getter
    @EqualsAndHashCode
    public static final class Agent {
        // This field is superfluous, but added because serialization fails on empty beans
        private final String jobId;

        // Jackson deserialization requires this dummy constructor
        private Agent() {
            this.jobId = null;
        }

        /**
         * Constructor.
         *
         * @param jobId The job id
         */
        public Agent(@JsonProperty(value = "jobId", required = true) final String jobId) {
            this.jobId = jobId;
        }
    }

    @Getter
    @EqualsAndHashCode
    private static final class RegisterMutation implements Comparable {
        private final String jobId;
        private final boolean refresh;
        private final long timestamp;

        private RegisterMutation(final String jobId, final boolean refresh) {
            this.jobId = jobId;
            this.refresh = refresh;
            this.timestamp = System.nanoTime();
        }

        static RegisterMutation refresh(final String jobId) {
            return new RegisterMutation(jobId, true);
        }

        static RegisterMutation update(final String jobId) {
            return new RegisterMutation(jobId, false);
        }

        /**
         * {@inheritDoc}
         */
        @Override
        public int compareTo(final RegisterMutation other) {
            if (this.isRefresh() == other.isRefresh()) {
                final long timestampDifference = this.getTimestamp() - other.getTimestamp();
                if (timestampDifference == 0) {
                    return this.getJobId().compareTo(other.getJobId());
                } else {
                    return timestamp > 0 ? 1 : -1;
                }
            } else {
                return this.isRefresh() ? 1 : -1;
            }
        }
    }
}




© 2015 - 2024 Weber Informatics LLC | Privacy Policy