All Downloads are FREE. Search and download functionalities are using the official Maven repository.

com.netflix.genie.web.tasks.leader.ArchiveStatusCleanupTask Maven / Gradle / Ivy

There is a newer version: 4.3.20
Show newest version
/*
 *
 *  Copyright 2020 Netflix, Inc.
 *
 *     Licensed under the Apache License, Version 2.0 (the "License");
 *     you may not use this file except in compliance with the License.
 *     You may obtain a copy of the License at
 *
 *         http://www.apache.org/licenses/LICENSE-2.0
 *
 *     Unless required by applicable law or agreed to in writing, software
 *     distributed under the License is distributed on an "AS IS" BASIS,
 *     WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 *     See the License for the specific language governing permissions and
 *     limitations under the License.
 *
 */
package com.netflix.genie.web.tasks.leader;

import com.google.common.collect.ImmutableSet;
import com.google.common.collect.Sets;
import com.netflix.genie.common.internal.dtos.ArchiveStatus;
import com.netflix.genie.common.internal.dtos.JobStatus;
import com.netflix.genie.web.agent.services.AgentRoutingService;
import com.netflix.genie.web.data.services.DataServices;
import com.netflix.genie.web.data.services.PersistenceService;
import com.netflix.genie.web.exceptions.checked.NotFoundException;
import com.netflix.genie.web.properties.ArchiveStatusCleanupProperties;
import com.netflix.genie.web.tasks.GenieTaskScheduleType;
import com.netflix.genie.web.util.MetricsUtils;
import io.micrometer.core.instrument.MeterRegistry;
import io.micrometer.core.instrument.Tag;
import lombok.extern.slf4j.Slf4j;

import java.time.Instant;
import java.util.Set;
import java.util.concurrent.TimeUnit;

/**
 * Leader task that find jobs whose archival status was left in 'PENDING' state.
 * This can for example happen if the agent fails to update the server after successfully archiving.
 * The logic is summarized as:
 * If a job finished running more than N minutes ago, and the agent is disconnected and the archive status is PENDING,
 * then set the archive status to UNKNOWN.
 *
 * @author mprimi
 * @since 4.0.0
 */
@Slf4j
public class ArchiveStatusCleanupTask extends LeaderTask {

    private static final String CLEAR_ARCHIVE_STATUS_COUNTER_NAME = "genie.jobs.archiveStatus.cleanup.counter";
    private static final String CLEAR_ARCHIVE_STATUS_TIMER_NAME = "genie.tasks.archiveStatusCleanup.timer";
    private static final Set PENDING_STATUS_SET = ImmutableSet.of(ArchiveStatus.PENDING);
    private final PersistenceService persistenceService;
    private final AgentRoutingService agentRoutingService;
    private final ArchiveStatusCleanupProperties properties;
    private final MeterRegistry registry;

    /**
     * Constructor.
     *
     * @param dataServices        data services
     * @param agentRoutingService agent routing service
     * @param properties          task properties
     * @param registry            metrics registry
     */
    public ArchiveStatusCleanupTask(
        final DataServices dataServices,
        final AgentRoutingService agentRoutingService,
        final ArchiveStatusCleanupProperties properties,
        final MeterRegistry registry
    ) {
        this.persistenceService = dataServices.getPersistenceService();
        this.agentRoutingService = agentRoutingService;
        this.properties = properties;
        this.registry = registry;
    }

    /**
     * {@inheritDoc}
     */
    @Override
    public void run() {
        final Set tags = Sets.newHashSet();
        final long start = System.nanoTime();
        try {
            final Instant updatedThreshold = Instant.now().minus(this.properties.getGracePeriod());
            final Set jobIds = this.persistenceService.getJobsWithStatusAndArchiveStatusUpdatedBefore(
                JobStatus.getFinishedStatuses(),
                PENDING_STATUS_SET,
                updatedThreshold
            );
            if (!jobIds.isEmpty()) {
                log.debug("Found {} finished jobs with PENDING archive status", jobIds.size());
                this.clearJobsArchiveStatus(jobIds);
            }
            MetricsUtils.addSuccessTags(tags);
        } catch (Exception e) {
            MetricsUtils.addFailureTagsWithException(tags, e);
            log.error("Archive status cleanup task failed with exception: {}", e.getMessage(), e);
        } finally {
            final long taskDuration = System.nanoTime() - start;
            registry.timer(CLEAR_ARCHIVE_STATUS_TIMER_NAME, tags).record(taskDuration, TimeUnit.NANOSECONDS);
        }
    }

    private void clearJobsArchiveStatus(final Set jobIds) {

        for (final String jobId : jobIds) {
            if (this.agentRoutingService.isAgentConnected(jobId)) {
                log.debug("Agent for job {} is still connected and probably archiving", jobId);
            } else {
                log.warn("Marking job {} archive status to UNKNOWN", jobId);
                final Set tags = Sets.newHashSet();

                try {
                    this.persistenceService.updateJobArchiveStatus(jobId, ArchiveStatus.UNKNOWN);
                    MetricsUtils.addSuccessTags(tags);
                } catch (NotFoundException e) {
                    log.error("Tried to update a job that does not exist: {}", jobId);
                    MetricsUtils.addFailureTagsWithException(tags, e);
                } finally {
                    registry.counter(CLEAR_ARCHIVE_STATUS_COUNTER_NAME, tags).increment();
                }
            }
        }
    }

    /**
     * {@inheritDoc}
     */
    @Override
    public GenieTaskScheduleType getScheduleType() {
        return GenieTaskScheduleType.FIXED_RATE;
    }

    /**
     * {@inheritDoc}
     */
    @Override
    public long getFixedRate() {
        return this.properties.getCheckInterval().toMillis();
    }
}




© 2015 - 2024 Weber Informatics LLC | Privacy Policy