org.apache.flink.runtime.executiongraph.failover.RestartPipelinedRegionFailoverStrategy Maven / Gradle / Ivy

Go to download
Show more of this group Show more artifacts with this name
Show all versions of flink-runtime Show documentation
The newest version!
/*
 * Licensed to the Apache Software Foundation (ASF) under one or more
 * contributor license agreements.  See the NOTICE file distributed with
 * this work for additional information regarding copyright ownership.
 * The ASF licenses this file to You under the Apache License, Version 2.0
 * (the "License"); you may not use this file except in compliance with
 * the License.  You may obtain a copy of the License at
 *
 *    http://www.apache.org/licenses/LICENSE-2.0
 *
 * Unless required by applicable law or agreed to in writing, software
 * distributed under the License is distributed on an "AS IS" BASIS,
 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 * See the License for the specific language governing permissions and
 * limitations under the License.
 */

package org.apache.flink.runtime.executiongraph.failover;

import org.apache.flink.annotation.VisibleForTesting;
import org.apache.flink.runtime.execution.ExecutionState;
import org.apache.flink.runtime.io.network.partition.PartitionException;
import org.apache.flink.runtime.io.network.partition.ResultPartitionType;
import org.apache.flink.runtime.jobgraph.IntermediateResultPartitionID;
import org.apache.flink.runtime.scheduler.strategy.ConsumedPartitionGroup;
import org.apache.flink.runtime.scheduler.strategy.ConsumerVertexGroup;
import org.apache.flink.runtime.scheduler.strategy.ExecutionVertexID;
import org.apache.flink.runtime.scheduler.strategy.SchedulingExecutionVertex;
import org.apache.flink.runtime.scheduler.strategy.SchedulingPipelinedRegion;
import org.apache.flink.runtime.scheduler.strategy.SchedulingResultPartition;
import org.apache.flink.runtime.scheduler.strategy.SchedulingTopology;
import org.apache.flink.util.ExceptionUtils;
import org.apache.flink.util.IterableUtils;

import java.util.ArrayDeque;
import java.util.ArrayList;
import java.util.Collections;
import java.util.HashSet;
import java.util.IdentityHashMap;
import java.util.List;
import java.util.Optional;
import java.util.Queue;
import java.util.Set;
import java.util.function.Function;

import static org.apache.flink.util.Preconditions.checkNotNull;

/**
 * A failover strategy that proposes to restart involved regions when a vertex fails. A region is
 * defined by this strategy as tasks that communicate via pipelined data exchange.
 */
public class RestartPipelinedRegionFailoverStrategy implements FailoverStrategy {

    /** The topology containing info about all the vertices and result partitions. */
    private final SchedulingTopology topology;

    /** The checker helps to query result partition availability. */
    private final RegionFailoverResultPartitionAvailabilityChecker
            resultPartitionAvailabilityChecker;

    /**
     * Creates a new failover strategy to restart pipelined regions that works on the given
     * topology. The result partitions are always considered to be available if no data consumption
     * error happens.
     *
     * @param topology containing info about all the vertices and result partitions
     */
    @VisibleForTesting
    public RestartPipelinedRegionFailoverStrategy(SchedulingTopology topology) {
        this(topology, resultPartitionID -> true);
    }

    /**
     * Creates a new failover strategy to restart pipelined regions that works on the given
     * topology.
     *
     * @param topology containing info about all the vertices and result partitions
     * @param resultPartitionAvailabilityChecker helps to query result partition availability
     */
    public RestartPipelinedRegionFailoverStrategy(
            SchedulingTopology topology,
            ResultPartitionAvailabilityChecker resultPartitionAvailabilityChecker) {

        this.topology = checkNotNull(topology);
        this.resultPartitionAvailabilityChecker =
                new RegionFailoverResultPartitionAvailabilityChecker(
                        resultPartitionAvailabilityChecker,
                        (intermediateResultPartitionID ->
                                topology.getResultPartition(intermediateResultPartitionID)
                                        .getResultType()));
    }

    // ------------------------------------------------------------------------
    //  task failure handling
    // ------------------------------------------------------------------------

    /**
     * Returns a set of IDs corresponding to the set of vertices that should be restarted. In this
     * strategy, all task vertices in 'involved' regions are proposed to be restarted. The
     * 'involved' regions are calculated with rules below: 1. The region containing the failed task
     * is always involved 2. If an input result partition of an involved region is not available,
     * i.e. Missing or Corrupted, the region containing the partition producer task is involved 3.
     * If a region is involved, all of its consumer regions are involved
     *
     * @param executionVertexId ID of the failed task
     * @param cause cause of the failure
     * @return set of IDs of vertices to restart
     */
    @Override
    public Set getTasksNeedingRestart(
            ExecutionVertexID executionVertexId, Throwable cause) {

        final SchedulingPipelinedRegion failedRegion =
                topology.getPipelinedRegionOfVertex(executionVertexId);
        if (failedRegion == null) {
            // TODO: show the task name in the log
            throw new IllegalStateException(
                    "Can not find the failover region for task " + executionVertexId, cause);
        }

        // if the failure cause is data consumption error, mark the corresponding data partition to
        // be failed,
        // so that the failover process will try to recover it
        Optional dataConsumptionException =
                ExceptionUtils.findThrowable(cause, PartitionException.class);
        if (dataConsumptionException.isPresent()) {
            resultPartitionAvailabilityChecker.markResultPartitionFailed(
                    dataConsumptionException.get().getPartitionId().getPartitionId());
        }

        // calculate the tasks to restart based on the result of regions to restart
        Set tasksToRestart = new HashSet<>();
        for (SchedulingPipelinedRegion region : getRegionsToRestart(failedRegion)) {
            for (SchedulingExecutionVertex vertex : region.getVertices()) {
                // we do not need to restart tasks which are already in the initial state
                if (vertex.getState() != ExecutionState.CREATED) {
                    tasksToRestart.add(vertex.getId());
                }
            }
        }

        // the previous failed partition will be recovered. remove its failed state from the checker
        if (dataConsumptionException.isPresent()) {
            resultPartitionAvailabilityChecker.removeResultPartitionFromFailedState(
                    dataConsumptionException.get().getPartitionId().getPartitionId());
        }

        return tasksToRestart;
    }

    /**
     * All 'involved' regions are proposed to be restarted. The 'involved' regions are calculated
     * with rules below: 1. The region containing the failed task is always involved 2. If an input
     * result partition of an involved region is not available, i.e. Missing or Corrupted, the
     * region containing the partition producer task is involved 3. If a region is involved, all of
     * its consumer regions are involved
     */
    private Set getRegionsToRestart(
            SchedulingPipelinedRegion failedRegion) {
        Set regionsToRestart =
                Collections.newSetFromMap(new IdentityHashMap<>());
        Set visitedRegions =
                Collections.newSetFromMap(new IdentityHashMap<>());

        Set visitedConsumedResultGroups =
                Collections.newSetFromMap(new IdentityHashMap<>());
        Set visitedConsumerVertexGroups =
                Collections.newSetFromMap(new IdentityHashMap<>());

        // start from the failed region to visit all involved regions
        Queue regionsToVisit = new ArrayDeque<>();
        visitedRegions.add(failedRegion);
        regionsToVisit.add(failedRegion);
        while (!regionsToVisit.isEmpty()) {
            SchedulingPipelinedRegion regionToRestart = regionsToVisit.poll();

            // an involved region should be restarted
            regionsToRestart.add(regionToRestart);

            // if a needed input result partition is not available, its producer region is involved
            for (IntermediateResultPartitionID consumedPartitionId :
                    getConsumedPartitionsToVisit(regionToRestart, visitedConsumedResultGroups)) {
                if (!resultPartitionAvailabilityChecker.isAvailable(consumedPartitionId)) {
                    SchedulingResultPartition consumedPartition =
                            topology.getResultPartition(consumedPartitionId);
                    SchedulingPipelinedRegion producerRegion =
                            topology.getPipelinedRegionOfVertex(
                                    consumedPartition.getProducer().getId());
                    if (!visitedRegions.contains(producerRegion)) {
                        visitedRegions.add(producerRegion);
                        regionsToVisit.add(producerRegion);
                    }
                }
            }

            // all consumer regions of an involved region should be involved
            for (ExecutionVertexID consumerVertexId :
                    getConsumerVerticesToVisit(regionToRestart, visitedConsumerVertexGroups)) {
                SchedulingPipelinedRegion consumerRegion =
                        topology.getPipelinedRegionOfVertex(consumerVertexId);
                if (!visitedRegions.contains(consumerRegion)) {
                    visitedRegions.add(consumerRegion);
                    regionsToVisit.add(consumerRegion);
                }
            }
        }

        return regionsToRestart;
    }

    private Iterable getConsumedPartitionsToVisit(
            SchedulingPipelinedRegion regionToRestart,
            Set visitedConsumedResultGroups) {

        final List consumedPartitionGroupsToVisit = new ArrayList<>();

        for (SchedulingExecutionVertex vertex : regionToRestart.getVertices()) {
            for (ConsumedPartitionGroup consumedPartitionGroup :
                    vertex.getConsumedPartitionGroups()) {
                if (!visitedConsumedResultGroups.contains(consumedPartitionGroup)) {
                    visitedConsumedResultGroups.add(consumedPartitionGroup);
                    consumedPartitionGroupsToVisit.add(consumedPartitionGroup);
                }
            }
        }

        return IterableUtils.flatMap(consumedPartitionGroupsToVisit, Function.identity());
    }

    private Iterable getConsumerVerticesToVisit(
            SchedulingPipelinedRegion regionToRestart,
            Set visitedConsumerVertexGroups) {
        final List consumerVertexGroupsToVisit = new ArrayList<>();

        for (SchedulingExecutionVertex vertex : regionToRestart.getVertices()) {
            for (SchedulingResultPartition producedPartition : vertex.getProducedResults()) {
                for (ConsumerVertexGroup consumerVertexGroup :
                        producedPartition.getConsumerVertexGroups()) {
                    if (!visitedConsumerVertexGroups.contains(consumerVertexGroup)) {
                        visitedConsumerVertexGroups.add(consumerVertexGroup);
                        consumerVertexGroupsToVisit.add(consumerVertexGroup);
                    }
                }
            }
        }

        return IterableUtils.flatMap(consumerVertexGroupsToVisit, Function.identity());
    }

    // ------------------------------------------------------------------------
    //  testing
    // ------------------------------------------------------------------------

    /**
     * Returns the failover region that contains the given execution vertex.
     *
     * @return the failover region that contains the given execution vertex
     */
    @VisibleForTesting
    public SchedulingPipelinedRegion getFailoverRegion(ExecutionVertexID vertexID) {
        return topology.getPipelinedRegionOfVertex(vertexID);
    }

    /**
     * A stateful {@link ResultPartitionAvailabilityChecker} which maintains the failed partitions
     * which are not available.
     */
    private static class RegionFailoverResultPartitionAvailabilityChecker
            implements ResultPartitionAvailabilityChecker {

        /** Result partition state checker from the shuffle master. */
        private final ResultPartitionAvailabilityChecker resultPartitionAvailabilityChecker;

        /** Records partitions which has caused {@link PartitionException}. */
        private final HashSet failedPartitions;

        /** Retrieve {@link ResultPartitionType} by {@link IntermediateResultPartitionID}. */
        private final Function
                resultPartitionTypeRetriever;

        RegionFailoverResultPartitionAvailabilityChecker(
                ResultPartitionAvailabilityChecker checker,
                Function
                        resultPartitionTypeRetriever) {
            this.resultPartitionAvailabilityChecker = checkNotNull(checker);
            this.failedPartitions = new HashSet<>();
            this.resultPartitionTypeRetriever = checkNotNull(resultPartitionTypeRetriever);
        }

        @Override
        public boolean isAvailable(IntermediateResultPartitionID resultPartitionID) {
            return !failedPartitions.contains(resultPartitionID)
                    && resultPartitionAvailabilityChecker.isAvailable(resultPartitionID)
                    // If the result partition is available in the partition tracker and does not
                    // fail, it will be available if it can be re-consumption, and it may also be
                    // available for PIPELINED_APPROXIMATE type.
                    && isResultPartitionIsReConsumableOrPipelinedApproximate(resultPartitionID);
        }

        public void markResultPartitionFailed(IntermediateResultPartitionID resultPartitionID) {
            failedPartitions.add(resultPartitionID);
        }

        public void removeResultPartitionFromFailedState(
                IntermediateResultPartitionID resultPartitionID) {
            failedPartitions.remove(resultPartitionID);
        }

        private boolean isResultPartitionIsReConsumableOrPipelinedApproximate(
                IntermediateResultPartitionID resultPartitionID) {
            ResultPartitionType resultPartitionType =
                    resultPartitionTypeRetriever.apply(resultPartitionID);
            return resultPartitionType.isReconsumable()
                    || resultPartitionType == ResultPartitionType.PIPELINED_APPROXIMATE;
        }
    }

    /** The factory to instantiate {@link RestartPipelinedRegionFailoverStrategy}. */
    public static class Factory implements FailoverStrategy.Factory {

        @Override
        public FailoverStrategy create(
                final SchedulingTopology topology,
                final ResultPartitionAvailabilityChecker resultPartitionAvailabilityChecker) {

            return new RestartPipelinedRegionFailoverStrategy(
                    topology, resultPartitionAvailabilityChecker);
        }
    }
}