org.apache.flink.runtime.scheduler.adapter.DefaultExecutionTopology Maven / Gradle / Ivy

Go to download
/*
 * Licensed to the Apache Software Foundation (ASF) under one
 * or more contributor license agreements.  See the NOTICE file
 * distributed with this work for additional information
 * regarding copyright ownership.  The ASF licenses this file
 * to you under the Apache License, Version 2.0 (the
 * "License"); you may not use this file except in compliance
 * with the License.  You may obtain a copy of the License at
 *
 *     http://www.apache.org/licenses/LICENSE-2.0
 *
 * Unless required by applicable law or agreed to in writing, software
 * distributed under the License is distributed on an "AS IS" BASIS,
 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 * See the License for the specific language governing permissions and
 * limitations under the License.
 */

package org.apache.flink.runtime.scheduler.adapter;

import org.apache.flink.runtime.executiongraph.DefaultExecutionGraph;
import org.apache.flink.runtime.executiongraph.EdgeManager;
import org.apache.flink.runtime.executiongraph.ExecutionGraph;
import org.apache.flink.runtime.executiongraph.ExecutionJobVertex;
import org.apache.flink.runtime.executiongraph.ExecutionVertex;
import org.apache.flink.runtime.executiongraph.IntermediateResultPartition;
import org.apache.flink.runtime.executiongraph.failover.flip1.SchedulingPipelinedRegionComputeUtil;
import org.apache.flink.runtime.jobgraph.DistributionPattern;
import org.apache.flink.runtime.jobgraph.IntermediateDataSet;
import org.apache.flink.runtime.jobgraph.IntermediateResultPartitionID;
import org.apache.flink.runtime.jobgraph.JobEdge;
import org.apache.flink.runtime.jobgraph.JobVertex;
import org.apache.flink.runtime.jobgraph.JobVertexID;
import org.apache.flink.runtime.jobgraph.topology.DefaultLogicalPipelinedRegion;
import org.apache.flink.runtime.jobgraph.topology.DefaultLogicalTopology;
import org.apache.flink.runtime.jobgraph.topology.LogicalEdge;
import org.apache.flink.runtime.jobgraph.topology.LogicalVertex;
import org.apache.flink.runtime.jobmanager.scheduler.CoLocationConstraint;
import org.apache.flink.runtime.jobmanager.scheduler.CoLocationGroup;
import org.apache.flink.runtime.scheduler.SchedulingTopologyListener;
import org.apache.flink.runtime.scheduler.strategy.ConsumedPartitionGroup;
import org.apache.flink.runtime.scheduler.strategy.ConsumerVertexGroup;
import org.apache.flink.runtime.scheduler.strategy.ExecutionVertexID;
import org.apache.flink.runtime.scheduler.strategy.ResultPartitionState;
import org.apache.flink.runtime.scheduler.strategy.SchedulingExecutionVertex;
import org.apache.flink.runtime.scheduler.strategy.SchedulingTopology;
import org.apache.flink.util.IterableUtils;

import org.slf4j.Logger;
import org.slf4j.LoggerFactory;

import java.util.ArrayList;
import java.util.Collections;
import java.util.HashMap;
import java.util.HashSet;
import java.util.IdentityHashMap;
import java.util.List;
import java.util.Map;
import java.util.Objects;
import java.util.Set;
import java.util.function.Function;
import java.util.function.Supplier;
import java.util.stream.Collectors;
import java.util.stream.Stream;

import static org.apache.flink.util.Preconditions.checkNotNull;
import static org.apache.flink.util.Preconditions.checkState;

/** Adapter of {@link ExecutionGraph} to {@link SchedulingTopology}. */
public class DefaultExecutionTopology implements SchedulingTopology {

    private static final Logger LOG = LoggerFactory.getLogger(DefaultExecutionTopology.class);

    private final Map executionVerticesById;

    private final List executionVerticesList;

    private final Map resultPartitionsById;

    private final Map pipelinedRegionsByVertex;

    private final List pipelinedRegions;

    private final EdgeManager edgeManager;

    private final Supplier> sortedExecutionVertexIds;

    private final Map
            logicalPipelinedRegionsByJobVertexId;

    /** Listeners that will be notified whenever the scheduling topology is updated. */
    private final List schedulingTopologyListeners = new ArrayList<>();

    private DefaultExecutionTopology(
            Supplier> sortedExecutionVertexIds,
            EdgeManager edgeManager,
            Map logicalPipelinedRegionsByJobVertexId) {
        this.sortedExecutionVertexIds = checkNotNull(sortedExecutionVertexIds);
        this.edgeManager = checkNotNull(edgeManager);
        this.logicalPipelinedRegionsByJobVertexId =
                checkNotNull(logicalPipelinedRegionsByJobVertexId);

        this.executionVerticesById = new HashMap<>();
        this.executionVerticesList = new ArrayList<>();
        this.resultPartitionsById = new HashMap<>();
        this.pipelinedRegionsByVertex = new HashMap<>();
        this.pipelinedRegions = new ArrayList<>();
    }

    @Override
    public Iterable getVertices() {
        return Collections.unmodifiableList(executionVerticesList);
    }

    @Override
    public DefaultExecutionVertex getVertex(final ExecutionVertexID executionVertexId) {
        final DefaultExecutionVertex executionVertex = executionVerticesById.get(executionVertexId);
        if (executionVertex == null) {
            throw new IllegalArgumentException("can not find vertex: " + executionVertexId);
        }
        return executionVertex;
    }

    @Override
    public DefaultResultPartition getResultPartition(
            final IntermediateResultPartitionID intermediateResultPartitionId) {
        final DefaultResultPartition resultPartition =
                resultPartitionsById.get(intermediateResultPartitionId);
        if (resultPartition == null) {
            throw new IllegalArgumentException(
                    "can not find partition: " + intermediateResultPartitionId);
        }
        return resultPartition;
    }

    @Override
    public void registerSchedulingTopologyListener(SchedulingTopologyListener listener) {
        checkNotNull(listener);
        schedulingTopologyListeners.add(listener);
    }

    @Override
    public Iterable getAllPipelinedRegions() {
        checkNotNull(pipelinedRegions);

        return Collections.unmodifiableCollection(pipelinedRegions);
    }

    @Override
    public DefaultSchedulingPipelinedRegion getPipelinedRegionOfVertex(
            final ExecutionVertexID vertexId) {
        checkNotNull(pipelinedRegionsByVertex);

        final DefaultSchedulingPipelinedRegion pipelinedRegion =
                pipelinedRegionsByVertex.get(vertexId);
        if (pipelinedRegion == null) {
            throw new IllegalArgumentException("Unknown execution vertex " + vertexId);
        }
        return pipelinedRegion;
    }

    public EdgeManager getEdgeManager() {
        return edgeManager;
    }

    private static Map
            computeLogicalPipelinedRegionsByJobVertexId(final ExecutionGraph executionGraph) {
        List topologicallySortedJobVertices =
                IterableUtils.toStream(executionGraph.getVerticesTopologically())
                        .map(ExecutionJobVertex::getJobVertex)
                        .collect(Collectors.toList());

        Iterable logicalPipelinedRegions =
                DefaultLogicalTopology.fromTopologicallySortedJobVertices(
                                topologicallySortedJobVertices)
                        .getAllPipelinedRegions();

        Map logicalPipelinedRegionsByJobVertexId =
                new HashMap<>();
        for (DefaultLogicalPipelinedRegion logicalPipelinedRegion : logicalPipelinedRegions) {
            for (LogicalVertex vertex : logicalPipelinedRegion.getVertices()) {
                logicalPipelinedRegionsByJobVertexId.put(vertex.getId(), logicalPipelinedRegion);
            }
        }

        return logicalPipelinedRegionsByJobVertexId;
    }

    public void notifyExecutionGraphUpdated(
            final DefaultExecutionGraph executionGraph,
            final List newlyInitializedJobVertices) {

        checkNotNull(executionGraph, "execution graph can not be null");

        final Set newJobVertexIds =
                newlyInitializedJobVertices.stream()
                        .map(ExecutionJobVertex::getJobVertexId)
                        .collect(Collectors.toSet());

        // any PIPELINED input should be from within this new set so that existing pipelined regions
        // will not change
        newlyInitializedJobVertices.stream()
                .map(ExecutionJobVertex::getJobVertex)
                .flatMap(v -> v.getInputs().stream())
                .map(JobEdge::getSource)
                .filter(r -> r.getResultType().isPipelined())
                .map(IntermediateDataSet::getProducer)
                .map(JobVertex::getID)
                .forEach(id -> checkState(newJobVertexIds.contains(id)));

        final Iterable newExecutionVertices =
                newlyInitializedJobVertices.stream()
                        .flatMap(jobVertex -> Stream.of(jobVertex.getTaskVertices()))
                        .collect(Collectors.toList());

        generateNewExecutionVerticesAndResultPartitions(newExecutionVertices);

        generateNewPipelinedRegions(newExecutionVertices);

        ensureCoLocatedVerticesInSameRegion(pipelinedRegions, executionGraph);

        notifySchedulingTopologyUpdated(newExecutionVertices);
    }

    private void notifySchedulingTopologyUpdated(Iterable newExecutionVertices) {
        List newVertexIds =
                IterableUtils.toStream(newExecutionVertices)
                        .map(ExecutionVertex::getID)
                        .collect(Collectors.toList());
        for (SchedulingTopologyListener listener : schedulingTopologyListeners) {
            listener.notifySchedulingTopologyUpdated(this, newVertexIds);
        }
    }

    public static DefaultExecutionTopology fromExecutionGraph(
            DefaultExecutionGraph executionGraph) {
        checkNotNull(executionGraph, "execution graph can not be null");

        EdgeManager edgeManager = executionGraph.getEdgeManager();

        DefaultExecutionTopology schedulingTopology =
                new DefaultExecutionTopology(
                        () ->
                                IterableUtils.toStream(executionGraph.getAllExecutionVertices())
                                        .map(ExecutionVertex::getID)
                                        .collect(Collectors.toList()),
                        edgeManager,
                        computeLogicalPipelinedRegionsByJobVertexId(executionGraph));

        schedulingTopology.notifyExecutionGraphUpdated(
                executionGraph,
                IterableUtils.toStream(executionGraph.getVerticesTopologically())
                        .filter(ExecutionJobVertex::isInitialized)
                        .collect(Collectors.toList()));

        return schedulingTopology;
    }

    private void generateNewExecutionVerticesAndResultPartitions(
            Iterable newExecutionVertices) {
        for (ExecutionVertex vertex : newExecutionVertices) {
            List producedPartitions =
                    generateProducedSchedulingResultPartition(
                            vertex.getProducedPartitions(),
                            edgeManager::getConsumerVertexGroupForPartition);

            producedPartitions.forEach(
                    partition -> resultPartitionsById.put(partition.getId(), partition));

            DefaultExecutionVertex schedulingVertex =
                    generateSchedulingExecutionVertex(
                            vertex,
                            producedPartitions,
                            edgeManager.getConsumedPartitionGroupsForVertex(vertex.getID()),
                            resultPartitionsById::get);
            executionVerticesById.put(schedulingVertex.getId(), schedulingVertex);
        }

        executionVerticesList.clear();
        for (ExecutionVertexID vertexID : sortedExecutionVertexIds.get()) {
            executionVerticesList.add(executionVerticesById.get(vertexID));
        }
    }

    private static List generateProducedSchedulingResultPartition(
            Map
                    producedIntermediatePartitions,
            Function
                    partitionConsumerVertexGroupRetriever) {

        List producedSchedulingPartitions =
                new ArrayList<>(producedIntermediatePartitions.size());

        producedIntermediatePartitions
                .values()
                .forEach(
                        irp ->
                                producedSchedulingPartitions.add(
                                        new DefaultResultPartition(
                                                irp.getPartitionId(),
                                                irp.getIntermediateResult().getId(),
                                                irp.getResultType(),
                                                () ->
                                                        irp.isConsumable()
                                                                ? ResultPartitionState.CONSUMABLE
                                                                : ResultPartitionState.CREATED,
                                                () ->
                                                        partitionConsumerVertexGroupRetriever.apply(
                                                                irp.getPartitionId()),
                                                irp::getConsumedPartitionGroups)));

        return producedSchedulingPartitions;
    }

    private static DefaultExecutionVertex generateSchedulingExecutionVertex(
            ExecutionVertex vertex,
            List producedPartitions,
            List consumedPartitionGroups,
            Function
                    resultPartitionRetriever) {

        DefaultExecutionVertex schedulingVertex =
                new DefaultExecutionVertex(
                        vertex.getID(),
                        producedPartitions,
                        vertex::getExecutionState,
                        consumedPartitionGroups,
                        resultPartitionRetriever);

        producedPartitions.forEach(partition -> partition.setProducer(schedulingVertex));

        return schedulingVertex;
    }

    private void generateNewPipelinedRegions(Iterable newExecutionVertices) {

        final Iterable newSchedulingExecutionVertices =
                IterableUtils.toStream(newExecutionVertices)
                        .map(ExecutionVertex::getID)
                        .map(executionVerticesById::get)
                        .collect(Collectors.toList());

        Map>
                sortedExecutionVerticesInPipelinedRegion = new IdentityHashMap<>();

        for (DefaultExecutionVertex schedulingVertex : newSchedulingExecutionVertices) {
            sortedExecutionVerticesInPipelinedRegion
                    .computeIfAbsent(
                            logicalPipelinedRegionsByJobVertexId.get(
                                    schedulingVertex.getId().getJobVertexId()),
                            ignore -> new ArrayList<>())
                    .add(schedulingVertex);
        }

        long buildRegionsStartTime = System.nanoTime();

        Set> rawPipelinedRegions =
                Collections.newSetFromMap(new IdentityHashMap<>());

        // A SchedulingPipelinedRegion can be derived from just one LogicalPipelinedRegion.
        // Thus, we can traverse all LogicalPipelinedRegions and convert them into
        // SchedulingPipelinedRegions one by one. The LogicalPipelinedRegions and
        // SchedulingPipelinedRegions are both connected with inter-region blocking edges.
        for (Map.Entry> entry :
                sortedExecutionVerticesInPipelinedRegion.entrySet()) {

            DefaultLogicalPipelinedRegion logicalPipelinedRegion = entry.getKey();
            List schedulingExecutionVertices = entry.getValue();

            if (containsIntraRegionAllToAllEdge(logicalPipelinedRegion)) {
                // For edges inside one LogicalPipelinedRegion, if there is any all-to-all edge, it
                // could be under two circumstances:
                //
                // 1. Pipelined all-to-all edge:
                //     Pipelined all-to-all edge will connect all vertices pipelined. Therefore,
                // all execution vertices derived from this LogicalPipelinedRegion should be in one
                // SchedulingPipelinedRegion.
                //
                // 2. Blocking all-to-all edge:
                //     For intra-region blocking all-to-all edge, we must make sure all the vertices
                // are inside one SchedulingPipelinedRegion, so that there will be no deadlock
                // happens during scheduling. For more details about this case, please refer to
                // FLINK-17330 (https://issues.apache.org/jira/browse/FLINK-17330).
                //
                // Therefore, if a LogicalPipelinedRegion contains any intra-region all-to-all
                // edge, we just convert the entire LogicalPipelinedRegion to a sole
                // SchedulingPipelinedRegion directly.
                rawPipelinedRegions.add(new HashSet<>(schedulingExecutionVertices));
            } else {
                // If there are only pointwise edges inside the LogicalPipelinedRegion, we can use
                // SchedulingPipelinedRegionComputeUtil to compute the regions with O(N) computation
                // complexity.
                rawPipelinedRegions.addAll(
                        SchedulingPipelinedRegionComputeUtil.computePipelinedRegions(
                                schedulingExecutionVertices,
                                executionVerticesById::get,
                                resultPartitionsById::get));
            }
        }

        for (Set rawPipelinedRegion : rawPipelinedRegions) {
            //noinspection unchecked
            final DefaultSchedulingPipelinedRegion pipelinedRegion =
                    new DefaultSchedulingPipelinedRegion(
                            (Set) rawPipelinedRegion,
                            resultPartitionsById::get);
            pipelinedRegions.add(pipelinedRegion);

            for (SchedulingExecutionVertex executionVertex : rawPipelinedRegion) {
                pipelinedRegionsByVertex.put(executionVertex.getId(), pipelinedRegion);
            }
        }

        long buildRegionsDuration = (System.nanoTime() - buildRegionsStartTime) / 1_000_000;
        LOG.info(
                "Built {} new pipelined regions in {} ms, total {} pipelined regions currently.",
                rawPipelinedRegions.size(),
                buildRegionsDuration,
                pipelinedRegions.size());
    }

    /**
     * Check if the {@link DefaultLogicalPipelinedRegion} contains intra-region all-to-all edges or
     * not.
     */
    private static boolean containsIntraRegionAllToAllEdge(
            DefaultLogicalPipelinedRegion logicalPipelinedRegion) {
        for (LogicalVertex vertex : logicalPipelinedRegion.getVertices()) {
            for (LogicalEdge inputEdge : vertex.getInputs()) {
                if (inputEdge.getDistributionPattern() == DistributionPattern.ALL_TO_ALL
                        && logicalPipelinedRegion.contains(inputEdge.getProducerVertexId())) {
                    return true;
                }
            }
        }
        return false;
    }

    /**
     * Co-location constraints are only used for iteration head and tail. A paired head and tail
     * needs to be in the same pipelined region so that they can be restarted together.
     */
    private static void ensureCoLocatedVerticesInSameRegion(
            List pipelinedRegions,
            ExecutionGraph executionGraph) {

        final Map constraintToRegion =
                new HashMap<>();
        for (DefaultSchedulingPipelinedRegion region : pipelinedRegions) {
            for (DefaultExecutionVertex vertex : region.getVertices()) {
                final CoLocationConstraint constraint =
                        getCoLocationConstraint(vertex.getId(), executionGraph);
                if (constraint != null) {
                    final DefaultSchedulingPipelinedRegion regionOfConstraint =
                            constraintToRegion.get(constraint);
                    checkState(
                            regionOfConstraint == null || regionOfConstraint == region,
                            "co-located tasks must be in the same pipelined region");
                    constraintToRegion.putIfAbsent(constraint, region);
                }
            }
        }
    }

    private static CoLocationConstraint getCoLocationConstraint(
            ExecutionVertexID executionVertexId, ExecutionGraph executionGraph) {

        CoLocationGroup coLocationGroup =
                Objects.requireNonNull(
                                executionGraph.getJobVertex(executionVertexId.getJobVertexId()))
                        .getCoLocationGroup();
        return coLocationGroup == null
                ? null
                : coLocationGroup.getLocationConstraint(executionVertexId.getSubtaskIndex());
    }
}