com.hazelcast.jet.impl.execution.init.ExecutionPlanBuilder Maven / Gradle / Ivy

Go to download
Show more of this group Show more artifacts with this name
Show all versions of hazelcast-jdbc Show documentation
Hazelcast JDBC Driver
The newest version!
/*
 * Copyright (c) 2008-2024, Hazelcast, Inc. All Rights Reserved.
 *
 * Licensed under the Apache License, Version 2.0 (the "License");
 * you may not use this file except in compliance with the License.
 * You may obtain a copy of the License at
 *
 * http://www.apache.org/licenses/LICENSE-2.0
 *
 * Unless required by applicable law or agreed to in writing, software
 * distributed under the License is distributed on an "AS IS" BASIS,
 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 * See the License for the specific language governing permissions and
 * limitations under the License.
 */

package com.hazelcast.jet.impl.execution.init;

import com.hazelcast.cluster.Address;
import com.hazelcast.internal.cluster.MemberInfo;
import com.hazelcast.internal.partition.IPartitionService;
import com.hazelcast.internal.util.collection.IntHashSet;
import com.hazelcast.jet.JetException;
import com.hazelcast.jet.config.EdgeConfig;
import com.hazelcast.jet.config.JobConfig;
import com.hazelcast.jet.core.DAG;
import com.hazelcast.jet.core.Edge;
import com.hazelcast.jet.core.ProcessorMetaSupplier;
import com.hazelcast.jet.core.ProcessorSupplier;
import com.hazelcast.jet.core.Vertex;
import com.hazelcast.jet.function.RunnableEx;
import com.hazelcast.jet.impl.JetServiceBackend;
import com.hazelcast.jet.impl.JobClassLoaderService;
import com.hazelcast.jet.impl.execution.init.Contexts.MetaSupplierCtx;
import com.hazelcast.jet.impl.util.FixedCapacityIntArrayList;
import com.hazelcast.logging.ILogger;
import com.hazelcast.spi.impl.NodeEngine;
import com.hazelcast.spi.impl.NodeEngineImpl;

import javax.annotation.Nonnull;
import javax.annotation.Nullable;
import javax.security.auth.Subject;
import java.util.ArrayList;
import java.util.Arrays;
import java.util.HashMap;
import java.util.HashSet;
import java.util.Iterator;
import java.util.LinkedHashMap;
import java.util.List;
import java.util.Map;
import java.util.Map.Entry;
import java.util.Set;
import java.util.concurrent.CompletableFuture;
import java.util.concurrent.Executor;
import java.util.concurrent.ExecutorService;
import java.util.function.Function;
import java.util.function.ToIntFunction;

import static com.hazelcast.internal.util.ConcurrencyUtil.CALLER_RUNS;
import static com.hazelcast.internal.util.ExceptionUtil.sneakyThrow;
import static com.hazelcast.internal.util.Preconditions.checkNotNull;
import static com.hazelcast.jet.config.JobConfigArguments.KEY_REQUIRED_PARTITIONS;
import static com.hazelcast.jet.impl.util.ExceptionUtil.peel;
import static com.hazelcast.jet.impl.util.PrefixedLogger.prefix;
import static com.hazelcast.jet.impl.util.PrefixedLogger.prefixedLogger;
import static com.hazelcast.jet.impl.util.Util.checkSerializable;
import static com.hazelcast.jet.impl.util.Util.doWithClassLoader;
import static com.hazelcast.jet.impl.util.Util.range;
import static com.hazelcast.jet.impl.util.Util.toList;
import static com.hazelcast.spi.impl.executionservice.ExecutionService.JOB_OFFLOADABLE_EXECUTOR;
import static java.util.concurrent.CompletableFuture.completedFuture;
import static java.util.concurrent.CompletableFuture.runAsync;
import static java.util.stream.Collectors.toMap;

public final class ExecutionPlanBuilder {

    private ExecutionPlanBuilder() {
    }

    @SuppressWarnings({"checkstyle:ParameterNumber", "rawtypes"})
    public static CompletableFuture> createExecutionPlans(
            NodeEngineImpl nodeEngine,
            List memberInfos,
            DAG dag,
            long jobId,
            long executionId,
            JobConfig jobConfig,
            long lastSnapshotId,
            boolean isLightJob,
            Subject subject
    ) {
        final Map partitionsByMember;
        final Set requiredPartitions = jobConfig.getArgument(KEY_REQUIRED_PARTITIONS);
        final boolean isIsolatedJob = dag.memberSelector() != null;

        if (requiredPartitions != null) {
            PartitionPruningAnalysisResult analysisResult = analyzeDagForPartitionPruning(nodeEngine, dag);
            partitionsByMember = getPartitionAssignment(
                    nodeEngine, memberInfos,
                    analysisResult.allPartitionsRequired,
                    requiredPartitions,
                    analysisResult.constantPartitionIds,
                    analysisResult.requiredAddresses);
        } else if (isIsolatedJob) {
            // For isolated jobs, we can't predict an amount and types (data/lite) of the selected members, as well as
            // workload type. That's the reason to use a balanced partition assignment between data and lite members,
            // and generally, a fair load distribution is good enough for any kind of workload.
            // More info in corresponded TDD and method's Javadoc.
            partitionsByMember = getFairPartitionAssignment(nodeEngine, memberInfos);
        } else {
            partitionsByMember = getPartitionAssignment(
                    nodeEngine, memberInfos,
                    false,
                    null,
                    null,
                    null);
        }

        final Map partitionsByAddress = partitionsByMember
                .entrySet()
                .stream()
                .collect(toMap(en -> en.getKey().getAddress(), Entry::getValue));
        final int memberCount = partitionsByAddress.size();
        final boolean isJobDistributed = memberCount > 1;

        final VerticesIdAndOrder verticesIdAndOrder = VerticesIdAndOrder.assignVertexIds(dag);
        final int defaultParallelism = nodeEngine.getConfig().getJetConfig().getCooperativeThreadCount();
        final EdgeConfig defaultEdgeConfig = nodeEngine.getConfig().getJetConfig().getDefaultEdgeConfig();

        final Map plans = new HashMap<>();
        int memberIndex = 0;
        for (MemberInfo member : partitionsByMember.keySet()) {
            plans.put(member, new ExecutionPlan(partitionsByAddress, jobConfig, lastSnapshotId, memberIndex++,
                    memberCount, isLightJob, subject, verticesIdAndOrder.count()));
        }

        final List addresses = toList(partitionsByMember.keySet(), MemberInfo::getAddress);
        ExecutorService initOffloadExecutor = nodeEngine.getExecutionService().getExecutor(JOB_OFFLOADABLE_EXECUTOR);
        CompletableFuture[] futures = new CompletableFuture[verticesIdAndOrder.count()];
        for (VertexIdPos entry : verticesIdAndOrder) {
            final Vertex vertex = dag.getVertex(entry.vertexName);
            assert vertex != null;
            final ProcessorMetaSupplier metaSupplier = vertex.getMetaSupplier();
            final int vertexId = entry.vertexId;
            // The local parallelism determination here is effective only
            // in jobs submitted as DAG. Otherwise, in jobs submitted as
            // pipeline, we are already doing this determination while
            // converting it to DAG and there is no vertex left with LP=-1.
            final int localParallelism = vertex.determineLocalParallelism(defaultParallelism);
            final int totalParallelism = localParallelism * memberCount;
            final List inbound = toEdgeDefs(dag.getInboundEdges(vertex.getName()), defaultEdgeConfig,
                    e -> verticesIdAndOrder.idByName(e.getSourceName()), isJobDistributed);
            final List outbound = toEdgeDefs(dag.getOutboundEdges(vertex.getName()), defaultEdgeConfig,
                    e -> verticesIdAndOrder.idByName(e.getDestName()), isJobDistributed);
            String prefix = prefix(jobConfig.getName(), jobId, vertex.getName(), "#PMS");
            ILogger logger = prefixedLogger(nodeEngine.getLogger(metaSupplier.getClass()), prefix);

            RunnableEx action = () -> {
                JetServiceBackend jetBackend = nodeEngine.getService(JetServiceBackend.SERVICE_NAME);
                JobClassLoaderService jobClassLoaderService = jetBackend.getJobClassLoaderService();
                ClassLoader processorClassLoader = jobClassLoaderService.getClassLoader(jobId);
                try {
                    doWithClassLoader(processorClassLoader, () ->
                            metaSupplier.init(new MetaSupplierCtx(nodeEngine, jobId, executionId, jobConfig,
                                    logger, vertex.getName(), localParallelism, totalParallelism, memberCount,
                                    isLightJob, partitionsByAddress, subject, processorClassLoader)));
                } catch (Exception e) {
                    throw sneakyThrow(peel(e));
                }

                Function procSupplierFn =
                        doWithClassLoader(processorClassLoader, () -> metaSupplier.get(addresses));
                for (Entry e : plans.entrySet()) {
                    final ProcessorSupplier processorSupplier =
                            doWithClassLoader(processorClassLoader, () -> procSupplierFn.apply(e.getKey().getAddress()));
                    if (!isLightJob) {
                        // We avoid the check for light jobs - the user will get the error anyway, but maybe with less
                        // information. And we can recommend the user to use normal job to have more checks.
                        checkSerializable(processorSupplier, "ProcessorSupplier in vertex '" + vertex.getName() + '\'');
                    }
                    final VertexDef vertexDef = new VertexDef(vertexId, vertex.getName(), processorSupplier, localParallelism);
                    vertexDef.addInboundEdges(inbound);
                    vertexDef.addOutboundEdges(outbound);
                    e.getValue().setVertex(entry.requiredPosition, vertexDef);
                }
            };
            Executor executor = metaSupplier.initIsCooperative() ? CALLER_RUNS : initOffloadExecutor;
            futures[entry.requiredPosition] = runAsync(action, executor);
        }
        return CompletableFuture.allOf(futures)
                .thenCompose(r -> completedFuture(plans));
    }

    /**
     * Analyze DAG if the query uses partition pruning in order to determine
     * which additional members and partitions are necessary for execution
     */
    // visible for testing
    @Nonnull
    static PartitionPruningAnalysisResult analyzeDagForPartitionPruning(NodeEngine nodeEngine, DAG dag) {
        final IPartitionService partitionService = nodeEngine.getPartitionService();
        final int partitionCount = partitionService.getPartitionCount();
        // we expect only local member to be explicitly required
        Set requiredAddresses = new HashSet<>(1);
        IntHashSet constantPartitionIds = new IntHashSet(partitionCount, -1);
        boolean allPartitionsRequired = false;
        for (Iterator it = dag.edgeIterator(); it.hasNext(); ) {
            Edge edge = it.next();
            if (edge.getDistributedTo() != null && !edge.isDistributed()) {
                // Edge is distributed to specific member, not to all members
                // so such member must be included in the job.
                // Usually this will be the local member.
                requiredAddresses.add(edge.getDistributedTo());
            }
            if (edge.getRoutingPolicy() == Edge.RoutingPolicy.PARTITIONED) {
                assert edge.getPartitioner() != null : "PARTITIONED policy was used without partitioner";
                // note that partitioned edge can be either distributed or local.
                var maybeConstantPartition = edge.getPartitioner().getConstantPartitioningKey();
                if (maybeConstantPartition != null) {
                    // allToOne or other constant partitioning case
                    constantPartitionIds.add(partitionService.getPartitionId(maybeConstantPartition));
                } else {
                    // partitioned edge with arbitrary partitioning function.
                    // unable to determine what partition ids will we used.
                    allPartitionsRequired = true;
                }
            }
        }
        // After the analysis we can have both ALL_PARTITIONS_REQUIRED and non-empty constantPartitionIds.
        // This is not a problem, ALL_PARTITIONS_REQUIRED will be more important.
        return new PartitionPruningAnalysisResult(requiredAddresses, constantPartitionIds, allPartitionsRequired);
    }

    // visible for testing
    static class PartitionPruningAnalysisResult {
        final Set requiredAddresses;
        final Set constantPartitionIds;
        final boolean allPartitionsRequired;

        PartitionPruningAnalysisResult(Set requiredAddresses,
                                       Set constantPartitionIds,
                                       boolean allPartitionsRequired) {
            this.requiredAddresses = requiredAddresses;
            this.constantPartitionIds = constantPartitionIds;
            this.allPartitionsRequired = allPartitionsRequired;
        }
    }

    /**
     * Basic vertex data wrapper:
     * - id
     * - name
     * - position
     */
    private static final class VerticesIdAndOrder implements Iterable {
        private final LinkedHashMap vertexIdMap;
        private final HashMap vertexPosById;

        private VerticesIdAndOrder(LinkedHashMap vertexIdMap) {
            this.vertexIdMap = vertexIdMap;
            int index = 0;
            vertexPosById = new LinkedHashMap<>(vertexIdMap.size());
            for (Integer vertexId : vertexIdMap.values()) {
                vertexPosById.put(vertexId, index++);
            }
        }

        private Integer idByName(String vertexName) {
            return vertexIdMap.get(vertexName);
        }

        private static VerticesIdAndOrder assignVertexIds(DAG dag) {
            LinkedHashMap vertexIdMap = new LinkedHashMap<>();
            final int[] vertexId = {0};
            dag.forEach(v -> vertexIdMap.put(v.getName(), vertexId[0]++));
            return new VerticesIdAndOrder(vertexIdMap);
        }

        private int count() {
            return vertexIdMap.size();
        }

        @Nonnull
        @Override
        public Iterator iterator() {
            return vertexIdMap.entrySet().stream()
                    .map(e -> new VertexIdPos(e.getValue(), e.getKey(), vertexPosById.get(e.getValue())))
                    .iterator();
        }
    }

    private static final class VertexIdPos {
        private final int vertexId;
        private final String vertexName;

        /**
         * Position on vertices list that vertex with this id/name should occupy.
         * {@link ExecutionPlan#getVertices()} order matters, it must be the same as DAG iteration order,
         * otherwise some functions in further processing won't give good results.
         */
        private final int requiredPosition;

        private VertexIdPos(int vertexId, String vertexName, int position) {
            this.vertexId = vertexId;
            this.vertexName = vertexName;
            this.requiredPosition = position;
        }
    }

    private static List toEdgeDefs(
            List edges, EdgeConfig defaultEdgeConfig,
            ToIntFunction oppositeVtxId, boolean isJobDistributed
    ) {
        List list = new ArrayList<>(edges.size());
        for (Edge edge : edges) {
            list.add(new EdgeDef(edge, edge.getConfig() == null ? defaultEdgeConfig : edge.getConfig(),
                    oppositeVtxId.applyAsInt(edge), isJobDistributed));
        }
        return list;
    }

    /**
     * Assign the partitions to their owners. Partitions whose owner isn't in
     * the {@code memberList}, are assigned to one of the members in a round-robin way.
     * Additional parameters are required if partition pruning is used : (dataPartitions != null).
     * Each mapped partitions id array must be sorted.
     *
     * @param allPartitionsRequired        if true, all partitions must be assigned to all required members
     *                                     were chosen to participate in job execution. It is applicable, if
     *                                     DAG contains at least one partitioned edge with non-constant key.
     * @param dataPartitions               set of all required data partitions must be processed by the job
     * @param routingPartitions            set of transitive partitions must be included to the job (allToOne targets)
     * @param extraRequiredMemberAddresses member addresses are targeted by {@link Edge#distributeTo} in job's DAG.
     */
    @SuppressWarnings("DataFlowIssue")
    public static Map getPartitionAssignment(
            NodeEngine nodeEngine,
            List memberList,
            boolean allPartitionsRequired,
            @Nullable Set dataPartitions,
            @Nullable Set routingPartitions,
            @Nullable Set extraRequiredMemberAddresses) {

        if (allPartitionsRequired) {
            checkNotNull(dataPartitions);
        }

        IPartitionService partitionService = nodeEngine.getPartitionService();

        Map membersByAddress = new HashMap<>();
        for (MemberInfo memberInfo : memberList) {
            membersByAddress.put(memberInfo.getAddress(), memberInfo);
        }

        Map partitionsForMember = new HashMap<>();
        int partitionCount = partitionService.getPartitionCount();
        int memberIndex = 0;

        if (dataPartitions == null) {
            // By default, partition pruning won't be applied, and for this code path
            // it is guaranteed to be only partition assignment loop.
            for (int partitionId = 0; partitionId < partitionCount; ++partitionId) {
                Address address = partitionService.getPartitionOwnerOrWait(partitionId);
                MemberInfo member = membersByAddress.get(address);

                if (member == null) {
                    // if the partition owner isn't in the current memberList, assign to one of the other members in
                    // round-robin fashion
                    member = memberList.get(memberIndex++ % memberList.size());
                }
                partitionsForMember.computeIfAbsent(member, ignored -> new FixedCapacityIntArrayList(partitionCount))
                        .add(partitionId);
            }
        } else {
            // We want to avoid boxing Integer partitionId over and over again
            // at least in the basic case when partition pruning is not used
            // and using IntStream.range() would be slower than pure for loop.
            // Such boxing generates many allocations for simple queries because
            // even with default partition count (271) not all ids are cached by JVM.
            // The loop body is the same as in the other branch, but it is hard to refactor.
            for (int partitionId : dataPartitions) {
                Address address = partitionService.getPartitionOwnerOrWait(partitionId);
                MemberInfo member = membersByAddress.get(address);

                if (member == null) {
                    // if the partition owner isn't in the current memberList, assign to one of the other members in
                    // round-robin fashion
                    member = memberList.get(memberIndex++ % memberList.size());
                }
                partitionsForMember.computeIfAbsent(member, ignored -> new FixedCapacityIntArrayList(partitionCount))
                        .add(partitionId);
            }
        }

        if (dataPartitions != null) {
            extraRequiredMemberAddresses = checkNotNull(extraRequiredMemberAddresses);
            routingPartitions = checkNotNull(routingPartitions);

            // Overall algorithm for partition assignment in case of partition pruning is as follows:
            // 1. Find all members that are owners of partitions with data required for the job (`dataPartitions`) - above.
            // 2. Add members that have explicit routing (`Edge.distributeTo`) if not yet added.
            //    Members found after this step are all members that are needed to execute the job ("required members")
            // 3. Assign additional partitions, which do not store data but are needed for other reasons (mainly routing)
            //    to required members.

            // Interactive prunable queries may require coordinator to be present
            // If coordinator still not captured to participate in the job -- do it.
            extraRequiredMemberAddresses.forEach(requiredMemberAddr -> {
                MemberInfo requiredMemberInfo = membersByAddress.get(requiredMemberAddr);
                if (requiredMemberInfo == null) {
                    // Should not happen for local member, may happen if outdated DAG is used
                    // which refers to no longer present member.
                    throw new JetException("Member with address " + requiredMemberAddr + " not present in the cluster");
                }
                partitionsForMember.computeIfAbsent(requiredMemberInfo, (i) -> {
                    nodeEngine.getLogger(ExecutionPlanBuilder.class)
                            .fine("Adding required member " + requiredMemberAddr + " to partition-pruned job members");
                    // Extra members may get some partitions assigned later, especially for ALL_PARTITIONS_REQUIRED
                    return new FixedCapacityIntArrayList(partitionCount);
                });
            });

            // There is a special case of partition/member pruning: when DAG contains distributed-partitioned edge,
            // we still want to apply member pruning, but we must redirect partitioned items to limited cluster subset.
            // To do that, we assign all unassigned (also they are a non-required) partitions to all required members
            // which are already was filtered by main assignment loop above.
            if (allPartitionsRequired || !routingPartitions.isEmpty()) {
                Set partitionsToAssign = allPartitionsRequired
                        ? new HashSet<>(range(0, partitionCount))
                        : new HashSet<>(routingPartitions);
                // do not assign duplicates, possible in both above cases
                partitionsToAssign.removeAll(dataPartitions);

                List requiredMembers = new ArrayList<>(partitionsForMember.keySet());
                for (int partitionId : partitionsToAssign) {
                    // Assign remaining partitions to one of the required members in round-robin fashion.
                    // they will be only used for internal routing.
                    //
                    // The partition assignment is not balanced here, so not all members have the same number of partitions
                    // especially for ALL_PARTITIONS_REQUIRED. This is not very important when there are only a few partitions
                    // in dataPartitions, but can make some difference if there are many (e.g. half of them).
                    // This is not obvious where extra partitions should be assigned - maybe we should prefer members
                    // that do not store the data for the job because they will be less loaded?
                    var member = requiredMembers.get(memberIndex++ % requiredMembers.size());
                    partitionsForMember.get(member).add(partitionId);
                }
            }
        }

        Map partitionAssignment = new HashMap<>();
        for (Entry memberWithPartitions : partitionsForMember.entrySet()) {
            int[] p = memberWithPartitions.getValue().asArray();
            if (dataPartitions != null) {
                Arrays.sort(p);
            }
            partitionAssignment.put(memberWithPartitions.getKey(), p);
        }

        return partitionAssignment;
    }


    /**
     * Assign the equal number of partitions between data and lite members. The algorithm always prefer to assign
     * partition to the actual partition owner (data member), if possible. If member is not selected for the job,
     * or the limit was exceeded, the partition is assigned to the next available member in a round-robin manner.
     * 
     * There might be an acceptable difference between the number of assigned partitions assigned to data members and
     * lite members in case, when data members prevails in member selection.
     * 
     * Each mapped partitions id array must be sorted.
     */
    public static Map getFairPartitionAssignment(NodeEngine nodeEngine, List memberList) {
        List liteMembers = memberList.stream().filter(MemberInfo::isLiteMember).toList();
        // If no lite members are present, we can use the default partition assignment.
        if (liteMembers.isEmpty()) {
            return getPartitionAssignment(nodeEngine, memberList, false, null, null, null);
        }

        IPartitionService partitionService = nodeEngine.getPartitionService();
        int partitionCount = partitionService.getPartitionCount();

        Map membersByAddress = new HashMap<>();
        Map partitionsForMember = new HashMap<>();
        Set membersAbleToAcceptPartitions = new HashSet<>(memberList);
        for (MemberInfo memberInfo : memberList) {
            membersByAddress.put(memberInfo.getAddress(), memberInfo);
        }

        final int fairPartitionSliceSize = (partitionCount + memberList.size() - 1) / memberList.size();

        int memberIndex = 0;
        for (int partitionId = 0; partitionId < partitionCount; ++partitionId) {
            Address address = partitionService.getPartitionOwnerOrWait(partitionId);
            MemberInfo member = membersByAddress.get(address);
            while (!membersAbleToAcceptPartitions.contains(member) || member == null) {
                member = memberList.get(memberIndex++ % memberList.size());
            }

            var partitions = partitionsForMember.computeIfAbsent(member, ignored ->
                    new FixedCapacityIntArrayList(partitionCount));
            partitions.add(partitionId);

            if (partitions.size() >= fairPartitionSliceSize) {
                membersAbleToAcceptPartitions.remove(member);
            }
        }

        Map partitionAssignment = new HashMap<>();
        for (Entry memberWithPartitions : partitionsForMember.entrySet()) {
            int[] p = memberWithPartitions.getValue().asArray();
            partitionAssignment.put(memberWithPartitions.getKey(), p);
        }

        return partitionAssignment;
    }
}