All Downloads are FREE. Search and download functionalities are using the official Maven repository.

io.trino.sql.planner.iterative.rule.AddExchangesBelowPartialAggregationOverGroupIdRuleSet Maven / Gradle / Ivy

/*
 * Licensed under the Apache License, Version 2.0 (the "License");
 * you may not use this file except in compliance with the License.
 * You may obtain a copy of the License at
 *
 *     http://www.apache.org/licenses/LICENSE-2.0
 *
 * Unless required by applicable law or agreed to in writing, software
 * distributed under the License is distributed on an "AS IS" BASIS,
 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 * See the License for the specific language governing permissions and
 * limitations under the License.
 */
package io.trino.sql.planner.iterative.rule;

import com.google.common.annotations.VisibleForTesting;
import com.google.common.collect.ImmutableList;
import com.google.common.collect.ImmutableSet;
import com.google.common.collect.Multiset;
import io.airlift.units.DataSize;
import io.trino.Session;
import io.trino.cost.PlanNodeStatsEstimate;
import io.trino.cost.SymbolStatsEstimate;
import io.trino.cost.TaskCountEstimator;
import io.trino.execution.TaskManagerConfig;
import io.trino.matching.Capture;
import io.trino.matching.Captures;
import io.trino.matching.Pattern;
import io.trino.sql.PlannerContext;
import io.trino.sql.planner.Partitioning;
import io.trino.sql.planner.PartitioningScheme;
import io.trino.sql.planner.Symbol;
import io.trino.sql.planner.iterative.Rule;
import io.trino.sql.planner.optimizations.StreamPreferredProperties;
import io.trino.sql.planner.optimizations.StreamPropertyDerivations.StreamProperties;
import io.trino.sql.planner.plan.AggregationNode;
import io.trino.sql.planner.plan.ExchangeNode;
import io.trino.sql.planner.plan.GroupIdNode;
import io.trino.sql.planner.plan.PlanNode;
import io.trino.sql.planner.plan.ProjectNode;

import java.util.Collection;
import java.util.List;
import java.util.Objects;
import java.util.Optional;
import java.util.Set;

import static com.google.common.base.Preconditions.checkArgument;
import static com.google.common.base.Verify.verify;
import static com.google.common.collect.ImmutableList.toImmutableList;
import static com.google.common.collect.ImmutableMultiset.toImmutableMultiset;
import static com.google.common.collect.ImmutableSet.toImmutableSet;
import static io.trino.SystemSessionProperties.getTaskConcurrency;
import static io.trino.SystemSessionProperties.isEnableForcedExchangeBelowGroupId;
import static io.trino.SystemSessionProperties.isEnableStatsCalculator;
import static io.trino.matching.Capture.newCapture;
import static io.trino.matching.Pattern.nonEmpty;
import static io.trino.matching.Pattern.typeOf;
import static io.trino.sql.planner.SystemPartitioningHandle.FIXED_HASH_DISTRIBUTION;
import static io.trino.sql.planner.optimizations.StreamPreferredProperties.fixedParallelism;
import static io.trino.sql.planner.optimizations.StreamPropertyDerivations.deriveProperties;
import static io.trino.sql.planner.plan.ExchangeNode.Scope.LOCAL;
import static io.trino.sql.planner.plan.ExchangeNode.Scope.REMOTE;
import static io.trino.sql.planner.plan.ExchangeNode.partitionedExchange;
import static io.trino.sql.planner.plan.Patterns.Aggregation.groupingColumns;
import static io.trino.sql.planner.plan.Patterns.Aggregation.step;
import static io.trino.sql.planner.plan.Patterns.Exchange.scope;
import static io.trino.sql.planner.plan.Patterns.source;
import static java.lang.Double.isNaN;
import static java.lang.Math.min;
import static java.util.Comparator.comparing;
import static java.util.Objects.requireNonNull;

/**
 * Transforms
 * 
 *   - Exchange
 *     - [ Projection ]
 *       - Partial Aggregation
 *         - GroupId
 * 
* to *
 *   - Exchange
 *     - [ Projection ]
 *       - Partial Aggregation
 *         - GroupId
 *           - LocalExchange
 *             - RemoteExchange
 * 
*

* Rationale: GroupId increases number of rows (number of times equal to number of grouping sets) and then * partial aggregation reduces number of rows. However, under certain conditions, exchanging the rows before * GroupId (before multiplication) makes partial aggregation more effective, resulting in less data being * exchanged afterwards. */ public class AddExchangesBelowPartialAggregationOverGroupIdRuleSet { private static final Capture PROJECTION = newCapture(); private static final Capture AGGREGATION = newCapture(); private static final Capture GROUP_ID = newCapture(); private static final Capture REMOTE_EXCHANGE = newCapture(); private static final Pattern WITH_PROJECTION = // If there was no exchange here, adding new exchanges could break property derivations logic of AddExchanges, AddLocalExchanges typeOf(ExchangeNode.class) .with(scope().equalTo(REMOTE)).capturedAs(REMOTE_EXCHANGE) .with(source().matching( // PushPartialAggregationThroughExchange adds a projection. However, it can be removed if RemoveRedundantIdentityProjections is run in the mean-time. typeOf(ProjectNode.class).capturedAs(PROJECTION) .with(source().matching( typeOf(AggregationNode.class).capturedAs(AGGREGATION) .with(step().equalTo(AggregationNode.Step.PARTIAL)) .with(nonEmpty(groupingColumns())) .with(source().matching( typeOf(GroupIdNode.class).capturedAs(GROUP_ID))))))); private static final Pattern WITHOUT_PROJECTION = // If there was no exchange here, adding new exchanges could break property derivations logic of AddExchanges, AddLocalExchanges typeOf(ExchangeNode.class) .with(scope().equalTo(REMOTE)).capturedAs(REMOTE_EXCHANGE) .with(source().matching( typeOf(AggregationNode.class).capturedAs(AGGREGATION) .with(step().equalTo(AggregationNode.Step.PARTIAL)) .with(nonEmpty(groupingColumns())) .with(source().matching( typeOf(GroupIdNode.class).capturedAs(GROUP_ID))))); private static final double GROUPING_SETS_SYMBOL_REQUIRED_FREQUENCY = 0.5; private static final double ANTI_SKEWNESS_MARGIN = 3; private final PlannerContext plannerContext; private final TaskCountEstimator taskCountEstimator; private final DataSize maxPartialAggregationMemoryUsage; public AddExchangesBelowPartialAggregationOverGroupIdRuleSet( PlannerContext plannerContext, TaskCountEstimator taskCountEstimator, TaskManagerConfig taskManagerConfig) { this.plannerContext = requireNonNull(plannerContext, "plannerContext is null"); this.taskCountEstimator = requireNonNull(taskCountEstimator, "taskCountEstimator is null"); this.maxPartialAggregationMemoryUsage = taskManagerConfig.getMaxPartialAggregationMemoryUsage(); } public Set> rules() { return ImmutableSet.of( belowProjectionRule(), belowExchangeRule()); } @VisibleForTesting AddExchangesBelowExchangePartialAggregationGroupId belowExchangeRule() { return new AddExchangesBelowExchangePartialAggregationGroupId(); } @VisibleForTesting AddExchangesBelowProjectionPartialAggregationGroupId belowProjectionRule() { return new AddExchangesBelowProjectionPartialAggregationGroupId(); } private class AddExchangesBelowProjectionPartialAggregationGroupId extends BaseAddExchangesBelowExchangePartialAggregationGroupId { @Override public Pattern getPattern() { return WITH_PROJECTION; } @Override public Result apply(ExchangeNode exchange, Captures captures, Context context) { ProjectNode project = captures.get(PROJECTION); AggregationNode aggregation = captures.get(AGGREGATION); GroupIdNode groupId = captures.get(GROUP_ID); ExchangeNode remoteExchange = captures.get(REMOTE_EXCHANGE); return transform(aggregation, groupId, remoteExchange.getPartitioningScheme().getPartitionCount(), context) .map(newAggregation -> Result.ofPlanNode( exchange.replaceChildren(ImmutableList.of( project.replaceChildren(ImmutableList.of( newAggregation)))))) .orElseGet(Result::empty); } } @VisibleForTesting class AddExchangesBelowExchangePartialAggregationGroupId extends BaseAddExchangesBelowExchangePartialAggregationGroupId { @Override public Pattern getPattern() { return WITHOUT_PROJECTION; } @Override public Result apply(ExchangeNode exchange, Captures captures, Context context) { AggregationNode aggregation = captures.get(AGGREGATION); GroupIdNode groupId = captures.get(GROUP_ID); ExchangeNode remoteExchange = captures.get(REMOTE_EXCHANGE); return transform(aggregation, groupId, remoteExchange.getPartitioningScheme().getPartitionCount(), context) .map(newAggregation -> { PlanNode newExchange = exchange.replaceChildren(ImmutableList.of(newAggregation)); return Result.ofPlanNode(newExchange); }) .orElseGet(Result::empty); } } private abstract class BaseAddExchangesBelowExchangePartialAggregationGroupId implements Rule { @Override public boolean isEnabled(Session session) { if (!isEnableStatsCalculator(session)) { // Old stats calculator is not trust-worthy return false; } return isEnableForcedExchangeBelowGroupId(session); } protected Optional transform(AggregationNode aggregation, GroupIdNode groupId, Optional partitionCount, Context context) { if (groupId.getGroupingSets().size() < 2) { return Optional.empty(); } Set groupingKeys = aggregation.getGroupingKeys().stream() .filter(symbol -> !groupId.getGroupIdSymbol().equals(symbol)) .collect(toImmutableSet()); Multiset groupingSetHistogram = groupId.getGroupingSets().stream() .flatMap(Collection::stream) .collect(toImmutableMultiset()); if (!Objects.equals(groupingSetHistogram.elementSet(), groupingKeys)) { // TODO handle the case when some aggregation keys are pass-through in GroupId (e.g. common in all grouping sets). However, this is never the case for ROLLUP. // TODO handle the case when some grouping set symbols are not used in aggregation (possible?) return Optional.empty(); } double aggregationMemoryRequirements = estimateAggregationMemoryRequirements(groupingKeys, groupId, groupingSetHistogram, context); if (isNaN(aggregationMemoryRequirements) || aggregationMemoryRequirements < maxPartialAggregationMemoryUsage.toBytes()) { // Aggregation will be effective even without exchanges (or we have insufficient information). return Optional.empty(); } List desiredHashSymbols = groupingSetHistogram.entrySet().stream() // Take only frequently used symbols .filter(entry -> entry.getCount() >= groupId.getGroupingSets().size() * GROUPING_SETS_SYMBOL_REQUIRED_FREQUENCY) .map(Multiset.Entry::getElement) // And only the symbols used in the aggregation (these are usually all symbols) .peek(symbol -> verify(groupingKeys.contains(symbol))) // Transform to symbols before GroupId .map(groupId.getGroupingColumns()::get) .collect(toImmutableList()); // Use only the symbol with the highest cardinality (if we have statistics). This makes partial aggregation more efficient in case of // low correlation between symbol that are in every grouping set vs additional symbols. PlanNodeStatsEstimate sourceStats = context.getStatsProvider().getStats(groupId.getSource()); desiredHashSymbols = desiredHashSymbols.stream() .filter(symbol -> !isNaN(sourceStats.getSymbolStatistics(symbol).getDistinctValuesCount())) .max(comparing(symbol -> sourceStats.getSymbolStatistics(symbol).getDistinctValuesCount())) .map(symbol -> (List) ImmutableList.of(symbol)).orElse(desiredHashSymbols); StreamPreferredProperties requiredProperties = fixedParallelism().withPartitioning(desiredHashSymbols); StreamProperties sourceProperties = derivePropertiesRecursively(groupId.getSource(), context); if (requiredProperties.isSatisfiedBy(sourceProperties)) { // Stream is already (locally) partitioned just as we want. // In fact, there might be just a LocalExchange below and no Remote. For now, we give up in this situation anyway. To properly support such situation: // 1. aggregation effectiveness estimation below need to consider the (helpful) fact that stream is already partitioned, so each operator will need less memory // 2. if the local exchange becomes unnecessary (after we add a remove on top of it), it should be removed. What if the local exchange is somewhere further // down the tree? return Optional.empty(); } double estimatedGroups = estimatedGroupCount(desiredHashSymbols, context.getStatsProvider().getStats(groupId.getSource())); if (isNaN(estimatedGroups) || estimatedGroups * ANTI_SKEWNESS_MARGIN < maximalConcurrencyAfterRepartition(context)) { // Desired hash symbols form too few groups. Hashing over them would harm concurrency. // TODO instead of taking symbols with >GROUPING_SETS_SYMBOL_REQUIRED_FREQUENCY presence, we could take symbols from high freq to low until there are enough groups return Optional.empty(); } PlanNode source = groupId.getSource(); // Above we only checked the data is not yet locally partitioned and it could be already globally partitioned (but not locally). TODO avoid remote exchange in this case // TODO If the aggregation memory requirements are only slightly above `maxPartialAggregationMemoryUsage`, adding only LocalExchange could be enough source = partitionedExchange( context.getIdAllocator().getNextId(), REMOTE, source, new PartitioningScheme( Partitioning.create(FIXED_HASH_DISTRIBUTION, desiredHashSymbols), source.getOutputSymbols(), Optional.empty(), false, Optional.empty(), // It's fine to reuse partitionCount since that is computed by considering all the expanding nodes and table scans in a query partitionCount)); source = partitionedExchange( context.getIdAllocator().getNextId(), LOCAL, source, new PartitioningScheme( Partitioning.create(FIXED_HASH_DISTRIBUTION, desiredHashSymbols), source.getOutputSymbols())); PlanNode newGroupId = groupId.replaceChildren(ImmutableList.of(source)); PlanNode newAggregation = aggregation.replaceChildren(ImmutableList.of(newGroupId)); return Optional.of(newAggregation); } private int maximalConcurrencyAfterRepartition(Context context) { return getTaskConcurrency(context.getSession()) * taskCountEstimator.estimateHashedTaskCount(context.getSession()); } private double estimateAggregationMemoryRequirements(Set groupingKeys, GroupIdNode groupId, Multiset groupingSetHistogram, Context context) { checkArgument(Objects.equals(groupingSetHistogram.elementSet(), groupingKeys)); // Otherwise math below would be off-topic PlanNodeStatsEstimate sourceStats = context.getStatsProvider().getStats(groupId.getSource()); double keysMemoryRequirements = 0; for (List groupingSet : groupId.getGroupingSets()) { List sourceSymbols = groupingSet.stream() .map(groupId.getGroupingColumns()::get) .collect(toImmutableList()); double keyWidth = sourceStats.getOutputSizeInBytes(sourceSymbols) / sourceStats.getOutputRowCount(); double keyNdv = min(estimatedGroupCount(sourceSymbols, sourceStats), sourceStats.getOutputRowCount()); keysMemoryRequirements += keyWidth * keyNdv; } // TODO consider also memory requirements for aggregation values return keysMemoryRequirements; } private double estimatedGroupCount(List symbols, PlanNodeStatsEstimate statsEstimate) { return symbols.stream() .map(statsEstimate::getSymbolStatistics) .mapToDouble(this::ndvIncludingNull) // This assumes no correlation, maximum number of aggregation keys .reduce(1, (a, b) -> a * b); } private double ndvIncludingNull(SymbolStatsEstimate symbolStatsEstimate) { if (symbolStatsEstimate.getNullsFraction() == 0.) { return symbolStatsEstimate.getDistinctValuesCount(); } return symbolStatsEstimate.getDistinctValuesCount() + 1; } private StreamProperties derivePropertiesRecursively(PlanNode node, Context context) { PlanNode resolvedPlanNode = context.getLookup().resolve(node); List inputProperties = resolvedPlanNode.getSources().stream() .map(source -> derivePropertiesRecursively(source, context)) .collect(toImmutableList()); return deriveProperties(resolvedPlanNode, inputProperties, plannerContext, context.getSession()); } } }





© 2015 - 2024 Weber Informatics LLC | Privacy Policy