All Downloads are FREE. Search and download functionalities are using the official Maven repository.

io.trino.cost.TopNStatsRule Maven / Gradle / Ivy

There is a newer version: 465
Show newest version
/*
 * Licensed under the Apache License, Version 2.0 (the "License");
 * you may not use this file except in compliance with the License.
 * You may obtain a copy of the License at
 *
 *     http://www.apache.org/licenses/LICENSE-2.0
 *
 * Unless required by applicable law or agreed to in writing, software
 * distributed under the License is distributed on an "AS IS" BASIS,
 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 * See the License for the specific language governing permissions and
 * limitations under the License.
 */
package io.trino.cost;

import io.trino.cost.StatsCalculator.Context;
import io.trino.matching.Pattern;
import io.trino.spi.connector.SortOrder;
import io.trino.sql.planner.Symbol;
import io.trino.sql.planner.plan.TopNNode;

import java.util.Optional;

import static io.trino.sql.planner.plan.Patterns.topN;

public class TopNStatsRule
        extends SimpleStatsRule
{
    private static final int ESTIMATED_PARTIAL_TOPN_INPUT_PER_DRIVER = 1_000_000;
    private static final Pattern PATTERN = topN();

    public TopNStatsRule(StatsNormalizer normalizer)
    {
        super(normalizer);
    }

    @Override
    public Pattern getPattern()
    {
        return PATTERN;
    }

    @Override
    protected Optional doCalculate(TopNNode node, Context context)
    {
        PlanNodeStatsEstimate sourceStats = context.statsProvider().getStats(node.getSource());
        double rowCount = sourceStats.getOutputRowCount();

        /* CreatePartialTopN rule runs after ReorderJoins but before DetermineJoinDistributionType and DetermineSemiJoinDistributionType.
         * Therefore, it is useful to provide estimates for partial and final topN nodes.
         * If, for example, the partial topN is part of a source stage, then it's overall output will depend on the number of splits that get created at runtime,
         * and that's not known in advance. We populate a rough estimate for partial topN by assuming an input of 1 million rows per driver.
         */
        if (node.getStep() == TopNNode.Step.PARTIAL) {
            double estimatedOutputRowCount = Math.max(rowCount / ESTIMATED_PARTIAL_TOPN_INPUT_PER_DRIVER, 1) * node.getCount();
            return Optional.of(PlanNodeStatsEstimate.buildFrom(sourceStats)
                    .setOutputRowCount(Math.min(estimatedOutputRowCount, rowCount))
                    .build());
        }

        if (rowCount <= node.getCount()) {
            return Optional.of(sourceStats);
        }

        long limitCount = node.getCount();

        PlanNodeStatsEstimate resultStats = PlanNodeStatsEstimate.buildFrom(sourceStats)
                .setOutputRowCount(limitCount)
                .build();
        if (limitCount == 0) {
            return Optional.of(resultStats);
        }
        // augment null fraction estimation for first ORDER BY symbol
        Symbol firstOrderSymbol = node.getOrderingScheme().orderBy().get(0); // Assuming not empty list
        SortOrder sortOrder = node.getOrderingScheme().ordering(firstOrderSymbol);

        resultStats = resultStats.mapSymbolColumnStatistics(firstOrderSymbol, symbolStats -> {
            SymbolStatsEstimate.Builder newStats = SymbolStatsEstimate.buildFrom(symbolStats);
            double nullCount = rowCount * symbolStats.getNullsFraction();

            if (sortOrder.isNullsFirst()) {
                if (nullCount > limitCount) {
                    newStats.setNullsFraction(1.0);
                }
                else {
                    newStats.setNullsFraction(nullCount / limitCount);
                }
            }
            else {
                double nonNullCount = rowCount - nullCount;
                if (nonNullCount > limitCount) {
                    newStats.setNullsFraction(0.0);
                }
                else {
                    newStats.setNullsFraction((limitCount - nonNullCount) / limitCount);
                }
            }
            return newStats.build();
        });

        // TopN actually limits (or when there was no row count estimated for source)
        return Optional.of(resultStats);
    }
}




© 2015 - 2024 Weber Informatics LLC | Privacy Policy