All Downloads are FREE. Search and download functionalities are using the official Maven repository.

org.apache.drill.exec.planner.logical.ConvertCountToDirectScanRule Maven / Gradle / Ivy

/*
 * Licensed to the Apache Software Foundation (ASF) under one
 * or more contributor license agreements.  See the NOTICE file
 * distributed with this work for additional information
 * regarding copyright ownership.  The ASF licenses this file
 * to you under the Apache License, Version 2.0 (the
 * "License"); you may not use this file except in compliance
 * with the License.  You may obtain a copy of the License at
 *
 * http://www.apache.org/licenses/LICENSE-2.0
 *
 * Unless required by applicable law or agreed to in writing, software
 * distributed under the License is distributed on an "AS IS" BASIS,
 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 * See the License for the specific language governing permissions and
 * limitations under the License.
 */
package org.apache.drill.exec.planner.logical;

import org.apache.calcite.plan.RelOptRule;
import org.apache.calcite.plan.RelOptRuleCall;
import org.apache.calcite.plan.RelOptRuleOperand;
import org.apache.calcite.rel.core.Aggregate;
import org.apache.calcite.rel.core.AggregateCall;
import org.apache.calcite.rel.core.Project;
import org.apache.calcite.rel.core.TableScan;
import org.apache.calcite.rel.type.RelDataType;
import org.apache.calcite.rex.RexInputRef;
import org.apache.commons.lang3.tuple.ImmutablePair;
import org.apache.commons.lang3.tuple.Pair;
import org.apache.drill.common.expression.SchemaPath;
import org.apache.drill.common.logical.FormatPluginConfig;

import org.apache.drill.exec.physical.base.ScanStats;
import org.apache.drill.metastore.statistics.Statistic;
import org.apache.drill.exec.planner.common.CountToDirectScanUtils;
import org.apache.drill.exec.planner.common.DrillRelOptUtil;

import org.apache.drill.exec.planner.physical.PlannerSettings;
import org.apache.drill.exec.store.ColumnExplorer;
import org.apache.drill.exec.store.dfs.DrillFileSystem;
import org.apache.drill.exec.store.dfs.FileSystemPlugin;
import org.apache.drill.exec.store.dfs.FormatSelection;
import org.apache.drill.exec.store.dfs.NamedFormatPluginConfig;
import org.apache.drill.exec.store.direct.MetadataDirectGroupScan;
import org.apache.drill.exec.store.parquet.ParquetFormatConfig;
import org.apache.drill.exec.store.parquet.ParquetReaderConfig;
import org.apache.drill.exec.store.parquet.metadata.Metadata;
import org.apache.drill.exec.store.parquet.metadata.Metadata_V4;
import org.apache.drill.exec.store.pojo.DynamicPojoRecordReader;
import org.apache.drill.shaded.guava.com.google.common.collect.ImmutableMap;
import org.apache.hadoop.fs.Path;
import org.slf4j.Logger;
import org.slf4j.LoggerFactory;

import java.io.IOException;
import java.util.ArrayList;
import java.util.Collections;
import java.util.Map;
import java.util.LinkedHashMap;
import java.util.Set;

/**
 * 

This rule is a logical planning counterpart to a corresponding ConvertCountToDirectScanPrule * physical rule *

*

* This rule will convert " select count(*) as mycount from table " * or " select count(not-nullable-expr) as mycount from table " into *

 *    Project(mycount)
 *         \
 *    DirectGroupScan ( PojoRecordReader ( rowCount ))
 *
* or " select count(column) as mycount from table " into *
 *      Project(mycount)
 *           \
 *            DirectGroupScan (PojoRecordReader (columnValueCount))
 *
* Rule can be applied if query contains multiple count expressions. * " select count(column1), count(column2), count(*) from table " *

* *

* The rule utilizes the Parquet Metadata Cache's summary information to retrieve the total row count * and the per-column null count. As such, the rule is only applicable for Parquet tables and only if the * metadata cache has been created with the summary information. *

*/ public class ConvertCountToDirectScanRule extends RelOptRule { public static final RelOptRule AGG_ON_PROJ_ON_SCAN = new ConvertCountToDirectScanRule( RelOptHelper.some(Aggregate.class, RelOptHelper.some(Project.class, RelOptHelper.any(TableScan.class))), "Agg_on_proj_on_scan"); public static final RelOptRule AGG_ON_SCAN = new ConvertCountToDirectScanRule( RelOptHelper.some(Aggregate.class, RelOptHelper.any(TableScan.class)), "Agg_on_scan"); private static final Logger logger = LoggerFactory.getLogger(ConvertCountToDirectScanRule.class); private ConvertCountToDirectScanRule(RelOptRuleOperand rule, String id) { super(rule, DrillRelFactories.LOGICAL_BUILDER, "ConvertCountToDirectScanRule:" + id); } @Override public void onMatch(RelOptRuleCall call) { final Aggregate agg = call.rel(0); final TableScan scan = call.rel(call.rels.length - 1); final Project project = call.rels.length == 3 ? (Project) call.rel(1) : null; // Qualifying conditions for rule: // 1) There's no GroupBY key, // 2) Agg is not a DISTINCT agg // 3) Additional checks are done further below .. if (agg.getGroupCount() > 0 || agg.containsDistinctCall()) { return; } DrillTable drillTable = DrillRelOptUtil.getDrillTable(scan); if (drillTable == null) { logger.debug("Rule does not apply since an eligible drill table instance was not found."); return; } Object selection = drillTable.getSelection(); if (!(selection instanceof FormatSelection)) { logger.debug("Rule does not apply since only Parquet file format is eligible."); return; } PlannerSettings settings = call.getPlanner().getContext().unwrap(PlannerSettings.class); // Rule is applicable only if the statistics for row count and null count are available from the metadata, FormatSelection formatSelection = (FormatSelection) selection; // Rule cannot be applied if the selection had wildcard since the totalrowcount cannot be read from the parent directory if (formatSelection.getSelection().hadWildcard()) { logger.debug("Rule does not apply when there is a wild card since the COUNT could not be determined from metadata."); return; } Pair status = checkMetadataForScanStats(settings, drillTable, formatSelection); if (!status.getLeft()) { logger.debug("Rule does not apply since MetadataSummary metadata was not found."); return; } Metadata_V4.MetadataSummary metadataSummary = status.getRight(); Map result = collectCounts(settings, metadataSummary, agg, scan, project); logger.trace("Calculated the following aggregate counts: {}", result); // if counts could not be determined, rule won't be applied if (result.isEmpty()) { logger.debug("Rule does not apply since one or more COUNTs could not be determined from metadata."); return; } Path summaryFileName = Metadata.getSummaryFileName(formatSelection.getSelection().getSelectionRoot()); final RelDataType scanRowType = CountToDirectScanUtils.constructDataType(agg, result.keySet()); final DynamicPojoRecordReader reader = new DynamicPojoRecordReader<>( CountToDirectScanUtils.buildSchema(scanRowType.getFieldNames()), Collections.singletonList(new ArrayList<>(result.values()))); final ScanStats scanStats = new ScanStats(ScanStats.GroupScanProperty.EXACT_ROW_COUNT, 1, 1, scanRowType.getFieldCount()); final MetadataDirectGroupScan directScan = new MetadataDirectGroupScan(reader, summaryFileName, 1, scanStats, true, false); final DrillDirectScanRel newScan = new DrillDirectScanRel(scan.getCluster(), scan.getTraitSet().plus(DrillRel.DRILL_LOGICAL), directScan, scanRowType); final DrillProjectRel newProject = new DrillProjectRel(agg.getCluster(), agg.getTraitSet().plus(DrillRel.DRILL_LOGICAL), newScan, CountToDirectScanUtils.prepareFieldExpressions(scanRowType), agg.getRowType()); call.transformTo(newProject); } private Pair checkMetadataForScanStats(PlannerSettings settings, DrillTable drillTable, FormatSelection formatSelection) { // Currently only support metadata rowcount stats for Parquet tables FormatPluginConfig formatConfig = formatSelection.getFormat(); if (!((formatConfig instanceof ParquetFormatConfig) || ((formatConfig instanceof NamedFormatPluginConfig) && ((NamedFormatPluginConfig) formatConfig).getName().equals("parquet")))) { return new ImmutablePair<>(false, null); } FileSystemPlugin plugin = (FileSystemPlugin) drillTable.getPlugin(); DrillFileSystem fs; try { fs = new DrillFileSystem(plugin.getFormatPlugin(formatSelection.getFormat()).getFsConf()); } catch (IOException e) { logger.warn("Unable to create the file system object for retrieving statistics from metadata cache file ", e); return new ImmutablePair<>(false, null); } // check if the cacheFileRoot has been set: this is needed because after directory pruning, the // cacheFileRoot could have been changed and not be the same as the original selectionRoot Path selectionRoot = formatSelection.getSelection().getCacheFileRoot() != null ? formatSelection.getSelection().getCacheFileRoot() : formatSelection.getSelection().getSelectionRoot(); ParquetReaderConfig parquetReaderConfig= ParquetReaderConfig.builder() .withFormatConfig((ParquetFormatConfig) formatConfig) .withOptions(settings.getOptions()) .build(); Metadata_V4.MetadataSummary metadataSummary = Metadata.getSummary(fs, selectionRoot, false, parquetReaderConfig); return metadataSummary != null ? new ImmutablePair<>(true, metadataSummary) : new ImmutablePair<>(false, null); } /** * Collects counts for each aggregation call by using the metadata summary information * Will return empty result map if was not able to determine count for at least one aggregation call. * * For each aggregate call will determine if count can be calculated. Collects counts only for COUNT function. * 1. First, we get the total row count from the metadata summary. * 2. For COUNT(*) and COUNT() and COUNT(), the count = total row count * 3. For COUNT(nullable column), count = (total row count - column's null count) * 4. Also count can not be calculated for parition columns. * 5. For the columns that are not present in the Summary(Non-existent columns), the count = 0 * * @param settings planner options * @param metadataSummary metadata summary containing row counts and column counts * @param agg aggregate relational expression * @param scan scan relational expression * @param project project relational expression * @return result map where key is count column name, value is count value */ private Map collectCounts(PlannerSettings settings, Metadata_V4.MetadataSummary metadataSummary, Aggregate agg, TableScan scan, Project project) { final Set implicitColumnsNames = ColumnExplorer.initImplicitFileColumns(settings.getOptions()).keySet(); final long totalRecordCount = metadataSummary.getTotalRowCount(); final LinkedHashMap result = new LinkedHashMap<>(); for (int i = 0; i < agg.getAggCallList().size(); i++) { AggregateCall aggCall = agg.getAggCallList().get(i); long cnt; // rule can be applied only for count function, return empty counts if (!"count".equalsIgnoreCase(aggCall.getAggregation().getName()) ) { return ImmutableMap.of(); } if (CountToDirectScanUtils.containsStarOrNotNullInput(aggCall, agg)) { cnt = totalRecordCount; } else if (aggCall.getArgList().size() == 1) { // count(columnName) ==> Agg ( Scan )) ==> columnValueCount int index = aggCall.getArgList().get(0); if (project != null) { // project in the middle of Agg and Scan : Only when input of AggCall is a RexInputRef in Project, we find the index of Scan's field. // For instance, // Agg - count($0) // \ // Proj - Exp={$1} // \ // Scan (col1, col2). // return count of "col2" in Scan's metadata, if found. if (!(project.getProjects().get(index) instanceof RexInputRef)) { return ImmutableMap.of(); // do not apply for all other cases. } index = ((RexInputRef) project.getProjects().get(index)).getIndex(); } String columnName = scan.getRowType().getFieldNames().get(index).toLowerCase(); // for implicit column count will be the same as total record count if (implicitColumnsNames.contains(columnName)) { cnt = totalRecordCount; } else { SchemaPath simplePath = SchemaPath.getSimplePath(columnName); if (ColumnExplorer.isPartitionColumn(settings.getOptions(), simplePath)) { return ImmutableMap.of(); } Metadata_V4.ColumnTypeMetadata_v4 columnMetadata = metadataSummary.getColumnTypeInfo(new Metadata_V4.ColumnTypeMetadata_v4.Key(simplePath)); if (columnMetadata == null) { // If the column doesn't exist in the table, row count is set to 0 cnt = 0; } else if (columnMetadata.totalNullCount == Statistic.NO_COLUMN_STATS) { // if column stats is not available don't apply this rule, return empty counts return ImmutableMap.of(); } else { // count of a nullable column = (total row count - column's null count) cnt = totalRecordCount - columnMetadata.totalNullCount; } } } else { return ImmutableMap.of(); } String name = "count" + i + "$" + (aggCall.getName() == null ? aggCall.toString() : aggCall.getName()); result.put(name, cnt); } return ImmutableMap.copyOf(result); } }




© 2015 - 2025 Weber Informatics LLC | Privacy Policy