All Downloads are FREE. Search and download functionalities are using the official Maven repository.

org.apache.drill.exec.physical.base.AbstractGroupScanWithMetadata Maven / Gradle / Ivy

/*
 * Licensed to the Apache Software Foundation (ASF) under one
 * or more contributor license agreements.  See the NOTICE file
 * distributed with this work for additional information
 * regarding copyright ownership.  The ASF licenses this file
 * to you under the Apache License, Version 2.0 (the
 * "License"); you may not use this file except in compliance
 * with the License.  You may obtain a copy of the License at
 *
 * http://www.apache.org/licenses/LICENSE-2.0
 *
 * Unless required by applicable law or agreed to in writing, software
 * distributed under the License is distributed on an "AS IS" BASIS,
 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 * See the License for the specific language governing permissions and
 * limitations under the License.
 */
package org.apache.drill.exec.physical.base;

import com.fasterxml.jackson.annotation.JsonIgnore;
import com.fasterxml.jackson.annotation.JsonProperty;
import org.apache.commons.collections.CollectionUtils;
import org.apache.commons.collections.MapUtils;
import org.apache.drill.common.expression.ErrorCollector;
import org.apache.drill.common.expression.ErrorCollectorImpl;
import org.apache.drill.common.expression.ExpressionStringBuilder;
import org.apache.drill.common.expression.LogicalExpression;
import org.apache.drill.common.expression.SchemaPath;
import org.apache.drill.common.expression.ValueExpressions;
import org.apache.drill.common.types.TypeProtos;
import org.apache.drill.common.types.Types;
import org.apache.drill.exec.compile.sig.ConstantExpressionIdentifier;
import org.apache.drill.exec.exception.MetadataException;
import org.apache.drill.exec.expr.ExpressionTreeMaterializer;
import org.apache.drill.exec.expr.FilterBuilder;
import org.apache.drill.exec.expr.FilterPredicate;
import org.apache.drill.exec.expr.fn.FunctionImplementationRegistry;
import org.apache.drill.exec.expr.fn.FunctionLookupContext;
import org.apache.drill.exec.expr.stat.RowsMatch;
import org.apache.drill.exec.metastore.MetadataProviderManager;
import org.apache.drill.exec.metastore.analyze.FileMetadataInfoCollector;
import org.apache.drill.exec.ops.OptimizerRulesContext;
import org.apache.drill.exec.ops.UdfUtilities;
import org.apache.drill.exec.physical.impl.scan.v3.FixedReceiver;
import org.apache.drill.exec.planner.physical.PlannerSettings;
import org.apache.drill.exec.record.MaterializedField;
import org.apache.drill.exec.record.metadata.ColumnMetadata;
import org.apache.drill.exec.record.metadata.TupleMetadata;
import org.apache.drill.exec.record.metadata.TupleSchema;
import org.apache.drill.exec.server.options.OptionManager;
import org.apache.drill.exec.store.ColumnExplorer;
import org.apache.drill.exec.store.dfs.DrillFileSystem;
import org.apache.drill.exec.store.dfs.FileSelection;
import org.apache.drill.exec.store.parquet.FilterEvaluatorUtils;
import org.apache.drill.exec.store.parquet.ParquetTableMetadataUtils;
import org.apache.drill.exec.util.ImpersonationUtil;
import org.apache.drill.metastore.metadata.BaseMetadata;
import org.apache.drill.metastore.metadata.FileMetadata;
import org.apache.drill.metastore.metadata.LocationProvider;
import org.apache.drill.metastore.metadata.Metadata;
import org.apache.drill.metastore.metadata.MetadataType;
import org.apache.drill.metastore.metadata.NonInterestingColumnsMetadata;
import org.apache.drill.metastore.metadata.PartitionMetadata;
import org.apache.drill.metastore.metadata.SegmentMetadata;
import org.apache.drill.metastore.metadata.TableMetadata;
import org.apache.drill.metastore.metadata.TableMetadataProvider;
import org.apache.drill.metastore.metadata.TableMetadataProviderBuilder;
import org.apache.drill.metastore.statistics.ColumnStatistics;
import org.apache.drill.metastore.statistics.ColumnStatisticsKind;
import org.apache.drill.metastore.statistics.Statistic;
import org.apache.drill.metastore.statistics.TableStatisticsKind;
import org.apache.drill.metastore.util.SchemaPathUtils;
import org.apache.hadoop.conf.Configuration;
import org.apache.hadoop.fs.FileStatus;
import org.apache.hadoop.fs.Path;
import org.slf4j.Logger;
import org.slf4j.LoggerFactory;

import java.io.IOException;
import java.util.ArrayList;
import java.util.Collection;
import java.util.Collections;
import java.util.HashMap;
import java.util.HashSet;
import java.util.LinkedHashMap;
import java.util.List;
import java.util.Map;
import java.util.Objects;
import java.util.Set;
import java.util.function.Function;
import java.util.stream.Collectors;

import static org.apache.drill.exec.ExecConstants.SKIP_RUNTIME_ROWGROUP_PRUNING_KEY;

/**
 * Represents table group scan with metadata usage.
 */
public abstract class AbstractGroupScanWithMetadata

extends AbstractFileGroupScan { static final Logger logger = LoggerFactory.getLogger(AbstractGroupScanWithMetadata.class); protected P metadataProvider; // table metadata info protected TableMetadata tableMetadata; // partition metadata info: mixed partition values for all partition keys in the same list protected List partitions; protected Map segments; protected NonInterestingColumnsMetadata nonInterestingColumnsMetadata; protected List partitionColumns; protected LogicalExpression filter; protected List columns; protected Map files; // set of the files to be handled protected Set fileSet; // whether all files, partitions or row groups of this group scan fully match the filter protected boolean matchAllMetadata; protected boolean usedMetastore; // false by default // The pushed-down limit. LIMIT 0 is valid, it means to return only schema. // The default is a -1, which means unlimited. // Note that, in "Big Data", a table might have more then 2B rows, so we // cannot use Integer.MAX_VALUE as the limit. protected int limit = -1; protected AbstractGroupScanWithMetadata(String userName, List columns, LogicalExpression filter) { super(userName); this.columns = columns; this.filter = filter; } protected AbstractGroupScanWithMetadata(AbstractGroupScanWithMetadata

that) { super(that.getUserName()); this.columns = that.columns; this.filter = that.filter; this.matchAllMetadata = that.matchAllMetadata; this.metadataProvider = that.metadataProvider; this.tableMetadata = that.tableMetadata; this.partitionColumns = that.partitionColumns; this.partitions = that.partitions; this.segments = that.segments; this.files = that.files; this.usedMetastore = that.usedMetastore; this.nonInterestingColumnsMetadata = that.nonInterestingColumnsMetadata; this.fileSet = that.fileSet == null ? null : new HashSet<>(that.fileSet); this.limit = that.limit; } @JsonProperty("columns") @Override public List getColumns() { return columns; } @Override public Collection getFiles() { return fileSet; } @Override public boolean hasFiles() { return true; } @JsonProperty("limit") public int getLimit() { return limit; } @JsonIgnore public boolean isMatchAllMetadata() { return matchAllMetadata; } /** * Return column value count for the specified column. * If does not contain such column, return 0. * Is used when applying convert to direct scan rule. * * @param column column schema path * @return column value count */ @Override public long getColumnValueCount(SchemaPath column) { ColumnStatistics columnStats = getTableMetadata().getColumnStatistics(column); ColumnStatistics nonInterestingColStats = columnStats == null ? getNonInterestingColumnsMetadata().getColumnStatistics(column) : null; long tableRowCount; if (columnStats != null) { tableRowCount = TableStatisticsKind.ROW_COUNT.getValue(getTableMetadata()); } else if (nonInterestingColStats != null) { tableRowCount = TableStatisticsKind.ROW_COUNT.getValue(getNonInterestingColumnsMetadata()); columnStats = nonInterestingColStats; } else if (hasNestedStatsForColumn(column, getTableMetadata()) || hasNestedStatsForColumn(column, getNonInterestingColumnsMetadata())) { // When statistics for nested field exists, this is complex column which is present in table. // But its nested fields statistics can't be used to extract tableRowCount for this column. // So NO_COLUMN_STATS returned here to avoid problems described in DRILL-7491. return Statistic.NO_COLUMN_STATS; } else { return 0; // returns 0 if the column doesn't exist in the table. } Long nulls = ColumnStatisticsKind.NULLS_COUNT.getFrom(columnStats); if (nulls == null || Statistic.NO_COLUMN_STATS == nulls || Statistic.NO_COLUMN_STATS == tableRowCount) { return Statistic.NO_COLUMN_STATS; } else { return tableRowCount - nulls; } } /** * For complex columns, stats may be present only for nested fields. For example, a column path is `a`, * but stats present for `a`.`b`. So before making a decision that column is absent, the case needs * to be tested. * * @param column parent column path * @param metadata metadata with column statistics * @return whether stats exists for nested fields */ private boolean hasNestedStatsForColumn(SchemaPath column, Metadata metadata) { return metadata.getColumnsStatistics().keySet().stream() .anyMatch(path -> path.contains(column)); } @Override public String getDigest() { return toString(); } @Override public ScanStats getScanStats() { int columnCount = columns == null ? 20 : columns.size(); double rowCount = TableStatisticsKind.ROW_COUNT.getValue(getTableMetadata()); ScanStats scanStats = new ScanStats(ScanStats.GroupScanProperty.EXACT_ROW_COUNT, rowCount, 1, rowCount * columnCount); logger.trace("Drill parquet scan statistics: {}", scanStats); return scanStats; } // filter push down methods block start @JsonProperty("filter") @Override public LogicalExpression getFilter() { return filter; } @Override public P getMetadataProvider() { return metadataProvider; } public void setFilter(LogicalExpression filter) { this.filter = filter; } /** * Set the filter - thus enabling runtime rowgroup pruning * The runtime pruning can be disabled with an option. * @param filterExpr The filter to be used at runtime to match with rowgroups' footers * @param optimizerContext The context for the options */ public void setFilterForRuntime(LogicalExpression filterExpr, OptimizerRulesContext optimizerContext) { OptionManager options = optimizerContext.getPlannerSettings().getOptions(); boolean skipRuntimePruning = options.getBoolean(SKIP_RUNTIME_ROWGROUP_PRUNING_KEY); // if option is set to disable runtime pruning if ( ! skipRuntimePruning ) { setFilter(filterExpr); } } /** * Applies specified filter {@code filterExpr} to current group scan and produces filtering at: *

    *
  • table level: *
    • if filter matches all the the data or prunes all the data, sets corresponding value to * {@link AbstractGroupScanWithMetadata#isMatchAllMetadata()} and returns null
  • *
  • segment level: *
    • if filter matches all the the data or prunes all the data, sets corresponding value to * {@link AbstractGroupScanWithMetadata#isMatchAllMetadata()} and returns null
    • *
    • if segment metadata was pruned, prunes underlying metadata
  • *
  • partition level: *
    • if filter matches all the the data or prunes all the data, sets corresponding value to * {@link AbstractGroupScanWithMetadata#isMatchAllMetadata()} and returns null
    • *
    • if partition metadata was pruned, prunes underlying metadata
  • *
  • file level: *
    • if filter matches all the the data or prunes all the data, sets corresponding value to * {@link AbstractGroupScanWithMetadata#isMatchAllMetadata()} and returns null
  • *
* * @param filterExpr filter expression to build * @param udfUtilities udf utilities * @param functionImplementationRegistry context to find drill function holder * @param optionManager option manager * @return group scan with applied filter expression */ @Override public AbstractGroupScanWithMetadata applyFilter(LogicalExpression filterExpr, UdfUtilities udfUtilities, FunctionImplementationRegistry functionImplementationRegistry, OptionManager optionManager) { // Builds filter for pruning. If filter cannot be built, null should be returned. FilterPredicate filterPredicate = getFilterPredicate(filterExpr, udfUtilities, functionImplementationRegistry, optionManager, true); if (filterPredicate == null) { logger.debug("FilterPredicate cannot be built."); return null; } GroupScanWithMetadataFilterer filteredMetadata = getFilterer() .filterExpression(filterExpr) .schema(tableMetadata.getSchema()) .context(functionImplementationRegistry) .udfUtilities(udfUtilities) .getFiltered(optionManager, filterPredicate); if (isGroupScanFullyMatchesFilter(filteredMetadata)) { logger.debug("applyFilter() does not have any pruning since GroupScan fully matches filter"); matchAllMetadata = filteredMetadata.isMatchAllMetadata(); return null; } if (isAllDataPruned(filteredMetadata)) { if (getFilesMetadata().size() == 1) { // For the case when group scan has single file and it was filtered, // no need to create new group scan with the same file. return null; } logger.debug("All files have been filtered out. Add back one to get schema from scanner"); Map filesMap = getNextOrEmpty(getFilesMetadata().values()).stream() .collect(Collectors.toMap(FileMetadata::getPath, Function.identity())); Map segmentsMap = getNextOrEmpty(getSegmentsMetadata().values()).stream() .collect(Collectors.toMap(SegmentMetadata::getPath, Function.identity())); filteredMetadata.table(getTableMetadata()) .segments(segmentsMap) .partitions(getNextOrEmpty(getPartitionsMetadata())) .files(filesMap) .nonInterestingColumns(getNonInterestingColumnsMetadata()) .matching(false); } return filteredMetadata.build(); } protected boolean isAllDataPruned(GroupScanWithMetadataFilterer filteredMetadata) { return !filteredMetadata.isMatchAllMetadata() // filter returns empty result using table metadata && (filteredMetadata.getTableMetadata() == null && getTableMetadata() != null) // all partitions are pruned if segment metadata is available || filteredMetadata.getSegments().isEmpty() && !getSegmentsMetadata().isEmpty() // all segments are pruned if partition metadata is available || filteredMetadata.getPartitions().isEmpty() && !getPartitionsMetadata().isEmpty() // all files are pruned if file metadata is available || filteredMetadata.getFiles().isEmpty() && !getFilesMetadata().isEmpty(); } protected boolean isGroupScanFullyMatchesFilter(GroupScanWithMetadataFilterer filteredMetadata) { if (MapUtils.isNotEmpty(getFilesMetadata())) { return getFilesMetadata().size() == filteredMetadata.getFiles().size(); } else if (CollectionUtils.isNotEmpty(getPartitionsMetadata())) { return getPartitionsMetadata().size() == filteredMetadata.getPartitions().size(); } else if (MapUtils.isNotEmpty(getSegmentsMetadata())) { return getSegmentsMetadata().size() == filteredMetadata.getSegments().size(); } else { return getTableMetadata() != null; } } /** * Returns list with the first element of input list or empty list if input one was empty. * * @param inputList the source of the first element * @param type of values in the list * @return list with the first element of input list */ protected List getNextOrEmpty(Collection inputList) { return CollectionUtils.isNotEmpty(inputList) ? Collections.singletonList(inputList.iterator().next()) : Collections.emptyList(); } /** * Returns holder for metadata values which provides API to filter metadata * and build new group scan instance using filtered metadata. */ protected abstract GroupScanWithMetadataFilterer getFilterer(); public FilterPredicate getFilterPredicate(LogicalExpression filterExpr, UdfUtilities udfUtilities, FunctionLookupContext functionImplementationRegistry, OptionManager optionManager, boolean omitUnsupportedExprs) { return getFilterPredicate(filterExpr, udfUtilities, functionImplementationRegistry, optionManager, omitUnsupportedExprs, supportsFileImplicitColumns(), getTableMetadata().getSchema()); } /** * Returns parquet filter predicate built from specified {@code filterExpr}. * * @param filterExpr filter expression to build * @param udfUtilities udf utilities * @param functionImplementationRegistry context to find drill function holder * @param optionManager option manager * @param omitUnsupportedExprs whether expressions which cannot be converted * may be omitted from the resulting expression * @param supportsFileImplicitColumns whether implicit columns are supported * @param schema schema * @return parquet filter predicate */ public static FilterPredicate getFilterPredicate(LogicalExpression filterExpr, UdfUtilities udfUtilities, FunctionLookupContext functionImplementationRegistry, OptionManager optionManager, boolean omitUnsupportedExprs, boolean supportsFileImplicitColumns, TupleMetadata schema) { TupleMetadata types = schema.copy(); Set schemaPathsInExpr = filterExpr.accept(FilterEvaluatorUtils.FieldReferenceFinder.INSTANCE, null); // adds implicit or partition columns if they weren't added before. if (supportsFileImplicitColumns) { for (SchemaPath schemaPath : schemaPathsInExpr) { if (isImplicitOrPartCol(schemaPath, optionManager) && SchemaPathUtils.getColumnMetadata(schemaPath, types) == null) { types.add(MaterializedField.create(schemaPath.getRootSegmentPath(), Types.required(TypeProtos.MinorType.VARCHAR))); } } } ErrorCollector errorCollector = new ErrorCollectorImpl(); LogicalExpression materializedFilter = ExpressionTreeMaterializer.materializeFilterExpr( filterExpr, types, errorCollector, functionImplementationRegistry); if (errorCollector.hasErrors()) { logger.error("{} error(s) encountered when materialize filter expression : {}", errorCollector.getErrorCount(), errorCollector.toErrorString()); return null; } if (logger.isDebugEnabled()) { logger.debug("materializedFilter : {}", ExpressionStringBuilder.toString(materializedFilter)); } Set constantBoundaries = ConstantExpressionIdentifier.getConstantExpressionSet(materializedFilter); return FilterBuilder.buildFilterPredicate(materializedFilter, constantBoundaries, udfUtilities, omitUnsupportedExprs); } @JsonProperty public TupleMetadata getSchema() { // creates a copy of TupleMetadata from tableMetadata TupleMetadata tuple = new TupleSchema(); for (ColumnMetadata md : getTableMetadata().getSchema()) { tuple.addColumn(md.copy()); } return tuple; } // limit push down methods start @Override public boolean supportsLimitPushdown() { return true; } @Override public GroupScan applyLimit(int maxRecords) { // LIMIT 0 is supported by EVF to return schema only. maxRecords = Math.max(maxRecords, 0); GroupScanWithMetadataFilterer prunedMetadata = getFilterer(); if (getTableMetadata() != null) { long tableRowCount = TableStatisticsKind.ROW_COUNT.getValue(getTableMetadata()); if (tableRowCount == Statistic.NO_COLUMN_STATS || tableRowCount <= maxRecords) { logger.debug("limit push down does not apply, since total number of rows [{}] is less or equal to the required [{}].", tableRowCount, maxRecords); // Return the group scan with the limit pushed down if (this.limit != maxRecords) { prunedMetadata.limit(maxRecords); return prunedMetadata.build(); } return null; } } // Calculate number of files to read based on maxRecords and update // number of records to read for each of those files. List qualifiedFiles = limitMetadata(getFilesMetadata().values(), maxRecords); // some files does not have set row count, do not do files pruning if (qualifiedFiles == null || qualifiedFiles.size() == getFilesMetadata().size()) { logger.debug("limit push down does not apply, since number of files was not reduced."); // Return the group scan with the limit pushed down if (this.limit != maxRecords) { prunedMetadata.limit(maxRecords); return prunedMetadata.build(); } return null; } Map filesMap = qualifiedFiles.stream() .collect(Collectors.toMap(FileMetadata::getPath, Function.identity())); return prunedMetadata .table(getTableMetadata()) .segments(getSegmentsMetadata()) .partitions(getPartitionsMetadata()) .files(filesMap) .limit(maxRecords) .nonInterestingColumns(getNonInterestingColumnsMetadata()) .matching(matchAllMetadata) .build(); } /** * Removes metadata which does not belong to any of partitions in metadata list. * * @param metadataToPrune list of metadata which should be pruned * @param filteredPartitionMetadata list of partition metadata which was pruned * @param type of metadata to filter * @return list with metadata which belongs to pruned partitions */ protected static Map pruneForPartitions(Map metadataToPrune, List filteredPartitionMetadata) { Map prunedFiles = new LinkedHashMap<>(); if (metadataToPrune != null) { metadataToPrune.forEach((path, metadata) -> { for (PartitionMetadata filteredPartition : filteredPartitionMetadata) { if (filteredPartition.getLocations().contains(path)) { prunedFiles.put(path, metadata); break; } } }); } return prunedFiles; } /** * Prunes specified metadata list and leaves minimum metadata instances count with general rows number * which is not less than specified {@code maxRecords}. * * @param metadataList list of metadata to prune * @param maxRecords rows number to leave * @param type of metadata to prune * @return pruned metadata list */ protected List limitMetadata(Collection metadataList, int maxRecords) { List qualifiedMetadata = new ArrayList<>(); long currentRowCount = 0; for (T metadata : metadataList) { long rowCount = TableStatisticsKind.ROW_COUNT.getValue(metadata); if (rowCount == Statistic.NO_COLUMN_STATS) { return null; } else if (currentRowCount + rowCount <= maxRecords) { currentRowCount += rowCount; qualifiedMetadata.add(metadata); continue; } else if (currentRowCount < maxRecords) { qualifiedMetadata.add(metadata); } break; } return qualifiedMetadata; } // limit push down methods end // partition pruning methods start @Override public List getPartitionColumns() { if (partitionColumns == null) { partitionColumns = metadataProvider.getPartitionColumns(); if (partitionColumns == null) { partitionColumns = new ArrayList<>(); } } return partitionColumns; } @JsonIgnore public TypeProtos.MajorType getTypeForColumn(SchemaPath schemaPath) { ColumnMetadata columnMetadata = SchemaPathUtils.getColumnMetadata(schemaPath, getTableMetadata().getSchema()); return columnMetadata != null ? columnMetadata.majorType() : null; } @JsonIgnore public T getPartitionValue(Path path, SchemaPath column, Class clazz) { return getPartitionsMetadata().stream() .filter(partition -> partition.getColumn().equals(column) && partition.getLocations().contains(path)) .findAny() .map(metadata -> clazz.cast(metadata.getColumnsStatistics().get(column).get(ColumnStatisticsKind.MAX_VALUE))) .orElse(null); } @JsonIgnore public Set getFileSet() { return fileSet; } // partition pruning methods end // helper method used for partition pruning and filter push down @Override public void modifyFileSelection(FileSelection selection) { fileSet = new HashSet<>(selection.getFiles()); } // protected methods block protected void init() throws IOException { if (fileSet == null && getFilesMetadata() != null) { fileSet = getFilesMetadata().keySet(); } } protected String getFilterString() { return filter == null || filter.equals(ValueExpressions.BooleanExpression.TRUE) ? "" : ExpressionStringBuilder.toString(this.filter); } protected abstract boolean supportsFileImplicitColumns(); protected abstract List getPartitionValues(LocationProvider locationProvider); public static boolean isImplicitOrPartCol(SchemaPath schemaPath, OptionManager optionManager) { Set implicitColNames = ColumnExplorer.initImplicitFileColumns(optionManager).keySet(); return ColumnExplorer.isPartitionColumn(optionManager, schemaPath) || implicitColNames.contains(schemaPath.getRootSegmentPath()); } @JsonIgnore public Map getFilesMetadata() { if (files == null) { files = metadataProvider.getFilesMetadataMap(); } return files; } @Override public TableMetadata getTableMetadata() { if (tableMetadata == null) { tableMetadata = metadataProvider.getTableMetadata(); } return tableMetadata; } @JsonIgnore public List getPartitionsMetadata() { if (partitions == null) { partitions = metadataProvider.getPartitionsMetadata(); } return partitions; } @JsonIgnore public Map getSegmentsMetadata() { if (segments == null) { segments = metadataProvider.getSegmentsMetadataMap(); } return segments; } @Override public boolean usedMetastore() { return usedMetastore; } @JsonIgnore public NonInterestingColumnsMetadata getNonInterestingColumnsMetadata() { if (nonInterestingColumnsMetadata == null) { nonInterestingColumnsMetadata = metadataProvider.getNonInterestingColumnsMetadata(); } return nonInterestingColumnsMetadata; } /** * Returns {@link TableMetadataProviderBuilder} instance based on specified * {@link MetadataProviderManager} source. * * @param source metadata provider manager * @return {@link TableMetadataProviderBuilder} instance */ protected abstract TableMetadataProviderBuilder tableMetadataProviderBuilder(MetadataProviderManager source); /** * Returns {@link TableMetadataProviderBuilder} instance which may provide metadata * without using Drill Metastore. * * @param source metadata provider manager * @return {@link TableMetadataProviderBuilder} instance */ protected abstract TableMetadataProviderBuilder defaultTableMetadataProviderBuilder(MetadataProviderManager source); /** * Compares the last modified time of files obtained from specified selection with * the Metastore last modified time to determine whether Metastore metadata * is up-to-date. If metadata is outdated, {@link MetadataException} will be thrown. * * @param selection the source of files to check * @throws MetadataException if metadata is outdated */ protected void checkMetadataConsistency(FileSelection selection, Configuration fsConf) throws IOException { if (metadataProvider.checkMetadataVersion()) { DrillFileSystem fileSystem = ImpersonationUtil.createFileSystem(ImpersonationUtil.resolveUserName(getUserName()), fsConf); List fileStatuses = FileMetadataInfoCollector.getFileStatuses(selection, fileSystem); long lastModifiedTime = metadataProvider.getTableMetadata().getLastModifiedTime(); Set removedFiles = new HashSet<>(metadataProvider.getFilesMetadataMap().keySet()); Set newFiles = new HashSet<>(); boolean isChanged = false; for (FileStatus fileStatus : fileStatuses) { if (!removedFiles.remove(Path.getPathWithoutSchemeAndAuthority(fileStatus.getPath()))) { newFiles.add(fileStatus.getPath()); } if (fileStatus.getModificationTime() > lastModifiedTime) { isChanged = true; break; } } if (isChanged || !removedFiles.isEmpty() || !newFiles.isEmpty()) { throw MetadataException.of(MetadataException.MetadataExceptionType.OUTDATED_METADATA); } } } /** * This class is responsible for filtering different metadata levels. */ protected abstract static class GroupScanWithMetadataFilterer> { protected final AbstractGroupScanWithMetadata source; protected boolean matchAllMetadata; protected TableMetadata tableMetadata; protected List partitions = Collections.emptyList(); protected Map segments = Collections.emptyMap(); protected Map files = Collections.emptyMap(); protected NonInterestingColumnsMetadata nonInterestingColumnsMetadata; // required for rebuilding filter expression for the case of schema change protected LogicalExpression filterExpression; protected TupleMetadata tableSchema; protected UdfUtilities udfUtilities; protected FunctionLookupContext context; protected int limit = -1; // for the case when filtering is possible for partitions, but files count exceeds // PARQUET_ROWGROUP_FILTER_PUSHDOWN_PLANNING_THRESHOLD, new group scan with at least filtered partitions // and files which belongs to that partitions may be returned protected MetadataType overflowLevel = MetadataType.NONE; public GroupScanWithMetadataFilterer(AbstractGroupScanWithMetadata source) { this.source = source; } /** * Constructs required implementation of {@link AbstractGroupScanWithMetadata} with filtered metadata. * * @return implementation of {@link AbstractGroupScanWithMetadata} with filtered metadata */ public abstract AbstractGroupScanWithMetadata build(); public B table(TableMetadata tableMetadata) { this.tableMetadata = tableMetadata; return self(); } public B partitions(List partitions) { this.partitions = partitions; return self(); } public B segments(Map segments) { this.segments = segments; return self(); } public B nonInterestingColumns(NonInterestingColumnsMetadata nonInterestingColumns) { this.nonInterestingColumnsMetadata = nonInterestingColumns; return self(); } public B files(Map files) { this.files = files; return self(); } public B limit(int maxRecords) { this.limit = maxRecords; return self(); } public B matching(boolean matchAllMetadata) { this.matchAllMetadata = matchAllMetadata; return self(); } public B overflow(MetadataType overflowLevel) { this.overflowLevel = overflowLevel; return self(); } public B filterExpression(LogicalExpression filterExpression) { this.filterExpression = filterExpression; return self(); } public B schema(TupleMetadata tableSchema) { this.tableSchema = tableSchema; return self(); } public B udfUtilities(UdfUtilities udfUtilities) { this.udfUtilities = udfUtilities; return self(); } public B context(FunctionLookupContext context) { this.context = context; return self(); } public boolean isMatchAllMetadata() { return matchAllMetadata; } public TableMetadata getTableMetadata() { return tableMetadata; } public List getPartitions() { return partitions; } public Map getSegments() { return segments; } public Map getFiles() { return files; } public MetadataType getOverflowLevel() { return overflowLevel; } /** * Produces filtering of metadata and returns {@link GroupScanWithMetadataFilterer} * to construct resulting group scan. * * @param optionManager option manager * @param filterPredicate filter expression * @return this instance with filtered metadata */ protected B getFiltered(OptionManager optionManager, FilterPredicate filterPredicate) { Objects.requireNonNull(filterExpression, "filterExpression was not set"); Objects.requireNonNull(tableSchema, "tableSchema was not set"); Objects.requireNonNull(udfUtilities, "udfUtilities were not set"); Objects.requireNonNull(context, "context was not set"); Set schemaPathsInExpr = filterExpression.accept(FilterEvaluatorUtils.FieldReferenceFinder.INSTANCE, null); if (source.getTableMetadata() != null) { filterTableMetadata(filterPredicate, schemaPathsInExpr); } if (source.getSegmentsMetadata() != null) { filterSegmentMetadata(optionManager, filterPredicate, schemaPathsInExpr); } if (source.getPartitionsMetadata() != null) { filterPartitionMetadata(optionManager, filterPredicate, schemaPathsInExpr); } if (source.getFilesMetadata() != null) { filterFileMetadata(optionManager, filterPredicate, schemaPathsInExpr); } return self(); } /** * Produces filtering of metadata at table level. * * @param filterPredicate filter expression * @param schemaPathsInExpr columns used in filter expression */ protected void filterTableMetadata(FilterPredicate filterPredicate, Set schemaPathsInExpr) { // Filters table metadata. If resulting list is empty, should be used single minimum entity of metadata. // If table matches fully, nothing is pruned and pruning of underlying metadata is stopped. matchAllMetadata = true; List filteredTableMetadata = filterAndGetMetadata(schemaPathsInExpr, Collections.singletonList(source.getTableMetadata()), filterPredicate, null); if (CollectionUtils.isNotEmpty(filteredTableMetadata)) { this.tableMetadata = filteredTableMetadata.get(0); } } /** * Produces filtering of metadata at segment level. * * @param optionManager option manager * @param filterPredicate filter expression * @param schemaPathsInExpr columns used in filter expression */ protected void filterSegmentMetadata(OptionManager optionManager, FilterPredicate filterPredicate, Set schemaPathsInExpr) { if (!matchAllMetadata) { if (!source.getSegmentsMetadata().isEmpty()) { if (source.getSegmentsMetadata().size() <= optionManager.getOption( PlannerSettings.PARQUET_ROWGROUP_FILTER_PUSHDOWN_PLANNING_THRESHOLD)) { matchAllMetadata = true; segments = filterAndGetMetadata(schemaPathsInExpr, source.getSegmentsMetadata().values(), filterPredicate, optionManager).stream() .collect(Collectors.toMap( SegmentMetadata::getPath, Function.identity(), (first, second) -> second)); } else { overflowLevel = MetadataType.SEGMENT; } } } else { segments = source.getSegmentsMetadata(); } } /** * Produces filtering of metadata at partition level. * * @param optionManager option manager * @param filterPredicate filter expression * @param schemaPathsInExpr columns used in filter expression */ protected void filterPartitionMetadata(OptionManager optionManager, FilterPredicate filterPredicate, Set schemaPathsInExpr) { List prunedPartitions; if (!source.getSegmentsMetadata().isEmpty() && source.getSegmentsMetadata().size() > getSegments().size()) { // prunes row groups to leave only row groups which are contained by pruned segments prunedPartitions = pruneForSegments(source.getPartitionsMetadata(), getSegments()); } else { prunedPartitions = source.getPartitionsMetadata(); } if (isMatchAllMetadata()) { partitions = prunedPartitions; return; } if (!source.getPartitionsMetadata().isEmpty()) { if (source.getPartitionsMetadata().size() <= optionManager.getOption( PlannerSettings.PARQUET_ROWGROUP_FILTER_PUSHDOWN_PLANNING_THRESHOLD)) { matchAllMetadata = true; partitions = filterAndGetMetadata(schemaPathsInExpr, prunedPartitions, filterPredicate, optionManager); } else { overflowLevel = MetadataType.PARTITION; } } } /** * Produces filtering of metadata at file level. * * @param optionManager option manager * @param filterPredicate filter expression * @param schemaPathsInExpr columns used in filter expression */ protected void filterFileMetadata(OptionManager optionManager, FilterPredicate filterPredicate, Set schemaPathsInExpr) { Map prunedFiles; if (!source.getPartitionsMetadata().isEmpty() && source.getPartitionsMetadata().size() > getPartitions().size()) { // prunes files to leave only files which are contained by pruned partitions prunedFiles = pruneForPartitions(source.getFilesMetadata(), getPartitions()); } else if (!source.getSegmentsMetadata().isEmpty() && source.getSegmentsMetadata().size() > getSegments().size()) { // prunes row groups to leave only row groups which are contained by pruned segments prunedFiles = pruneForSegments(source.getFilesMetadata(), getSegments()); } else { prunedFiles = source.getFilesMetadata(); } if (isMatchAllMetadata()) { files = prunedFiles; return; } // Stop files pruning for the case: // - # of files is beyond PARQUET_ROWGROUP_FILTER_PUSHDOWN_PLANNING_THRESHOLD. if (prunedFiles.size() <= optionManager.getOption( PlannerSettings.PARQUET_ROWGROUP_FILTER_PUSHDOWN_PLANNING_THRESHOLD)) { matchAllMetadata = true; files = filterAndGetMetadata(schemaPathsInExpr, prunedFiles.values(), filterPredicate, optionManager).stream() .collect(Collectors.toMap(FileMetadata::getPath, Function.identity())); } else { matchAllMetadata = false; files = prunedFiles; overflowLevel = MetadataType.FILE; } } /** * Removes metadata which does not belong to any of partitions in metadata list. * * @param metadataToPrune list of metadata which should be pruned * @param filteredSegmentMetadata list of segment metadata which was pruned * @param type of metadata to filter * @return map with metadata which belongs to pruned partitions */ protected static Map pruneForSegments( Map metadataToPrune, Map filteredSegmentMetadata) { Map prunedFiles = new HashMap<>(); if (metadataToPrune != null) { for (Map.Entry entry : metadataToPrune.entrySet()) { for (SegmentMetadata filteredPartition : filteredSegmentMetadata.values()) { if (filteredPartition.getLocations().contains(entry.getKey())) { prunedFiles.put(entry.getKey(), entry.getValue()); break; } } } } return prunedFiles; } /** * Removes metadata which does not belong to any of partitions in metadata list. * * @param metadataToPrune list of partition metadata which should be pruned * @param filteredSegmentMetadata list of segment metadata which was pruned * @return list with metadata which belongs to pruned partitions */ protected List pruneForSegments( List metadataToPrune, Map filteredSegmentMetadata) { List prunedPartitions = new ArrayList<>(); if (metadataToPrune != null) { for (PartitionMetadata partition : metadataToPrune) { for (SegmentMetadata segment : filteredSegmentMetadata.values()) { if (!Collections.disjoint(segment.getLocations(), partition.getLocations())) { prunedPartitions.add(partition); break; } } } } return prunedPartitions; } /** * Produces filtering of specified metadata using specified filter expression and returns filtered metadata. * * @param schemaPathsInExpr columns used in filter expression * @param metadataList metadata to filter * @param filterPredicate filter expression * @param optionManager option manager * @param type of metadata to filter * @return filtered metadata */ public List filterAndGetMetadata( Set schemaPathsInExpr, Iterable metadataList, FilterPredicate filterPredicate, OptionManager optionManager) { List qualifiedMetadata = new ArrayList<>(); for (T metadata : metadataList) { TupleMetadata schema = metadata.getSchema(); if (schema != null && !tableSchema.isEquivalent(schema)) { schema = FixedReceiver.Builder.mergeSchemas(schema, tableSchema); filterPredicate = getFilterPredicate(filterExpression, udfUtilities, context, optionManager, true, true, schema); } Map> columnsStatistics = metadata.getColumnsStatistics(); // adds partition (dir) column statistics if it may be used during filter evaluation columnsStatistics = getImplicitColumnStatistics(optionManager, metadata, columnsStatistics); if (source.getNonInterestingColumnsMetadata() != null) { columnsStatistics.putAll(source.getNonInterestingColumnsMetadata().getColumnsStatistics()); } RowsMatch match = FilterEvaluatorUtils.matches(filterPredicate, columnsStatistics, TableStatisticsKind.ROW_COUNT.getValue(metadata), schema, schemaPathsInExpr, udfUtilities); if (match == RowsMatch.NONE) { continue; // No file comply to the filter => drop the file } if (matchAllMetadata) { matchAllMetadata = match == RowsMatch.ALL; } qualifiedMetadata.add(metadata); } if (qualifiedMetadata.isEmpty()) { matchAllMetadata = false; } return qualifiedMetadata; } protected Map> getImplicitColumnStatistics( OptionManager optionManager, T metadata, Map> columnsStatistics) { if (metadata instanceof LocationProvider && optionManager != null) { LocationProvider locationProvider = (LocationProvider) metadata; columnsStatistics = ParquetTableMetadataUtils.addImplicitColumnsStatistics(columnsStatistics, source.columns, source.getPartitionValues(locationProvider), optionManager, locationProvider.getPath(), source.supportsFileImplicitColumns()); } return columnsStatistics; } protected abstract B self(); } }




© 2015 - 2025 Weber Informatics LLC | Privacy Policy