org.apache.hudi.source.prune.PartitionPruners Maven / Gradle / Ivy
/*
* Licensed to the Apache Software Foundation (ASF) under one
* or more contributor license agreements. See the NOTICE file
* distributed with this work for additional information
* regarding copyright ownership. The ASF licenses this file
* to you under the Apache License, Version 2.0 (the
* "License"); you may not use this file except in compliance
* with the License. You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
package org.apache.hudi.source.prune;
import org.apache.hudi.common.config.HoodieMetadataConfig;
import org.apache.hudi.configuration.FlinkOptions;
import org.apache.hudi.source.ExpressionEvaluators;
import org.apache.hudi.source.ExpressionEvaluators.Evaluator;
import org.apache.hudi.source.stats.ColumnStats;
import org.apache.hudi.source.stats.PartitionStatsIndex;
import org.apache.hudi.table.format.FilePathUtils;
import org.apache.hudi.util.DataTypeUtils;
import org.apache.hudi.util.StreamerUtil;
import org.apache.flink.configuration.Configuration;
import org.apache.flink.table.types.DataType;
import org.apache.flink.table.types.logical.RowType;
import java.io.Serializable;
import java.util.ArrayList;
import java.util.Collection;
import java.util.HashSet;
import java.util.LinkedHashMap;
import java.util.List;
import java.util.Map;
import java.util.Objects;
import java.util.Set;
import java.util.stream.Collectors;
import java.util.stream.Stream;
/**
* Tools to prune partitions.
*/
public class PartitionPruners {
public interface PartitionPruner extends Serializable {
/**
* Applies partition pruning on the given partition list, return remained partitions.
*/
Set filter(Collection partitions);
}
/**
* Dynamic partition pruner for hoodie table source which partitions list is available in runtime phase.
*
* Note: The data of new partitions created after the job starts could be read if they match the
* filter conditions.
*/
public static class DynamicPartitionPruner implements PartitionPruner {
private static final long serialVersionUID = 1L;
private final List partitionEvaluator;
private final String[] partitionKeys;
private final List partitionTypes;
private final String defaultParName;
private final boolean hivePartition;
private DynamicPartitionPruner(
List partitionEvaluators,
List partitionKeys,
List partitionTypes,
String defaultParName,
boolean hivePartition) {
this.partitionEvaluator = partitionEvaluators;
this.partitionKeys = partitionKeys.toArray(new String[] {});
this.partitionTypes = partitionTypes;
this.defaultParName = defaultParName;
this.hivePartition = hivePartition;
}
public Set filter(Collection partitions) {
return partitions.stream().filter(this::evaluate).collect(Collectors.toSet());
}
private boolean evaluate(String partition) {
String[] partStrArray = FilePathUtils.extractPartitionKeyValues(
new org.apache.hadoop.fs.Path(partition),
hivePartition,
partitionKeys).values().toArray(new String[] {});
Map partStats = new LinkedHashMap<>();
for (int idx = 0; idx < partitionKeys.length; idx++) {
String partKey = partitionKeys[idx];
Object partVal = partStrArray[idx].equals(defaultParName)
? null : DataTypeUtils.resolvePartition(partStrArray[idx], partitionTypes.get(idx));
ColumnStats columnStats = new ColumnStats(partVal, partVal, partVal == null ? 1 : 0);
partStats.put(partKey, columnStats);
}
return partitionEvaluator.stream().allMatch(evaluator -> evaluator.eval(partStats));
}
}
/**
* Static partition pruner for hoodie table source which partitions list is available in compile phase.
* After applied this partition pruner, hoodie source could not read the data from other partitions during runtime.
*
* Note: the data of new partitions created after the job starts would never be read.
*/
public static class StaticPartitionPruner implements PartitionPruner {
private static final long serialVersionUID = 1L;
private final Set partitions;
private StaticPartitionPruner(Collection partitions) {
this.partitions = new HashSet<>(partitions);
}
public Set filter(Collection partitions) {
return partitions.stream()
.filter(this.partitions::contains).collect(Collectors.toSet());
}
}
/**
* ColumnStats partition pruner for hoodie table source which enables partition stats index.
*
* Note: The data of new partitions created after the job starts could be read if they match the
* filter conditions.
*/
public static class ColumnStatsPartitionPruner implements PartitionPruner {
private static final long serialVersionUID = 1L;
private final ColumnStatsProbe probe;
private final PartitionStatsIndex partitionStatsIndex;
public ColumnStatsPartitionPruner(
RowType rowType,
String basePath,
HoodieMetadataConfig metadataConfig,
ColumnStatsProbe probe) {
this.probe = probe;
this.partitionStatsIndex = new PartitionStatsIndex(basePath, rowType, metadataConfig);
}
@Override
public Set filter(Collection partitions) {
Set candidatePartitions = partitionStatsIndex.computeCandidatePartitions(probe, new ArrayList<>(partitions));
if (candidatePartitions == null) {
return new HashSet<>(partitions);
}
return partitions.stream().filter(candidatePartitions::contains).collect(Collectors.toSet());
}
}
/**
* Chained partition pruner for hoodie table source combining multiple partition pruners with predicate '&'.
*/
public static class ChainedPartitionPruner implements PartitionPruner {
private static final long serialVersionUID = 1L;
private final List pruners;
public ChainedPartitionPruner(List pruners) {
this.pruners = pruners;
}
@Override
public Set filter(Collection partitions) {
for (PartitionPruner pruner: pruners) {
partitions = pruner.filter(partitions);
}
return new HashSet<>(partitions);
}
}
public static Builder builder() {
return new Builder();
}
public static class Builder {
private RowType rowType;
private String basePath;
private Configuration conf;
private ColumnStatsProbe probe;
private List partitionEvaluators;
private List partitionKeys;
private List partitionTypes;
private String defaultParName;
private boolean hivePartition;
private Collection candidatePartitions;
private Builder() {
}
public Builder rowType(RowType rowType) {
this.rowType = rowType;
return this;
}
public Builder basePath(String basePath) {
this.basePath = basePath;
return this;
}
public Builder conf(Configuration conf) {
this.conf = conf;
return this;
}
public Builder columnStatsProbe(ColumnStatsProbe probe) {
this.probe = probe;
return this;
}
public Builder partitionEvaluators(List partitionEvaluators) {
this.partitionEvaluators = partitionEvaluators;
return this;
}
public Builder partitionKeys(List partitionKeys) {
this.partitionKeys = partitionKeys;
return this;
}
public Builder partitionTypes(List partitionTypes) {
this.partitionTypes = partitionTypes;
return this;
}
public Builder defaultParName(String defaultParName) {
this.defaultParName = defaultParName;
return this;
}
public Builder hivePartition(boolean hivePartition) {
this.hivePartition = hivePartition;
return this;
}
public Builder candidatePartitions(Collection candidatePartitions) {
this.candidatePartitions = candidatePartitions;
return this;
}
public PartitionPruner build() {
PartitionPruner staticPruner = null;
if (candidatePartitions != null && !candidatePartitions.isEmpty()) {
staticPruner = new StaticPartitionPruner(candidatePartitions);
}
PartitionPruner dynamicPruner = null;
if (partitionEvaluators != null && !partitionEvaluators.isEmpty()) {
dynamicPruner = new DynamicPartitionPruner(partitionEvaluators, Objects.requireNonNull(partitionKeys),
Objects.requireNonNull(partitionTypes), Objects.requireNonNull(defaultParName),
hivePartition);
}
PartitionPruner columnStatsPruner = null;
if (probe != null
&& conf.get(FlinkOptions.READ_DATA_SKIPPING_ENABLED)
&& conf.get(FlinkOptions.METADATA_ENABLED)) {
columnStatsPruner = new ColumnStatsPartitionPruner(Objects.requireNonNull(rowType), Objects.requireNonNull(basePath),
StreamerUtil.metadataConfig(Objects.requireNonNull(conf)), probe);
}
List partitionPruners =
Stream.of(staticPruner, dynamicPruner, columnStatsPruner)
.filter(Objects::nonNull)
.collect(Collectors.toList());
if (partitionPruners.isEmpty()) {
return null;
}
if (partitionPruners.size() < 2) {
return partitionPruners.get(0);
}
return new ChainedPartitionPruner(partitionPruners);
}
}
}