Many resources are needed to download a project. Please understand that we have to compensate our server costs. Thank you in advance. Project price only 1 $
You can buy this project and download/modify it how often you want.
/*
* Licensed to the Apache Software Foundation (ASF) under one
* or more contributor license agreements. See the NOTICE file
* distributed with this work for additional information
* regarding copyright ownership. The ASF licenses this file
* to you under the Apache License, Version 2.0 (the
* "License"); you may not use this file except in compliance
* with the License. You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
package org.apache.hudi.source;
import org.apache.hudi.client.common.HoodieFlinkEngineContext;
import org.apache.hudi.common.config.HoodieMetadataConfig;
import org.apache.hudi.common.fs.FSUtils;
import org.apache.hudi.configuration.FlinkOptions;
import org.apache.hudi.configuration.HadoopConfigurations;
import org.apache.hudi.hadoop.fs.HadoopFSUtils;
import org.apache.hudi.index.bucket.BucketIdentifier;
import org.apache.hudi.source.prune.ColumnStatsProbe;
import org.apache.hudi.source.prune.PartitionPruners;
import org.apache.hudi.source.prune.PrimaryKeyPruners;
import org.apache.hudi.source.stats.FileStatsIndex;
import org.apache.hudi.storage.StoragePath;
import org.apache.hudi.storage.StoragePathInfo;
import org.apache.hudi.storage.hadoop.HoodieHadoopStorage;
import org.apache.hudi.util.StreamerUtil;
import org.apache.flink.annotation.VisibleForTesting;
import org.apache.flink.configuration.Configuration;
import org.apache.flink.table.types.logical.RowType;
import org.slf4j.Logger;
import org.slf4j.LoggerFactory;
import java.io.Serializable;
import java.util.ArrayList;
import java.util.Arrays;
import java.util.Collection;
import java.util.Collections;
import java.util.LinkedHashMap;
import java.util.List;
import java.util.Map;
import java.util.Objects;
import java.util.Set;
import java.util.stream.Collectors;
/**
* A file index which supports listing files efficiently through metadata table.
*
*
It caches the partition paths to avoid redundant look up.
*/
public class FileIndex implements Serializable {
private static final long serialVersionUID = 1L;
private static final Logger LOG = LoggerFactory.getLogger(FileIndex.class);
private final StoragePath path;
private final boolean tableExists;
private final HoodieMetadataConfig metadataConfig;
private final org.apache.hadoop.conf.Configuration hadoopConf;
private final PartitionPruners.PartitionPruner partitionPruner; // for partition pruning
private final ColumnStatsProbe colStatsProbe; // for probing column stats
private final int dataBucket; // for bucket pruning
private List partitionPaths; // cache of partition paths
private final FileStatsIndex fileStatsIndex; // for data skipping
private FileIndex(
StoragePath path,
Configuration conf,
RowType rowType,
ColumnStatsProbe colStatsProbe,
PartitionPruners.PartitionPruner partitionPruner,
int dataBucket) {
this.path = path;
this.hadoopConf = HadoopConfigurations.getHadoopConf(conf);
this.tableExists = StreamerUtil.tableExists(path.toString(), hadoopConf);
this.metadataConfig = StreamerUtil.metadataConfig(conf);
this.colStatsProbe = isDataSkippingFeasible(conf.get(FlinkOptions.READ_DATA_SKIPPING_ENABLED)) ? colStatsProbe : null;
this.partitionPruner = partitionPruner;
this.dataBucket = dataBucket;
this.fileStatsIndex = new FileStatsIndex(path.toString(), rowType, metadataConfig);
}
/**
* Returns the builder.
*/
public static Builder builder() {
return new Builder();
}
/**
* Returns the partition path key and values as a list of map, each map item in the list
* is a mapping of the partition key name to its actual partition value. For example, say
* there is a file path with partition keys [key1, key2, key3]:
*
*
The return list should be [{key1:val1, key2:val2, key3:val3}, {key1:val4, key2:val5, key3:val6}].
*
* @param partitionKeys The partition key list
* @param defaultParName The default partition name for nulls
* @param hivePartition Whether the partition path is in Hive style
*/
public List