org.apache.hudi.source.FileIndex Maven / Gradle / Ivy

Go to download
/*
 * Licensed to the Apache Software Foundation (ASF) under one
 * or more contributor license agreements.  See the NOTICE file
 * distributed with this work for additional information
 * regarding copyright ownership.  The ASF licenses this file
 * to you under the Apache License, Version 2.0 (the
 * "License"); you may not use this file except in compliance
 * with the License.  You may obtain a copy of the License at
 *
 *      http://www.apache.org/licenses/LICENSE-2.0
 *
 * Unless required by applicable law or agreed to in writing, software
 * distributed under the License is distributed on an "AS IS" BASIS,
 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 * See the License for the specific language governing permissions and
 * limitations under the License.
 */

package org.apache.hudi.source;

import org.apache.hudi.client.common.HoodieFlinkEngineContext;
import org.apache.hudi.common.config.HoodieMetadataConfig;
import org.apache.hudi.common.fs.FSUtils;
import org.apache.hudi.common.util.collection.Pair;
import org.apache.hudi.configuration.FlinkOptions;
import org.apache.hudi.configuration.HadoopConfigurations;
import org.apache.hudi.index.bucket.BucketIdentifier;
import org.apache.hudi.source.prune.DataPruner;
import org.apache.hudi.source.prune.PartitionPruners;
import org.apache.hudi.source.prune.PrimaryKeyPruners;
import org.apache.hudi.source.stats.ColumnStatsIndices;
import org.apache.hudi.util.DataTypeUtils;
import org.apache.hudi.util.StreamerUtil;

import org.apache.flink.annotation.VisibleForTesting;
import org.apache.flink.configuration.Configuration;
import org.apache.flink.table.data.RowData;
import org.apache.flink.table.types.logical.RowType;
import org.apache.hadoop.fs.FileStatus;
import org.apache.hadoop.fs.Path;
import org.slf4j.Logger;
import org.slf4j.LoggerFactory;

import javax.annotation.Nullable;

import java.util.ArrayList;
import java.util.Arrays;
import java.util.Collections;
import java.util.LinkedHashMap;
import java.util.List;
import java.util.Map;
import java.util.Objects;
import java.util.Properties;
import java.util.Set;
import java.util.stream.Collectors;

/**
 * A file index which supports listing files efficiently through metadata table.
 *
 * It caches the partition paths to avoid redundant look up.
 */
public class FileIndex {
  private static final Logger LOG = LoggerFactory.getLogger(FileIndex.class);

  private final Path path;
  private final RowType rowType;
  private final boolean tableExists;
  private final HoodieMetadataConfig metadataConfig;
  private final org.apache.hadoop.conf.Configuration hadoopConf;
  private final PartitionPruners.PartitionPruner partitionPruner; // for partition pruning
  private final DataPruner dataPruner;                            // for data skipping
  private final int dataBucket;                                   // for bucket pruning
  private List partitionPaths;                            // cache of partition paths

  private FileIndex(Path path, Configuration conf, RowType rowType, DataPruner dataPruner, PartitionPruners.PartitionPruner partitionPruner, int dataBucket) {
    this.path = path;
    this.rowType = rowType;
    this.hadoopConf = HadoopConfigurations.getHadoopConf(conf);
    this.tableExists = StreamerUtil.tableExists(path.toString(), hadoopConf);
    this.metadataConfig = metadataConfig(conf);
    this.dataPruner = isDataSkippingFeasible(conf.getBoolean(FlinkOptions.READ_DATA_SKIPPING_ENABLED)) ? dataPruner : null;
    this.partitionPruner = partitionPruner;
    this.dataBucket = dataBucket;
  }

  /**
   * Returns the builder.
   */
  public static Builder builder() {
    return new Builder();
  }

  /**
   * Returns the partition path key and values as a list of map, each map item in the list
   * is a mapping of the partition key name to its actual partition value. For example, say
   * there is a file path with partition keys [key1, key2, key3]:
   *
   * 
   *   -- file:/// ... key1=val1/key2=val2/key3=val3
   *   -- file:/// ... key1=val4/key2=val5/key3=val6
   * 
   *
   * The return list should be [{key1:val1, key2:val2, key3:val3}, {key1:val4, key2:val5, key3:val6}].
   *
   * @param partitionKeys  The partition key list
   * @param defaultParName The default partition name for nulls
   * @param hivePartition  Whether the partition path is in Hive style
   */
  public List> getPartitions(
      List partitionKeys,
      String defaultParName,
      boolean hivePartition) {
    if (partitionKeys.size() == 0) {
      // non partitioned table
      return Collections.emptyList();
    }
    List partitionPaths = getOrBuildPartitionPaths();
    if (partitionPaths.size() == 1 && partitionPaths.get(0).isEmpty()) {
      return Collections.emptyList();
    }
    List> partitions = new ArrayList<>();
    for (String partitionPath : partitionPaths) {
      String[] paths = partitionPath.split(Path.SEPARATOR);
      Map partitionMapping = new LinkedHashMap<>();
      if (hivePartition) {
        Arrays.stream(paths).forEach(p -> {
          String[] kv = p.split("=");
          if (kv.length == 2) {
            partitionMapping.put(kv[0], defaultParName.equals(kv[1]) ? null : kv[1]);
          }
        });
      } else {
        for (int i = 0; i < partitionKeys.size(); i++) {
          partitionMapping.put(partitionKeys.get(i), defaultParName.equals(paths[i]) ? null : paths[i]);
        }
      }
      partitions.add(partitionMapping);
    }
    return partitions;
  }

  /**
   * Returns all the file statuses under the table base path.
   */
  public FileStatus[] getFilesInPartitions() {
    if (!tableExists) {
      return new FileStatus[0];
    }
    String[] partitions = getOrBuildPartitionPaths().stream().map(p -> fullPartitionPath(path, p)).toArray(String[]::new);
    FileStatus[] allFiles = FSUtils.getFilesInPartitions(
            new HoodieFlinkEngineContext(hadoopConf), metadataConfig, path.toString(), partitions)
        .values().stream()
        .flatMap(Arrays::stream)
        .toArray(FileStatus[]::new);

    if (allFiles.length == 0) {
      // returns early for empty table.
      return allFiles;
    }

    // bucket pruning
    if (this.dataBucket >= 0) {
      String bucketIdStr = BucketIdentifier.bucketIdStr(this.dataBucket);
      FileStatus[] filesAfterBucketPruning = Arrays.stream(allFiles)
          .filter(fileStatus -> fileStatus.getPath().getName().contains(bucketIdStr))
          .toArray(FileStatus[]::new);
      logPruningMsg(allFiles.length, filesAfterBucketPruning.length, "bucket pruning");
      allFiles = filesAfterBucketPruning;
    }

    // data skipping
    Set candidateFiles = candidateFilesInMetadataTable(allFiles);
    if (candidateFiles == null) {
      // no need to filter by col stats or error occurs.
      return allFiles;
    }
    FileStatus[] results = Arrays.stream(allFiles).parallel()
        .filter(fileStatus -> candidateFiles.contains(fileStatus.getPath().getName()))
        .toArray(FileStatus[]::new);
    logPruningMsg(allFiles.length, results.length, "data skipping");
    return results;
  }

  /**
   * Returns the full partition path.
   *
   * @param basePath      The base path.
   * @param partitionPath The relative partition path, may be empty if the table is non-partitioned.
   * @return The full partition path string
   */
  private static String fullPartitionPath(Path basePath, String partitionPath) {
    if (partitionPath.isEmpty()) {
      return basePath.toString();
    }
    return new Path(basePath, partitionPath).toString();
  }

  /**
   * Reset the state of the file index.
   */
  @VisibleForTesting
  public void reset() {
    this.partitionPaths = null;
  }

  // -------------------------------------------------------------------------
  //  Utilities
  // -------------------------------------------------------------------------

  /**
   * Computes pruned list of candidate base-files' names based on provided list of data filters.
   * conditions, by leveraging Metadata Table's Column Statistics index (hereon referred as ColStats for brevity)
   * bearing "min", "max", "num_nulls" statistics for all columns.
   *
   * 
NOTE: This method has to return complete set of candidate files, since only provided candidates will
   * ultimately be scanned as part of query execution. Hence, this method has to maintain the
   * invariant of conservatively including every base-file's name, that is NOT referenced in its index.
   *
   * 
The {@code filters} must all be simple.
   *
   * @return set of pruned (data-skipped) candidate base-files' names
   */
  @Nullable
  private Set candidateFilesInMetadataTable(FileStatus[] allFileStatus) {
    if (dataPruner == null) {
      return null;
    }
    try {
      String[] referencedCols = dataPruner.getReferencedCols();
      final List colStats = ColumnStatsIndices.readColumnStatsIndex(path.toString(), metadataConfig, referencedCols);
      final Pair, String[]> colStatsTable = ColumnStatsIndices.transposeColumnStatsIndex(colStats, referencedCols, rowType);
      List transposedColStats = colStatsTable.getLeft();
      String[] queryCols = colStatsTable.getRight();
      if (queryCols.length == 0) {
        // the indexed columns have no intersection with the referenced columns, returns early
        return null;
      }
      RowType.RowField[] queryFields = DataTypeUtils.projectRowFields(rowType, queryCols);

      Set allIndexedFileNames = transposedColStats.stream().parallel()
          .map(row -> row.getString(0).toString())
          .collect(Collectors.toSet());
      Set candidateFileNames = transposedColStats.stream().parallel()
          .filter(row -> dataPruner.test(row, queryFields))
          .map(row -> row.getString(0).toString())
          .collect(Collectors.toSet());

      // NOTE: Col-Stats Index isn't guaranteed to have complete set of statistics for every
      //       base-file: since it's bound to clustering, which could occur asynchronously
      //       at arbitrary point in time, and is not likely to be touching all the base files.
      //
      //       To close that gap, we manually compute the difference b/w all indexed (by col-stats-index)
      //       files and all outstanding base-files, and make sure that all base files not
      //       represented w/in the index are included in the output of this method
      Set nonIndexedFileNames = Arrays.stream(allFileStatus)
          .map(fileStatus -> fileStatus.getPath().getName()).collect(Collectors.toSet());
      nonIndexedFileNames.removeAll(allIndexedFileNames);

      candidateFileNames.addAll(nonIndexedFileNames);
      return candidateFileNames;
    } catch (Throwable throwable) {
      LOG.warn("Read column stats for data skipping error", throwable);
      return null;
    }
  }

  /**
   * Returns all the relative partition paths.
   *
   * The partition paths are cached once invoked.
   */
  public List getOrBuildPartitionPaths() {
    if (this.partitionPaths != null) {
      return this.partitionPaths;
    }
    List allPartitionPaths = this.tableExists
        ? FSUtils.getAllPartitionPaths(new HoodieFlinkEngineContext(hadoopConf), metadataConfig, path.toString())
        : Collections.emptyList();
    if (this.partitionPruner == null) {
      this.partitionPaths = allPartitionPaths;
    } else {
      Set prunedPartitionPaths = this.partitionPruner.filter(allPartitionPaths);
      this.partitionPaths = new ArrayList<>(prunedPartitionPaths);
    }
    return this.partitionPaths;
  }

  public static HoodieMetadataConfig metadataConfig(org.apache.flink.configuration.Configuration conf) {
    Properties properties = new Properties();

    // set up metadata.enabled=true in table DDL to enable metadata listing
    properties.put(HoodieMetadataConfig.ENABLE.key(), conf.getBoolean(FlinkOptions.METADATA_ENABLED));

    return HoodieMetadataConfig.newBuilder().fromProperties(properties).build();
  }

  private boolean isDataSkippingFeasible(boolean dataSkippingEnabled) {
    // NOTE: Data Skipping is only effective when it references columns that are indexed w/in
    //       the Column Stats Index (CSI). Following cases could not be effectively handled by Data Skipping:
    //          - Expressions on top-level column's fields (ie, for ex filters like "struct.field > 0", since
    //          CSI only contains stats for top-level columns, in this case for "struct")
    //          - Any expression not directly referencing top-level column (for ex, sub-queries, since there's
    //          nothing CSI in particular could be applied for)
    if (dataSkippingEnabled) {
      if (metadataConfig.enabled()) {
        return true;
      } else {
        LOG.warn("Data skipping requires Metadata Table to be enabled! Disable the data skipping");
      }
    }
    return false;
  }

  private void logPruningMsg(int numTotalFiles, int numLeftFiles, String action) {
    LOG.info("\n"
        + "------------------------------------------------------------\n"
        + "---------- action:        {}\n"
        + "---------- total files:   {}\n"
        + "---------- left files:    {}\n"
        + "---------- skipping rate: {}\n"
        + "------------------------------------------------------------",
        action, numTotalFiles, numLeftFiles, percentage(numTotalFiles, numLeftFiles));
  }

  private static double percentage(double total, double left) {
    return (total - left) / total;
  }

  // -------------------------------------------------------------------------
  //  Inner class
  // -------------------------------------------------------------------------

  /**
   * Builder for {@link FileIndex}.
   */
  public static class Builder {
    private Path path;
    private Configuration conf;
    private RowType rowType;
    private DataPruner dataPruner;
    private PartitionPruners.PartitionPruner partitionPruner;
    private int dataBucket = PrimaryKeyPruners.BUCKET_ID_NO_PRUNING;

    private Builder() {
    }

    public Builder path(Path path) {
      this.path = path;
      return this;
    }

    public Builder conf(Configuration conf) {
      this.conf = conf;
      return this;
    }

    public Builder rowType(RowType rowType) {
      this.rowType = rowType;
      return this;
    }

    public Builder dataPruner(DataPruner dataPruner) {
      this.dataPruner = dataPruner;
      return this;
    }

    public Builder partitionPruner(PartitionPruners.PartitionPruner partitionPruner) {
      this.partitionPruner = partitionPruner;
      return this;
    }

    public Builder dataBucket(int dataBucket) {
      this.dataBucket = dataBucket;
      return this;
    }

    public FileIndex build() {
      return new FileIndex(Objects.requireNonNull(path), Objects.requireNonNull(conf), Objects.requireNonNull(rowType),
          dataPruner, partitionPruner, dataBucket);
    }
  }
}