org.apache.hudi.source.stats.FileStatsIndex Maven / Gradle / Ivy

Go to download
Show more of this group Show more artifacts with this name
Show all versions of hudi-flink1.17-bundle Show documentation
The newest version!
/*
 * Licensed to the Apache Software Foundation (ASF) under one
 * or more contributor license agreements.  See the NOTICE file
 * distributed with this work for additional information
 * regarding copyright ownership.  The ASF licenses this file
 * to you under the Apache License, Version 2.0 (the
 * "License"); you may not use this file except in compliance
 * with the License.  You may obtain a copy of the License at
 *
 *      http://www.apache.org/licenses/LICENSE-2.0
 *
 * Unless required by applicable law or agreed to in writing, software
 * distributed under the License is distributed on an "AS IS" BASIS,
 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 * See the License for the specific language governing permissions and
 * limitations under the License.
 */

package org.apache.hudi.source.stats;

import org.apache.hudi.avro.model.HoodieMetadataRecord;
import org.apache.hudi.client.common.HoodieFlinkEngineContext;
import org.apache.hudi.common.config.HoodieMetadataConfig;
import org.apache.hudi.common.data.HoodieData;
import org.apache.hudi.common.model.HoodieRecord;
import org.apache.hudi.common.util.ValidationUtils;
import org.apache.hudi.common.util.VisibleForTesting;
import org.apache.hudi.common.util.collection.Pair;
import org.apache.hudi.common.util.collection.Tuple3;
import org.apache.hudi.common.util.hash.ColumnIndexID;
import org.apache.hudi.exception.HoodieException;
import org.apache.hudi.metadata.HoodieMetadataPayload;
import org.apache.hudi.metadata.HoodieTableMetadata;
import org.apache.hudi.metadata.HoodieTableMetadataUtil;
import org.apache.hudi.source.prune.ColumnStatsProbe;
import org.apache.hudi.storage.hadoop.HoodieHadoopStorage;
import org.apache.hudi.util.AvroToRowDataConverters;
import org.apache.hudi.util.DataTypeUtils;
import org.apache.hudi.util.FlinkClientUtil;
import org.apache.hudi.util.RowDataProjection;

import org.apache.avro.generic.GenericRecord;
import org.apache.flink.table.data.GenericRowData;
import org.apache.flink.table.data.RowData;
import org.apache.flink.table.data.StringData;
import org.apache.flink.table.types.logical.LogicalType;
import org.apache.flink.table.types.logical.RowType;
import org.slf4j.Logger;
import org.slf4j.LoggerFactory;

import javax.annotation.Nullable;

import java.io.IOException;
import java.util.Arrays;
import java.util.List;
import java.util.Map;
import java.util.Set;
import java.util.SortedMap;
import java.util.TreeMap;
import java.util.TreeSet;
import java.util.concurrent.ConcurrentHashMap;
import java.util.stream.Collectors;

import static org.apache.hudi.common.util.ValidationUtils.checkState;
import static org.apache.hudi.source.stats.ColumnStatsSchemas.COL_STATS_DATA_TYPE;
import static org.apache.hudi.source.stats.ColumnStatsSchemas.COL_STATS_TARGET_POS;
import static org.apache.hudi.source.stats.ColumnStatsSchemas.METADATA_DATA_TYPE;
import static org.apache.hudi.source.stats.ColumnStatsSchemas.ORD_COL_NAME;
import static org.apache.hudi.source.stats.ColumnStatsSchemas.ORD_FILE_NAME;
import static org.apache.hudi.source.stats.ColumnStatsSchemas.ORD_MAX_VAL;
import static org.apache.hudi.source.stats.ColumnStatsSchemas.ORD_MIN_VAL;
import static org.apache.hudi.source.stats.ColumnStatsSchemas.ORD_NULL_CNT;
import static org.apache.hudi.source.stats.ColumnStatsSchemas.ORD_VAL_CNT;

/**
 * An index support implementation that leverages Column Stats Index to prune files,
 * including utilities for abstracting away heavy-lifting of interactions with the index,
 * providing convenient interfaces to read it, transpose, etc.
 */
public class FileStatsIndex implements ColumnStatsIndex {
  private static final long serialVersionUID = 1L;
  private static final Logger LOG = LoggerFactory.getLogger(FileStatsIndex.class);
  private final RowType rowType;
  private final String basePath;
  private final HoodieMetadataConfig metadataConfig;
  private HoodieTableMetadata metadataTable;

  public FileStatsIndex(
      String basePath,
      RowType rowType,
      HoodieMetadataConfig metadataConfig) {
    this.basePath = basePath;
    this.rowType = rowType;
    this.metadataConfig = metadataConfig;
  }

  @Override
  public String getIndexPartitionName() {
    return HoodieTableMetadataUtil.PARTITION_NAME_COLUMN_STATS;
  }

  public HoodieTableMetadata getMetadataTable() {
    // initialize the metadata table lazily
    if (this.metadataTable == null) {
      this.metadataTable = HoodieTableMetadata.create(
          HoodieFlinkEngineContext.DEFAULT,
          new HoodieHadoopStorage(basePath, FlinkClientUtil.getHadoopConf()),
          metadataConfig,
          basePath);
    }
    return this.metadataTable;
  }

  @Override
  public Set computeCandidateFiles(ColumnStatsProbe probe, List allFiles) {
    if (probe == null) {
      return null;
    }
    try {
      String[] targetColumns = probe.getReferencedCols();
      final List statsRows = readColumnStatsIndexByColumns(targetColumns);
      return candidatesInMetadataTable(probe, statsRows, allFiles);
    } catch (Throwable t) {
      LOG.warn("Read {} for data skipping error", getIndexPartitionName(), t);
      return null;
    }
  }

  @Override
  public Set computeCandidatePartitions(ColumnStatsProbe probe, List allPartitions) {
    throw new UnsupportedOperationException("This method is not supported by " + this.getClass().getSimpleName());
  }

  /**
   * Computes pruned list of candidates' names based on provided list of data filters.
   * conditions, by leveraging Metadata Table's Column Statistics index (hereon referred as ColStats for brevity)
   * bearing "min", "max", "num_nulls" statistics for all columns.
   *
   * NOTE: This method has to return complete set of the candidates, since only provided candidates will
   * ultimately be scanned as part of query execution. Hence, this method has to maintain the
   * invariant of conservatively including every candidate's name, that is NOT referenced in its index.
   *
   * 
The {@code filters} must all be simple.
   *
   * @param probe         The column stats probe built from push-down filters.
   * @param indexRows     The raw column stats records.
   * @param oriCandidates The original candidates to be pruned.
   *
   * @return set of pruned (data-skipped) candidate names
   */
  protected Set candidatesInMetadataTable(
      @Nullable ColumnStatsProbe probe,
      List indexRows,
      List oriCandidates) {
    if (probe == null) {
      return null;
    }
    String[] referencedCols = probe.getReferencedCols();
    final Pair, String[]> colStatsTable =
        transposeColumnStatsIndex(indexRows, referencedCols);
    List transposedColStats = colStatsTable.getLeft();
    String[] queryCols = colStatsTable.getRight();
    if (queryCols.length == 0) {
      // the indexed columns have no intersection with the referenced columns, returns early
      return null;
    }
    RowType.RowField[] queryFields = DataTypeUtils.projectRowFields(rowType, queryCols);

    Set allIndexedFiles = transposedColStats.stream().parallel()
        .map(row -> row.getString(0).toString())
        .collect(Collectors.toSet());
    Set candidateFiles = transposedColStats.stream().parallel()
        .filter(row -> probe.test(row, queryFields))
        .map(row -> row.getString(0).toString())
        .collect(Collectors.toSet());

    // NOTE: Col-Stats Index isn't guaranteed to have complete set of statistics for every
    //       base-file: since it's bound to clustering, which could occur asynchronously
    //       at arbitrary point in time, and is not likely to be touching all the base files.
    //
    //       To close that gap, we manually compute the difference b/w all indexed (by col-stats-index)
    //       files and all outstanding base-files, and make sure that all base files not
    //       represented w/in the index are included in the output of this method
    oriCandidates.removeAll(allIndexedFiles);
    candidateFiles.addAll(oriCandidates);
    return candidateFiles;
  }

  private static List projectNestedColStatsColumns(List rows) {
    int pos = HoodieMetadataRecord.SCHEMA$.getField(HoodieMetadataPayload.SCHEMA_FIELD_ID_COLUMN_STATS).pos();
    RowDataProjection projection = RowDataProjection.instanceV2((RowType) COL_STATS_DATA_TYPE.getLogicalType(), COL_STATS_TARGET_POS);
    return rows.stream().parallel()
        .map(row -> {
          RowData columnStatsField = row.getRow(pos, 9);
          return projection.project(columnStatsField);
        }).collect(Collectors.toList());
  }

  /**
   * Transposes and converts the raw table format of the Column Stats Index representation,
   * where each row/record corresponds to individual (column, file) pair, into the table format
   * where each row corresponds to single file with statistic for individual columns collated
   * w/in such row:
   * 

   * Metadata Table Column Stats Index format:
   *
   * 
   *  +---------------------------+------------+------------+------------+-------------+
   *  |        fileName           | columnName |  minValue  |  maxValue  |  num_nulls  |
   *  +---------------------------+------------+------------+------------+-------------+
   *  | one_base_file.parquet     |          A |          1 |         10 |           0 |
   *  | another_base_file.parquet |          A |        -10 |          0 |           5 |
   *  +---------------------------+------------+------------+------------+-------------+
   * 
   * 
   * Returned table format
   *
   * 
   *  +---------------------------+------------+------------+-------------+
   *  |          file             | A_minValue | A_maxValue | A_nullCount |
   *  +---------------------------+------------+------------+-------------+
   *  | one_base_file.parquet     |          1 |         10 |           0 |
   *  | another_base_file.parquet |        -10 |          0 |           5 |
   *  +---------------------------+------------+------------+-------------+
   * 
   * 
   * NOTE: Column Stats Index might potentially contain statistics for many columns (if not all), while
   * query at hand might only be referencing a handful of those. As such, we collect all the
   * column references from the filtering expressions, and only transpose records corresponding to the
   * columns referenced in those
   *
   * @param colStats     RowData list bearing raw Column Stats Index table
   * @param queryColumns target columns to be included into the final table
   * @return reshaped table according to the format outlined above
   */
  @VisibleForTesting
  public Pair, String[]> transposeColumnStatsIndex(List colStats, String[] queryColumns) {

    Map tableFieldTypeMap = rowType.getFields().stream()
        .collect(Collectors.toMap(RowType.RowField::getName, RowType.RowField::getType));

    // NOTE: We have to collect list of indexed columns to make sure we properly align the rows
    //       w/in the transposed dataset: since some files might not have all the columns indexed
    //       either due to the Column Stats Index config changes, schema evolution, etc. we have
    //       to make sure that all the rows w/in transposed data-frame are properly padded (with null
    //       values) for such file-column combinations
    Set indexedColumns = colStats.stream().map(row -> row.getString(ORD_COL_NAME)
        .toString()).collect(Collectors.toSet());

    // NOTE: We're sorting the columns to make sure final index schema matches layout
    //       of the transposed table
    TreeSet sortedTargetColumns = Arrays.stream(queryColumns).sorted()
        .filter(indexedColumns::contains)
        .collect(Collectors.toCollection(TreeSet::new));

    final Map converters = new ConcurrentHashMap<>();
    Map> fileNameToRows = colStats.stream().parallel()
        .filter(row -> sortedTargetColumns.contains(row.getString(ORD_COL_NAME).toString()))
        .map(row -> {
          if (row.isNullAt(ORD_MIN_VAL) && row.isNullAt(ORD_MAX_VAL)) {
            // Corresponding row could be null in either of the 2 cases
            //    - Column contains only null values (in that case both min/max have to be nulls)
            //    - This is a stubbed Column Stats record (used as a tombstone)
            return row;
          } else {
            String colName = row.getString(ORD_COL_NAME).toString();
            LogicalType colType = tableFieldTypeMap.get(colName);
            return unpackMinMaxVal(row, colType, converters);
          }
        }).collect(Collectors.groupingBy(rowData -> rowData.getString(ORD_FILE_NAME)));

    return Pair.of(foldRowsByFiles(sortedTargetColumns, fileNameToRows), sortedTargetColumns.toArray(new String[0]));
  }

  private static List foldRowsByFiles(
      TreeSet sortedTargetColumns,
      Map> fileNameToRows) {
    return fileNameToRows.values().stream().parallel().map(rows -> {
      // Rows seq is always non-empty (otherwise it won't be grouped into)
      StringData fileName = rows.get(0).getString(ORD_FILE_NAME);
      long valueCount = rows.get(0).getLong(ORD_VAL_CNT);

      // To properly align individual rows (corresponding to a file) w/in the transposed projection, we need
      // to align existing column-stats for individual file with the list of expected ones for the
      // whole transposed projection (a superset of all files)
      Map columnRowsMap = rows.stream()
          .collect(Collectors.toMap(row -> row.getString(ORD_COL_NAME).toString(), row -> row));
      SortedMap alignedColumnRowsMap = new TreeMap<>();
      sortedTargetColumns.forEach(col -> alignedColumnRowsMap.put(col, columnRowsMap.get(col)));

      List columnStats = alignedColumnRowsMap.values().stream().map(row -> {
        if (row == null) {
          // NOTE: Since we're assuming missing column to essentially contain exclusively
          //       null values, we set null-count to be equal to value-count (this behavior is
          //       consistent with reading non-existent columns from Parquet)
          return Tuple3.of(null, null, valueCount);
        } else {
          GenericRowData gr = (GenericRowData) row;
          return Tuple3.of(gr.getField(ORD_MIN_VAL), gr.getField(ORD_MAX_VAL), gr.getField(ORD_NULL_CNT));
        }
      }).collect(Collectors.toList());
      GenericRowData foldedRow = new GenericRowData(2 + 3 * columnStats.size());
      foldedRow.setField(0, fileName);
      foldedRow.setField(1, valueCount);
      for (int i = 0; i < columnStats.size(); i++) {
        Tuple3 stats = columnStats.get(i);
        int startPos = 2 + 3 * i;
        foldedRow.setField(startPos, stats.f0);
        foldedRow.setField(startPos + 1, stats.f1);
        foldedRow.setField(startPos + 2, stats.f2);
      }
      return foldedRow;
    }).collect(Collectors.toList());
  }

  private static RowData unpackMinMaxVal(
      RowData row,
      LogicalType colType,
      Map converters) {

    RowData minValueStruct = row.getRow(ORD_MIN_VAL, 1);
    RowData maxValueStruct = row.getRow(ORD_MAX_VAL, 1);

    checkState(minValueStruct != null && maxValueStruct != null,
        "Invalid Column Stats record: either both min/max have to be null, or both have to be non-null");

    Object minValue = tryUnpackNonNullVal(minValueStruct, colType, converters);
    Object maxValue = tryUnpackNonNullVal(maxValueStruct, colType, converters);

    // the column schema:
    // |- file_name: string
    // |- min_val: row
    // |- max_val: row
    // |- null_cnt: long
    // |- val_cnt: long
    // |- column_name: string
    GenericRowData unpackedRow = new GenericRowData(row.getArity());
    unpackedRow.setField(0, row.getString(0));
    unpackedRow.setField(1, minValue);
    unpackedRow.setField(2, maxValue);
    unpackedRow.setField(3, row.getLong(3));
    unpackedRow.setField(4, row.getLong(4));
    unpackedRow.setField(5, row.getString(5));

    return unpackedRow;
  }

  private static Object tryUnpackNonNullVal(
      RowData rowData,
      LogicalType colType,
      Map converters) {
    for (int i = 0; i < rowData.getArity(); i++) {
      // row data converted from avro is definitely generic.
      Object nested = ((GenericRowData) rowData).getField(i);
      if (nested != null) {
        return doUnpack(nested, colType, converters);
      }
    }
    return null;
  }

  private static Object doUnpack(
      Object rawVal,
      LogicalType logicalType,
      Map converters) {
    AvroToRowDataConverters.AvroToRowDataConverter converter =
        converters.computeIfAbsent(logicalType, k -> AvroToRowDataConverters.createConverter(logicalType, true));
    return converter.convert(rawVal);
  }

  @VisibleForTesting
  public List readColumnStatsIndexByColumns(String[] targetColumns) {
    // NOTE: If specific columns have been provided, we can considerably trim down amount of data fetched
    //       by only fetching Column Stats Index records pertaining to the requested columns.
    //       Otherwise, we fall back to read whole Column Stats Index
    ValidationUtils.checkArgument(targetColumns.length > 0,
        "Column stats is only valid when push down filters have referenced columns");

    // Read Metadata Table's column stats Flink's RowData list by
    //    - Fetching the records by key-prefixes (encoded column names)
    //    - Deserializing fetched records into [[RowData]]s
    // TODO encoding should be done internally w/in HoodieBackedTableMetadata
    List encodedTargetColumnNames = Arrays.stream(targetColumns)
        .map(colName -> new ColumnIndexID(colName).asBase64EncodedString()).collect(Collectors.toList());

    HoodieData> records =
        getMetadataTable().getRecordsByKeyPrefixes(encodedTargetColumnNames, getIndexPartitionName(), false);

    org.apache.hudi.util.AvroToRowDataConverters.AvroToRowDataConverter converter =
        AvroToRowDataConverters.createRowConverter((RowType) METADATA_DATA_TYPE.getLogicalType());
    List rows = records.collectAsList().stream().parallel().map(record -> {
          // schema and props are ignored for generating metadata record from the payload
          // instead, the underlying file system, or bloom filter, or columns stats metadata (part of payload) are directly used
          GenericRecord genericRecord;
          try {
            genericRecord = (GenericRecord) record.getData().getInsertValue(null, null).orElse(null);
          } catch (IOException e) {
            throw new HoodieException("Exception while getting insert value from metadata payload");
          }
          return (RowData) converter.convert(genericRecord);
        }
    ).collect(Collectors.toList());
    return projectNestedColStatsColumns(rows);
  }
}