All Downloads are FREE. Search and download functionalities are using the official Maven repository.

org.apache.hudi.table.HoodieTableSource Maven / Gradle / Ivy

The newest version!
/*
 * Licensed to the Apache Software Foundation (ASF) under one
 * or more contributor license agreements.  See the NOTICE file
 * distributed with this work for additional information
 * regarding copyright ownership.  The ASF licenses this file
 * to you under the Apache License, Version 2.0 (the
 * "License"); you may not use this file except in compliance
 * with the License.  You may obtain a copy of the License at
 *
 * http://www.apache.org/licenses/LICENSE-2.0
 *
 * Unless required by applicable law or agreed to in writing, software
 * distributed under the License is distributed on an "AS IS" BASIS,
 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 * See the License for the specific language governing permissions and
 * limitations under the License.
 */

package org.apache.hudi.table;

import org.apache.hudi.adapter.DataStreamScanProviderAdapter;
import org.apache.hudi.avro.HoodieAvroUtils;
import org.apache.hudi.common.model.BaseFile;
import org.apache.hudi.common.model.HoodieBaseFile;
import org.apache.hudi.common.model.HoodieCommitMetadata;
import org.apache.hudi.common.model.HoodieLogFile;
import org.apache.hudi.common.model.HoodieTableType;
import org.apache.hudi.common.table.HoodieTableMetaClient;
import org.apache.hudi.common.table.TableSchemaResolver;
import org.apache.hudi.common.table.timeline.HoodieActiveTimeline;
import org.apache.hudi.common.table.timeline.HoodieInstant;
import org.apache.hudi.common.table.view.HoodieTableFileSystemView;
import org.apache.hudi.common.util.Option;
import org.apache.hudi.common.util.collection.Pair;
import org.apache.hudi.configuration.FlinkOptions;
import org.apache.hudi.configuration.HadoopConfigurations;
import org.apache.hudi.configuration.OptionsInference;
import org.apache.hudi.configuration.OptionsResolver;
import org.apache.hudi.exception.HoodieException;
import org.apache.hudi.exception.HoodieValidationException;
import org.apache.hudi.sink.utils.Pipelines;
import org.apache.hudi.source.ExpressionEvaluators;
import org.apache.hudi.source.ExpressionPredicates;
import org.apache.hudi.source.ExpressionPredicates.Predicate;
import org.apache.hudi.source.FileIndex;
import org.apache.hudi.source.IncrementalInputSplits;
import org.apache.hudi.source.StreamReadMonitoringFunction;
import org.apache.hudi.source.StreamReadOperator;
import org.apache.hudi.source.prune.ColumnStatsProbe;
import org.apache.hudi.source.prune.PartitionPruners;
import org.apache.hudi.source.prune.PrimaryKeyPruners;
import org.apache.hudi.source.rebalance.partitioner.StreamReadAppendPartitioner;
import org.apache.hudi.source.rebalance.partitioner.StreamReadBucketIndexPartitioner;
import org.apache.hudi.source.rebalance.selector.StreamReadAppendKeySelector;
import org.apache.hudi.source.rebalance.selector.StreamReadBucketIndexKeySelector;
import org.apache.hudi.storage.StorageConfiguration;
import org.apache.hudi.storage.StoragePath;
import org.apache.hudi.storage.StoragePathInfo;
import org.apache.hudi.storage.hadoop.HadoopStorageConfiguration;
import org.apache.hudi.table.format.FilePathUtils;
import org.apache.hudi.table.format.InternalSchemaManager;
import org.apache.hudi.table.format.cdc.CdcInputFormat;
import org.apache.hudi.table.format.cow.CopyOnWriteInputFormat;
import org.apache.hudi.table.format.mor.MergeOnReadInputFormat;
import org.apache.hudi.table.format.mor.MergeOnReadInputSplit;
import org.apache.hudi.table.format.mor.MergeOnReadTableState;
import org.apache.hudi.table.lookup.HoodieLookupFunction;
import org.apache.hudi.table.lookup.HoodieLookupTableReader;
import org.apache.hudi.util.AvroSchemaConverter;
import org.apache.hudi.util.ChangelogModes;
import org.apache.hudi.util.ExpressionUtils;
import org.apache.hudi.util.InputFormats;
import org.apache.hudi.util.SerializableSchema;
import org.apache.hudi.util.StreamerUtil;

import org.apache.avro.Schema;
import org.apache.flink.annotation.VisibleForTesting;
import org.apache.flink.api.common.io.InputFormat;
import org.apache.flink.api.common.typeinfo.TypeInformation;
import org.apache.flink.api.java.tuple.Tuple2;
import org.apache.flink.configuration.Configuration;
import org.apache.flink.streaming.api.datastream.DataStream;
import org.apache.flink.streaming.api.datastream.DataStreamSource;
import org.apache.flink.streaming.api.datastream.SingleOutputStreamOperator;
import org.apache.flink.streaming.api.environment.StreamExecutionEnvironment;
import org.apache.flink.streaming.api.functions.source.InputFormatSourceFunction;
import org.apache.flink.streaming.api.operators.OneInputStreamOperatorFactory;
import org.apache.flink.table.api.DataTypes;
import org.apache.flink.table.connector.ChangelogMode;
import org.apache.flink.table.connector.source.DynamicTableSource;
import org.apache.flink.table.connector.source.LookupTableSource;
import org.apache.flink.table.connector.source.ScanTableSource;
import org.apache.flink.table.connector.source.TableFunctionProvider;
import org.apache.flink.table.connector.source.abilities.SupportsFilterPushDown;
import org.apache.flink.table.connector.source.abilities.SupportsLimitPushDown;
import org.apache.flink.table.connector.source.abilities.SupportsProjectionPushDown;
import org.apache.flink.table.data.RowData;
import org.apache.flink.table.expressions.ResolvedExpression;
import org.apache.flink.table.runtime.types.TypeInfoDataTypeConverter;
import org.apache.flink.table.types.DataType;
import org.apache.flink.table.types.logical.RowType;
import org.apache.hadoop.fs.Path;
import org.slf4j.Logger;
import org.slf4j.LoggerFactory;

import javax.annotation.Nullable;

import java.io.Serializable;
import java.time.Duration;
import java.util.ArrayList;
import java.util.Arrays;
import java.util.Collection;
import java.util.Collections;
import java.util.List;
import java.util.Optional;
import java.util.Set;
import java.util.StringJoiner;
import java.util.concurrent.atomic.AtomicInteger;
import java.util.stream.Collectors;
import java.util.stream.IntStream;

import static org.apache.hudi.configuration.FlinkOptions.LOOKUP_JOIN_CACHE_TTL;
import static org.apache.hudi.configuration.HadoopConfigurations.getParquetConf;
import static org.apache.hudi.util.ExpressionUtils.filterSimpleCallExpression;
import static org.apache.hudi.util.ExpressionUtils.splitExprByPartitionCall;

/**
 * Hoodie batch table source that always read the latest snapshot of the underneath table.
 */
public class HoodieTableSource implements
    ScanTableSource,
    SupportsProjectionPushDown,
    SupportsLimitPushDown,
    SupportsFilterPushDown,
    LookupTableSource,
    Serializable {
  private static final long serialVersionUID = 1L;
  private static final Logger LOG = LoggerFactory.getLogger(HoodieTableSource.class);

  private static final long NO_LIMIT_CONSTANT = -1;

  private final StorageConfiguration hadoopConf;
  private final HoodieTableMetaClient metaClient;
  private final long maxCompactionMemoryInBytes;

  private final SerializableSchema schema;
  private final RowType tableRowType;
  private final StoragePath path;
  private final List partitionKeys;
  private final String defaultPartName;
  private final Configuration conf;
  private final InternalSchemaManager internalSchemaManager;

  private int[] requiredPos;
  private long limit;
  private List predicates;
  private ColumnStatsProbe columnStatsProbe;
  private PartitionPruners.PartitionPruner partitionPruner;
  private int dataBucket;
  private transient FileIndex fileIndex;

  public HoodieTableSource(
      SerializableSchema schema,
      StoragePath path,
      List partitionKeys,
      String defaultPartName,
      Configuration conf) {
    this(schema, path, partitionKeys, defaultPartName, conf, null, null, null, PrimaryKeyPruners.BUCKET_ID_NO_PRUNING, null, null, null, null);
  }

  public HoodieTableSource(
      SerializableSchema schema,
      StoragePath path,
      List partitionKeys,
      String defaultPartName,
      Configuration conf,
      @Nullable List predicates,
      @Nullable ColumnStatsProbe columnStatsProbe,
      @Nullable PartitionPruners.PartitionPruner partitionPruner,
      int dataBucket,
      @Nullable int[] requiredPos,
      @Nullable Long limit,
      @Nullable HoodieTableMetaClient metaClient,
      @Nullable InternalSchemaManager internalSchemaManager) {
    this.schema = schema;
    this.tableRowType = (RowType) this.schema.toSourceRowDataType().notNull().getLogicalType();
    this.path = path;
    this.partitionKeys = partitionKeys;
    this.defaultPartName = defaultPartName;
    this.conf = conf;
    this.predicates = Optional.ofNullable(predicates).orElse(Collections.emptyList());
    this.columnStatsProbe = columnStatsProbe;
    this.partitionPruner = partitionPruner;
    this.dataBucket = dataBucket;
    this.requiredPos = Optional.ofNullable(requiredPos).orElseGet(() -> IntStream.range(0, this.tableRowType.getFieldCount()).toArray());
    this.limit = Optional.ofNullable(limit).orElse(NO_LIMIT_CONSTANT);
    this.hadoopConf = new HadoopStorageConfiguration(HadoopConfigurations.getHadoopConf(conf));
    this.metaClient = Optional.ofNullable(metaClient).orElseGet(() -> StreamerUtil.metaClientForReader(conf, this.hadoopConf.unwrap()));
    this.maxCompactionMemoryInBytes = StreamerUtil.getMaxCompactionMemoryInBytes(conf);
    this.internalSchemaManager = Optional.ofNullable(internalSchemaManager).orElseGet(() -> InternalSchemaManager.get(this.conf, this.metaClient));
  }

  @Override
  public ScanRuntimeProvider getScanRuntimeProvider(ScanContext scanContext) {
    return new DataStreamScanProviderAdapter() {

      @Override
      public boolean isBounded() {
        return !conf.getBoolean(FlinkOptions.READ_AS_STREAMING);
      }

      @Override
      public DataStream produceDataStream(StreamExecutionEnvironment execEnv) {
        @SuppressWarnings("unchecked")
        TypeInformation typeInfo =
            (TypeInformation) TypeInfoDataTypeConverter.fromDataTypeToTypeInfo(getProducedDataType());
        OptionsInference.setupSourceTasks(conf, execEnv.getParallelism());
        if (conf.getBoolean(FlinkOptions.READ_AS_STREAMING)) {
          StreamReadMonitoringFunction monitoringFunction = new StreamReadMonitoringFunction(
              conf, FilePathUtils.toFlinkPath(path), tableRowType, maxCompactionMemoryInBytes, partitionPruner);
          InputFormat inputFormat = getInputFormat(true);
          OneInputStreamOperatorFactory factory = StreamReadOperator.factory((MergeOnReadInputFormat) inputFormat);
          SingleOutputStreamOperator monitorOperatorStream = execEnv.addSource(monitoringFunction, getSourceOperatorName("split_monitor"))
              .uid(Pipelines.opUID("split_monitor", conf))
              .setParallelism(1)
              .setMaxParallelism(1);

          DataStream sourceWithKey = addFileDistributionStrategy(monitorOperatorStream);

          SingleOutputStreamOperator streamReadSource = sourceWithKey
              .transform("split_reader", typeInfo, factory)
              .uid(Pipelines.opUID("split_reader", conf))
              .setParallelism(conf.getInteger(FlinkOptions.READ_TASKS));
          return new DataStreamSource<>(streamReadSource);
        } else {
          InputFormatSourceFunction func = new InputFormatSourceFunction<>(getInputFormat(), typeInfo);
          DataStreamSource source = execEnv.addSource(func, asSummaryString(), typeInfo);
          return source.name(getSourceOperatorName("bounded_source")).setParallelism(conf.getInteger(FlinkOptions.READ_TASKS));
        }
      }
    };
  }

  /**
   * Specify the file distribution strategy based on different upstream writing mechanisms,
   *  to prevent hot spot issues during stream reading.
   */
  private DataStream addFileDistributionStrategy(SingleOutputStreamOperator source) {
    if (OptionsResolver.isMorWithBucketIndexUpsert(conf)) {
      return source.partitionCustom(new StreamReadBucketIndexPartitioner(conf.getInteger(FlinkOptions.READ_TASKS)), new StreamReadBucketIndexKeySelector());
    } else if (OptionsResolver.isAppendMode(conf)) {
      return source.partitionCustom(new StreamReadAppendPartitioner(conf.getInteger(FlinkOptions.READ_TASKS)), new StreamReadAppendKeySelector());
    } else {
      return source.keyBy(MergeOnReadInputSplit::getFileId);
    }
  }

  @Override
  public ChangelogMode getChangelogMode() {
    // when read as streaming and changelog mode is enabled, emit as FULL mode;
    // when read as incremental and cdc is enabled, emit as FULL mode;
    // when all the changes are compacted or read as batch, emit as INSERT mode.
    return OptionsResolver.emitChangelog(conf) ? ChangelogModes.FULL : ChangelogMode.insertOnly();
  }

  @Override
  public DynamicTableSource copy() {
    return new HoodieTableSource(schema, path, partitionKeys, defaultPartName,
        conf, predicates, columnStatsProbe, partitionPruner, dataBucket, requiredPos, limit, metaClient, internalSchemaManager);
  }

  @Override
  public String asSummaryString() {
    return "HudiTableSource";
  }

  @Override
  public Result applyFilters(List filters) {
    List simpleFilters = filterSimpleCallExpression(filters);
    Tuple2, List> splitFilters = splitExprByPartitionCall(simpleFilters, this.partitionKeys, this.tableRowType);
    this.predicates = ExpressionPredicates.fromExpression(splitFilters.f0);
    this.columnStatsProbe = ColumnStatsProbe.newInstance(splitFilters.f0);
    this.partitionPruner = createPartitionPruner(splitFilters.f1, columnStatsProbe);
    this.dataBucket = getDataBucket(splitFilters.f0);
    // refuse all the filters now
    return SupportsFilterPushDown.Result.of(new ArrayList<>(splitFilters.f1), new ArrayList<>(filters));
  }

  @Override
  public boolean supportsNestedProjection() {
    return false;
  }

  @Override
  public void applyProjection(int[][] projections) {
    // nested projection is not supported.
    this.requiredPos = Arrays.stream(projections).mapToInt(array -> array[0]).toArray();
  }

  @Override
  public void applyLimit(long limit) {
    this.limit = limit;
  }

  @Override
  public LookupRuntimeProvider getLookupRuntimeProvider(LookupContext context) {
    Duration duration = conf.get(LOOKUP_JOIN_CACHE_TTL);
    return TableFunctionProvider.of(
        new HoodieLookupFunction(
            new HoodieLookupTableReader(this::getBatchInputFormat, conf),
            (RowType) getProducedDataType().notNull().getLogicalType(),
            getLookupKeys(context.getKeys()),
            duration,
            conf
        ));
  }

  private DataType getProducedDataType() {
    String[] schemaFieldNames = this.schema.getColumnNames().toArray(new String[0]);
    DataType[] schemaTypes = this.schema.getColumnDataTypes().toArray(new DataType[0]);

    return DataTypes.ROW(Arrays.stream(this.requiredPos)
            .mapToObj(i -> DataTypes.FIELD(schemaFieldNames[i], schemaTypes[i]))
            .toArray(DataTypes.Field[]::new))
        .bridgedTo(RowData.class);
  }

  private String getSourceOperatorName(String operatorName) {
    String[] schemaFieldNames = this.schema.getColumnNames().toArray(new String[0]);
    List fields = Arrays.stream(this.requiredPos)
        .mapToObj(i -> schemaFieldNames[i])
        .collect(Collectors.toList());
    StringBuilder sb = new StringBuilder();
    sb.append(operatorName)
        .append("(")
        .append("table=").append(Collections.singletonList(conf.getString(FlinkOptions.TABLE_NAME)))
        .append(", ")
        .append("fields=").append(fields)
        .append(")");
    return sb.toString();
  }

  @Nullable
  private PartitionPruners.PartitionPruner createPartitionPruner(List partitionFilters, ColumnStatsProbe columnStatsProbe) {
    if (!isPartitioned() || partitionFilters.isEmpty() && columnStatsProbe == null) {
      return null;
    }
    StringJoiner joiner = new StringJoiner(" and ");
    partitionFilters.forEach(f -> joiner.add(f.asSummaryString()));
    LOG.info("Partition pruner for hoodie source, condition is:\n" + joiner);
    List evaluators = ExpressionEvaluators.fromExpression(partitionFilters);
    List partitionTypes = this.partitionKeys.stream().map(name ->
            this.schema.getColumn(name).orElseThrow(() -> new HoodieValidationException("Field " + name + " does not exist")))
        .map(SerializableSchema.Column::getDataType)
        .collect(Collectors.toList());
    String defaultParName = conf.get(FlinkOptions.PARTITION_DEFAULT_NAME);
    boolean hivePartition = conf.get(FlinkOptions.HIVE_STYLE_PARTITIONING);

    return PartitionPruners.builder()
        .basePath(path.toString())
        .rowType(tableRowType)
        .conf(conf)
        .columnStatsProbe(columnStatsProbe)
        .partitionEvaluators(evaluators)
        .partitionKeys(partitionKeys)
        .partitionTypes(partitionTypes)
        .defaultParName(defaultParName)
        .hivePartition(hivePartition)
        .build();
  }

  private int getDataBucket(List dataFilters) {
    if (!OptionsResolver.isBucketIndexType(conf) || dataFilters.isEmpty()) {
      return PrimaryKeyPruners.BUCKET_ID_NO_PRUNING;
    }
    Set indexKeyFields = Arrays.stream(OptionsResolver.getIndexKeys(conf)).collect(Collectors.toSet());
    List indexKeyFilters = dataFilters.stream().filter(expr -> ExpressionUtils.isEqualsLitExpr(expr, indexKeyFields)).collect(Collectors.toList());
    if (!ExpressionUtils.isFilteringByAllFields(indexKeyFilters, indexKeyFields)) {
      return PrimaryKeyPruners.BUCKET_ID_NO_PRUNING;
    }
    return PrimaryKeyPruners.getBucketId(indexKeyFilters, conf);
  }

  private List buildInputSplits() {
    FileIndex fileIndex = getOrBuildFileIndex();
    List relPartitionPaths = fileIndex.getOrBuildPartitionPaths();
    if (relPartitionPaths.isEmpty()) {
      return Collections.emptyList();
    }
    List pathInfoList = fileIndex.getFilesInPartitions();
    if (pathInfoList.isEmpty()) {
      throw new HoodieException("No files found for reading in user provided path.");
    }

    HoodieTableFileSystemView fsView = new HoodieTableFileSystemView(metaClient,
        // file-slice after pending compaction-requested instant-time is also considered valid
        metaClient.getCommitsAndCompactionTimeline().filterCompletedAndCompactionInstants(),
        pathInfoList);
    if (!fsView.getLastInstant().isPresent()) {
      return Collections.emptyList();
    }
    String latestCommit = fsView.getLastInstant().get().requestedTime();
    final String mergeType = this.conf.getString(FlinkOptions.MERGE_TYPE);
    final AtomicInteger cnt = new AtomicInteger(0);
    // generates one input split for each file group
    return relPartitionPaths.stream()
        .map(relPartitionPath -> fsView.getLatestMergedFileSlicesBeforeOrOn(relPartitionPath, latestCommit)
            .map(fileSlice -> {
              String basePath = fileSlice.getBaseFile().map(BaseFile::getPath).orElse(null);
              Option> logPaths = Option.ofNullable(fileSlice.getLogFiles()
                  .sorted(HoodieLogFile.getLogFileComparator())
                  .map(logFile -> logFile.getPath().toString())
                  .collect(Collectors.toList()));
              return new MergeOnReadInputSplit(cnt.getAndAdd(1), basePath, logPaths, latestCommit,
                  metaClient.getBasePath().toString(), maxCompactionMemoryInBytes, mergeType, null, fileSlice.getFileId());
            }).collect(Collectors.toList()))
        .flatMap(Collection::stream)
        .collect(Collectors.toList());
  }

  public InputFormat getInputFormat() {
    return getInputFormat(false);
  }

  @VisibleForTesting
  public InputFormat getInputFormat(boolean isStreaming) {
    return isStreaming ? getStreamInputFormat() : getBatchInputFormat();
  }

  private InputFormat getBatchInputFormat() {
    final Schema tableAvroSchema = getTableAvroSchema();
    final DataType rowDataType = AvroSchemaConverter.convertToDataType(tableAvroSchema);
    final RowType rowType = (RowType) rowDataType.getLogicalType();
    final RowType requiredRowType = (RowType) getProducedDataType().notNull().getLogicalType();

    final String queryType = this.conf.getString(FlinkOptions.QUERY_TYPE);
    switch (queryType) {
      case FlinkOptions.QUERY_TYPE_SNAPSHOT:
        final HoodieTableType tableType = HoodieTableType.valueOf(this.conf.getString(FlinkOptions.TABLE_TYPE));
        switch (tableType) {
          case MERGE_ON_READ:
            final List inputSplits = buildInputSplits();
            if (inputSplits.isEmpty()) {
              // When there is no input splits, just return an empty source.
              LOG.warn("No input splits generate for MERGE_ON_READ input format, returns empty collection instead");
              return InputFormats.EMPTY_INPUT_FORMAT;
            }
            return mergeOnReadInputFormat(rowType, requiredRowType, tableAvroSchema,
                rowDataType, inputSplits, false);
          case COPY_ON_WRITE:
            return baseFileOnlyInputFormat();
          default:
            throw new HoodieException("Unexpected table type: " + this.conf.getString(FlinkOptions.TABLE_TYPE));
        }
      case FlinkOptions.QUERY_TYPE_READ_OPTIMIZED:
        return baseFileOnlyInputFormat();
      case FlinkOptions.QUERY_TYPE_INCREMENTAL:
        IncrementalInputSplits incrementalInputSplits = IncrementalInputSplits.builder()
            .conf(conf)
            .path(FilePathUtils.toFlinkPath(path))
            .rowType(this.tableRowType)
            .maxCompactionMemoryInBytes(maxCompactionMemoryInBytes)
            .partitionPruner(partitionPruner)
            .build();
        final boolean cdcEnabled = this.conf.getBoolean(FlinkOptions.CDC_ENABLED);
        final IncrementalInputSplits.Result result = incrementalInputSplits.inputSplits(metaClient, cdcEnabled);
        if (result.isEmpty()) {
          // When there is no input splits, just return an empty source.
          LOG.warn("No input splits generate for incremental read, returns empty collection instead");
          return InputFormats.EMPTY_INPUT_FORMAT;
        } else if (cdcEnabled) {
          return cdcInputFormat(rowType, requiredRowType, tableAvroSchema, rowDataType, result.getInputSplits());
        } else {
          return mergeOnReadInputFormat(rowType, requiredRowType, tableAvroSchema,
              rowDataType, result.getInputSplits(), false);
        }
      default:
        String errMsg = String.format("Invalid query type : '%s', options ['%s', '%s', '%s'] are supported now", queryType,
            FlinkOptions.QUERY_TYPE_SNAPSHOT, FlinkOptions.QUERY_TYPE_READ_OPTIMIZED, FlinkOptions.QUERY_TYPE_INCREMENTAL);
        throw new HoodieException(errMsg);
    }
  }

  private InputFormat getStreamInputFormat() {
    // if table does not exist or table data does not exist, use schema from the DDL
    Schema tableAvroSchema = (this.metaClient == null || !tableDataExists()) ? inferSchemaFromDdl() : getTableAvroSchema();
    final DataType rowDataType = AvroSchemaConverter.convertToDataType(tableAvroSchema);
    final RowType rowType = (RowType) rowDataType.getLogicalType();
    final RowType requiredRowType = (RowType) getProducedDataType().notNull().getLogicalType();

    final String queryType = this.conf.getString(FlinkOptions.QUERY_TYPE);
    switch (queryType) {
      case FlinkOptions.QUERY_TYPE_SNAPSHOT:
      case FlinkOptions.QUERY_TYPE_INCREMENTAL:
        final HoodieTableType tableType = HoodieTableType.valueOf(this.conf.getString(FlinkOptions.TABLE_TYPE));
        boolean emitDelete = tableType == HoodieTableType.MERGE_ON_READ;
        if (this.conf.getBoolean(FlinkOptions.CDC_ENABLED)) {
          return cdcInputFormat(rowType, requiredRowType, tableAvroSchema, rowDataType, Collections.emptyList());
        } else {
          return mergeOnReadInputFormat(rowType, requiredRowType, tableAvroSchema,
              rowDataType, Collections.emptyList(), emitDelete);
        }
      default:
        String errMsg = String.format("Invalid query type : '%s', options ['%s', '%s'] are supported now", queryType,
            FlinkOptions.QUERY_TYPE_SNAPSHOT, FlinkOptions.QUERY_TYPE_INCREMENTAL);
        throw new HoodieException(errMsg);
    }
  }

  /**
   * Returns whether the hoodie table data exists .
   */
  private boolean tableDataExists() {
    HoodieActiveTimeline activeTimeline = metaClient.getActiveTimeline();
    Option> instantAndCommitMetadata = activeTimeline.getLastCommitMetadataWithValidData();
    return instantAndCommitMetadata.isPresent();
  }

  private MergeOnReadInputFormat cdcInputFormat(
      RowType rowType,
      RowType requiredRowType,
      Schema tableAvroSchema,
      DataType rowDataType,
      List inputSplits) {
    final MergeOnReadTableState hoodieTableState = new MergeOnReadTableState(
        rowType,
        requiredRowType,
        tableAvroSchema.toString(),
        AvroSchemaConverter.convertToSchema(requiredRowType).toString(),
        inputSplits,
        conf.getString(FlinkOptions.RECORD_KEY_FIELD).split(","));
    return CdcInputFormat.builder()
        .config(this.conf)
        .tableState(hoodieTableState)
        // use the explicit fields' data type because the AvroSchemaConverter
        // is not very stable.
        .fieldTypes(rowDataType.getChildren())
        .defaultPartName(conf.getString(FlinkOptions.PARTITION_DEFAULT_NAME))
        .predicates(this.predicates)
        .limit(this.limit)
        .emitDelete(false) // the change logs iterator can handle the DELETE records
        .build();
  }

  private MergeOnReadInputFormat mergeOnReadInputFormat(
      RowType rowType,
      RowType requiredRowType,
      Schema tableAvroSchema,
      DataType rowDataType,
      List inputSplits,
      boolean emitDelete) {
    final MergeOnReadTableState hoodieTableState = new MergeOnReadTableState(
        rowType,
        requiredRowType,
        tableAvroSchema.toString(),
        AvroSchemaConverter.convertToSchema(requiredRowType).toString(),
        inputSplits,
        conf.getString(FlinkOptions.RECORD_KEY_FIELD).split(","));
    return MergeOnReadInputFormat.builder()
        .config(this.conf)
        .tableState(hoodieTableState)
        // use the explicit fields' data type because the AvroSchemaConverter
        // is not very stable.
        .fieldTypes(rowDataType.getChildren())
        .defaultPartName(conf.getString(FlinkOptions.PARTITION_DEFAULT_NAME))
        .predicates(this.predicates)
        .limit(this.limit)
        .emitDelete(emitDelete)
        .internalSchemaManager(internalSchemaManager)
        .build();
  }

  private InputFormat baseFileOnlyInputFormat() {
    final List pathInfoList = getReadFiles();
    if (pathInfoList.isEmpty()) {
      return InputFormats.EMPTY_INPUT_FORMAT;
    }

    HoodieTableFileSystemView fsView = new HoodieTableFileSystemView(metaClient,
        metaClient.getCommitsAndCompactionTimeline().filterCompletedInstants(), pathInfoList);
    Path[] paths = fsView.getLatestBaseFiles()
        .map(HoodieBaseFile::getPathInfo)
        .map(e -> new Path(e.getPath().toUri())).toArray(Path[]::new);

    if (paths.length == 0) {
      return InputFormats.EMPTY_INPUT_FORMAT;
    }

    return new CopyOnWriteInputFormat(
        FilePathUtils.toFlinkPaths(paths),
        this.schema.getColumnNames().toArray(new String[0]),
        this.schema.getColumnDataTypes().toArray(new DataType[0]),
        this.requiredPos,
        this.conf.getString(FlinkOptions.PARTITION_DEFAULT_NAME),
        this.conf.getString(FlinkOptions.PARTITION_PATH_FIELD),
        this.conf.getBoolean(FlinkOptions.HIVE_STYLE_PARTITIONING),
        this.predicates,
        this.limit == NO_LIMIT_CONSTANT ? Long.MAX_VALUE : this.limit, // ParquetInputFormat always uses the limit value
        getParquetConf(this.conf, this.hadoopConf.unwrap()),
        this.conf.getBoolean(FlinkOptions.READ_UTC_TIMEZONE),
        this.internalSchemaManager
    );
  }

  private Schema inferSchemaFromDdl() {
    Schema schema = AvroSchemaConverter.convertToSchema(this.tableRowType);
    return HoodieAvroUtils.addMetadataFields(schema, conf.getBoolean(FlinkOptions.CHANGELOG_ENABLED));
  }

  private FileIndex getOrBuildFileIndex() {
    if (this.fileIndex == null) {
      this.fileIndex = FileIndex.builder()
          .path(this.path)
          .conf(this.conf)
          .rowType(this.tableRowType)
          .columnStatsProbe(this.columnStatsProbe)
          .partitionPruner(this.partitionPruner)
          .dataBucket(this.dataBucket)
          .build();
    }
    return this.fileIndex;
  }

  private int[] getLookupKeys(int[][] keys) {
    int[] keyIndices = new int[keys.length];
    int i = 0;
    for (int[] key : keys) {
      if (key.length > 1) {
        throw new UnsupportedOperationException(
            "Hoodie lookup can not support nested key now.");
      }
      keyIndices[i] = key[0];
      i++;
    }
    return keyIndices;
  }

  private boolean isPartitioned() {
    return !this.partitionKeys.isEmpty() && this.partitionKeys.stream().noneMatch(String::isEmpty);
  }

  @VisibleForTesting
  public Schema getTableAvroSchema() {
    try {
      TableSchemaResolver schemaResolver = new TableSchemaResolver(metaClient);
      return schemaResolver.getTableAvroSchema();
    } catch (Throwable e) {
      // table exists but has no written data
      LOG.warn("Get table avro schema error, use schema from the DDL instead", e);
      return inferSchemaFromDdl();
    }
  }

  @VisibleForTesting
  public HoodieTableMetaClient getMetaClient() {
    return this.metaClient;
  }

  @VisibleForTesting
  public Configuration getConf() {
    return this.conf;
  }

  /**
   * Reset the state of the table source.
   */
  @VisibleForTesting
  public void reset() {
    this.metaClient.reloadActiveTimeline();
    this.fileIndex = null;
  }

  /**
   * Get the reader paths with partition path expanded.
   */
  @VisibleForTesting
  public List getReadFiles() {
    List relPartitionPaths = getReadPartitions();
    if (relPartitionPaths.isEmpty()) {
      return Collections.emptyList();
    }
    return fileIndex.getFilesInPartitions();
  }

  @VisibleForTesting
  public List getReadPartitions() {
    FileIndex fileIndex = getOrBuildFileIndex();
    return fileIndex.getOrBuildPartitionPaths();
  }

  @VisibleForTesting
  public List getPredicates() {
    return predicates;
  }

  @VisibleForTesting
  public ColumnStatsProbe getColumnStatsProbe() {
    return columnStatsProbe;
  }

  @VisibleForTesting
  public int getDataBucket() {
    return dataBucket;
  }
}




© 2015 - 2025 Weber Informatics LLC | Privacy Policy