All Downloads are FREE. Search and download functionalities are using the official Maven repository.

org.apache.hudi.table.HoodieTableFactory Maven / Gradle / Ivy

The newest version!
/*
 * Licensed to the Apache Software Foundation (ASF) under one
 * or more contributor license agreements.  See the NOTICE file
 * distributed with this work for additional information
 * regarding copyright ownership.  The ASF licenses this file
 * to you under the Apache License, Version 2.0 (the
 * "License"); you may not use this file except in compliance
 * with the License.  You may obtain a copy of the License at
 *
 * http://www.apache.org/licenses/LICENSE-2.0
 *
 * Unless required by applicable law or agreed to in writing, software
 * distributed under the License is distributed on an "AS IS" BASIS,
 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 * See the License for the specific language governing permissions and
 * limitations under the License.
 */

package org.apache.hudi.table;

import org.apache.hudi.avro.AvroSchemaUtils;
import org.apache.hudi.common.model.HoodieTableType;
import org.apache.hudi.common.table.HoodieTableConfig;
import org.apache.hudi.common.util.StringUtils;
import org.apache.hudi.config.HoodieIndexConfig;
import org.apache.hudi.configuration.FlinkOptions;
import org.apache.hudi.configuration.HadoopConfigurations;
import org.apache.hudi.configuration.OptionsResolver;
import org.apache.hudi.exception.HoodieValidationException;
import org.apache.hudi.index.HoodieIndex;
import org.apache.hudi.keygen.NonpartitionedAvroKeyGenerator;
import org.apache.hudi.keygen.TimestampBasedAvroKeyGenerator;
import org.apache.hudi.storage.StoragePath;
import org.apache.hudi.util.AvroSchemaConverter;
import org.apache.hudi.util.DataTypeUtils;
import org.apache.hudi.util.SerializableSchema;
import org.apache.hudi.util.StreamerUtil;

import org.apache.flink.configuration.ConfigOption;
import org.apache.flink.configuration.Configuration;
import org.apache.flink.configuration.ReadableConfig;
import org.apache.flink.table.api.ValidationException;
import org.apache.flink.table.api.constraints.UniqueConstraint;
import org.apache.flink.table.catalog.CatalogTable;
import org.apache.flink.table.catalog.ObjectIdentifier;
import org.apache.flink.table.catalog.ResolvedSchema;
import org.apache.flink.table.connector.sink.DynamicTableSink;
import org.apache.flink.table.connector.source.DynamicTableSource;
import org.apache.flink.table.factories.DynamicTableSinkFactory;
import org.apache.flink.table.factories.DynamicTableSourceFactory;
import org.apache.flink.table.types.DataType;
import org.apache.flink.table.types.logical.LogicalType;
import org.slf4j.Logger;
import org.slf4j.LoggerFactory;

import java.util.Arrays;
import java.util.Collections;
import java.util.List;
import java.util.Set;
import java.util.concurrent.TimeUnit;
import java.util.stream.Collectors;

import static org.apache.flink.table.api.config.ExecutionConfigOptions.TABLE_EXEC_SORT_ASYNC_MERGE_ENABLED;
import static org.apache.flink.table.api.config.ExecutionConfigOptions.TABLE_EXEC_SORT_MAX_NUM_FILE_HANDLES;
import static org.apache.flink.table.api.config.ExecutionConfigOptions.TABLE_EXEC_SPILL_COMPRESSION_BLOCK_SIZE;
import static org.apache.flink.table.api.config.ExecutionConfigOptions.TABLE_EXEC_SPILL_COMPRESSION_ENABLED;
import static org.apache.hudi.common.config.TimestampKeyGeneratorConfig.INPUT_TIME_UNIT;
import static org.apache.hudi.common.config.TimestampKeyGeneratorConfig.TIMESTAMP_INPUT_DATE_FORMAT;
import static org.apache.hudi.common.config.TimestampKeyGeneratorConfig.TIMESTAMP_OUTPUT_DATE_FORMAT;
import static org.apache.hudi.common.config.TimestampKeyGeneratorConfig.TIMESTAMP_OUTPUT_TIMEZONE_FORMAT;
import static org.apache.hudi.common.config.TimestampKeyGeneratorConfig.TIMESTAMP_TYPE_FIELD;
import static org.apache.hudi.common.util.ValidationUtils.checkArgument;

/**
 * Hoodie data source/sink factory.
 */
public class HoodieTableFactory implements DynamicTableSourceFactory, DynamicTableSinkFactory {
  private static final Logger LOG = LoggerFactory.getLogger(HoodieTableFactory.class);

  public static final String FACTORY_ID = "hudi";

  @Override
  public DynamicTableSource createDynamicTableSource(Context context) {
    Configuration conf = FlinkOptions.fromMap(context.getCatalogTable().getOptions());
    StoragePath path = new StoragePath(conf.getOptional(FlinkOptions.PATH).orElseThrow(() ->
        new ValidationException("Option [path] should not be empty.")));
    setupTableOptions(conf.getString(FlinkOptions.PATH), conf);
    ResolvedSchema schema = context.getCatalogTable().getResolvedSchema();
    setupConfOptions(conf, context.getObjectIdentifier(), context.getCatalogTable(), schema);
    return new HoodieTableSource(
        SerializableSchema.create(schema),
        path,
        context.getCatalogTable().getPartitionKeys(),
        conf.getString(FlinkOptions.PARTITION_DEFAULT_NAME),
        conf);
  }

  @Override
  public DynamicTableSink createDynamicTableSink(Context context) {
    Configuration conf = FlinkOptions.fromMap(context.getCatalogTable().getOptions());
    checkArgument(!StringUtils.isNullOrEmpty(conf.getString(FlinkOptions.PATH)),
        "Option [path] should not be empty.");
    setupTableOptions(conf.getString(FlinkOptions.PATH), conf);
    ResolvedSchema schema = context.getCatalogTable().getResolvedSchema();
    sanityCheck(conf, schema);
    setupConfOptions(conf, context.getObjectIdentifier(), context.getCatalogTable(), schema);
    setupSortOptions(conf, context.getConfiguration());
    return new HoodieTableSink(conf, schema);
  }

  /**
   * Supplement the table config options if not specified.
   */
  private void setupTableOptions(String basePath, Configuration conf) {
    StreamerUtil.getTableConfig(basePath, HadoopConfigurations.getHadoopConf(conf))
        .ifPresent(tableConfig -> {
          if (tableConfig.contains(HoodieTableConfig.RECORDKEY_FIELDS)
              && !conf.contains(FlinkOptions.RECORD_KEY_FIELD)) {
            conf.setString(FlinkOptions.RECORD_KEY_FIELD, tableConfig.getString(HoodieTableConfig.RECORDKEY_FIELDS));
          }
          if (tableConfig.contains(HoodieTableConfig.PRECOMBINE_FIELD)
              && !conf.contains(FlinkOptions.PRECOMBINE_FIELD)) {
            conf.setString(FlinkOptions.PRECOMBINE_FIELD, tableConfig.getString(HoodieTableConfig.PRECOMBINE_FIELD));
          }
          if (tableConfig.contains(HoodieTableConfig.HIVE_STYLE_PARTITIONING_ENABLE)
              && !conf.contains(FlinkOptions.HIVE_STYLE_PARTITIONING)) {
            conf.setBoolean(FlinkOptions.HIVE_STYLE_PARTITIONING, tableConfig.getBoolean(HoodieTableConfig.HIVE_STYLE_PARTITIONING_ENABLE));
          }
          if (tableConfig.contains(HoodieTableConfig.TYPE) && conf.contains(FlinkOptions.TABLE_TYPE)) {
            if (!tableConfig.getString(HoodieTableConfig.TYPE).equals(conf.get(FlinkOptions.TABLE_TYPE))) {
              LOG.warn(
                  String.format("Table type conflict : %s in %s and %s in table options. Fix the table type as to be in line with the hoodie.properties.",
                      tableConfig.getString(HoodieTableConfig.TYPE), HoodieTableConfig.HOODIE_PROPERTIES_FILE,
                      conf.get(FlinkOptions.TABLE_TYPE)));
              conf.setString(FlinkOptions.TABLE_TYPE, tableConfig.getString(HoodieTableConfig.TYPE));
            }
          }
          if (tableConfig.contains(HoodieTableConfig.TYPE)
              && !conf.contains(FlinkOptions.TABLE_TYPE)) {
            conf.setString(FlinkOptions.TABLE_TYPE, tableConfig.getString(HoodieTableConfig.TYPE));
          }
          if (tableConfig.contains(HoodieTableConfig.PAYLOAD_CLASS_NAME)
              && !conf.contains(FlinkOptions.PAYLOAD_CLASS_NAME)) {
            conf.setString(FlinkOptions.PAYLOAD_CLASS_NAME, tableConfig.getString(HoodieTableConfig.PAYLOAD_CLASS_NAME));
          }
        });
  }

  @Override
  public String factoryIdentifier() {
    return FACTORY_ID;
  }

  @Override
  public Set> requiredOptions() {
    return Collections.singleton(FlinkOptions.PATH);
  }

  @Override
  public Set> optionalOptions() {
    return FlinkOptions.optionalOptions();
  }

  // -------------------------------------------------------------------------
  //  Utilities
  // -------------------------------------------------------------------------

  /**
   * The sanity check.
   *
   * @param conf   The table options
   * @param schema The table schema
   */
  private void sanityCheck(Configuration conf, ResolvedSchema schema) {
    checkTableType(conf);
    checkIndexType(conf);

    if (!OptionsResolver.isAppendMode(conf)) {
      checkRecordKey(conf, schema);
    }
    StreamerUtil.checkPreCombineKey(conf, schema.getColumnNames());
  }

  /**
   * Validate the index type.
   */
  private void checkIndexType(Configuration conf) {
    String indexType = conf.get(FlinkOptions.INDEX_TYPE);
    if (!StringUtils.isNullOrEmpty(indexType)) {
      HoodieIndexConfig.INDEX_TYPE.checkValues(indexType);
    }
  }

  /**
   * Validate the table type.
   */
  private void checkTableType(Configuration conf) {
    String tableType = conf.get(FlinkOptions.TABLE_TYPE);
    if (StringUtils.nonEmpty(tableType)) {
      try {
        HoodieTableType.valueOf(tableType);
      } catch (IllegalArgumentException e) {
        throw new HoodieValidationException("Invalid table type: " + tableType + ". Table type should be either "
                + HoodieTableType.MERGE_ON_READ + " or " + HoodieTableType.COPY_ON_WRITE + ".");
      }
    }
  }

  /**
   * Validate the record key.
   */
  private void checkRecordKey(Configuration conf, ResolvedSchema schema) {
    List fields = schema.getColumnNames();
    if (!schema.getPrimaryKey().isPresent()) {
      String[] recordKeys = conf.get(FlinkOptions.RECORD_KEY_FIELD).split(",");
      if (recordKeys.length == 1
          && FlinkOptions.RECORD_KEY_FIELD.defaultValue().equals(recordKeys[0])
          && !fields.contains(recordKeys[0])) {
        throw new HoodieValidationException("Primary key definition is required, the default primary key field "
            + "'" + FlinkOptions.RECORD_KEY_FIELD.defaultValue() + "' does not exist in the table schema, "
            + "use either PRIMARY KEY syntax or option '" + FlinkOptions.RECORD_KEY_FIELD.key() + "' to speciy.");
      }

      Arrays.stream(recordKeys)
          .filter(field -> !fields.contains(field))
          .findAny()
          .ifPresent(f -> {
            throw new HoodieValidationException("Field '" + f + "' specified in option "
                + "'" + FlinkOptions.RECORD_KEY_FIELD.key() + "' does not exist in the table schema.");
          });
    }
  }

  /**
   * Sets up the config options based on the table definition, for e.g, the table name, primary key.
   *
   * @param conf      The configuration to set up
   * @param tablePath The table path
   * @param table     The catalog table
   * @param schema    The physical schema
   */
  private static void setupConfOptions(
      Configuration conf,
      ObjectIdentifier tablePath,
      CatalogTable table,
      ResolvedSchema schema) {
    // table name
    conf.setString(FlinkOptions.TABLE_NAME.key(), tablePath.getObjectName());
    // database name
    conf.setString(FlinkOptions.DATABASE_NAME.key(), tablePath.getDatabaseName());
    // hoodie key about options
    setupHoodieKeyOptions(conf, table);
    // compaction options
    setupCompactionOptions(conf);
    // hive options
    setupHiveOptions(conf, tablePath);
    // read options
    setupReadOptions(conf);
    // write options
    setupWriteOptions(conf);
    // infer avro schema from physical DDL schema
    inferAvroSchema(conf, schema.toPhysicalRowDataType().notNull().getLogicalType());
  }

  /**
   * Sets up the hoodie key options (e.g. record key and partition key) from the table definition.
   */
  private static void setupHoodieKeyOptions(Configuration conf, CatalogTable table) {
    List pkColumns = table.getSchema().getPrimaryKey()
        .map(UniqueConstraint::getColumns).orElse(Collections.emptyList());
    if (pkColumns.size() > 0) {
      // the PRIMARY KEY syntax always has higher priority than option FlinkOptions#RECORD_KEY_FIELD
      String recordKey = String.join(",", pkColumns);
      conf.setString(FlinkOptions.RECORD_KEY_FIELD, recordKey);
    }
    List partitionKeys = table.getPartitionKeys();
    if (partitionKeys.size() > 0) {
      // the PARTITIONED BY syntax always has higher priority than option FlinkOptions#PARTITION_PATH_FIELD
      conf.setString(FlinkOptions.PARTITION_PATH_FIELD, String.join(",", partitionKeys));
    }
    // set index key for bucket index if not defined
    if (conf.getString(FlinkOptions.INDEX_TYPE).equals(HoodieIndex.IndexType.BUCKET.name())) {
      if (conf.getString(FlinkOptions.INDEX_KEY_FIELD).isEmpty()) {
        conf.setString(FlinkOptions.INDEX_KEY_FIELD, conf.getString(FlinkOptions.RECORD_KEY_FIELD));
      } else {
        Set recordKeySet =
            Arrays.stream(conf.getString(FlinkOptions.RECORD_KEY_FIELD).split(",")).collect(Collectors.toSet());
        Set indexKeySet =
            Arrays.stream(conf.getString(FlinkOptions.INDEX_KEY_FIELD).split(",")).collect(Collectors.toSet());
        if (!recordKeySet.containsAll(indexKeySet)) {
          throw new HoodieValidationException(
              FlinkOptions.INDEX_KEY_FIELD + " should be a subset of or equal to the recordKey fields");
        }
      }
    }

    // tweak the key gen class if possible
    final String[] partitions = conf.getString(FlinkOptions.PARTITION_PATH_FIELD).split(",");
    final String[] pks = conf.getString(FlinkOptions.RECORD_KEY_FIELD).split(",");
    if (partitions.length == 1) {
      final String partitionField = partitions[0];
      if (partitionField.isEmpty()) {
        conf.setString(FlinkOptions.KEYGEN_CLASS_NAME, NonpartitionedAvroKeyGenerator.class.getName());
        LOG.info("Table option [{}] is reset to {} because this is a non-partitioned table",
            FlinkOptions.KEYGEN_CLASS_NAME.key(), NonpartitionedAvroKeyGenerator.class.getName());
        return;
      }
      DataType partitionFieldType = table.getSchema().getFieldDataType(partitionField)
          .orElseThrow(() -> new HoodieValidationException("Field " + partitionField + " does not exist"));
      if (pks.length <= 1 && DataTypeUtils.isDatetimeType(partitionFieldType)) {
        // timestamp based key gen only supports simple primary key
        setupTimestampKeygenOptions(conf, partitionFieldType);
        return;
      }
    }
    boolean complexHoodieKey = pks.length > 1 || partitions.length > 1;
    StreamerUtil.checkKeygenGenerator(complexHoodieKey, conf);
  }

  /**
   * Sets up the keygen options when the partition path is datetime type.
   *
   * 

The UTC timezone is used as default. */ public static void setupTimestampKeygenOptions(Configuration conf, DataType fieldType) { if (conf.contains(FlinkOptions.KEYGEN_CLASS_NAME)) { // the keygen clazz has been set up explicitly, skipping return; } conf.setString(FlinkOptions.KEYGEN_CLASS_NAME, TimestampBasedAvroKeyGenerator.class.getName()); LOG.info("Table option [{}] is reset to {} because datetime partitioning turns on", FlinkOptions.KEYGEN_CLASS_NAME.key(), TimestampBasedAvroKeyGenerator.class.getName()); if (DataTypeUtils.isTimestampType(fieldType)) { int precision = DataTypeUtils.precision(fieldType.getLogicalType()); if (precision == 0) { // seconds conf.setString(TIMESTAMP_TYPE_FIELD.key(), TimestampBasedAvroKeyGenerator.TimestampType.UNIX_TIMESTAMP.name()); } else if (precision == 3) { // milliseconds conf.setString(TIMESTAMP_TYPE_FIELD.key(), TimestampBasedAvroKeyGenerator.TimestampType.EPOCHMILLISECONDS.name()); } String outputPartitionFormat = conf.getOptional(FlinkOptions.PARTITION_FORMAT).orElse(FlinkOptions.PARTITION_FORMAT_HOUR); conf.setString(TIMESTAMP_OUTPUT_DATE_FORMAT.key(), outputPartitionFormat); } else { conf.setString(TIMESTAMP_TYPE_FIELD.key(), TimestampBasedAvroKeyGenerator.TimestampType.SCALAR.name()); conf.setString(INPUT_TIME_UNIT.key(), TimeUnit.DAYS.toString()); String outputPartitionFormat = conf.getOptional(FlinkOptions.PARTITION_FORMAT).orElse(FlinkOptions.PARTITION_FORMAT_DAY); conf.setString(TIMESTAMP_OUTPUT_DATE_FORMAT.key(), outputPartitionFormat); // the option is actually useless, it only works for validation conf.setString(TIMESTAMP_INPUT_DATE_FORMAT.key(), FlinkOptions.PARTITION_FORMAT_DAY); } conf.setString(TIMESTAMP_OUTPUT_TIMEZONE_FORMAT.key(), "UTC"); } /** * Sets up the compaction options from the table definition. */ private static void setupCompactionOptions(Configuration conf) { int commitsToRetain = conf.getInteger(FlinkOptions.CLEAN_RETAIN_COMMITS); int minCommitsToKeep = conf.getInteger(FlinkOptions.ARCHIVE_MIN_COMMITS); if (commitsToRetain >= minCommitsToKeep) { LOG.info("Table option [{}] is reset to {} to be greater than {}={},\n" + "to avoid risk of missing data from few instants in incremental pull", FlinkOptions.ARCHIVE_MIN_COMMITS.key(), commitsToRetain + 10, FlinkOptions.CLEAN_RETAIN_COMMITS.key(), commitsToRetain); conf.setInteger(FlinkOptions.ARCHIVE_MIN_COMMITS, commitsToRetain + 10); conf.setInteger(FlinkOptions.ARCHIVE_MAX_COMMITS, commitsToRetain + 20); } } /** * Sets up the hive options from the table definition. */ private static void setupHiveOptions(Configuration conf, ObjectIdentifier tablePath) { if (!conf.contains(FlinkOptions.HIVE_SYNC_DB)) { conf.setString(FlinkOptions.HIVE_SYNC_DB, tablePath.getDatabaseName()); } if (!conf.contains(FlinkOptions.HIVE_SYNC_TABLE)) { conf.setString(FlinkOptions.HIVE_SYNC_TABLE, tablePath.getObjectName()); } } /** * Sets up the read options from the table definition. */ private static void setupReadOptions(Configuration conf) { if (OptionsResolver.isIncrementalQuery(conf)) { conf.setString(FlinkOptions.QUERY_TYPE, FlinkOptions.QUERY_TYPE_INCREMENTAL); } } /** * Sets up the write options from the table definition. */ private static void setupWriteOptions(Configuration conf) { if (FlinkOptions.isDefaultValueDefined(conf, FlinkOptions.OPERATION) && OptionsResolver.isCowTable(conf)) { conf.setBoolean(FlinkOptions.PRE_COMBINE, true); } } /** * Sets up the table exec sort options. */ private void setupSortOptions(Configuration conf, ReadableConfig contextConfig) { if (contextConfig.getOptional(TABLE_EXEC_SORT_MAX_NUM_FILE_HANDLES).isPresent()) { conf.set(TABLE_EXEC_SORT_MAX_NUM_FILE_HANDLES, contextConfig.get(TABLE_EXEC_SORT_MAX_NUM_FILE_HANDLES)); } if (contextConfig.getOptional(TABLE_EXEC_SPILL_COMPRESSION_ENABLED).isPresent()) { conf.set(TABLE_EXEC_SPILL_COMPRESSION_ENABLED, contextConfig.get(TABLE_EXEC_SPILL_COMPRESSION_ENABLED)); } if (contextConfig.getOptional(TABLE_EXEC_SPILL_COMPRESSION_BLOCK_SIZE).isPresent()) { conf.set(TABLE_EXEC_SPILL_COMPRESSION_BLOCK_SIZE, contextConfig.get(TABLE_EXEC_SPILL_COMPRESSION_BLOCK_SIZE)); } if (contextConfig.getOptional(TABLE_EXEC_SORT_ASYNC_MERGE_ENABLED).isPresent()) { conf.set(TABLE_EXEC_SORT_ASYNC_MERGE_ENABLED, contextConfig.get(TABLE_EXEC_SORT_ASYNC_MERGE_ENABLED)); } } /** * Inferences the deserialization Avro schema from the table schema (e.g. the DDL) * if both options {@link FlinkOptions#SOURCE_AVRO_SCHEMA_PATH} and * {@link FlinkOptions#SOURCE_AVRO_SCHEMA} are not specified. * * @param conf The configuration * @param rowType The specified table row type */ private static void inferAvroSchema(Configuration conf, LogicalType rowType) { if (!conf.getOptional(FlinkOptions.SOURCE_AVRO_SCHEMA_PATH).isPresent() && !conf.getOptional(FlinkOptions.SOURCE_AVRO_SCHEMA).isPresent()) { String inferredSchema = AvroSchemaConverter.convertToSchema(rowType, AvroSchemaUtils.getAvroRecordQualifiedName(conf.getString(FlinkOptions.TABLE_NAME))).toString(); conf.setString(FlinkOptions.SOURCE_AVRO_SCHEMA, inferredSchema); } } }





© 2015 - 2025 Weber Informatics LLC | Privacy Policy