org.apache.hudi.utilities.config.KafkaSourceConfig Maven / Gradle / Ivy

Go to download
Show more of this group Show more artifacts with this name
Show all versions of hudi-utilities_2.13 Show documentation
The newest version!
/*
 * Licensed to the Apache Software Foundation (ASF) under one
 * or more contributor license agreements.  See the NOTICE file
 * distributed with this work for additional information
 * regarding copyright ownership.  The ASF licenses this file
 * to you under the Apache License, Version 2.0 (the
 * "License"); you may not use this file except in compliance
 * with the License.  You may obtain a copy of the License at
 *
 *   http://www.apache.org/licenses/LICENSE-2.0
 *
 * Unless required by applicable law or agreed to in writing,
 * software distributed under the License is distributed on an
 * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
 * KIND, either express or implied.  See the License for the
 * specific language governing permissions and limitations
 * under the License.
 */

package org.apache.hudi.utilities.config;

import org.apache.hudi.common.config.ConfigClassProperty;
import org.apache.hudi.common.config.ConfigGroups;
import org.apache.hudi.common.config.ConfigProperty;
import org.apache.hudi.common.config.HoodieConfig;

import org.apache.kafka.common.serialization.ByteArrayDeserializer;

import javax.annotation.concurrent.Immutable;

import static org.apache.hudi.common.util.ConfigUtils.DELTA_STREAMER_CONFIG_PREFIX;
import static org.apache.hudi.common.util.ConfigUtils.STREAMER_CONFIG_PREFIX;

/**
 * Cloud Source Configs
 */
@Immutable
@ConfigClassProperty(name = "Kafka Source Configs",
    groupName = ConfigGroups.Names.HUDI_STREAMER,
    subGroupName = ConfigGroups.SubGroupNames.DELTA_STREAMER_SOURCE,
    description = "Configurations controlling the behavior of Kafka source in Hudi Streamer.")
public class KafkaSourceConfig extends HoodieConfig {

  public static final String KAFKA_CHECKPOINT_TYPE_STRING = "string";
  public static final String KAFKA_CHECKPOINT_TYPE_TIMESTAMP = "timestamp";
  public static final String KAFKA_CHECKPOINT_TYPE_SINGLE_OFFSET = "single_offset";

  private static final String PREFIX = STREAMER_CONFIG_PREFIX + "source.kafka.";
  private static final String OLD_PREFIX = DELTA_STREAMER_CONFIG_PREFIX + "source.kafka.";

  public static final ConfigProperty KAFKA_CHECKPOINT_TYPE = ConfigProperty
      .key(PREFIX + "checkpoint.type")
      .defaultValue(KAFKA_CHECKPOINT_TYPE_STRING)
      .withAlternatives(OLD_PREFIX + "checkpoint.type")
      .markAdvanced()
      .withDocumentation("Kafka checkpoint type. Value must be one of the following: "
          + KAFKA_CHECKPOINT_TYPE_STRING + ", " + KAFKA_CHECKPOINT_TYPE_TIMESTAMP + ", " + KAFKA_CHECKPOINT_TYPE_SINGLE_OFFSET
          + ". Default type is " + KAFKA_CHECKPOINT_TYPE_STRING + ". "
          + "For type " + KAFKA_CHECKPOINT_TYPE_STRING + ", checkpoint should be provided as: topicName,0:offset0,1:offset1,2:offset2. "
          + "For type " + KAFKA_CHECKPOINT_TYPE_TIMESTAMP + ", checkpoint should be provided as long value of desired timestamp. "
          + "For type " + KAFKA_CHECKPOINT_TYPE_SINGLE_OFFSET + ", we assume that topic consists of a single partition, "
          + "so checkpoint should be provided as long value of desired offset.");

  public static final ConfigProperty KAFKA_AVRO_VALUE_DESERIALIZER_CLASS = ConfigProperty
      .key(PREFIX + "value.deserializer.class")
      .defaultValue("io.confluent.kafka.serializers.KafkaAvroDeserializer")
      .withAlternatives(OLD_PREFIX + "value.deserializer.class")
      .markAdvanced()
      .sinceVersion("0.9.0")
      .withDocumentation("This class is used by kafka client to deserialize the records.");

  public static final ConfigProperty KAFKA_VALUE_DESERIALIZER_SCHEMA = ConfigProperty
      .key(PREFIX + "value.deserializer.schema")
      .noDefaultValue()
      .withAlternatives(OLD_PREFIX + "value.deserializer.schema")
      .markAdvanced()
      .withDocumentation("Schema to deserialize the records.");

  @Deprecated
  public static final ConfigProperty KAFKA_FETCH_PARTITION_TIME_OUT = ConfigProperty
      .key(PREFIX + "fetch_partition.time.out")
      .defaultValue(300 * 1000L)
      .withAlternatives(OLD_PREFIX + "fetch_partition.time.out")
      .markAdvanced()
      .withDocumentation("Time out for fetching partitions. 5min by default");

  public static final ConfigProperty ENABLE_KAFKA_COMMIT_OFFSET = ConfigProperty
      .key(PREFIX + "enable.commit.offset")
      .defaultValue(false)
      .withAlternatives(OLD_PREFIX + "enable.commit.offset")
      .markAdvanced()
      .withDocumentation("Automatically submits offset to kafka.");

  public static final ConfigProperty ENABLE_FAIL_ON_DATA_LOSS = ConfigProperty
      .key(PREFIX + "enable.failOnDataLoss")
      .defaultValue(false)
      .withAlternatives(OLD_PREFIX + "enable.failOnDataLoss")
      .markAdvanced()
      .withDocumentation("Fail when checkpoint goes out of bounds instead of seeking to earliest offsets.");

  public static final ConfigProperty MAX_EVENTS_FROM_KAFKA_SOURCE = ConfigProperty
      .key(STREAMER_CONFIG_PREFIX + "kafka.source.maxEvents")
      .defaultValue(5000000L)
      .withAlternatives(DELTA_STREAMER_CONFIG_PREFIX + "kafka.source.maxEvents")
      .markAdvanced()
      .withDocumentation("Maximum number of records obtained in each batch.");

  // the documentation is inspired by the minPartition definition of kafka structured streaming
  public static final ConfigProperty KAFKA_SOURCE_MIN_PARTITIONS = ConfigProperty
      .key(PREFIX + "minPartitions")
      .defaultValue(0L)
      .withAlternatives(OLD_PREFIX + "minPartitions")
      .markAdvanced()
      .sinceVersion("0.14.0")
          .withDocumentation("Desired minimum number of partitions to read from Kafka. "
              + "By default, Hudi has a 1-1 mapping of topicPartitions to Hudi partitions consuming from Kafka. "
              + "If set this option to a value greater than topicPartitions, "
              + "Hudi will divvy up large Kafka partitions to smaller pieces. "
              + "Please note that this configuration is like a hint: the number of input tasks will be approximately minPartitions. "
              + "It can be less or more depending on rounding errors or Kafka partitions that didn't receive any new data.");

  public static final ConfigProperty KAFKA_TOPIC_NAME = ConfigProperty
      .key(PREFIX + "topic")
      .noDefaultValue()
      .withAlternatives(OLD_PREFIX + "topic")
      .withDocumentation("Kafka topic name.");

  // "auto.offset.reset" is kafka native config param. Do not change the config param name.
  public static final ConfigProperty KAFKA_AUTO_OFFSET_RESET = ConfigProperty
      .key("auto.offset.reset")
      .defaultValue(KafkaResetOffsetStrategies.LATEST)
      .markAdvanced()
      .withDocumentation("Kafka consumer strategy for reading data.");

  public static final ConfigProperty KAFKA_PROTO_VALUE_DESERIALIZER_CLASS = ConfigProperty
      .key(PREFIX + "proto.value.deserializer.class")
      .defaultValue(ByteArrayDeserializer.class.getName())
      .sinceVersion("0.15.0")
      .withDocumentation("Kafka Proto Payload Deserializer Class");

  public static final ConfigProperty INITIAL_RETRY_INTERVAL_MS = ConfigProperty
      .key(PREFIX + "retry.initial_interval_ms")
      .defaultValue(100L)
      .markAdvanced()
      .sinceVersion("1.1.0")
      .withDocumentation("Amount of time (in ms) to wait, before retry to do operations on KafkaConsumer.");

  public static final ConfigProperty MAX_RETRY_INTERVAL_MS = ConfigProperty
      .key(PREFIX + "retry.max_interval_ms")
      .defaultValue(2000L)
      .markAdvanced()
      .sinceVersion("1.1.0")
      .withDocumentation("Maximum amount of time (in ms), to wait for next retry.");

  public static final ConfigProperty MAX_RETRY_COUNT = ConfigProperty
      .key(PREFIX + "retry.max_count")
      .defaultValue(4)
      .markAdvanced()
      .sinceVersion("1.1.0")
      .withDocumentation("Maximum number of retry actions to perform, with exponential backoff.");

  public static final ConfigProperty RETRY_EXCEPTIONS = ConfigProperty
      .key(PREFIX + "retry.exceptions")
      .defaultValue("")
      .markAdvanced()
      .sinceVersion("1.1.0")
      .withDocumentation("The class name of the Exception that needs to be retried, separated by commas. "
          + "Default is empty which means retry all the IOException and RuntimeException from KafkaConsumer");

  /**
   * Kafka reset offset strategies.
   */
  public enum KafkaResetOffsetStrategies {
    LATEST, EARLIEST, GROUP
  }
}