All Downloads are FREE. Search and download functionalities are using the official Maven repository.

org.apache.spark.sql.kafka010.KafkaSourceProvider.scala Maven / Gradle / Ivy

The newest version!
/*
 * Licensed to the Apache Software Foundation (ASF) under one or more
 * contributor license agreements.  See the NOTICE file distributed with
 * this work for additional information regarding copyright ownership.
 * The ASF licenses this file to You under the Apache License, Version 2.0
 * (the "License"); you may not use this file except in compliance with
 * the License.  You may obtain a copy of the License at
 *
 *    http://www.apache.org/licenses/LICENSE-2.0
 *
 * Unless required by applicable law or agreed to in writing, software
 * distributed under the License is distributed on an "AS IS" BASIS,
 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 * See the License for the specific language governing permissions and
 * limitations under the License.
 */

package org.apache.spark.sql.kafka010

import java.{util => ju}
import java.util.{Locale, UUID}

import scala.jdk.CollectionConverters._

import org.apache.kafka.clients.consumer.ConsumerConfig
import org.apache.kafka.clients.producer.ProducerConfig
import org.apache.kafka.common.serialization.{ByteArrayDeserializer, ByteArraySerializer}

import org.apache.spark.internal.{Logging, LogKeys, MDC}
import org.apache.spark.kafka010.KafkaConfigUpdater
import org.apache.spark.sql.{AnalysisException, DataFrame, SaveMode, SQLContext}
import org.apache.spark.sql.catalyst.util.CaseInsensitiveMap
import org.apache.spark.sql.connector.catalog.{SupportsRead, SupportsWrite, Table, TableCapability}
import org.apache.spark.sql.connector.metric.{CustomMetric, CustomSumMetric}
import org.apache.spark.sql.connector.read.{Batch, Scan, ScanBuilder}
import org.apache.spark.sql.connector.read.streaming.{ContinuousStream, MicroBatchStream}
import org.apache.spark.sql.connector.write.{LogicalWriteInfo, SupportsTruncate, Write, WriteBuilder}
import org.apache.spark.sql.execution.streaming.{Sink, Source}
import org.apache.spark.sql.internal.connector.{SimpleTableProvider, SupportsStreamingUpdateAsAppend}
import org.apache.spark.sql.sources._
import org.apache.spark.sql.streaming.OutputMode
import org.apache.spark.sql.types.StructType
import org.apache.spark.sql.util.CaseInsensitiveStringMap
import org.apache.spark.util.ArrayImplicits._

/**
 * The provider class for all Kafka readers and writers. It is designed such that it throws
 * IllegalArgumentException when the Kafka Dataset is created, so that it can catch
 * missing options even before the query is started.
 */
private[kafka010] class KafkaSourceProvider extends DataSourceRegister
    with StreamSourceProvider
    with StreamSinkProvider
    with RelationProvider
    with CreatableRelationProvider
    with SimpleTableProvider
    with Logging {
  import KafkaSourceProvider._

  override def shortName(): String = "kafka"

  /**
   * Returns the name and schema of the source. In addition, it also verifies whether the options
   * are correct and sufficient to create the [[KafkaSource]] when the query is started.
   */
  override def sourceSchema(
      sqlContext: SQLContext,
      schema: Option[StructType],
      providerName: String,
      parameters: Map[String, String]): (String, StructType) = {
    val caseInsensitiveParameters = CaseInsensitiveMap(parameters)
    validateStreamOptions(caseInsensitiveParameters)
    require(schema.isEmpty, "Kafka source has a fixed schema and cannot be set with a custom one")
    val includeHeaders = caseInsensitiveParameters.getOrElse(INCLUDE_HEADERS, "false").toBoolean
    (shortName(), KafkaRecordToRowConverter.kafkaSchema(includeHeaders))
  }

  override def createSource(
      sqlContext: SQLContext,
      metadataPath: String,
      schema: Option[StructType],
      providerName: String,
      parameters: Map[String, String]): Source = {
    val caseInsensitiveParameters = CaseInsensitiveMap(parameters)
    validateStreamOptions(caseInsensitiveParameters)
    // Each running query should use its own group id. Otherwise, the query may be only assigned
    // partial data since Kafka will assign partitions to multiple consumers having the same group
    // id. Hence, we should generate a unique id for each query.
    val uniqueGroupId = streamingUniqueGroupId(caseInsensitiveParameters, metadataPath)

    val specifiedKafkaParams = convertToSpecifiedParams(caseInsensitiveParameters)

    val startingStreamOffsets = KafkaSourceProvider.getKafkaOffsetRangeLimit(
      caseInsensitiveParameters, STARTING_TIMESTAMP_OPTION_KEY,
      STARTING_OFFSETS_BY_TIMESTAMP_OPTION_KEY, STARTING_OFFSETS_OPTION_KEY,
      LatestOffsetRangeLimit)

    val kafkaOffsetReader = KafkaOffsetReader.build(
      strategy(caseInsensitiveParameters),
      kafkaParamsForDriver(specifiedKafkaParams),
      caseInsensitiveParameters,
      driverGroupIdPrefix = s"$uniqueGroupId-driver")

    new KafkaSource(
      sqlContext,
      kafkaOffsetReader,
      kafkaParamsForExecutors(specifiedKafkaParams, uniqueGroupId),
      caseInsensitiveParameters,
      metadataPath,
      startingStreamOffsets,
      failOnDataLoss(caseInsensitiveParameters))
  }

  override def getTable(options: CaseInsensitiveStringMap): KafkaTable = {
    val includeHeaders = options.getBoolean(INCLUDE_HEADERS, false)
    new KafkaTable(includeHeaders)
  }

  /**
   * Returns a new base relation with the given parameters.
   *
   * @note The parameters' keywords are case insensitive and this insensitivity is enforced
   *       by the Map that is passed to the function.
   */
  override def createRelation(
      sqlContext: SQLContext,
      parameters: Map[String, String]): BaseRelation = {
    val caseInsensitiveParameters = CaseInsensitiveMap(parameters)
    validateBatchOptions(caseInsensitiveParameters)
    val specifiedKafkaParams = convertToSpecifiedParams(caseInsensitiveParameters)

    val startingRelationOffsets = KafkaSourceProvider.getKafkaOffsetRangeLimit(
      caseInsensitiveParameters, STARTING_TIMESTAMP_OPTION_KEY,
      STARTING_OFFSETS_BY_TIMESTAMP_OPTION_KEY, STARTING_OFFSETS_OPTION_KEY,
      EarliestOffsetRangeLimit)
    assert(startingRelationOffsets != LatestOffsetRangeLimit)

    val endingRelationOffsets = KafkaSourceProvider.getKafkaOffsetRangeLimit(
      caseInsensitiveParameters, ENDING_TIMESTAMP_OPTION_KEY,
      ENDING_OFFSETS_BY_TIMESTAMP_OPTION_KEY, ENDING_OFFSETS_OPTION_KEY,
      LatestOffsetRangeLimit)
    assert(endingRelationOffsets != EarliestOffsetRangeLimit)

    val includeHeaders = caseInsensitiveParameters.getOrElse(INCLUDE_HEADERS, "false").toBoolean

    new KafkaRelation(
      sqlContext,
      strategy(caseInsensitiveParameters),
      sourceOptions = caseInsensitiveParameters,
      specifiedKafkaParams = specifiedKafkaParams,
      failOnDataLoss = failOnDataLoss(caseInsensitiveParameters),
      includeHeaders = includeHeaders,
      startingOffsets = startingRelationOffsets,
      endingOffsets = endingRelationOffsets)
  }

  override def createSink(
      sqlContext: SQLContext,
      parameters: Map[String, String],
      partitionColumns: Seq[String],
      outputMode: OutputMode): Sink = {
    val caseInsensitiveParameters = CaseInsensitiveMap(parameters)
    val defaultTopic = caseInsensitiveParameters.get(TOPIC_OPTION_KEY).map(_.trim)
    val specifiedKafkaParams = kafkaParamsForProducer(caseInsensitiveParameters)
    new KafkaSink(specifiedKafkaParams, defaultTopic)
  }

  override def createRelation(
      outerSQLContext: SQLContext,
      mode: SaveMode,
      parameters: Map[String, String],
      data: DataFrame): BaseRelation = {
    mode match {
      case SaveMode.Overwrite | SaveMode.Ignore =>
        throw new AnalysisException(
          errorClass = "_LEGACY_ERROR_TEMP_3081",
          messageParameters = Map(
            "mode" -> mode.toString,
            "append" -> SaveMode.Append.toString,
            "errorIfExists" -> SaveMode.ErrorIfExists.toString))
      case _ => // good
    }
    val caseInsensitiveParameters = CaseInsensitiveMap(parameters)
    val topic = caseInsensitiveParameters.get(TOPIC_OPTION_KEY).map(_.trim)
    val specifiedKafkaParams = kafkaParamsForProducer(caseInsensitiveParameters)
    KafkaWriter.write(data.queryExecution, specifiedKafkaParams, topic)

    /* This method is suppose to return a relation that reads the data that was written.
     * We cannot support this for Kafka. Therefore, in order to make things consistent,
     * we return an empty base relation.
     */
    new BaseRelation {
      override def sqlContext: SQLContext = unsupportedException
      override def schema: StructType = unsupportedException
      override def needConversion: Boolean = unsupportedException
      override def sizeInBytes: Long = unsupportedException
      override def unhandledFilters(filters: Array[Filter]): Array[Filter] = unsupportedException
      private def unsupportedException =
        throw new UnsupportedOperationException("BaseRelation from Kafka write " +
          "operation is not usable.")
    }
  }

  private def strategy(params: CaseInsensitiveMap[String]) = {
    val lowercaseParams = params.map { case (k, v) => (k.toLowerCase(Locale.ROOT), v) }

    lowercaseParams.find(x => STRATEGY_OPTION_KEYS.contains(x._1)).get match {
      case (ASSIGN, value) =>
        AssignStrategy(JsonUtils.partitions(value))
      case (SUBSCRIBE, value) =>
        SubscribeStrategy(value.split(",").map(_.trim()).filter(_.nonEmpty).toImmutableArraySeq)
      case (SUBSCRIBE_PATTERN, value) =>
        SubscribePatternStrategy(value.trim())
      case _ =>
        // Should never reach here as we are already matching on
        // matched strategy names
        throw new IllegalArgumentException("Unknown option")
    }
  }

  private def failOnDataLoss(params: CaseInsensitiveMap[String]) =
    params.getOrElse(FAIL_ON_DATA_LOSS_OPTION_KEY, "true").toBoolean

  private def validateGeneralOptions(params: CaseInsensitiveMap[String]): Unit = {
    // Validate source options
    val lowercaseParams = params.map { case (k, v) => (k.toLowerCase(Locale.ROOT), v) }
    val specifiedStrategies =
      lowercaseParams.filter { case (k, _) => STRATEGY_OPTION_KEYS.contains(k) }.toSeq

    if (specifiedStrategies.isEmpty) {
      throw new IllegalArgumentException(
        "One of the following options must be specified for Kafka source: "
          + STRATEGY_OPTION_KEYS.mkString(", ") + ". See the docs for more details.")
    } else if (specifiedStrategies.size > 1) {
      throw new IllegalArgumentException(
        "Only one of the following options can be specified for Kafka source: "
          + STRATEGY_OPTION_KEYS.mkString(", ") + ". See the docs for more details.")
    }

    lowercaseParams.find(x => STRATEGY_OPTION_KEYS.contains(x._1)).get match {
      case (ASSIGN, value) =>
        if (!value.trim.startsWith("{")) {
          throw new IllegalArgumentException(
            "No topicpartitions to assign as specified value for option " +
              s"'assign' is '$value'")
        }

      case (SUBSCRIBE, value) =>
        val topics = value.split(",").map(_.trim).filter(_.nonEmpty)
        if (topics.isEmpty) {
          throw new IllegalArgumentException(
            "No topics to subscribe to as specified value for option " +
              s"'subscribe' is '$value'")
        }
      case (SUBSCRIBE_PATTERN, value) =>
        val pattern = params(SUBSCRIBE_PATTERN).trim()
        if (pattern.isEmpty) {
          throw new IllegalArgumentException(
            "Pattern to subscribe is empty as specified value for option " +
              s"'subscribePattern' is '$value'")
        }
      case _ =>
        // Should never reach here as we are already matching on
        // matched strategy names
        throw new IllegalArgumentException("Unknown option")
    }

    // Validate minPartitions value if present
    if (params.contains(MIN_PARTITIONS_OPTION_KEY)) {
      val p = params(MIN_PARTITIONS_OPTION_KEY).toInt
      if (p <= 0) throw new IllegalArgumentException("minPartitions must be positive")
    }

    // Validate user-specified Kafka options

    if (params.contains(s"kafka.${ConsumerConfig.GROUP_ID_CONFIG}")) {
      logWarning(CUSTOM_GROUP_ID_ERROR_MESSAGE)
      if (params.contains(GROUP_ID_PREFIX)) {
        logWarning(log"Option groupIdPrefix will be ignored as " +
          log"option kafka.${MDC(LogKeys.CONFIG, ConsumerConfig.GROUP_ID_CONFIG)} has been set.")
      }
    }

    if (params.contains(s"kafka.${ConsumerConfig.AUTO_OFFSET_RESET_CONFIG}")) {
      throw new IllegalArgumentException(
        s"""
           |Kafka option '${ConsumerConfig.AUTO_OFFSET_RESET_CONFIG}' is not supported.
           |Instead set the source option '$STARTING_OFFSETS_OPTION_KEY' to 'earliest' or 'latest'
           |to specify where to start. Structured Streaming manages which offsets are consumed
           |internally, rather than relying on the kafkaConsumer to do it. This will ensure that no
           |data is missed when new topics/partitions are dynamically subscribed. Note that
           |'$STARTING_OFFSETS_OPTION_KEY' only applies when a new Streaming query is started, and
           |that resuming will always pick up from where the query left off. See the docs for more
           |details.
         """.stripMargin)
    }

    if (params.contains(s"kafka.${ConsumerConfig.KEY_DESERIALIZER_CLASS_CONFIG}")) {
      throw new IllegalArgumentException(
        s"Kafka option '${ConsumerConfig.KEY_DESERIALIZER_CLASS_CONFIG}' is not supported as keys "
          + "are deserialized as byte arrays with ByteArrayDeserializer. Use DataFrame operations "
          + "to explicitly deserialize the keys.")
    }

    if (params.contains(s"kafka.${ConsumerConfig.VALUE_DESERIALIZER_CLASS_CONFIG}"))
    {
      throw new IllegalArgumentException(
        s"Kafka option '${ConsumerConfig.VALUE_DESERIALIZER_CLASS_CONFIG}' is not supported as "
          + "values are deserialized as byte arrays with ByteArrayDeserializer. Use DataFrame "
          + "operations to explicitly deserialize the values.")
    }

    val otherUnsupportedConfigs = Seq(
      ConsumerConfig.ENABLE_AUTO_COMMIT_CONFIG, // committing correctly requires new APIs in Source
      ConsumerConfig.INTERCEPTOR_CLASSES_CONFIG) // interceptors can modify payload, so not safe

    otherUnsupportedConfigs.foreach { c =>
      if (params.contains(s"kafka.$c")) {
        throw new IllegalArgumentException(s"Kafka option '$c' is not supported")
      }
    }

    if (!params.contains(s"kafka.${ConsumerConfig.BOOTSTRAP_SERVERS_CONFIG}")) {
      throw new IllegalArgumentException(
        s"Option 'kafka.${ConsumerConfig.BOOTSTRAP_SERVERS_CONFIG}' must be specified for " +
          s"configuring Kafka consumer")
    }

    if (params.contains(MIN_OFFSET_PER_TRIGGER) && params.contains(MAX_OFFSET_PER_TRIGGER)) {
      val minOffsets = params.get(MIN_OFFSET_PER_TRIGGER).get.toLong
      val maxOffsets = params.get(MAX_OFFSET_PER_TRIGGER).get.toLong
      if (minOffsets > maxOffsets) {
        throw new IllegalArgumentException(s"The value of minOffsetPerTrigger($minOffsets) is " +
          s"higher than the maxOffsetsPerTrigger($maxOffsets).")
      }
    }
  }

  private def validateStreamOptions(params: CaseInsensitiveMap[String]) = {
    // Stream specific options
    params.get(ENDING_OFFSETS_OPTION_KEY).map(_ =>
      throw new IllegalArgumentException("ending offset not valid in streaming queries"))
    params.get(ENDING_OFFSETS_BY_TIMESTAMP_OPTION_KEY).map(_ =>
      throw new IllegalArgumentException("ending timestamp not valid in streaming queries"))

    validateGeneralOptions(params)
  }

  private def validateBatchOptions(params: CaseInsensitiveMap[String]) = {
    // Batch specific options
    KafkaSourceProvider.getKafkaOffsetRangeLimit(
      params, STARTING_TIMESTAMP_OPTION_KEY, STARTING_OFFSETS_BY_TIMESTAMP_OPTION_KEY,
      STARTING_OFFSETS_OPTION_KEY, EarliestOffsetRangeLimit) match {
      case EarliestOffsetRangeLimit => // good to go
      case LatestOffsetRangeLimit =>
        throw new IllegalArgumentException("starting offset can't be latest " +
          "for batch queries on Kafka")
      case SpecificOffsetRangeLimit(partitionOffsets) =>
        partitionOffsets.foreach {
          case (tp, off) if off == KafkaOffsetRangeLimit.LATEST =>
            throw new IllegalArgumentException(s"startingOffsets for $tp can't " +
              "be latest for batch queries on Kafka")
          case _ => // ignore
        }
      case _: SpecificTimestampRangeLimit => // good to go
      case _: GlobalTimestampRangeLimit => // good to go
    }

    KafkaSourceProvider.getKafkaOffsetRangeLimit(
      params, ENDING_TIMESTAMP_OPTION_KEY, ENDING_OFFSETS_BY_TIMESTAMP_OPTION_KEY,
      ENDING_OFFSETS_OPTION_KEY, LatestOffsetRangeLimit) match {
      case EarliestOffsetRangeLimit =>
        throw new IllegalArgumentException("ending offset can't be earliest " +
          "for batch queries on Kafka")
      case LatestOffsetRangeLimit => // good to go
      case SpecificOffsetRangeLimit(partitionOffsets) =>
        partitionOffsets.foreach {
          case (tp, off) if off == KafkaOffsetRangeLimit.EARLIEST =>
            throw new IllegalArgumentException(s"ending offset for $tp can't be " +
              "earliest for batch queries on Kafka")
          case _ => // ignore
        }
      case _: SpecificTimestampRangeLimit => // good to go
      case _: GlobalTimestampRangeLimit => // good to go
    }

    validateGeneralOptions(params)

    // Don't want to throw an error, but at least log a warning.
    if (params.contains(MAX_OFFSET_PER_TRIGGER)) {
      logWarning("maxOffsetsPerTrigger option ignored in batch queries")
    }

    if (params.contains(MIN_OFFSET_PER_TRIGGER)) {
      logWarning("minOffsetsPerTrigger option ignored in batch queries")
    }

    if (params.contains(MAX_TRIGGER_DELAY)) {
      logWarning("maxTriggerDelay option ignored in batch queries")
    }
  }

  class KafkaTable(includeHeaders: Boolean) extends Table with SupportsRead with SupportsWrite {

    override def name(): String = "KafkaTable"

    override def schema(): StructType = KafkaRecordToRowConverter.kafkaSchema(includeHeaders)

    override def capabilities(): ju.Set[TableCapability] = {
      import TableCapability._
      // ACCEPT_ANY_SCHEMA is needed because of the following reasons:
      // * Kafka writer validates the schema instead of the SQL analyzer (the schema is fixed)
      // * Read schema differs from write schema (please see Kafka integration guide)
      ju.EnumSet.of(BATCH_READ, BATCH_WRITE, MICRO_BATCH_READ, CONTINUOUS_READ, STREAMING_WRITE,
        ACCEPT_ANY_SCHEMA)
    }

    override def newScanBuilder(options: CaseInsensitiveStringMap): ScanBuilder =
      () => new KafkaScan(options)

    override def newWriteBuilder(info: LogicalWriteInfo): WriteBuilder = {
      new WriteBuilder with SupportsTruncate with SupportsStreamingUpdateAsAppend {
        private val options = info.options
        private val inputSchema: StructType = info.schema()
        private val topic = Option(options.get(TOPIC_OPTION_KEY)).map(_.trim)
        private val producerParams =
          kafkaParamsForProducer(CaseInsensitiveMap(options.asScala.toMap))

        override def build(): Write = KafkaWrite(topic, producerParams, inputSchema)

        override def truncate(): WriteBuilder = this
      }
    }
  }

  class KafkaScan(options: CaseInsensitiveStringMap) extends Scan {
    val includeHeaders = options.getBoolean(INCLUDE_HEADERS, false)

    override def readSchema(): StructType = {
      KafkaRecordToRowConverter.kafkaSchema(includeHeaders)
    }

    override def toBatch(): Batch = {
      val caseInsensitiveOptions = CaseInsensitiveMap(options.asScala.toMap)
      validateBatchOptions(caseInsensitiveOptions)
      val specifiedKafkaParams = convertToSpecifiedParams(caseInsensitiveOptions)

      val startingRelationOffsets = KafkaSourceProvider.getKafkaOffsetRangeLimit(
        caseInsensitiveOptions, STARTING_TIMESTAMP_OPTION_KEY,
        STARTING_OFFSETS_BY_TIMESTAMP_OPTION_KEY, STARTING_OFFSETS_OPTION_KEY,
        EarliestOffsetRangeLimit)

      val endingRelationOffsets = KafkaSourceProvider.getKafkaOffsetRangeLimit(
        caseInsensitiveOptions, ENDING_TIMESTAMP_OPTION_KEY,
        ENDING_OFFSETS_BY_TIMESTAMP_OPTION_KEY, ENDING_OFFSETS_OPTION_KEY,
        LatestOffsetRangeLimit)

      new KafkaBatch(
        strategy(caseInsensitiveOptions),
        caseInsensitiveOptions,
        specifiedKafkaParams,
        failOnDataLoss(caseInsensitiveOptions),
        startingRelationOffsets,
        endingRelationOffsets,
        includeHeaders)
    }

    override def toMicroBatchStream(checkpointLocation: String): MicroBatchStream = {
      val caseInsensitiveOptions = CaseInsensitiveMap(options.asScala.toMap)
      validateStreamOptions(caseInsensitiveOptions)
      // Each running query should use its own group id. Otherwise, the query may be only assigned
      // partial data since Kafka will assign partitions to multiple consumers having the same group
      // id. Hence, we should generate a unique id for each query.
      val uniqueGroupId = streamingUniqueGroupId(caseInsensitiveOptions, checkpointLocation)

      val specifiedKafkaParams = convertToSpecifiedParams(caseInsensitiveOptions)

      val startingStreamOffsets = KafkaSourceProvider.getKafkaOffsetRangeLimit(
        caseInsensitiveOptions, STARTING_TIMESTAMP_OPTION_KEY,
        STARTING_OFFSETS_BY_TIMESTAMP_OPTION_KEY, STARTING_OFFSETS_OPTION_KEY,
        LatestOffsetRangeLimit)

      val kafkaOffsetReader = KafkaOffsetReader.build(
        strategy(caseInsensitiveOptions),
        kafkaParamsForDriver(specifiedKafkaParams),
        caseInsensitiveOptions,
        driverGroupIdPrefix = s"$uniqueGroupId-driver")

      new KafkaMicroBatchStream(
        kafkaOffsetReader,
        kafkaParamsForExecutors(specifiedKafkaParams, uniqueGroupId),
        options,
        checkpointLocation,
        startingStreamOffsets,
        failOnDataLoss(caseInsensitiveOptions))
    }

    override def toContinuousStream(checkpointLocation: String): ContinuousStream = {
      val caseInsensitiveOptions = CaseInsensitiveMap(options.asScala.toMap)
      validateStreamOptions(caseInsensitiveOptions)
      // Each running query should use its own group id. Otherwise, the query may be only assigned
      // partial data since Kafka will assign partitions to multiple consumers having the same group
      // id. Hence, we should generate a unique id for each query.
      val uniqueGroupId = streamingUniqueGroupId(caseInsensitiveOptions, checkpointLocation)

      val specifiedKafkaParams = convertToSpecifiedParams(caseInsensitiveOptions)

      val startingStreamOffsets = KafkaSourceProvider.getKafkaOffsetRangeLimit(
        caseInsensitiveOptions, STARTING_TIMESTAMP_OPTION_KEY,
        STARTING_OFFSETS_BY_TIMESTAMP_OPTION_KEY, STARTING_OFFSETS_OPTION_KEY,
        LatestOffsetRangeLimit)

      val kafkaOffsetReader = KafkaOffsetReader.build(
        strategy(caseInsensitiveOptions),
        kafkaParamsForDriver(specifiedKafkaParams),
        caseInsensitiveOptions,
        driverGroupIdPrefix = s"$uniqueGroupId-driver")

      new KafkaContinuousStream(
        kafkaOffsetReader,
        kafkaParamsForExecutors(specifiedKafkaParams, uniqueGroupId),
        options,
        checkpointLocation,
        startingStreamOffsets,
        failOnDataLoss(caseInsensitiveOptions))
    }

    override def supportedCustomMetrics(): Array[CustomMetric] = {
      Array(new OffsetOutOfRangeMetric, new DataLossMetric)
    }

    override def columnarSupportMode(): Scan.ColumnarSupportMode =
      Scan.ColumnarSupportMode.UNSUPPORTED
  }
}

private[spark] class OffsetOutOfRangeMetric extends CustomSumMetric {
  override def name(): String = "offsetOutOfRange"
  override def description(): String = "estimated number of fetched offsets out of range"
}

private[spark] class DataLossMetric extends CustomSumMetric {
  override def name(): String = "dataLoss"
  override def description(): String = "number of data loss error"
}

private[kafka010] object KafkaSourceProvider extends Logging {
  private val ASSIGN = "assign"
  private val SUBSCRIBE_PATTERN = "subscribepattern"
  private val SUBSCRIBE = "subscribe"
  private val STRATEGY_OPTION_KEYS = Set(SUBSCRIBE, SUBSCRIBE_PATTERN, ASSIGN)
  private[kafka010] val STARTING_OFFSETS_OPTION_KEY = "startingoffsets"
  private[kafka010] val ENDING_OFFSETS_OPTION_KEY = "endingoffsets"
  private[kafka010] val STARTING_OFFSETS_BY_TIMESTAMP_OPTION_KEY = "startingoffsetsbytimestamp"
  private[kafka010] val ENDING_OFFSETS_BY_TIMESTAMP_OPTION_KEY = "endingoffsetsbytimestamp"
  private[kafka010] val STARTING_TIMESTAMP_OPTION_KEY = "startingtimestamp"
  private[kafka010] val ENDING_TIMESTAMP_OPTION_KEY = "endingtimestamp"
  private val FAIL_ON_DATA_LOSS_OPTION_KEY = "failondataloss"
  private[kafka010] val MIN_PARTITIONS_OPTION_KEY = "minpartitions"
  private[kafka010] val MAX_OFFSET_PER_TRIGGER = "maxoffsetspertrigger"
  private[kafka010] val MIN_OFFSET_PER_TRIGGER = "minoffsetspertrigger"
  private[kafka010] val MAX_TRIGGER_DELAY = "maxtriggerdelay"
  private[kafka010] val DEFAULT_MAX_TRIGGER_DELAY = "15m"
  private[kafka010] val FETCH_OFFSET_NUM_RETRY = "fetchoffset.numretries"
  private[kafka010] val FETCH_OFFSET_RETRY_INTERVAL_MS = "fetchoffset.retryintervalms"
  private[kafka010] val CONSUMER_POLL_TIMEOUT = "kafkaconsumer.polltimeoutms"
  private[kafka010] val STARTING_OFFSETS_BY_TIMESTAMP_STRATEGY_KEY =
    "startingoffsetsbytimestampstrategy"
  private val GROUP_ID_PREFIX = "groupidprefix"
  private[kafka010] val INCLUDE_HEADERS = "includeheaders"
  // This is only for internal testing and should not be used otherwise.
  private[kafka010] val MOCK_SYSTEM_TIME = "_mockSystemTime"

  private[kafka010] object StrategyOnNoMatchStartingOffset extends Enumeration {
    val ERROR, LATEST = Value
  }

  val TOPIC_OPTION_KEY = "topic"

  val INSTRUCTION_FOR_FAIL_ON_DATA_LOSS_FALSE =
    """
      |Some data may have been lost because they are not available in Kafka any more; either the
      | data was aged out by Kafka or the topic may have been deleted before all the data in the
      | topic was processed. If you want your streaming query to fail on such cases, set the source
      | option "failOnDataLoss" to "true".
    """.stripMargin

  val CUSTOM_GROUP_ID_ERROR_MESSAGE =
    s"""Kafka option 'kafka.${ConsumerConfig.GROUP_ID_CONFIG}' has been set on this query, it is
       | not recommended to set this option. This option is unsafe to use since multiple concurrent
       | queries or sources using the same group id will interfere with each other as they are part
       | of the same consumer group. Restarted queries may also suffer interference from the
       | previous run having the same group id. The user should have only one query per group id,
       | and/or set the option 'kafka.session.timeout.ms' to be very small so that the Kafka
       | consumers from the previous query are marked dead by the Kafka group coordinator before the
       | restarted query starts running.
    """.stripMargin

  private val serClassName = classOf[ByteArraySerializer].getName
  private val deserClassName = classOf[ByteArrayDeserializer].getName

  def getKafkaOffsetRangeLimit(
      params: CaseInsensitiveMap[String],
      globalOffsetTimestampOptionKey: String,
      offsetByTimestampOptionKey: String,
      offsetOptionKey: String,
      defaultOffsets: KafkaOffsetRangeLimit): KafkaOffsetRangeLimit = {
    // The order below represents "preferences"

    val strategyOnNoMatchStartingOffset = params.get(STARTING_OFFSETS_BY_TIMESTAMP_STRATEGY_KEY)
      .map(v => StrategyOnNoMatchStartingOffset.withName(v.toUpperCase(Locale.ROOT)))
      .getOrElse(StrategyOnNoMatchStartingOffset.ERROR)

    if (params.contains(globalOffsetTimestampOptionKey)) {
      // 1. global timestamp
      val tsStr = params(globalOffsetTimestampOptionKey).trim
      try {
        val ts = tsStr.toLong
        GlobalTimestampRangeLimit(ts, strategyOnNoMatchStartingOffset)
      } catch {
        case _: NumberFormatException =>
          throw new IllegalArgumentException(s"Expected a single long value, got $tsStr")
      }
    } else if (params.contains(offsetByTimestampOptionKey)) {
      // 2. timestamp per topic partition
      val json = params(offsetByTimestampOptionKey).trim
      SpecificTimestampRangeLimit(JsonUtils.partitionTimestamps(json),
        strategyOnNoMatchStartingOffset)
    } else {
      // 3. latest/earliest/offset
      params.get(offsetOptionKey).map(_.trim) match {
        case Some(offset) if offset.toLowerCase(Locale.ROOT) == "latest" =>
          LatestOffsetRangeLimit
        case Some(offset) if offset.toLowerCase(Locale.ROOT) == "earliest" =>
          EarliestOffsetRangeLimit
        case Some(json) => SpecificOffsetRangeLimit(JsonUtils.partitionOffsets(json))
        case None => defaultOffsets
      }
    }
  }

  def kafkaParamsForDriver(specifiedKafkaParams: Map[String, String]): ju.Map[String, Object] =
    KafkaConfigUpdater("source", specifiedKafkaParams)
      .set(ConsumerConfig.KEY_DESERIALIZER_CLASS_CONFIG, deserClassName)
      .set(ConsumerConfig.VALUE_DESERIALIZER_CLASS_CONFIG, deserClassName)

      // Set to "earliest" to avoid exceptions. However, KafkaSource will fetch the initial
      // offsets by itself instead of counting on KafkaConsumer.
      .set(ConsumerConfig.AUTO_OFFSET_RESET_CONFIG, "earliest")

      // So that consumers in the driver does not commit offsets unnecessarily
      .set(ConsumerConfig.ENABLE_AUTO_COMMIT_CONFIG, "false")

      // So that the driver does not pull too much data
      .set(ConsumerConfig.MAX_POLL_RECORDS_CONFIG, java.lang.Integer.valueOf(1))

      // If buffer config is not set, set it to reasonable value to work around
      // buffer issues (see KAFKA-3135)
      .setIfUnset(ConsumerConfig.RECEIVE_BUFFER_CONFIG, 65536: java.lang.Integer)
      .build()

  def kafkaParamsForExecutors(
      specifiedKafkaParams: Map[String, String],
      uniqueGroupId: String): ju.Map[String, Object] =
    KafkaConfigUpdater("executor", specifiedKafkaParams)
      .set(ConsumerConfig.KEY_DESERIALIZER_CLASS_CONFIG, deserClassName)
      .set(ConsumerConfig.VALUE_DESERIALIZER_CLASS_CONFIG, deserClassName)

      // Make sure executors do only what the driver tells them.
      .set(ConsumerConfig.AUTO_OFFSET_RESET_CONFIG, "none")

      // So that consumers in executors do not mess with any existing group id
      .setIfUnset(ConsumerConfig.GROUP_ID_CONFIG, s"$uniqueGroupId-executor")

      // So that consumers in executors does not commit offsets unnecessarily
      .set(ConsumerConfig.ENABLE_AUTO_COMMIT_CONFIG, "false")

      // If buffer config is not set, set it to reasonable value to work around
      // buffer issues (see KAFKA-3135)
      .setIfUnset(ConsumerConfig.RECEIVE_BUFFER_CONFIG, 65536: java.lang.Integer)
      .build()

  /**
   * Returns a unique batch consumer group (group.id), allowing the user to set the prefix of
   * the consumer group
   */
  private[kafka010] def batchUniqueGroupId(params: CaseInsensitiveMap[String]): String = {
    val groupIdPrefix = params.getOrElse(GROUP_ID_PREFIX, "spark-kafka-relation")
    s"${groupIdPrefix}-${UUID.randomUUID}"
  }

  /**
   * Returns a unique streaming consumer group (group.id), allowing the user to set the prefix of
   * the consumer group
   */
  private def streamingUniqueGroupId(
      params: CaseInsensitiveMap[String],
      metadataPath: String): String = {
    val groupIdPrefix = params.getOrElse(GROUP_ID_PREFIX, "spark-kafka-source")
    s"${groupIdPrefix}-${UUID.randomUUID}-${metadataPath.hashCode}"
  }

  private[kafka010] def kafkaParamsForProducer(
      params: CaseInsensitiveMap[String]): ju.Map[String, Object] = {
    if (params.contains(s"kafka.${ProducerConfig.KEY_SERIALIZER_CLASS_CONFIG}")) {
      throw new IllegalArgumentException(
        s"Kafka option '${ProducerConfig.KEY_SERIALIZER_CLASS_CONFIG}' is not supported as keys "
          + "are serialized with ByteArraySerializer.")
    }

    if (params.contains(s"kafka.${ProducerConfig.VALUE_SERIALIZER_CLASS_CONFIG}")) {
      throw new IllegalArgumentException(
        s"Kafka option '${ProducerConfig.VALUE_SERIALIZER_CLASS_CONFIG}' is not supported as "
          + "value are serialized with ByteArraySerializer.")
    }

    val specifiedKafkaParams = convertToSpecifiedParams(params)

    KafkaConfigUpdater("executor", specifiedKafkaParams)
      .set(ProducerConfig.KEY_SERIALIZER_CLASS_CONFIG, serClassName)
      .set(ProducerConfig.VALUE_SERIALIZER_CLASS_CONFIG, serClassName)
      .build()
  }

  private def convertToSpecifiedParams(parameters: Map[String, String]): Map[String, String] = {
    parameters
      .keySet
      .filter(_.toLowerCase(Locale.ROOT).startsWith("kafka."))
      .map { k => k.drop(6) -> parameters(k) }
      .toMap
  }
}




© 2015 - 2024 Weber Informatics LLC | Privacy Policy