All Downloads are FREE. Search and download functionalities are using the official Maven repository.

com.github.panhongan.util.sparkstreaming.StreamingFactory.scala Maven / Gradle / Ivy

The newest version!
package com.github.panhongan.util.sparkstreaming

import kafka.serializer.StringDecoder
import kafka.common.TopicAndPartition
import kafka.message.MessageAndMetadata
import org.apache.spark.streaming.kafka.OffsetRange
import org.apache.spark.streaming.dstream.InputDStream
import org.apache.spark.streaming.kafka.KafkaUtils
import org.apache.spark.streaming.StreamingContext
import org.apache.spark.streaming.kafka.HasOffsetRanges

import org.slf4j.LoggerFactory
import org.slf4j.Logger

object StreamingFactory {

  private val logger : Logger = LoggerFactory.getLogger(StreamingFactory.getClass)

  def createDirectStreamIgnoreOffset(topic_set: Set[String],
                                     kafka_param: Map[String, String],
                                     streaming_context: StreamingContext): InputDStream[(String, String)] = {
    KafkaUtils.createDirectStream[String, String, StringDecoder, StringDecoder](streaming_context, kafka_param, topic_set)
  }

  def createDirectStreamByOffset(zk_list: String,
                                 offset_zk_path: String,
                                 topic: String,
                                 partition_num: Int,
                                 kafka_param: Map[String, String],
                                 streaming_context: StreamingContext,
                                 reuse_last_data_when_largest: Boolean): InputDStream[(String, String)] = {

    var start_read_offsets = Map[TopicAndPartition, Long]()

    // latest read offset from zk
    var latest_read_offsets = KafkaOffsetUtil.readOffset(zk_list,
      offset_zk_path,
      kafka_param.get("group.id").get,
      topic,
      partition_num)
    if (!latest_read_offsets.isEmpty) {
      for (offset <- latest_read_offsets) {
        logger.info("start from, topic = " + offset.topic +
          ", partition = " + offset.partition +
          ", start_offset = " + offset.fromOffset +
          ", end_offset = " + offset.untilOffset)
      }

      // latest write offset
      val latest_write_offsets = KafkaOffsetUtil.getLatestWriteOffset(kafka_param.get("metadata.broker.list").get, topic)
      if (latest_write_offsets.isEmpty) {
        logger.warn("failed to get partition write offset, topic = {}", topic)
        System.exit(1)
      }

      val compare = KafkaOffsetUtil.compareConsumerAndProducerOffset(latest_read_offsets, latest_write_offsets)
      if (compare == 0) { // largest offset
        if (reuse_last_data_when_largest) {
          for (offset <- latest_read_offsets) {
            start_read_offsets += (TopicAndPartition(offset.topic, offset.partition) -> offset.fromOffset)
          }
        } else {
          for (offset <- latest_read_offsets) {
            start_read_offsets += (TopicAndPartition(offset.topic, offset.partition) -> offset.untilOffset)
          }
        } // end else
      } else if (compare < 0) { //
        for (offset <- latest_read_offsets) {
          start_read_offsets += (TopicAndPartition(offset.topic, offset.partition) -> offset.untilOffset)
        }
      } else if (compare > 0) { // consumer offset > producer offset (invalid)
        logger.info("consumer offset > producer offset, will revise")

        val revised_latest_read_offsets = KafkaOffsetUtil.reviseConsumerOffset(latest_read_offsets, latest_write_offsets)
        for (offset <- revised_latest_read_offsets) {
          start_read_offsets += (TopicAndPartition(offset.topic, offset.partition) -> offset.untilOffset)
        }
      }
    } else {
     logger.info("no consumer offset, topic = " + topic + ", groupid = " + kafka_param.get("group.id").get)
    }

    // create dstream
    val msg_handler = (mmd: MessageAndMetadata[String, String]) => (mmd.topic, mmd.message)
    var kafka_stream : InputDStream[(String, String)] = null

    if (start_read_offsets.isEmpty) {
      kafka_stream = KafkaUtils.createDirectStream[String, String, StringDecoder, StringDecoder](streaming_context, kafka_param, Set(topic))
    } else {
      kafka_stream = KafkaUtils.createDirectStream[String, String, StringDecoder, StringDecoder, (String, String)](
        streaming_context, kafka_param, start_read_offsets, msg_handler)
    }
    
    save_offset(kafka_stream, zk_list,
      offset_zk_path,
      kafka_param.get("group.id").get)

    kafka_stream
  }

  def save_offset(kafka_stream: InputDStream[(String, String)],
                  zk_list: String,
                  offset_zk_path: String,
                  consumer_group: String) {
    kafka_stream.foreachRDD(rdd => {
      val offsetArr = rdd.asInstanceOf[HasOffsetRanges].offsetRanges
      for (offset <- offsetArr) {
        val ret = KafkaOffsetUtil.writeOffset(zk_list, offset_zk_path,
          consumer_group, offset)
        if (!ret) {
          logger.warn("write offset failed : group_id = " + consumer_group + ", offset = " + offset)
        } else {
          logger.info("write offset succeed : group_id = " + consumer_group + ", offset = " + offset)
        }
      }
    })
  }

}




© 2015 - 2025 Weber Informatics LLC | Privacy Policy