All Downloads are FREE. Search and download functionalities are using the official Maven repository.

datahub.ReliableDatahubReceiver.scala Maven / Gradle / Ivy

The newest version!
/*
 * Licensed to the Apache Software Foundation (ASF) under one or more
 * contributor license agreements.  See the NOTICE file distributed with
 * this work for additional information regarding copyright ownership.
 * The ASF licenses this file to You under the Apache License, Version 2.0
 * (the "License"); you may not use this file except in compliance with
 * the License.  You may obtain a copy of the License at
 *
 *    http://www.apache.org/licenses/LICENSE-2.0
 *
 * Unless required by applicable law or agreed to in writing, software
 * distributed under the License is distributed on an "AS IS" BASIS,
 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 * See the License for the specific language governing permissions and
 * limitations under the License.
 */
package org.apache.spark.streaming.aliyun.datahub


import java.util.concurrent.{ConcurrentHashMap, ThreadPoolExecutor}

import com.aliyun.datahub.DatahubConfiguration
import com.aliyun.datahub.auth.AliyunAccount
import com.aliyun.datahub.exception.{DatahubClientException, OffsetResetedException}
import com.aliyun.datahub.model.GetCursorRequest.CursorType
import com.aliyun.datahub.model.{GetRecordsResult, GetTopicResult, OffsetContext, RecordEntry}
import org.apache.spark.SparkException
import org.apache.spark.internal.Logging
import org.apache.spark.storage.{StorageLevel, StreamBlockId}
import org.apache.spark.streaming.receiver.{BlockGenerator, BlockGeneratorListener, Receiver}
import org.apache.spark.util.ThreadUtils

import scala.collection.mutable.ArrayBuffer
import scala.collection.JavaConversions._
import scala.collection.mutable
import scala.reflect.ClassTag

private[datahub] class ReliableDatahubReceiver[T: ClassTag](
                                                             projectName: String,
                                                             topicName: String,
                                                             subId: String,
                                                             accessKeyId: String,
                                                             accessKeySecret: String,
                                                             endpoint: String,
                                                             shardId: String,
                                                             func: RecordEntry => T,
                                                             storageLevel: StorageLevel,
                                                             sparkDatahubConf: Map[String, String]) extends Receiver[T](storageLevel) with Logging {
  /**
    * A ArrayBuffer to manage the offsetContext, this ArrayBuffer is called in
    * synchronized block, so ArrayBuffer will not meet concurrency issue.
    */
  private var offsetBuffer: ArrayBuffer[OffsetContext.Offset] = null

  /** A concurrent HashMap to store the stream block id and related offset snapshot. */
  private var blockOffsetMap: ConcurrentHashMap[StreamBlockId, Array[OffsetContext.Offset]] = null

  /**
    * Manage the BlockGenerator in receiver itself for better managing block store and offset
    * commit.
    */
  private var blockGenerator: BlockGenerator = null

  /** Thread pool running the handlers for receiving message from the dataHub. */
  private var messageHandlerThreadPool: ThreadPoolExecutor = null

  private var client: DatahubClientOpt = null

  private var offsetCtx: OffsetContext = null
  private var topicResult: GetTopicResult = null
  private var cursor: String = null
  private val getRecordLimits = sparkDatahubConf.getOrElse("spark.datahub.batch.getrecord.limits", 1000).toString.toInt
  private val getRecordSleepTime = sparkDatahubConf.getOrElse("spark.datahub.norecord.waittimes", 100).toString.toLong
  private val commitRows = sparkDatahubConf.getOrElse("spark.datahub.commit.perrows", 100).toString.toInt

  override def onStart(): Unit = {
    logInfo(s"Starting Datahub Reliable Receiver for " +
      s"projectName: $projectName, " +
      s"topicName: $topicName " +
      s"shardId: $shardId, with " +
      s"subId: $subId")
    logInfo(s"the spark.datahub.batch.getrecord.limits is $getRecordLimits")

    offsetBuffer = new ArrayBuffer[OffsetContext.Offset]()
    // Initialize the stream block id / offset snapshot hash map.
    blockOffsetMap = new ConcurrentHashMap[StreamBlockId, Array[OffsetContext.Offset]]()

    // Initialize the block generator for storing Datahub data.
    blockGenerator = supervisor.createBlockGenerator(new GeneratedBlockHandler)

    // Initialize the thread pool,
    messageHandlerThreadPool = ThreadUtils.newDaemonFixedThreadPool(1, "DatahubMessageHandler")

    blockGenerator.start()

    val conf = new DatahubConfiguration(new AliyunAccount(accessKeyId, accessKeySecret), endpoint)
    client = new DatahubClientOpt(conf)
    topicResult = client.getTopic(projectName, topicName)
    offsetCtx = client.initOffsetContext(projectName, topicName, subId, shardId)
    if (!offsetCtx.hasOffset) {
      val cursorResult = client.getCursor(projectName, topicName, shardId, CursorType.OLDEST)
      cursor = cursorResult.getCursor
    } else {
      try {
        cursor = client.getNextOffsetCursor(offsetCtx).getCursor
      } catch {
        case e: DatahubClientException =>
          cursor = client.getCursor(projectName, topicName, shardId, CursorType.OLDEST).getCursor
      }
    }
    logInfo(s"the initial offsetTimestamp is: ${offsetCtx.getOffset.getTimestamp} and cursor is: $cursor")

    messageHandlerThreadPool.submit(new MessageHandler(client))
  }

  override def onStop(): Unit = {
    if (messageHandlerThreadPool != null) {
      messageHandlerThreadPool.shutdown()
      messageHandlerThreadPool = null
      logInfo(s"Stop Datahub Reliable Receiver for " +
        s"projectName: $projectName, " +
        s"topicName: $topicName " +
        s"shardId: $shardId, with " +
        s"subId: $subId")
    }
  }

  private final class MessageHandler(client: DatahubClientOpt) extends Runnable {
    override def run(): Unit = {
      while (!isStopped) {
        try {
          val recordsResult = client.getRecords(projectName, topicName, shardId, cursor, getRecordLimits,
            topicResult.getRecordSchema)
          val records = recordsResult.getRecords
          if (records.size() == 0) {
            Thread.sleep(getRecordSleepTime)
          } else {
            for (record <- records) {
              storeMessageAndOffset(func(record), record.getOffset)
              cursor = recordsResult.getNextCursor
            }
          }
        } catch {
          case _: OffsetResetedException =>
            client.updateOffsetContext(offsetCtx)
            cursor = client.getNextOffsetCursor(offsetCtx).getCursor
            logInfo(s"Restart consume shard: $shardId, reset ${offsetCtx.toObjectNode.toString}, cursor: $cursor")
          case e: SparkException => {
            reportError(e.getMessage, e)
          }
          case exception: Exception => reportError("Error handling message", exception)
        }
      }
    }
  }

  private def storeMessageAndOffset(record: T, offset: OffsetContext.Offset): Unit = {
    val data = record
    blockGenerator.addDataWithCallback(data, offset)
  }

  private def updateOffset(offset: OffsetContext.Offset): Unit = {
    offsetBuffer.append(offset)
  }

  private def rememberBlockOffsets(blockId: StreamBlockId): Unit = {
    val offsetSnapshot = offsetBuffer.toArray
    blockOffsetMap.put(blockId, offsetSnapshot)
    offsetBuffer.clear()

  }

  private def storeBlockAndCommitOffset(blockId: StreamBlockId, arrayBuffer: ArrayBuffer[_]): Unit = {
    var count = 0
    var pushed = false
    var exception: Exception = null
    while (!pushed && count <= 3) {
      try {
        var recordEntryArray = arrayBuffer.asInstanceOf[mutable.ArrayBuffer[T]]
        store(recordEntryArray)
        pushed = true
      } catch {
        case ex: Exception =>
          count += 1
          exception = ex
      }
    }
    if (pushed) {
      Option(blockOffsetMap.get(blockId)).foreach(commitOffset)
      blockOffsetMap.remove(blockId)
    } else {
      stop("Error while storing block into Spark,", exception)
    }
  }

  private def commitOffset(offsetBuffer: Array[OffsetContext.Offset]): Unit = {
    var recordNum = 0L
    offsetBuffer.foreach { offset =>
      offsetCtx.setOffset(offset)
      recordNum += 1
      if (recordNum % commitRows == 0) {
        client.commitOffset(offsetCtx)
      }
    }
    if (recordNum > 0 && (recordNum % commitRows != 0)) {
      client.commitOffset(offsetCtx)
    }
  }

  /** Class to handle blocks generated by the block generator. */
  private final class GeneratedBlockHandler() extends BlockGeneratorListener {
    override def onAddData(data: Any, metadata: Any): Unit = {
      if (metadata != null) {
        val offset = metadata.asInstanceOf[OffsetContext.Offset]
        updateOffset(offset)
      }
    }

    override def onGenerateBlock(blockId: StreamBlockId): Unit = {
      rememberBlockOffsets(blockId)
    }

    override def onPushBlock(blockId: StreamBlockId, arrayBuffer: ArrayBuffer[_]): Unit = {
      storeBlockAndCommitOffset(blockId, arrayBuffer)
    }

    override def onError(message: String, throwable: Throwable): Unit = {
      reportError(message, throwable)
    }
  }

}




© 2015 - 2025 Weber Informatics LLC | Privacy Policy