
datahub.ReliableDatahubReceiver.scala Maven / Gradle / Ivy
The newest version!
/*
* Licensed to the Apache Software Foundation (ASF) under one or more
* contributor license agreements. See the NOTICE file distributed with
* this work for additional information regarding copyright ownership.
* The ASF licenses this file to You under the Apache License, Version 2.0
* (the "License"); you may not use this file except in compliance with
* the License. You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
package org.apache.spark.streaming.aliyun.datahub
import java.util.concurrent.{ConcurrentHashMap, ThreadPoolExecutor}
import com.aliyun.datahub.DatahubConfiguration
import com.aliyun.datahub.auth.AliyunAccount
import com.aliyun.datahub.exception.{DatahubClientException, OffsetResetedException}
import com.aliyun.datahub.model.GetCursorRequest.CursorType
import com.aliyun.datahub.model.{GetRecordsResult, GetTopicResult, OffsetContext, RecordEntry}
import org.apache.spark.SparkException
import org.apache.spark.internal.Logging
import org.apache.spark.storage.{StorageLevel, StreamBlockId}
import org.apache.spark.streaming.receiver.{BlockGenerator, BlockGeneratorListener, Receiver}
import org.apache.spark.util.ThreadUtils
import scala.collection.mutable.ArrayBuffer
import scala.collection.JavaConversions._
import scala.collection.mutable
import scala.reflect.ClassTag
private[datahub] class ReliableDatahubReceiver[T: ClassTag](
projectName: String,
topicName: String,
subId: String,
accessKeyId: String,
accessKeySecret: String,
endpoint: String,
shardId: String,
func: RecordEntry => T,
storageLevel: StorageLevel,
sparkDatahubConf: Map[String, String]) extends Receiver[T](storageLevel) with Logging {
/**
* A ArrayBuffer to manage the offsetContext, this ArrayBuffer is called in
* synchronized block, so ArrayBuffer will not meet concurrency issue.
*/
private var offsetBuffer: ArrayBuffer[OffsetContext.Offset] = null
/** A concurrent HashMap to store the stream block id and related offset snapshot. */
private var blockOffsetMap: ConcurrentHashMap[StreamBlockId, Array[OffsetContext.Offset]] = null
/**
* Manage the BlockGenerator in receiver itself for better managing block store and offset
* commit.
*/
private var blockGenerator: BlockGenerator = null
/** Thread pool running the handlers for receiving message from the dataHub. */
private var messageHandlerThreadPool: ThreadPoolExecutor = null
private var client: DatahubClientOpt = null
private var offsetCtx: OffsetContext = null
private var topicResult: GetTopicResult = null
private var cursor: String = null
private val getRecordLimits = sparkDatahubConf.getOrElse("spark.datahub.batch.getrecord.limits", 1000).toString.toInt
private val getRecordSleepTime = sparkDatahubConf.getOrElse("spark.datahub.norecord.waittimes", 100).toString.toLong
private val commitRows = sparkDatahubConf.getOrElse("spark.datahub.commit.perrows", 100).toString.toInt
override def onStart(): Unit = {
logInfo(s"Starting Datahub Reliable Receiver for " +
s"projectName: $projectName, " +
s"topicName: $topicName " +
s"shardId: $shardId, with " +
s"subId: $subId")
logInfo(s"the spark.datahub.batch.getrecord.limits is $getRecordLimits")
offsetBuffer = new ArrayBuffer[OffsetContext.Offset]()
// Initialize the stream block id / offset snapshot hash map.
blockOffsetMap = new ConcurrentHashMap[StreamBlockId, Array[OffsetContext.Offset]]()
// Initialize the block generator for storing Datahub data.
blockGenerator = supervisor.createBlockGenerator(new GeneratedBlockHandler)
// Initialize the thread pool,
messageHandlerThreadPool = ThreadUtils.newDaemonFixedThreadPool(1, "DatahubMessageHandler")
blockGenerator.start()
val conf = new DatahubConfiguration(new AliyunAccount(accessKeyId, accessKeySecret), endpoint)
client = new DatahubClientOpt(conf)
topicResult = client.getTopic(projectName, topicName)
offsetCtx = client.initOffsetContext(projectName, topicName, subId, shardId)
if (!offsetCtx.hasOffset) {
val cursorResult = client.getCursor(projectName, topicName, shardId, CursorType.OLDEST)
cursor = cursorResult.getCursor
} else {
try {
cursor = client.getNextOffsetCursor(offsetCtx).getCursor
} catch {
case e: DatahubClientException =>
cursor = client.getCursor(projectName, topicName, shardId, CursorType.OLDEST).getCursor
}
}
logInfo(s"the initial offsetTimestamp is: ${offsetCtx.getOffset.getTimestamp} and cursor is: $cursor")
messageHandlerThreadPool.submit(new MessageHandler(client))
}
override def onStop(): Unit = {
if (messageHandlerThreadPool != null) {
messageHandlerThreadPool.shutdown()
messageHandlerThreadPool = null
logInfo(s"Stop Datahub Reliable Receiver for " +
s"projectName: $projectName, " +
s"topicName: $topicName " +
s"shardId: $shardId, with " +
s"subId: $subId")
}
}
private final class MessageHandler(client: DatahubClientOpt) extends Runnable {
override def run(): Unit = {
while (!isStopped) {
try {
val recordsResult = client.getRecords(projectName, topicName, shardId, cursor, getRecordLimits,
topicResult.getRecordSchema)
val records = recordsResult.getRecords
if (records.size() == 0) {
Thread.sleep(getRecordSleepTime)
} else {
for (record <- records) {
storeMessageAndOffset(func(record), record.getOffset)
cursor = recordsResult.getNextCursor
}
}
} catch {
case _: OffsetResetedException =>
client.updateOffsetContext(offsetCtx)
cursor = client.getNextOffsetCursor(offsetCtx).getCursor
logInfo(s"Restart consume shard: $shardId, reset ${offsetCtx.toObjectNode.toString}, cursor: $cursor")
case e: SparkException => {
reportError(e.getMessage, e)
}
case exception: Exception => reportError("Error handling message", exception)
}
}
}
}
private def storeMessageAndOffset(record: T, offset: OffsetContext.Offset): Unit = {
val data = record
blockGenerator.addDataWithCallback(data, offset)
}
private def updateOffset(offset: OffsetContext.Offset): Unit = {
offsetBuffer.append(offset)
}
private def rememberBlockOffsets(blockId: StreamBlockId): Unit = {
val offsetSnapshot = offsetBuffer.toArray
blockOffsetMap.put(blockId, offsetSnapshot)
offsetBuffer.clear()
}
private def storeBlockAndCommitOffset(blockId: StreamBlockId, arrayBuffer: ArrayBuffer[_]): Unit = {
var count = 0
var pushed = false
var exception: Exception = null
while (!pushed && count <= 3) {
try {
var recordEntryArray = arrayBuffer.asInstanceOf[mutable.ArrayBuffer[T]]
store(recordEntryArray)
pushed = true
} catch {
case ex: Exception =>
count += 1
exception = ex
}
}
if (pushed) {
Option(blockOffsetMap.get(blockId)).foreach(commitOffset)
blockOffsetMap.remove(blockId)
} else {
stop("Error while storing block into Spark,", exception)
}
}
private def commitOffset(offsetBuffer: Array[OffsetContext.Offset]): Unit = {
var recordNum = 0L
offsetBuffer.foreach { offset =>
offsetCtx.setOffset(offset)
recordNum += 1
if (recordNum % commitRows == 0) {
client.commitOffset(offsetCtx)
}
}
if (recordNum > 0 && (recordNum % commitRows != 0)) {
client.commitOffset(offsetCtx)
}
}
/** Class to handle blocks generated by the block generator. */
private final class GeneratedBlockHandler() extends BlockGeneratorListener {
override def onAddData(data: Any, metadata: Any): Unit = {
if (metadata != null) {
val offset = metadata.asInstanceOf[OffsetContext.Offset]
updateOffset(offset)
}
}
override def onGenerateBlock(blockId: StreamBlockId): Unit = {
rememberBlockOffsets(blockId)
}
override def onPushBlock(blockId: StreamBlockId, arrayBuffer: ArrayBuffer[_]): Unit = {
storeBlockAndCommitOffset(blockId, arrayBuffer)
}
override def onError(message: String, throwable: Throwable): Unit = {
reportError(message, throwable)
}
}
}
© 2015 - 2025 Weber Informatics LLC | Privacy Policy