org.apache.spark.eventhubs.common.RateControlUtils.scala Maven / Gradle / Ivy
Go to download
Show more of this group Show more artifacts with this name
Show all versions of azure-eventhubs-databricks_2.11 Show documentation
Show all versions of azure-eventhubs-databricks_2.11 Show documentation
Library to connect Azure Event Hubs with Databricks (Spark Streaming and Structured Streaming).
The newest version!
/*
* Licensed to the Apache Software Foundation (ASF) under one or more
* contributor license agreements. See the NOTICE file distributed with
* this work for additional information regarding copyright ownership.
* The ASF licenses this file to You under the Apache License, Version 2.0
* (the "License"); you may not use this file except in compliance with
* the License. You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
package org.apache.spark.eventhubs.common
import org.apache.spark.eventhubs.common.client.EventHubsOffsetTypes.EventHubsOffsetType
import org.apache.spark.eventhubs.common.client.{ Client, EventHubsOffsetTypes }
import org.apache.spark.internal.Logging
import scala.collection.mutable
private[spark] object RateControlUtils extends Logging {
/**
* return the last sequence number of each partition, which are to be
* received in this micro batch
*
* @param highestEndpoints the latest offset/seq of each partition
*/
// TODO: a lot of this code comes from the fact that EHSource takes Map[String, String] and
// TODO: DStream takes Map[String, Map[String, String]]. Which is preferred? We should pick one.
private[spark] def clamp(currentOffsetsAndSeqNos: Map[NameAndPartition, (Long, Long)],
highestEndpoints: List[(NameAndPartition, (Long, Long))],
ehParams: Map[String, _]): Map[NameAndPartition, Long] = {
(for {
(nameAndPartition, (_, seqNo)) <- highestEndpoints
maxRate = ehParams.get(nameAndPartition.ehName) match {
case Some(x) => // for DStream
x.asInstanceOf[Map[String, String]]
.getOrElse("eventhubs.maxRate", EventHubsUtils.DefaultMaxRate)
case None => // for Structured Stream
ehParams
.asInstanceOf[Map[String, String]]
.getOrElse("eventhubs.maxRate", EventHubsUtils.DefaultMaxRate)
}
endSeqNo = math.min(seqNo, maxRate.toInt + currentOffsetsAndSeqNos(nameAndPartition)._2)
} yield (nameAndPartition, endSeqNo)) toMap
}
private[spark] def calculateStartOffset(
nameAndPartition: NameAndPartition,
filteringOffsetAndType: Map[NameAndPartition, (EventHubsOffsetType, Long)],
startOffsetInNextBatch: Map[NameAndPartition, (Long, Long)]): (EventHubsOffsetType, Long) = {
filteringOffsetAndType.getOrElse(
nameAndPartition,
(EventHubsOffsetTypes.PreviousCheckpoint, startOffsetInNextBatch(nameAndPartition)._1)
)
}
private[spark] def validateFilteringParams(eventHubsClients: Map[String, Client],
ehParams: Map[String, _],
namesAndPartitions: List[NameAndPartition]): Unit = {
val lastEnqueuedTimes: List[(NameAndPartition, Long)] = for {
nAndP <- namesAndPartitions
name = nAndP.ehName
lastTime = eventHubsClients(name).lastEnqueuedTime(nAndP).get
} yield nAndP -> lastTime
val booleans: List[Boolean] = for {
(nAndP, lastTime) <- lastEnqueuedTimes
passInEnqueueTime = ehParams.get(nAndP.ehName) match {
case Some(x) =>
x.asInstanceOf[Map[String, String]]
.getOrElse("eventhubs.filter.enqueuetime", EventHubsUtils.DefaultEnqueueTime)
case None =>
ehParams
.asInstanceOf[Map[String, String]]
.getOrElse("eventhubs.filter.enqueuetime", EventHubsUtils.DefaultEnqueueTime)
}
} yield lastTime >= passInEnqueueTime.toLong
require(!booleans.contains(false),
"You cannot pass in an enqueue time that is greater than what exists in EventHubs.")
}
private[spark] def composeFromOffsetWithFilteringParams(
ehParams: Map[String, _],
startOffsetsAndSeqNos: Map[NameAndPartition, (Long, Long)])
: Map[NameAndPartition, (EventHubsOffsetType, Long)] = {
for {
(nameAndPartition, (offset, _)) <- startOffsetsAndSeqNos
(offsetType, offsetStr) = configureStartOffset(
offset.toString,
ehParams.get(nameAndPartition.ehName) match {
case Some(x) => x.asInstanceOf[Map[String, String]]
case None => ehParams.asInstanceOf[Map[String, String]]
}
)
} yield (nameAndPartition, (offsetType, offsetStr.toLong))
}
private[eventhubs] def configureStartOffset(
previousOffset: String,
ehParams: Map[String, String]): (EventHubsOffsetType, String) = {
if (previousOffset != "-1" && previousOffset != null) {
(EventHubsOffsetTypes.PreviousCheckpoint, previousOffset)
} else if (ehParams.contains("eventhubs.filter.offset")) {
(EventHubsOffsetTypes.Offset, ehParams("eventhubs.filter.offset"))
} else if (ehParams.contains("eventhubs.filter.enqueuetime")) {
(EventHubsOffsetTypes.EnqueueTime, ehParams("eventhubs.filter.enqueuetime"))
} else {
(EventHubsOffsetTypes.None, EventHubsUtils.StartOfStream)
}
}
}
© 2015 - 2025 Weber Informatics LLC | Privacy Policy