All Downloads are FREE. Search and download functionalities are using the official Maven repository.

nl.vroste.zio.kinesis.client.dynamicconsumer.DynamicConsumer.scala Maven / Gradle / Ivy

The newest version!
package nl.vroste.zio.kinesis.client.dynamicconsumer

import nl.vroste.zio.kinesis.client.dynamicconsumer.DynamicConsumer.{ Checkpointer, Record }
import zio.aws.cloudwatch.CloudWatch
import zio.aws.dynamodb.DynamoDb
import zio.aws.kinesis.Kinesis
import nl.vroste.zio.kinesis.client.dynamicconsumer.fake.DynamicConsumerFake
import nl.vroste.zio.kinesis.client.serde.Deserializer
import software.amazon.awssdk.services.kinesis.model.EncryptionType
import software.amazon.kinesis.common.{ InitialPositionInStream, InitialPositionInStreamExtended }
import software.amazon.kinesis.exceptions.ShutdownException
import zio.stream.{ ZSink, ZStream }
import zio._

import java.time.Instant
import java.util.UUID

trait DynamicConsumer {

  /**
   * Create a ZStream of records on a Kinesis stream with substreams per shard
   *
   * Uses DynamoDB for lease coordination between different instances of consumers with the same application name and
   * for offset checkpointing.
   *
   * @param streamIdentifier
   *   Stream to consume from. Either just the name or the whole arn.
   * @param applicationName
   *   Application name for coordinating shard leases
   * @param deserializer
   *   Deserializer for record values
   * @param requestShutdown
   *   Effect that when completed will trigger a graceful shutdown of the KCL and the streams.
   * @param initialPosition
   *   Position in stream to start at when there is no previous checkpoint for this application
   * @param leaseTableName
   *   Optionally set the lease table name - defaults to None. If not specified the `applicationName` will be used.
   * @param metricsNamespace
   *   CloudWatch metrics namespace
   * @param workerIdentifier
   *   Identifier used for the worker in this application group. Used in logging and written to the lease table.
   * @param maxShardBufferSize
   *   The maximum number of records per shard to store in a queue before blocking the KCL record processor until
   *   records have been dequeued. Note that the stream returned from this method will have internal chunk buffers as
   *   well.
   * @param configureKcl
   *   Make additional KCL Scheduler configurations
   * @tparam R
   *   ZIO environment type required by the `deserializer`
   * @tparam T
   *   Type of record values
   * @return
   *   A nested ZStream - the outer ZStream represents the collection of shards, and the inner ZStream represents the
   *   individual shard
   */
  def shardedStream[R, T](
    streamName: String,
    applicationName: String,
    deserializer: Deserializer[R, T],
    requestShutdown: UIO[Unit] = ZIO.never,
    initialPosition: InitialPositionInStreamExtended =
      InitialPositionInStreamExtended.newInitialPosition(InitialPositionInStream.TRIM_HORIZON),
    leaseTableName: Option[String] = None,
    metricsNamespace: Option[String] = None,
    workerIdentifier: String = UUID.randomUUID().toString,
    maxShardBufferSize: Int = 1024, // Prefer powers of 2
    configureKcl: SchedulerConfig => SchedulerConfig = identity
  ): ZStream[
    R,
    Throwable,
    (String, ZStream[Any, Throwable, Record[T]], Checkpointer)
  ]
}

/**
 * Offers a ZStream based interface to the Amazon Kinesis Client Library (KCL)
 *
 * Ensures proper resource shutdown and failure handling
 */
object DynamicConsumer {
  final case class Record[+T](
    shardId: String,
    sequenceNumber: String,
    approximateArrivalTimestamp: Instant,
    data: T,
    partitionKey: String,
    encryptionType: EncryptionType,
    subSequenceNumber: Option[Long],
    explicitHashKey: Option[String],
    aggregated: Boolean
  )

  val live: ZLayer[Kinesis with CloudWatch with DynamoDb, Nothing, DynamicConsumer] = ZLayer {
    Unsafe.unsafe { implicit unsafe =>
      for {
        kinesis    <- ZIO.service[Kinesis]
        cloudwatch <- ZIO.service[CloudWatch]
        dynamodb   <- ZIO.service[DynamoDb]
      } yield new DynamicConsumerLive(kinesis.api, cloudwatch.api, dynamodb.api, unsafe): DynamicConsumer
    }
  }

  /**
   * Implements a fake `DynamicConsumer` that also offers fake checkpointing functionality that can be tracked using the
   * `refCheckpointedList` parameter.
   * @param shards
   *   A ZStream that is a fake representation of a Kinesis shard. There are helper constructors to create these - see
   *   [[nl.vroste.zio.kinesis.client.dynamicconsumer.fake.DynamicConsumerFake.shardsFromIterables]] and
   *   [[nl.vroste.zio.kinesis.client.dynamicconsumer.fake.DynamicConsumerFake.shardsFromStreams]]
   * @param refCheckpointedList
   *   A Ref that will be used to store the checkpointed records
   * @return
   *   A ZLayer of the fake `DynamicConsumer` implementation
   */
  def fake(
    shards: ZStream[Any, Throwable, (String, ZStream[Any, Throwable, Chunk[Byte]])],
    refCheckpointedList: Ref[Seq[Record[Any]]]
  ): ZLayer[Any, Nothing, DynamicConsumer] = ZLayer.succeed {
    new DynamicConsumerFake(shards, refCheckpointedList)
  }

  /**
   * Overloaded version of above but without fake checkpointing functionality
   * @param shards
   *   A ZStream that is a fake representation of a Kinesis shard. There are helper constructors to create these - see
   *   [[nl.vroste.zio.kinesis.client.dynamicconsumer.fake.DynamicConsumerFake.shardsFromIterables]] and
   *   [[nl.vroste.zio.kinesis.client.dynamicconsumer.fake.DynamicConsumerFake.shardsFromStreams]]
   * @return
   *   A ZLayer of the fake `DynamicConsumer` implementation
   */
  def fake(
    shards: ZStream[Any, Throwable, (String, ZStream[Any, Throwable, Chunk[Byte]])]
  ): ZLayer[Any, Nothing, DynamicConsumer] = ZLayer {
    Ref.make[Seq[Record[Any]]](Seq.empty).map { refCheckpointedList =>
      new DynamicConsumerFake(shards, refCheckpointedList)
    }
  }

  // Accessor
  def shardedStream[R, T](
    streamName: String,
    applicationName: String,
    deserializer: Deserializer[R, T],
    requestShutdown: UIO[Unit] = ZIO.never,
    initialPosition: InitialPositionInStreamExtended =
      InitialPositionInStreamExtended.newInitialPosition(InitialPositionInStream.TRIM_HORIZON),
    leaseTableName: Option[String] = None,
    metricsNamespace: Option[String] = None,
    workerIdentifier: String = UUID.randomUUID().toString,
    maxShardBufferSize: Int = 1024, // Prefer powers of 2
    configureKcl: SchedulerConfig => SchedulerConfig = identity
  ): ZStream[
    DynamicConsumer with R,
    Throwable,
    (String, ZStream[Any, Throwable, Record[T]], Checkpointer)
  ] =
    ZStream.unwrap(
      ZIO
        .service[DynamicConsumer]
        .map(
          _.shardedStream(
            streamName,
            applicationName,
            deserializer,
            requestShutdown,
            initialPosition,
            leaseTableName,
            metricsNamespace,
            workerIdentifier,
            maxShardBufferSize,
            configureKcl
          )
        )
    )

  /**
   * Similar to `shardedStream` accessor but provides the `recordProcessor` callback function for processing records and
   * takes care of checkpointing. The other difference is that it returns a ZIO of unit rather than a ZStream.
   *
   * @param streamName
   *   Name of the Kinesis stream
   * @param applicationName
   *   Application name for coordinating shard leases
   * @param deserializer
   *   Deserializer for record values
   * @param requestShutdown
   *   Effect that when completed will trigger a graceful shutdown of the KCL and the streams.
   * @param initialPosition
   *   Position in stream to start at when there is no previous checkpoint for this application
   * @param leaseTableName
   *   Optionally set the lease table name - defaults to None. If not specified the `applicationName` will be used.
   * @param metricsNamespace
   *   CloudWatch metrics namespace
   * @param workerIdentifier
   *   Identifier used for the worker in this application group. Used in logging and written to the lease table.
   * @param maxShardBufferSize
   *   The maximum number of records per shard to store in a queue before blocking the KCL record processor until
   *   records have been dequeued. Note that the stream returned from this method will have internal chunk buffers as
   *   well.
   * @param checkpointBatchSize
   *   Maximum number of records before checkpointing
   * @param checkpointDuration
   *   Maximum interval before checkpointing
   * @param recordProcessor
   *   A function for processing a `Record[T]`
   * @param configureKcl
   *   Make additional KCL Scheduler configurations
   * @tparam R
   *   ZIO environment type required by the `deserializer` and the `recordProcessor`
   * @tparam T
   *   Type of record values
   * @return
   *   A ZIO that completes with Unit when record processing is stopped via requestShutdown or fails when the consumer
   *   stream fails
   */
  def consumeWith[R, RC, T](
    streamName: String,
    applicationName: String,
    deserializer: Deserializer[R, T],
    requestShutdown: UIO[Unit] = ZIO.never,
    initialPosition: InitialPositionInStreamExtended =
      InitialPositionInStreamExtended.newInitialPosition(InitialPositionInStream.TRIM_HORIZON),
    leaseTableName: Option[String] = None,
    metricsNamespace: Option[String] = None,
    workerIdentifier: String = UUID.randomUUID().toString,
    maxShardBufferSize: Int = 1024, // Prefer powers of 2
    checkpointBatchSize: Long = 200,
    checkpointDuration: Duration = 5.minutes,
    configureKcl: SchedulerConfig => SchedulerConfig = identity
  )(
    recordProcessor: Record[T] => RIO[RC, Unit]
  ): ZIO[R with RC with DynamicConsumer, Throwable, Unit] =
    ZIO.scoped[R with RC with DynamicConsumer] {
      for {
        consumer <- ZIO.service[DynamicConsumer]
        _        <- consumer
                      .shardedStream(
                        streamName,
                        applicationName,
                        deserializer,
                        requestShutdown,
                        initialPosition,
                        leaseTableName,
                        metricsNamespace,
                        workerIdentifier,
                        maxShardBufferSize,
                        configureKcl
                      )
                      .flatMapPar(Int.MaxValue) { case (_, shardStream, checkpointer) =>
                        shardStream
                          .tap(record => recordProcessor(record) *> checkpointer.stage(record))
                          .viaFunction(
                            checkpointer
                              .checkpointBatched[Any with RC](
                                nr = checkpointBatchSize,
                                interval = checkpointDuration
                              )
                          )
                      }
                      .runDrain
      } yield ()
    }

  /**
   * Staging area for checkpoints
   *
   * Guarantees that the last staged record is checkpointed upon stream shutdown / interruption
   */
  trait Checkpointer {

    /*
     * Helper method that returns current state - useful for debugging
     */
    private[client] def peek: UIO[Option[ExtendedSequenceNumber]]

    /**
     * Stages a record for checkpointing
     *
     * Checkpoints are not actually performed until `checkpoint` is called
     *
     * @param r
     *   Record to checkpoint
     * @return
     *   Effect that completes immediately
     */
    def stage(r: Record[_]): UIO[Unit]

    /**
     * Helper method that ensures that a checkpoint is staged when 'effect' completes successfully, even when the fiber
     * is interrupted. When 'effect' fails or is itself interrupted, the checkpoint is not staged.
     *
     * @param effect
     *   Effect to execute
     * @param r
     *   Record to stage a checkpoint for
     * @return
     *   Effect that completes with the result of 'effect'
     */
    def stageOnSuccess[R, E, A](effect: ZIO[R, E, A])(r: Record[_]): ZIO[R, E, A] =
      effect.onExit {
        case Exit.Success(_) => stage(r)
        case _               => ZIO.unit
      }

    /**
     * Checkpoint the last staged checkpoint
     *
     * Exceptions you should be prepared to handle:
     *   - `software.amazon.kinesis.exceptions.ShutdownException` when the lease for this shard has been lost, when
     *     another worker has stolen the lease (this can happen at any time).
     *   - `software.amazon.kinesis.exceptions.ThrottlingException`
     *
     * See also `software.amazon.kinesis.processor.RecordProcessorCheckpointer`
     */
    def checkpoint: ZIO[Any, Throwable, Unit]

    /**
     * Immediately checkpoint this record
     */
    def checkpointNow(r: Record[_]): ZIO[Any, Throwable, Unit] =
      stage(r) *> checkpoint

    /**
     * Helper method to add batch checkpointing to a shard stream
     *
     * Usage: shardStream.viaFunction(checkpointer.checkpointBatched(1000, 1.second))
     *
     * @param nr
     *   Maximum number of records before checkpointing
     * @param interval
     *   Maximum interval before checkpointing
     * @return
     *   Function that results in a ZStream that produces Unit values for successful checkpoints, fails with an
     *   exception when checkpointing fails or becomes an empty stream when the lease for this shard is lost, thereby
     *   ending the stream.
     */
    def checkpointBatched[R](
      nr: Long,
      interval: Duration
    ): ZStream[R, Throwable, Any] => ZStream[R with Any, Throwable, Unit] =
      _.aggregateAsyncWithin(ZSink.foldUntil[Any, Unit]((), nr)((_, _) => ()), Schedule.fixed(interval))
        .tap(_ => checkpoint)
        .catchAll {
          case _: ShutdownException =>
            ZStream.empty
          case e                    =>
            ZStream.fail(e)
        }
  }
}




© 2015 - 2025 Weber Informatics LLC | Privacy Policy