com.sksamuel.elastic4s.streams.BulkIndexingSubscriber.scala Maven / Gradle / Ivy

Go to download
package com.sksamuel.elastic4s.streams

import akka.actor._
import com.sksamuel.elastic4s.requests.bulk.{BulkCompatibleRequest, BulkRequest, BulkResponseItem}
import com.sksamuel.elastic4s.requests.common.RefreshPolicy
import com.sksamuel.elastic4s.{ElasticClient, RequestFailure, RequestSuccess}
import org.reactivestreams.{Subscriber, Subscription}

import scala.collection.mutable.ArrayBuffer
import scala.concurrent.duration._
import scala.language.higherKinds
import scala.util.{Failure, Success}

/** An implementation of the reactive API Subscriber. This subscriber will bulk index received elements. The bulk nature
  * means that the elasticsearch index operations are performed as a bulk call, the size of which are controlled by the
  * batchSize param.
  *
  * The received elements must be converted into an elastic4s bulk compatible definition, such as index or delete. This
  * is done by the RequestBuilder.
  *
  * @param client
  *   used to connect to the cluster
  * @param builder
  *   used to turn elements of T into IndexDefinitions so they can be used in the bulk indexer
  * @tparam T
  *   the type of element provided by the publisher this subscriber will subscribe with
  */
@deprecated("Use the elastic4-reactivestreams-akka package", "8.16.0")
class BulkIndexingSubscriber[T] private[streams] (
    client: ElasticClient,
    builder: RequestBuilder[T],
    config: SubscriberConfig[T]
)(implicit actorRefFactory: ActorRefFactory)
    extends Subscriber[T] {

  private var actor: ActorRef = _

  override def onSubscribe(sub: Subscription): Unit = {
    // rule 1.9 https://github.com/reactive-streams/reactive-streams-jvm#2.5
    // when the provided Subscriber is null in which case it MUST throw a java.lang.NullPointerException to the caller
    if (sub == null) throw new NullPointerException()
    if (actor == null)
      actor = actorRefFactory.actorOf(Props(new BulkActor(client, sub, builder, config)))
    else
      // rule 2.5, must cancel subscription if onSubscribe has been invoked twice
      // https://github.com/reactive-streams/reactive-streams-jvm#2.5
      sub.cancel()
  }

  override def onNext(t: T): Unit = {
    if (t == null) throw new NullPointerException("On next should not be called until onSubscribe has returned")
    actor ! t
  }

  override def onError(t: Throwable): Unit = {
    if (t == null) throw new NullPointerException()
    actor ! t
  }

  override def onComplete(): Unit =
    actor ! BulkActor.Completed

  def close(): Unit =
    actor ! PoisonPill
}

@deprecated("Use the elastic4-reactivestreams-akka package", "8.16.0")
object BulkActor {

  // signifies that the downstream publisher has completed (NOT that a bulk request has suceeded)
  case object Completed

  case object ForceIndexing

  case class Result[T](items: Seq[BulkResponseItem], originals: Seq[T], requestNext: Boolean)

  case class FailedResult[T](items: Seq[BulkResponseItem], originals: Seq[T], requestNext: Boolean)

  case class Request(n: Int)

  case class Send[T](req: BulkRequest, originals: Seq[T], attempts: Int)

}

@deprecated("Use the elastic4-reactivestreams-akka package", "8.16.0")
class BulkActor[T](
    client: ElasticClient,
    subscription: Subscription,
    builder: RequestBuilder[T],
    config: SubscriberConfig[T]
) extends Actor {

  import com.sksamuel.elastic4s.ElasticDsl._
  import context.{dispatcher, system}

  private val buffer = new ArrayBuffer[T]()
  buffer.sizeHint(config.batchSize)

  private var completed = false

  // total number of documents requested from our publisher
  private var requested: Long = 0L

  // total number of documents acknowledged at the elasticsearch cluster level but pending confirmation of index
  private var sent: Long = 0L

  // total number of documents confirmed as successful
  private var confirmed: Long = 0L

  // total number of documents that failed the retry attempts and are ignored
  private var failed: Long = 0L

  // Create a scheduler if a flushInterval is provided. This scheduler will be used to force indexing, otherwise
  // we can be stuck at batchSize-1 waiting for the nth message to arrive.
  //
  // It has been suggested we can use ReceiveTimeout here, but one reason we can't is because BulkResult messages,
  // will cause the timeout period to be reset, but they shouldn't interfere with the flush interval.
  private val flushIntervalScheduler: Option[Cancellable] = config.flushInterval.map { interval =>
    system.scheduler.scheduleWithFixedDelay(interval, interval, self, BulkActor.ForceIndexing)
  }

  // If flushAfter is specified then after each message, a scheduler is created to force indexing if no documents
  // are received within the given duration.
  private var flushAfterScheduler: Option[Cancellable] = None

  private def resetFlushAfterScheduler(): Unit = {
    flushAfterScheduler.foreach(_.cancel())
    flushAfterScheduler = config.flushAfter.map { interval =>
      system.scheduler.scheduleOnce(interval, self, BulkActor.ForceIndexing)
    }
  }

  // requests our initial starting batches, we can request them all at once, and then just request a new batch
  // each time we complete a batch
  override def preStart(): Unit =
    self ! BulkActor.Request(config.batchSize * config.concurrentRequests)

  def receive: PartialFunction[Any, Unit] = {
    case t: Throwable => handleError(t)

    case BulkActor.Completed =>
      // since we are completed at the publisher level, we should send all remaining documents because a complete
      // batch cannot happen now
      if (buffer.nonEmpty)
        index()
      completed = true
      shutdownIfAllConfirmed()

    case BulkActor.Request(n) =>
      subscription.request(n)
      requested = requested + n

    case msg: BulkActor.Send[T] => send(msg.req, msg.originals, msg.attempts)

    case BulkActor.ForceIndexing =>
      if (buffer.nonEmpty)
        index()

    case msg: BulkActor.Result[T] =>
      confirmed = confirmed + msg.items.size
      msg.items
        .zip(msg.originals)
        .foreach {
          case (item, original) =>
            config.listener.onAck(item, original)
        }
      checkCompleteOrRequestNext(msg.items.size, msg.requestNext)

    case msg: BulkActor.FailedResult[T] =>
      failed = failed + msg.items.size
      msg.items
        .zip(msg.originals)
        .foreach {
          case (item, original) =>
            config.listener.onFailure(item, original)
        }
      checkCompleteOrRequestNext(msg.items.size, msg.requestNext)

    case t: T =>
      buffer.append(t)
      if (buffer.size == config.batchSize)
        index()
      else
        resetFlushAfterScheduler()
  }

  // need to check if we're completed, because if we are then this might be the last pending conf
  // and if it is, we can shutdown.
  private def checkCompleteOrRequestNext(n: Int, requestNext: Boolean): Unit =
    if (completed) shutdownIfAllConfirmed()
    else if (requestNext) self ! BulkActor.Request(n)

  // Stops the schedulers if they exist and invoke final functions
  override def postStop(): Unit = {
    flushIntervalScheduler.map(_.cancel())
    flushAfterScheduler.map(_.cancel())
    if (failed == 0)
      config.successFn()
    config.completionFn()
  }

  private def shutdownIfAllConfirmed(): Unit =
    if (confirmed + failed == sent)
      context.stop(self)

  private def send(req: BulkRequest, originals: Seq[T], attempts: Int): Unit = {
    require(req.requests.size == originals.size, "Requests size does not match originals size")

    def filterByIndexes[S](sequence: Seq[S], indexes: Set[Int]) =
      sequence.zipWithIndex
        .filter { case (_, index) => indexes.contains(index) }
        .map { case (seqItem, _) => seqItem }

    def getOriginalForResponse(response: BulkResponseItem) = originals(response.itemId)

    // returns just requests that failed as a new bulk definition (+ originals)
    def getRetryDef(failures: Seq[BulkResponseItem]): (BulkRequest, Seq[T]) = {
      val policy = if (config.refreshAfterOp) RefreshPolicy.Immediate else RefreshPolicy.NONE

      val failureIds     = failures.map(_.itemId).toSet
      val retryOriginals = filterByIndexes(originals, failureIds)
      val failedReqs     = filterByIndexes(req.requests, failureIds)

      (BulkRequest(failedReqs).refresh(policy), retryOriginals)
    }

    val f = client.execute(req)
    f.onComplete {
      case Failure(e)                               => self ! e
      case Success(failure: RequestFailure)         => self ! new RuntimeException(failure.toString)
      case Success(RequestSuccess(_, _, _, result)) =>
        if (result.hasSuccesses)
          self ! BulkActor.Result(
            result.successes,
            result.successes.map(getOriginalForResponse),
            requestNext = !result.errors
          )
        if (result.errors) {
          val failures = if (attempts > 0) {
            val (retriable, nonRetriable) = result.failures.partition(failure =>
              config.retryFailure(failure, getOriginalForResponse(failure))
            )
            // all retriable failures need to be resent, if retries left, but only after we wait for the failureWait
            // period to avoid flooding the cluster
            if (retriable.nonEmpty) {
              val (retryDef, originals) = getRetryDef(retriable)
              system.scheduler.scheduleOnce(config.failureWait, self, BulkActor.Send(retryDef, originals, attempts - 1))
            }
            nonRetriable
          } else result.failures
          self ! BulkActor.FailedResult(failures, failures.map(getOriginalForResponse), requestNext = attempts == 0)
        }
    }
  }

  private def handleError(t: Throwable): Unit = {
    // if an error we will cancel the subscription as we cannot for sure handle further elements
    // and the error may be from outside the subscriber
    subscription.cancel()
    config.errorFn(t)
    buffer.clear()
    context.stop(self)
  }

  private def index(): Unit = {

    def bulkDef: BulkRequest = {
      val defs   = buffer.map(t => builder.request(t)).toSeq
      val policy = if (config.refreshAfterOp) RefreshPolicy.Immediate else RefreshPolicy.NONE
      BulkRequest(defs).refresh(policy)
    }

    sent = sent + buffer.size
    self ! BulkActor.Send(bulkDef, buffer.toList, config.maxAttempts)

    buffer.clear

    // buffer is now empty so no point keeping a scheduled flush after operation
    flushAfterScheduler.foreach(_.cancel())
    flushAfterScheduler = None
  }
}

/** An implementation of this typeclass must provide a bulk compatible request for the given instance of T. The bulk
  * compatible request will then be sent to elastic.
  *
  * A bulk compatible request can be either an index, update, or delete.
  *
  * @tparam T
  *   the type of elements this builder supports
  */
@deprecated("Use the elastic4-reactivestreams-akka package", "8.16.0")
trait RequestBuilder[T] {
  def request(t: T): BulkCompatibleRequest
}

/** Notified on each acknowledgement
  */
@deprecated("Use the elastic4-reactivestreams-akka package", "8.16.0")
trait ResponseListener[-T] {
  def onAck(resp: BulkResponseItem, original: T): Unit
  def onFailure(resp: BulkResponseItem, original: T): Unit = ()
}

@deprecated("Use the elastic4-reactivestreams-akka package", "8.16.0")
object ResponseListener {
  val noop: ResponseListener[Any] = new ResponseListener[Any] {
    override def onAck(resp: BulkResponseItem, original: Any): Unit = ()
  }
}

/** @param listener
  *   a listener which is notified on each acknowledge batch item
  * @param batchSize
  *   the number of elements to group together per batch aside from the last batch
  * @param concurrentRequests
  *   the number of concurrent batch operations
  * @param refreshAfterOp
  *   if the index should be refreshed after each bulk operation
  * @param completionFn
  *   a function which is invoked when all sent requests have been acknowledged and the publisher has completed Note:
  *   this function is executed regardless of whether there was an error or not, that is, this function is always
  *   invoked regardless of the state
  * @param successFn
  *   a function will is only invoked when all operations have completed successfully
  * @param errorFn
  *   a function which is invoked after there is an error
  * @param failureWait
  *   the timeout before re-trying failed requests. Usually a failed request is elasticsearch's way of indicating
  *   backpressure, so this parameter determines how long to wait between requests.
  * @param retryFailure
  *   a function which is invoked for every failure in a bulk request, and returns true if the failure should be retried
  *   or false if it should not.
  * @param maxAttempts
  *   the max number of times to try a request. If it fails too many times it probably isn't back pressure but an error
  *   with the document.
  * @param flushInterval
  *   used to schedule periodic bulk indexing. This can be set to avoid waiting for a complete batch for a long period
  *   of time. It also is used if the publisher will never complete. This ensures that all elements are indexed, even if
  *   the last batch size is lower than batch size.
  * @param flushAfter
  *   used to schedule an index if no document has been received within the given duration. Once an index is performed
  *   (either by this flush value or because docs arrived in time) the flush after schedule is reset.
  */
@deprecated("Use the elastic4-reactivestreams-akka package", "8.16.0")
case class SubscriberConfig[T](
    batchSize: Int = 100,
    concurrentRequests: Int = 5,
    refreshAfterOp: Boolean = false,
    listener: ResponseListener[T] = ResponseListener.noop,
    completionFn: () => Unit = () => (),
    successFn: () => Unit = () => (),
    errorFn: Throwable => Unit = _ => (),
    failureWait: FiniteDuration = 2.seconds,
    retryFailure: (BulkResponseItem, T) => Boolean = (_: BulkResponseItem, _: T) => true,
    maxAttempts: Int = 5,
    flushInterval: Option[FiniteDuration] = None,
    flushAfter: Option[FiniteDuration] = None
)