All Downloads are FREE. Search and download functionalities are using the official Maven repository.

vectorpipe.sources.AugmentedDiffMicroBatchReader.scala Maven / Gradle / Ivy

The newest version!
package vectorpipe.sources

import java.net.URI
import java.util

import org.apache.spark.internal.Logging
import org.apache.spark.sql.catalyst.InternalRow
import org.apache.spark.sql.sources.v2.DataSourceOptions
import org.apache.spark.sql.sources.v2.reader.{InputPartition, InputPartitionReader}
import vectorpipe.model.AugmentedDiff

import scala.collection.JavaConverters._
import scala.compat.java8.OptionConverters._

case class AugmentedDiffStreamBatchTask(baseURI: URI, sequences: Seq[Int], handler: (Int, AugmentedDiffSource.RF) => Unit)
    extends InputPartition[InternalRow] {
  override def createPartitionReader(): InputPartitionReader[InternalRow] =
    AugmentedDiffStreamBatchReader(baseURI, sequences, handler)
}

case class AugmentedDiffStreamBatchReader(baseURI: URI, sequences: Seq[Int], handler: (Int, AugmentedDiffSource.RF) => Unit)
    extends ReplicationStreamBatchReader[AugmentedDiff](baseURI, sequences) {

  override def getSequence(baseURI: URI, sequence: Int): Seq[AugmentedDiff] =
    AugmentedDiffSource.getSequence(baseURI, sequence, handler)
}

case class AugmentedDiffMicroBatchReader(options: DataSourceOptions, checkpointLocation: String)
    extends ReplicationStreamMicroBatchReader[AugmentedDiff](options, checkpointLocation)
    with Logging {

  override def getCurrentSequence: Option[Int] =
    AugmentedDiffSource.getCurrentSequence(baseURI)

  private def baseURI: URI =
    options
      .get(Source.BaseURI)
      .asScala
      .map(new URI(_))
      .getOrElse(
        throw new RuntimeException(
          s"${Source.BaseURI} is a required option for ${Source.AugmentedDiffs}"
        )
      )

  private def errorHandler: AugmentedDiffSourceErrorHandler = {
    val handlerClass = options
      .get(Source.ErrorHandler)
      .asScala
      .getOrElse("vectorpipe.sources.AugmentedDiffSourceErrorHandler")

    val handler = Class.forName(handlerClass).newInstance.asInstanceOf[AugmentedDiffSourceErrorHandler]
    handler.setOptions(options.asMap.asScala.toMap)
    handler
  }

  override def planInputPartitions(): util.List[InputPartition[InternalRow]] =
    sequenceRange
      .map(seq =>
        AugmentedDiffStreamBatchTask(baseURI, Seq(seq), errorHandler.handle).asInstanceOf[InputPartition[InternalRow]])
      .asJava
}




© 2015 - 2025 Weber Informatics LLC | Privacy Policy