ai.tripl.arc.extract.ElasticsearchExtract.scala Maven / Gradle / Ivy
package ai.tripl.arc.extract
import java.io._
import java.net.URI
import java.util.Properties
import scala.annotation.tailrec
import scala.collection.JavaConverters._
import org.apache.spark.sql._
import org.apache.spark.sql.functions._
import org.apache.spark.sql.types._
import org.apache.spark.storage.StorageLevel
import org.apache.spark.TaskContext
import org.elasticsearch.spark.sql._
import ai.tripl.arc.api._
import ai.tripl.arc.api.API._
import ai.tripl.arc.config._
import ai.tripl.arc.config.Error._
import ai.tripl.arc.plugins.PipelineStagePlugin
import ai.tripl.arc.util.CloudUtils
import ai.tripl.arc.util.DetailException
import ai.tripl.arc.util.EitherUtils._
import ai.tripl.arc.util.ExtractUtils
import ai.tripl.arc.util.MetadataUtils
import ai.tripl.arc.util.Utils
class ElasticsearchExtract extends PipelineStagePlugin {
val version = ai.tripl.arc.elasticsearch.BuildInfo.version
def instantiate(index: Int, config: com.typesafe.config.Config)(implicit spark: SparkSession, logger: ai.tripl.arc.util.log.logger.Logger, arcContext: ARCContext): Either[List[ai.tripl.arc.config.Error.StageError], PipelineStage] = {
import ai.tripl.arc.config.ConfigReader._
import ai.tripl.arc.config.ConfigUtils._
implicit val c = config
val expectedKeys = "type" :: "name" :: "description" :: "environments" :: "input" :: "outputView" :: "numPartitions" :: "partitionBy" :: "persist" :: "params" :: Nil
val name = getValue[String]("name")
val description = getOptionalValue[String]("description")
val input = getValue[String]("input")
val outputView = getValue[String]("outputView")
val persist = getValue[java.lang.Boolean]("persist", default = Some(false))
val numPartitions = getOptionalValue[Int]("numPartitions")
val partitionBy = getValue[StringList]("partitionBy", default = Some(Nil))
val params = readMap("params", c)
val invalidKeys = checkValidKeys(c)(expectedKeys)
(name, description, input, outputView, persist, numPartitions, partitionBy, invalidKeys) match {
case (Right(name), Right(description), Right(input), Right(outputView), Right(persist), Right(numPartitions), Right(partitionBy), Right(invalidKeys)) =>
val stage = ElasticsearchExtractStage(
plugin=this,
name=name,
description=description,
input=input,
outputView=outputView,
params=params,
persist=persist,
numPartitions=numPartitions,
partitionBy=partitionBy
)
stage.stageDetail.put("input", input)
stage.stageDetail.put("outputView", outputView)
stage.stageDetail.put("params", params.asJava)
Right(stage)
case _ =>
val allErrors: Errors = List(name, description, input, outputView, persist, numPartitions, partitionBy, invalidKeys).collect{ case Left(errs) => errs }.flatten
val stageName = stringOrDefault(name, "unnamed stage")
val err = StageError(index, stageName, c.origin.lineNumber, allErrors)
Left(err :: Nil)
}
}
}
case class ElasticsearchExtractStage(
plugin: ElasticsearchExtract,
name: String,
description: Option[String],
input: String,
outputView: String,
params: Map[String, String],
persist: Boolean,
numPartitions: Option[Int],
partitionBy: List[String]
) extends PipelineStage {
override def execute()(implicit spark: SparkSession, logger: ai.tripl.arc.util.log.logger.Logger, arcContext: ARCContext): Option[DataFrame] = {
ElasticsearchExtractStage.execute(this)
}
}
object ElasticsearchExtractStage {
def execute(stage: ElasticsearchExtractStage)(implicit spark: SparkSession, logger: ai.tripl.arc.util.log.logger.Logger, arcContext: ARCContext): Option[DataFrame] = {
import spark.implicits._
// if incoming dataset is empty create empty dataset with a known schema
val df = try {
if (arcContext.isStreaming) {
spark.emptyDataFrame
} else {
spark.read.format("org.elasticsearch.spark.sql").options(stage.params).load(stage.input)
}
} catch {
case e: Exception => throw new Exception(e) with DetailException {
override val detail = stage.stageDetail
}
}
// repartition to distribute rows evenly
val repartitionedDF = stage.partitionBy match {
case Nil => {
stage.numPartitions match {
case Some(numPartitions) => df.repartition(numPartitions)
case None => df
}
}
case partitionBy => {
// create a column array for repartitioning
val partitionCols = partitionBy.map(col => df(col))
stage.numPartitions match {
case Some(numPartitions) => df.repartition(numPartitions, partitionCols:_*)
case None => df.repartition(partitionCols:_*)
}
}
}
if (arcContext.immutableViews) repartitionedDF.createTempView(stage.outputView) else repartitionedDF.createOrReplaceTempView(stage.outputView)
if (!repartitionedDF.isStreaming) {
stage.stageDetail.put("inputFiles", java.lang.Integer.valueOf(repartitionedDF.inputFiles.length))
stage.stageDetail.put("outputColumns", java.lang.Integer.valueOf(repartitionedDF.schema.length))
stage.stageDetail.put("numPartitions", java.lang.Integer.valueOf(repartitionedDF.rdd.partitions.length))
if (stage.persist) {
repartitionedDF.persist(arcContext.storageLevel)
stage.stageDetail.put("records", java.lang.Long.valueOf(repartitionedDF.count))
}
}
Option(repartitionedDF)
}
}
© 2015 - 2025 Weber Informatics LLC | Privacy Policy