bio.ferlab.datalake.spark3.publictables.normalized.DBNSFPRaw.scala Maven / Gradle / Ivy
Go to download
Show more of this group Show more artifacts with this name
Show all versions of datalake-spark3_2.12 Show documentation
Show all versions of datalake-spark3_2.12 Show documentation
Library built on top of Apache Spark to speed-up data lakes development..
package bio.ferlab.datalake.spark3.publictables.normalized
import bio.ferlab.datalake.commons.config.{DatasetConf, RuntimeETLContext}
import bio.ferlab.datalake.spark3.etl.v4.SimpleSingleETL
import bio.ferlab.datalake.spark3.implicits.DatasetConfImplicits.DatasetConfOperations
import mainargs.{ParserForMethods, main}
import org.apache.spark.sql.DataFrame
import java.time.LocalDateTime
case class DBNSFPRaw(rc: RuntimeETLContext) extends SimpleSingleETL(rc) {
override val mainDestination: DatasetConf = conf.getDataset("normalized_dbnsfp")
val raw_dbnsfp: DatasetConf = conf.getDataset("raw_dbnsfp")
override def extract(lastRunValue: LocalDateTime = minValue,
currentRunValue: LocalDateTime = LocalDateTime.now()): Map[String, DataFrame] = {
Map(raw_dbnsfp.id -> raw_dbnsfp.read)
}
override def transformSingle(data: Map[String, DataFrame],
lastRunValue: LocalDateTime = minValue,
currentRunValue: LocalDateTime = LocalDateTime.now()): DataFrame = {
data(raw_dbnsfp.id)
.withColumnRenamed("#chr", "chromosome")
.withColumnRenamed("position_1-based", "start")
.withColumnRenamed("ref", "reference")
.withColumnRenamed("alt", "alternate")
}
}
object DBNSFPRaw {
@main
def run(rc: RuntimeETLContext): Unit = {
DBNSFPRaw(rc).run()
}
def main(args: Array[String]): Unit = ParserForMethods(this).runOrThrow(args)
}