bio.ferlab.datalake.spark3.config.ReferenceConfiguration.scala Maven / Gradle / Ivy
Go to download
Show more of this group Show more artifacts with this name
Show all versions of datalake-spark3_2.12 Show documentation
Show all versions of datalake-spark3_2.12 Show documentation
Library built on top of Apache Spark to speed-up data lakes development..
package bio.ferlab.datalake.spark3.config
import bio.ferlab.datalake.commons.config._
import bio.ferlab.datalake.commons.file.FileSystemType.S3
import bio.ferlab.datalake.spark3.genomics.GenomicDatasets
import bio.ferlab.datalake.spark3.publictables.PublicDatasets
import pureconfig.generic.auto._
object ReferenceConfiguration extends App {
val alias = "public_database"
val prod_storage = List(
StorageConf(alias, "s3a://kf-strides-variant-parquet-prd", S3),
PublicDatasets.gnomadStorage
)
val kf_conf =
SimpleConfiguration(
DatalakeConf(
storages = prod_storage,
sources = PublicDatasets(alias, Some("variant"), Some("variant_live")).sources ++ GenomicDatasets(alias, Some("variant"), Some("variant_live")).sources,
sparkconf = Map("hive.metastore.client.factory.class" -> "com.amazonaws.glue.catalog.metastore.AWSGlueDataCatalogHiveClientFactory")
)
)
ConfigurationWriter.writeTo("datalake-spark3/src/main/resources/reference_kf.conf", kf_conf)
ConfigurationWriter.writeTo("datalake-spark3/src/test/resources/config/reference_kf.conf", kf_conf)
}