
io.projectglow.Glow.scala Maven / Gradle / Ivy
/*
* Copyright 2019 The Glow Authors
*
* Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License.
* You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
package io.projectglow
import java.util.ServiceLoader
import scala.collection.JavaConverters._
import com.fasterxml.jackson.databind.ObjectMapper
import com.fasterxml.jackson.module.scala.DefaultScalaModule
import org.apache.spark.sql.{DataFrame, SQLUtils, SparkSession}
import io.projectglow.common.Named
import io.projectglow.sql.util.BGZFCodec
import io.projectglow.sql.{GlowSQLExtensions, SqlExtensionProvider}
import io.projectglow.transformers.util.{SnakeCaseMap, StringUtils}
import io.projectglow.vcf.VCFFileFormat
/**
* The entry point for all language specific functionality, meaning methods that cannot be expressed
* as SparkSQL expressions.
*
* We should expose as little functionality as is necessary through this object and should prefer
* generic methods with stringly-typed arguments to reduce language-specific maintenance burden.
*/
class GlowBase {
val mapper = new ObjectMapper()
mapper.registerModule(DefaultScalaModule)
def register(spark: SparkSession, newSession: Boolean = true): SparkSession = {
val sess = if (newSession) spark.newSession() else spark
new GlowSQLExtensions().apply(SQLUtils.getSessionExtensions(sess))
SqlExtensionProvider.registerFunctions(
sess.sessionState.conf,
sess.sessionState.functionRegistry)
// Decrease the parquet columnar batch size (often necessary for large cohorts)
sess.conf.set("spark.sql.parquet.columnarReaderBatchSize", "16")
// Add BGZ compression codec. Note that we do not enable for the enhanced GZIP codec, which automatically
// determines if an input file is gzipped or bgzipped, for all datasources since it confuses Spark's built in
// datasources.
sess.conf.set("io.compression.codecs", compressionCodecsWithBGZ(spark))
// Expressions that use ExpectsGenotypeFields do not support pruning
sess.conf.set("spark.sql.optimizer.nestedSchemaPruning.enabled", "false")
sess
}
private def compressionCodecsWithBGZ(spark: SparkSession): String = {
val newCodecs = Seq(classOf[BGZFCodec].getCanonicalName)
(spark
.sessionState
.newHadoopConf()
.get("io.compression.codecs", "")
.split(",")
.filter(codec => codec.nonEmpty && !newCodecs.contains(codec)) ++ newCodecs).mkString(",")
}
/**
* Apply a named transformation to a DataFrame of genomic data. All parameters apart from the
* input data and its schema are provided through the case-insensitive options map.
*
* There are no bounds on what the transformer may do. For instance, it's legal for the
* transformer to materialize the input DataFrame.
*
* @return The transformed DataFrame
*/
def transform(operationName: String, df: DataFrame, options: Map[String, Any]): DataFrame = {
val stringValuedMap = options.mapValues {
case s: String => s
case v => mapper.writeValueAsString(v)
}.map(identity) // output of mapValues is not serializable: https://github.com/scala/bug/issues/7005
lookupTransformer(operationName) match {
case Some(transformer) => transformer.transform(df, new SnakeCaseMap(stringValuedMap))
case None =>
throw new IllegalArgumentException(s"No transformer with name $operationName")
}
}
def transform(operationName: String, df: DataFrame, options: (String, Any)*): DataFrame = {
transform(operationName, df, options.toMap)
}
def transform(
operationName: String,
df: DataFrame,
options: java.util.Map[String, String]): DataFrame = {
transform(operationName, df, options.asScala.toMap)
}
private def lookupTransformer(name: String): Option[DataFrameTransformer] = synchronized {
transformerLoader.reload()
transformerLoader
.iterator()
.asScala
.find(n => StringUtils.toSnakeCase(n.name) == StringUtils.toSnakeCase(name))
}
private val transformerLoader = ServiceLoader
.load(classOf[DataFrameTransformer])
}
object Glow extends GlowBase
trait DataFrameTransformer extends Named {
def transform(df: DataFrame, options: Map[String, String]): DataFrame
}
© 2015 - 2025 Weber Informatics LLC | Privacy Policy