ch.ninecode.cim.CIMSubsetter.scala Maven / Gradle / Ivy
package ch.ninecode.cim
import scala.reflect._
import scala.reflect.runtime.universe._
import org.apache.spark.rdd.RDD
import org.apache.spark.sql.SQLContext
import org.apache.spark.sql.Row
import org.apache.spark.sql.types.StructType
import org.apache.spark.storage.StorageLevel
import org.apache.spark.sql.types._
import ch.ninecode.model.Element
import scala.reflect.runtime.universe
/**
* Subclass extractor.
*
* Extracts the given type of object from the full Element Resilient Distributes Dataset (RDD),
* to create another RDD of just those elements, and creates a DataFrame of that RDD,
* and registers it as a temporary table for access via SQL (e.g. JDBC and SparkR::sql()).
*
* Note: This must be serializable and can't depend on the companion objects
* for the CIM case classes.
*/
class CIMSubsetter[A <: Product : ClassTag : TypeTag] () extends Serializable
{
type basetype = A
type rddtype = RDD[A]
val tag: universe.TypeTag[A] = typeTag [A]
val runtime_class: Class[_] = classTag [A].runtimeClass
val classname: String = runtime_class.getName
val cls: String =
{
classname.substring (classname.lastIndexOf (".") + 1)
}
/**
* Alter the schema so sup has the correct superclass name.
*
* @param rtc The runtime class for Typeclass A.
* @param schema The SQL schema for Typeclass A, e.g.
* org.apache.spark.sql.types.StructType = StructType(StructField(sup,StructType(StructField(sup,StructType(StructField(sup,...
*/
def modify_schema (rtc: Class[_], schema: StructType): StructType =
{
val sup = schema.fields (0)
val supcls = rtc.getMethod ("sup").getReturnType
val clsname = supcls.getName.substring (supcls.getName.lastIndexOf (".") + 1)
val suptyp = sup.dataType
val dataType = if (suptyp.typeName == "struct")
modify_schema (supcls, suptyp.asInstanceOf [StructType])
else
suptyp
val supersup = StructField (clsname, dataType, sup.nullable, sup.metadata)
schema.fields.update (0, supersup)
StructType (schema.fields)
}
/**
* Match names with pattern "name|xxx".
*
* @param name the name to match
* @return true
if the RDD name matches the pattern
*/
def like (name: String): ((Int, RDD[_])) => Boolean =
{
val pattern = s"$name|"
(rdd: (Int, RDD[_])) => (rdd._2.name != null) && ((rdd._2.name == name) || rdd._2.name.startsWith (pattern))
}
/**
* Create the Dataframe for Typeclass A.
*
* @param context The SQL context for creating the views.
* @param rdd The raw Element RDD to subset.
* @param storage The storage level to persist the subset RDD with.
*/
def save (context: SQLContext, rdd: rddtype, storage: StorageLevel): Unit =
{
// remove any previously named RDD
val matched = context.sparkContext.getPersistentRDDs.filter (like (cls))
matched.foreach (_._2.setName (null).unpersist (true))
rdd.name = cls
val _ = rdd.persist (storage)
if (context.sparkSession.sparkContext.getCheckpointDir.isDefined) rdd.checkpoint ()
val df = context.sparkSession.createDataFrame (rdd)(tag)
val altered_schema = modify_schema (runtime_class, df.schema)
val data_frame = context.sparkSession.createDataFrame (rdd.asInstanceOf [RDD[Row]], altered_schema)
data_frame.createOrReplaceTempView (cls)
}
/**
* Return the provided Element as this class if possible.
*
* @param element the element to convert
* @return Some(A) or None if the Element is not this Subsetter class type.
*/
def asThisClass (element: Element): Option[A] =
{
element match
{
case obj: A => Some (obj)
case e => if (null == e.sup) None else asThisClass (e.sup)
}
}
/**
* Create the Dataframe for Typeclass A.
*
* @param context The SQL context for creating the views.
* @param rdd The raw Element RDD to subset.
* @param storage The storage level to persist the subset RDD with.
*/
def make (context: SQLContext, rdd: RDD[Element], storage: StorageLevel): Unit =
{
val subrdd = rdd.flatMap (asThisClass)
save (context, subrdd, storage)
}
}