
net.sansa_stack.inference.spark.utils.RDDUtils.scala Maven / Gradle / Ivy
Go to download
Show more of this group Show more artifacts with this name
Show all versions of sansa-inference-spark_2.12 Show documentation
Show all versions of sansa-inference-spark_2.12 Show documentation
Apache Spark based inference layer for RDF and OWL
The newest version!
package net.sansa_stack.inference.spark.utils
import scala.reflect.ClassTag
import org.apache.spark.rdd.RDD
/**
* Utility class for RDD operations.
*
* @author Lorenz Buehmann
*/
object RDDUtils {
implicit class RDDOps[T: ClassTag](rdd: RDD[T]) {
/**
* Splits an RDD into two parts based on the given filter function
*
* @param f the boolean filter function
* @return two RDDs
*/
def span(f: T => Boolean): (RDD[T], RDD[T]) = {
val spaned = rdd.mapPartitions { iter =>
val (left, right) = iter.span(f)
val iterSeq = Seq(left, right)
iterSeq.iterator
}
val left = spaned.mapPartitions { iter =>
iter.next()
}
val right = spaned.mapPartitions { iter =>
iter.next()
iter.next()
}
(left, right)
}
/**
* Splits an RDD into two parts based on the given filter function. Note, that filtering is done twice on the same
* data twice, thus, caching beforehand is recommended!
*
* @param f the boolean filter function
* @return two RDDs
*/
def partitionBy(f: T => Boolean): (RDD[T], RDD[T]) = {
val passes = rdd.filter(f)
val fails = rdd.filter(e => !f(e)) // Spark doesn't have filterNot
(passes, fails)
}
}
}
© 2015 - 2025 Weber Informatics LLC | Privacy Policy