
com.tribbloids.spookystuff.views.package.scala Maven / Gradle / Ivy
package com.tribbloids.spookystuff
import org.apache.spark.rdd.RDD
import org.apache.spark.storage.StorageLevel
import com.tribbloids.spookystuff.entity.{Squashed, PageRow}
import com.tribbloids.spookystuff.entity.PageRow.WebCacheRDD
import com.tribbloids.spookystuff.utils.Utils
import scala.language.implicitConversions
import scala.reflect.ClassTag
/**
* Created by peng on 11/7/14.
* implicit conversions in this package are used for development only
*/
package object views {
implicit class RDDView[A](val self: A)(implicit ev1: A => RDD[_]) {
def persistDuring[T](newLevel: StorageLevel, blocking: Boolean = true)(fn: => T): T =
if (self.getStorageLevel == StorageLevel.NONE){
self.persist(newLevel)
val result = fn
self.unpersist(blocking)
result
}
else {
val result = fn
self.unpersist(blocking)
result
}
// def checkpointNow(): Unit = {
// persistDuring(StorageLevel.MEMORY_ONLY) {
// self.checkpoint()
// self.foreach(_ =>)
// self
// }
// Unit
// }
}
implicit class PairRDDView[K: ClassTag, V: ClassTag](val self: RDD[(K, V)]) {
import RDD._
//get 3 RDDs that shares key partitioning: leftExclusive, intersection, rightExclusive
//all 3 can be zipped directly as if joined by key, this has many applications like getting union, intersection and subtraction
// def logicalCombinationsByKey[S](
// other: RDD[(K, V)])(
// innerReducer: (V, V) => V)(
// staging: RDD[(K, (Option[V], Option[V]))] => S
// ): (RDD[(K, V)], RDD[(K, (V, V))], RDD[(K, V)], S) = {
//
// val cogrouped = self.cogroup(other)
//
// val mixed: RDD[(K, (Option[V], Option[V]))] = cogrouped.mapValues{
// tuple =>
// val leftOption = tuple._1.reduceLeftOption(innerReducer)
// val rightOption = tuple._2.reduceLeftOption(innerReducer)
//
// (leftOption, rightOption)
// }
// val stage = staging(mixed)
//
// val leftExclusive = mixed.flatMapValues {
// case (Some(left), None) => Some(left)
// case _ => None
// }
// val Intersection = mixed.flatMapValues {
// case (Some(left), Some(right)) => Some(left, right)
// case _ => None
// }
// val rightExclusive = mixed.flatMapValues {
// case (None, Some(right)) => Some(right)
// case _ => None
// }
// (leftExclusive, Intersection, rightExclusive, stage)
// }
def unionByKey(
other: RDD[(K, V)])(
innerReducer: (V, V) => V
): RDD[(K, V)] = {
val cogrouped = self.cogroup(other)
cogrouped.mapValues {
tuple =>
val reduced = (tuple._1 ++ tuple._2).reduce(innerReducer)
reduced
}
}
def intersectionByKey(
other: RDD[(K, V)])(
innerReducer: (V, V) => V
): RDD[(K, V)] = {
val cogrouped = self.cogroup(other)
cogrouped.flatMap {
triplet =>
val tuple = triplet._2
if (tuple._1.nonEmpty || tuple._2.nonEmpty) {
val reduced = (tuple._1 ++ tuple._2).reduce(innerReducer)
Some(triplet._1 -> reduced)
}
else {
None
}
}
}
}
implicit class WebCacheRDDView(self: WebCacheRDD) {
import RDD._
def discardRows: WebCacheRDD = {
self.mapValues {
_.copy(metadata = Array())
}
}
def getRows: RDD[PageRow] = {
val updated = self.flatMap {
_._2.metadata
}
updated
}
def putRows(
rows: RDD[PageRow],
seedFilter: Iterable[PageRow] => Option[PageRow] = _=>None
): WebCacheRDD = {
val dryRun_RowRDD = rows.keyBy(_.dryrun)
val cogrouped = self.cogroup(dryRun_RowRDD)
val result = cogrouped.map {
triplet =>
val tuple = triplet._2
assert(tuple._1.size <= 1)
val squashedRowOption = triplet._2._1.reduceOption(_ ++ _)
squashedRowOption match {
case None =>
val newRows = tuple._2
val seedRows = seedFilter(newRows)
.groupBy(_.uid)
.flatMap(tuple => seedFilter(tuple._2))
val newPageLikes = seedFilter(newRows).get.pageLikes
val newCached = triplet._1 -> Squashed(newPageLikes, seedRows.toArray)
newCached
case Some(squashedRow) =>
val newRows = tuple._2
val existingRowUIDs = squashedRow.metadata.map(_.uid)
val seedRows = newRows.filterNot(row => existingRowUIDs.contains(row.uid))
.groupBy(_.uid)
.flatMap(tuple => seedFilter(tuple._2))
val newCached = triplet._1 -> squashedRow.copy(metadata = squashedRow.metadata ++ seedRows)
newCached
}
}
result
}
}
implicit class MapView[K, V](m1: Map[K,V]) {
def getTyped[T: ClassTag](key: K): Option[T] = m1.get(key) match {
case Some(res) =>
res match {
case r: T => Some(r)
case _ => None
}
case _ => None
}
// def merge(m2: Map[K,_], strategy: MergeStrategy): Map[K,_] = {
// val halfMerged = m2.map{
// kv => {
// m1.get(kv._1) match {
// case None => kv
// case Some(v) => (kv._1, strategy.f(v, kv._2))
// }
// }
// }
//
// m1 ++ halfMerged
// }
def flattenKey(
key: K
): Seq[Map[K,_]] = {
val valueOption = m1.get(key)
val values: Iterable[_] = valueOption.toSeq.flatMap(Utils.encapsulateAsIterable)
val cleaned = m1 - key
val result = values.toSeq.map(value => cleaned + (key-> value))
result
}
def canonizeKeysToColumnNames: Map[String,V] = m1.map(
tuple =>
(Utils.canonizeColumnName(tuple._1.toString), tuple._2)
)
}
}
© 2015 - 2025 Weber Informatics LLC | Privacy Policy