com.tresata.spark.skewjoin.SkewJoinOperations.scala Maven / Gradle / Ivy

Go to download
Show more of this group Show more artifacts with this name
Show all versions of spark-skewjoin_2.11 Show documentation
spark-skewjoin
The newest version!
package com.tresata.spark.skewjoin

import java.util.{ Random => JRandom }
import scala.reflect.ClassTag

import org.apache.spark.rdd.RDD
import org.apache.spark.Partitioner
import org.apache.spark.Partitioner.defaultPartitioner

import com.twitter.algebird.{ CMS, CMSHasher, CMSMonoid }

case class CMSParams(eps: Double = 0.005, delta: Double = 1e-8, seed: Int = 1) {
  def getCMSMonoid[K: Ordering: CMSHasher]: CMSMonoid[K] = CMS.monoid[K](eps, delta, seed)
}

class SkewJoinOperations[K: ClassTag: Ordering: CMSHasher, V: ClassTag](rdd: RDD[(K, V)]) extends Serializable {
  private def getReplicationFactors(random: JRandom, replication: Int, otherReplication: Int): Seq[(Int, Int)] = {
    require(replication > 0 && otherReplication > 0, "replication must be positive")
    val rand = random.nextInt(otherReplication)
    (0 until replication).map(rep => (rand, rep))
  }
  
  private def createRddCMS[K](rdd: RDD[K], cmsMonoid: CMSMonoid[K]): CMS[K] =
    rdd.map(k => cmsMonoid.create(k)).reduce(cmsMonoid.plus(_, _))

  def skewCogroup[W: ClassTag](other: RDD[(K, W)], partitioner: Partitioner,
    skewReplication: SkewReplication = DefaultSkewReplication(), cmsParams: CMSParams = CMSParams()): RDD[(K, (Iterable[V], Iterable[W]))] = {
    val numPartitions = partitioner.numPartitions
    val broadcastedLeftCMS = rdd.sparkContext.broadcast(createRddCMS[K](rdd.keys, cmsParams.getCMSMonoid[K]))
    val broadcastedRightCMS = rdd.sparkContext.broadcast(createRddCMS[K](other.keys, cmsParams.getCMSMonoid[K]))
    
    val rddSkewed = rdd.mapPartitions{ it =>
      val random = new JRandom
      it.flatMap{ kv => 
        val (leftReplication, rightReplication) = skewReplication.getReplications(
          broadcastedLeftCMS.value.frequency(kv._1).estimate,
          broadcastedRightCMS.value.frequency(kv._1).estimate,
          numPartitions)
        getReplicationFactors(random, leftReplication, rightReplication).map(rl =>((kv._1, rl.swap), kv._2))
      }
    }
    
    val otherSkewed = other.mapPartitions{ it =>
      val random = new JRandom
      it.flatMap{ kv => 
        val (leftReplication, rightReplication) = skewReplication.getReplications(
          broadcastedLeftCMS.value.frequency(kv._1).estimate,
          broadcastedRightCMS.value.frequency(kv._1).estimate,
          numPartitions)
        getReplicationFactors(random, rightReplication, leftReplication).map(lr => ((kv._1, lr), kv._2))
      }
    }

    rddSkewed.cogroup(otherSkewed, partitioner).map(kv => (kv._1._1, kv._2))
  }

  def skewCogroup[W: ClassTag](other: RDD[(K, W)]): RDD[(K, (Iterable[V], Iterable[W]))] =
    skewCogroup(other, defaultPartitioner(rdd, other))

  def skewJoin[W: ClassTag](other: RDD[(K, W)], partitioner: Partitioner,
    skewReplication: SkewReplication = DefaultSkewReplication(), cmsParams: CMSParams = CMSParams()): RDD[(K, (V, W))] =
    skewCogroup(other, partitioner, skewReplication, cmsParams).flatMap{ blockPair =>
      for (v <- blockPair._2._1.iterator; w <- blockPair._2._2.iterator) yield
        (blockPair._1, (v, w))
    }

  def skewJoin[W: ClassTag](other: RDD[(K, W)]): RDD[(K, (V, W))] =
    skewJoin(other, defaultPartitioner(rdd, other))

  def skewLeftOuterJoin[W: ClassTag](other: RDD[(K, W)], partitioner: Partitioner,
    skewReplication: SkewReplication = DefaultSkewReplication(), cmsParams: CMSParams = CMSParams()): RDD[(K, (V, Option[W]))] =
    skewCogroup(other, partitioner, RightReplication(skewReplication), cmsParams).flatMap{
      case (k, (itv, Seq())) => itv.iterator.map(v => (k, (v, None)))
      case (k, (itv, itw)) => for (v <- itv; w <- itw) yield (k, (v, Some(w)))
    }

  def skewLeftOuterJoin[W: ClassTag](other: RDD[(K, W)]): RDD[(K, (V, Option[W]))] =
    skewLeftOuterJoin(other, defaultPartitioner(rdd, other))

  def skewRightOuterJoin[W: ClassTag](other: RDD[(K, W)], partitioner: Partitioner,
    skewReplication: SkewReplication = DefaultSkewReplication(), cmsParams: CMSParams = CMSParams()): RDD[(K, (Option[V], W))] =
    skewCogroup(other, partitioner, LeftReplication(skewReplication), cmsParams).flatMap{
      case (k, (Seq(), itw)) => itw.iterator.map(w => (k, (None, w)))
      case (k, (itv, itw)) => for (v <- itv; w <- itw) yield (k, (Some(v), w))
    }
  
  def skewRightOuterJoin[W: ClassTag](other: RDD[(K, W)]): RDD[(K, (Option[V], W))] =
    skewRightOuterJoin(other, defaultPartitioner(rdd, other))
}

trait Dsl {
  implicit def rddToSkewJoinOperations_e94qoy3tnt[K: ClassTag: Ordering: CMSHasher, V: ClassTag](rdd: RDD[(K, V)]): SkewJoinOperations[K, V] = new SkewJoinOperations(rdd)
  implicit def rddToBlockJoinOperations_7IaIe6dkih[K: ClassTag, V: ClassTag](rdd: RDD[(K, V)]): BlockJoinOperations[K, V] = new BlockJoinOperations(rdd)
}

object Dsl extends Dsl