All Downloads are FREE. Search and download functionalities are using the official Maven repository.

com.databricks.spark.sql.perf.JoinPerformance.scala Maven / Gradle / Ivy

The newest version!
package com.databricks.spark.sql.perf

import org.apache.spark.sql.functions._
import org.apache.spark.sql.types._

trait JoinPerformance extends Benchmark {
  // 1.5 mb, 1 file

  import ExecutionMode._
  import sqlContext.implicits._

  private val table = sqlContext.table _

  val x = Table(
    "1milints",
    sqlContext.range(0, 1000000)
      .repartition(1))

  val joinTables = Seq(
    // 143.542mb, 10 files
    Table(
      "100milints",
      sqlContext.range(0, 100000000)
        .repartition(10)),

    // 1.4348gb, 10 files
    Table(
      "1bilints",
      sqlContext.range(0, 1000000000)
      .repartition(10))
  )

  val sortMergeJoin = Variation("sortMergeJoin", Seq("on", "off")) {
    case "off" => sqlContext.setConf("spark.sql.planner.sortMergeJoin", "false")
    case "on" => sqlContext.setConf("spark.sql.planner.sortMergeJoin", "true")
  }

  val singleKeyJoins = Seq("1milints", "100milints", "1bilints").flatMap { table1 =>
    Seq("1milints", "100milints", "1bilints").flatMap { table2 =>
      Seq("JOIN", "RIGHT JOIN", "LEFT JOIN", "FULL OUTER JOIN").map { join =>
        Query(
          s"singleKey-$join-$table1-$table2",
          s"SELECT COUNT(*) FROM $table1 a $join $table2 b ON a.id = b.id",
          "equi-inner join a small table with a big table using a single key.",
          executionMode = CollectResults)
      }
    }
  }

  val varyDataSize = Seq(1, 128, 256, 512, 1024).map { dataSize =>
    val intsWithData = table("100milints").select($"id", lit("*" * dataSize).as(s"data$dataSize"))
    new Query(
      s"join - datasize: $dataSize",
      intsWithData.as("a").join(intsWithData.as("b"), $"a.id" === $"b.id"))
  }

  val varyKeyType = Seq(StringType, IntegerType, LongType, DoubleType).map { keyType =>
    val convertedInts = table("100milints").select($"id".cast(keyType).as("id"))
    new Query(
      s"join - keytype: $keyType",
      convertedInts.as("a").join(convertedInts.as("b"), $"a.id" === $"b.id"))
  }

  val varyNumMatches = Seq(1, 2, 4, 8, 16).map { numCopies =>
    val ints = table("100milints")
    val copiedInts = Seq.fill(numCopies)(ints).reduce(_ unionAll _)
    new Query(
      s"join - numMatches: $numCopies",
      copiedInts.as("a").join(ints.as("b"), $"a.id" === $"b.id"))
  }
}




© 2015 - 2025 Weber Informatics LLC | Privacy Policy