ompute.benchmarks.0.2.5.source-code.Benchmarks.kt Maven / Gradle / Ivy
Go to download
Show more of this group Show more artifacts with this name
Show all versions of benchmarks Show documentation
Show all versions of benchmarks Show documentation
JVM query engine based on Apache Arrow
The newest version!
package org.ballistacompute.benchmarks
import kotlinx.coroutines.GlobalScope
import kotlinx.coroutines.async
import kotlinx.coroutines.runBlocking
import org.ballistacompute.datasource.InMemoryDataSource
import org.ballistacompute.datatypes.RecordBatch
import org.ballistacompute.execution.ExecutionContext
import java.io.File
import java.io.FileWriter
/**
* Designed to be run from Docker. See top-level benchmarks folder for more info.
*/
class Benchmarks {
companion object {
@JvmStatic
fun main(args: Array) {
println("maxMemory=${Runtime.getRuntime().maxMemory()}")
println("totalMemory=${Runtime.getRuntime().totalMemory()}")
println("freeMemory=${Runtime.getRuntime().freeMemory()}")
// val sql = System.getenv("BENCH_SQL_PARTIAL")
// val sql = System.getenv("BENCH_SQL_FINAL")
//TODO parameterize
val sqlPartial = "SELECT passenger_count, " +
"MIN(CAST(fare_amount AS double)) AS min_fare, MAX(CAST(fare_amount AS double)) AS max_fare, SUM(CAST(fare_amount AS double)) AS sum_fare " +
"FROM tripdata " +
"GROUP BY passenger_count"
val sqlFinal = "SELECT passenger_count, " +
"MIN(max_fare), " +
"MAX(min_fare), " +
"SUM(max_fare) " +
"FROM tripdata " +
"GROUP BY passenger_count"
val path = System.getenv("BENCH_PATH")
val resultFile = System.getenv("BENCH_RESULT_FILE")
val settings = mapOf(Pair("ballista.csv.batchSize", "1024"))
//TODO iterations
sqlAggregate(path, sqlPartial, sqlFinal, resultFile, settings)
println("maxMemory=${Runtime.getRuntime().maxMemory()}")
println("totalMemory=${Runtime.getRuntime().totalMemory()}")
println("freeMemory=${Runtime.getRuntime().freeMemory()}")
}
}
}
private fun getFiles(path: String): List {
//TODO improve to do recursion
val dir = File(path)
return dir.list().filter { it.endsWith(".csv") }
}
private fun sqlAggregate(path: String, sqlPartial: String, sqlFinal: String, resultFile: String, settings: Map) {
val start = System.currentTimeMillis()
val files = getFiles(path)
val deferred = files.map { file ->
GlobalScope.async {
println("Executing query against $file ...")
val partitionStart = System.currentTimeMillis()
val result = executeQuery(File(File(path), file).absolutePath, sqlPartial, settings)
val duration = System.currentTimeMillis() - partitionStart
println("Query against $file took $duration ms")
result
}
}
val results: List = runBlocking {
deferred.flatMap { it.await() }
}
println(results.first().schema)
val ctx = ExecutionContext(settings)
ctx.registerDataSource("tripdata", InMemoryDataSource(results.first().schema, results))
val df = ctx.sql(sqlFinal)
ctx.execute(df).forEach { println(it) }
val duration = System.currentTimeMillis() - start
println("Executed query in $duration ms")
val w = FileWriter(File(resultFile))
w.write("iterations,time_millis\n")
w.write("1,$duration\n")
w.close()
}
fun executeQuery(path: String, sql: String, settings: Map): List {
val ctx = ExecutionContext(settings)
ctx.registerCsv("tripdata", path)
val df = ctx.sql(sql)
return ctx.execute(df).toList()
}