io.deephaven.benchmark.api.BenchTable Maven / Gradle / Ivy
/* Copyright (c) 2022-2023 Deephaven Data Labs and Patent Pending */
package io.deephaven.benchmark.api;
import java.time.Duration;
import java.time.temporal.ChronoUnit;
import java.util.concurrent.Future;
import java.util.concurrent.atomic.AtomicBoolean;
import io.deephaven.benchmark.generator.*;
import io.deephaven.benchmark.metric.Metrics;
import io.deephaven.benchmark.util.Ids;
import io.deephaven.benchmark.util.Log;
* Represents the configuration of table name and columns.
final public class BenchTable implements Closeable {
final Bench bench;
final String tableName;
final ColumnDefs columns = new ColumnDefs();
private long rowCount = 0;
private int durationSecs = -1;
private int rowPauseMillis = -1;
private String compression = null;
private Generator generator = null;
private boolean isFixed = false;
BenchTable(Bench bench, String tableName) {
this.tableName = tableName;
this.bench = bench;
* Add a column definition for the table schema
* @param name the name of the column
* @param type the type of the column ( string | long | int | double | float
* @param valuesDef range or combination of range and string
* @return this instance
public BenchTable add(String name, String type, String valuesDef) {
columns.add(name, type, valuesDef);
return this;
* Add a column definition for the table schema
* @param name the name of the column
* @param type the type of the column ( string | long | int | double | float
* @param valuesDef range or combination of range and string
* @param distribution the name of the distribution ( linearConv
* @return this instance
public BenchTable add(String name, String type, String valuesDef, String distribution) {
columns.add(name, type, valuesDef, distribution);
return this;
* Override the profile's row count (e.g. scale.row.count)
* @param generatedRowCount how many rows the table should have
* @return this instance
public BenchTable withRowCount(long generatedRowCount) {
rowCount = generatedRowCount;
return this;
* Override the profile's run duration (e.g. default.completion.timeout=5 minutes)
* @param duration the maximum run duration
* @param unit the unit of time for the duration
* @return this instance
public BenchTable withRunDuration(int duration, ChronoUnit unit) {
durationSecs = (int) Duration.of(duration, unit).toSeconds();
return this;
* Override the pause between producing records (e.g. generator.pause.per.row=0 millis). (Note: Usually, this should
* be left alone, since adding even 1 milli can make record generation take inordinately long.)
* @param duration the pause between sending records
* @param unit the unit of time for the duration
* @return this instance
public BenchTable withRowPause(int duration, ChronoUnit unit) {
rowPauseMillis = (int) Duration.of(duration, unit).toMillis();
return this;
* Override the default compression codec for record generation and parquet
* @param codec the compression codec (zstd | lz4 | lzo | gzip | snappy | none)
* @return this instance
public BenchTable withCompression(String codec) {
compression = codec;
return this;
* Direct the table generator to produce column values according to an incremental distribution and rows up the
* maximum defined by all column ranges. For example, if col1 has range [1-10] and col2 has range [1-20] the total
* number of rows generated will be 20, unless {@code withRowCount()} is used to override it.
* Calling this method will override the default of fixed = false and distribution = random.
* @return this instance
public BenchTable fixed() {
isFixed = true;
return this;
* Generate the table asynchronously through Kafka using Avro serialization
public void generateAvro() {
var future = generateWithAvro();
* Generate the table asynchronously through Kafka using JSON serialization
public void generateJson() {
var future = generateWithJson();
* Generate the table asynchronously through Kafka using Avro serialization
public void generateProtobuf() {
var future = generateWithProtobuf();
* Generate the table synchronously to a parquet file in the engine's data directory. If a parquet file already
* exists in the Deephaven data directory that matches this table definition, use it and skip generation.
public void generateParquet() {
String q = replaceTableAndGeneratorFields(useExistingParquetQuery);
AtomicBoolean usedExistingParquet = new AtomicBoolean(false);
bench.query(q).fetchAfter("result", table -> {
usedExistingParquet.set(table.getValue(0, "UsedExistingParquet").toString().equalsIgnoreCase("true"));
if (usedExistingParquet.get()) {"Table '%s' with %s rows already exists. Skipping", tableName, getRowCount());
}"Generating table '%s' with %s rows", tableName, getRowCount());
long beginTime = System.currentTimeMillis();
if (rowPauseMillis < 0)
withRowPause(0, ChronoUnit.MILLIS);
bench.awaitCompletion(generateWithAvro());"Produce Data Duration: " + (System.currentTimeMillis() - beginTime));
beginTime = System.currentTimeMillis();
q = replaceTableAndGeneratorFields(kafkaToParquetQuery);
bench.query(q).execute();"DH Write Table Duration: " + (System.currentTimeMillis() - beginTime));
* Shutdown and cleanup any running generator
public void close() {
if (generator != null)
private Future generateWithAvro() {
String bootstrapServer ="client.redpanda.addr", "localhost:9092");
String schemaRegistry = "http://" +"client.schema.registry.addr", "localhost:8081");
generator = new AvroKafkaGenerator(bootstrapServer, schemaRegistry, tableName, columns, getCompression());
return generator.produce(getRowPause(), getRowCount(), getRunDuration());
private Future generateWithJson() {
String bootstrapServer ="client.redpanda.addr", "localhost:9092");
String schemaRegistry = "http://" +"client.schema.registry.addr", "localhost:8081");
generator = new JsonKafkaGenerator(bootstrapServer, schemaRegistry, tableName, columns, getCompression());
return generator.produce(getRowPause(), getRowCount(), getRunDuration());
private Future generateWithProtobuf() {
String bootstrapServer ="client.redpanda.addr", "localhost:9092");
String schemaRegistry = "http://" +"client.schema.registry.addr", "localhost:8081");
generator = new ProtobufKafkaGenerator(bootstrapServer, schemaRegistry, tableName, columns, getCompression());
return generator.produce(getRowPause(), getRowCount(), getRunDuration());
private int getRowPause() {
if (rowPauseMillis >= 0)
return rowPauseMillis;
return (int) bench.propertyAsDuration("generator.pause.per.row", "1 millis").toMillis();
private long getRowCount() {
if (rowCount > 0)
return rowCount;
long count = isFixed ? columns.getMaxValueCount() : 0;
if (count > 0)
return count;
return bench.propertyAsIntegral("scale.row.count", "10000");
private int getRunDuration() {
if (durationSecs >= 0)
return durationSecs;
return (int) bench.propertyAsDuration("default.completion.timeout", "1 minute").toSeconds();
private String getCompression() {
String codec = (compression != null) ? compression :"record.compression", "NONE");
return codec.trim().toUpperCase();
private String getTableDefinition() {
return "row.count=" + getRowCount() + "\n"
+ "compression=" + getCompression() + "\n"
+ columns.describe();
private String getTableDefinitionId() {
return "benchmark." + Ids.uniqueName();
private String replaceTableAndGeneratorFields(String query) {
query = generatorDefValues + query;
String codec = getCompression();
codec = codec.equals("NONE") ? "UNCOMPRESSED" : codec;
codec = codec.equals("LZ4") ? "LZ4_RAW" : codec;
String compression = String.format(", compression_codec_name='%s'", codec);
return query.replace("${}", tableName)
.replace("${compression.codec}", compression)
.replace("${max.dict.keys}", ", max_dictionary_keys=2000000")
.replace("${max.dict.bytes}", ", max_dictionary_size=16777216")
.replace("${}", ", target_page_size=2097152")
.replace("${table.columns}", columns.getQuotedColumns())
.replace("${table.rowcount}", Long.toString(getRowCount()))
.replace("${table.duration}", Long.toString(getRunDuration()))
.replace("${table.definition}", getTableDefinition())
.replace("${}", getTableDefinitionId());
static final String generatorDefValues = """
# Define files and generator configuration
table_parquet = '/data/${}.parquet'
table_gen_parquet = '/data/${}.gen.parquet'
table_gen_def_text = '''${table.definition}'''
table_gen_def_file = '/data/${}.gen.def'
static final String useExistingParquetQuery = """
# Determine if a Parquet file already exists that fits the table configuration
import os, glob
from deephaven import new_table
from deephaven.column import string_col
def findMatchingGenParquet(gen_def_text):
for path in glob.glob('/data/benchmark.*.*.*.gen.def'):
with open(path) as f:
if == gen_def_text:
return os.path.splitext(os.path.splitext(path)[0])[0]
return None
if os.path.exists(table_parquet):
usedExisting = False
matching_gen_parquet = findMatchingGenParquet(table_gen_def_text)
if matching_gen_parquet is not None and os.path.exists(str(matching_gen_parquet) + '.gen.parquet'): + '.gen.parquet', table_parquet)
usedExisting = True
result = new_table([string_col("UsedExistingParquet", [str(usedExisting)])])
static final String kafkaToParquetQuery = """
# Create a Parquet file from a Kafka topic
import jpy, os
from deephaven import kafka_consumer as kc
from import TableType, KeyValueSpec
from deephaven.parquet import write
from deephaven.table import Table
from deephaven.update_graph import exclusive_lock
${} = kc.consume(
{ 'bootstrap.servers' : '${kafka.consumer.addr}', 'schema.registry.url' : 'http://${schema.registry.addr}' },
'${}', partitions=None, offsets=kc.ALL_PARTITIONS_SEEK_TO_BEGINNING,
key_spec=KeyValueSpec.IGNORE, value_spec=kc.avro_spec('${}_record', schema_version='1'),
def wait_ticking_table_update(table: Table, row_count: int):
with exclusive_lock(table):
while table.size < row_count:
wait_ticking_table_update(${}, ${table.rowcount})
if os.path.exists(table_parquet):
mymeta = ${}.meta_table
with open(table_gen_def_file, 'w') as f:
write(${}, table_gen_parquet ${compression.codec} ${max.dict.keys} ${max.dict.bytes} ${}), table_parquet)
del ${}
from deephaven import garbage_collect