
ai.databand.spark.SparkColumnStats Maven / Gradle / Ivy
/*
* © Copyright Databand.ai, an IBM Company 2022
*/
package ai.databand.spark;
import ai.databand.log.HistogramRequest;
import ai.databand.log.LogDatasetRequest;
import ai.databand.parameters.Histogram;
import ai.databand.schema.ColumnStats;
import ai.databand.schema.histograms.NumericSummary;
import ai.databand.schema.histograms.Summary;
import org.apache.spark.sql.Dataset;
import org.apache.spark.sql.types.StructField;
import java.util.ArrayList;
import java.util.Collections;
import java.util.HashMap;
import java.util.List;
import java.util.Map;
import static ai.databand.DbndPropertyNames.DBND_INTERNAL_ALIAS;
public class SparkColumnStats {
private final Dataset> dataset;
private final LogDatasetRequest params;
public SparkColumnStats(Dataset> dataset, LogDatasetRequest params) {
this.dataset = dataset.alias(String.format("%s_%s", DBND_INTERNAL_ALIAS, "COLUMN_STATS"));
this.params = params;
}
public List values() {
if (!params.getWithStats()) {
return Collections.emptyList();
}
Histogram histogram = new Histogram("columnStat", dataset, new HistogramRequest().onlyStats());
Map columnTypes = new HashMap<>();
for (StructField field : dataset.schema().fields()) {
columnTypes.put(field.name(), field.dataType().typeName());
}
// calculate summary
histogram.summary();
Map summaries = histogram.getSummaries();
List result = new ArrayList<>(summaries.size());
for (Map.Entry col : summaries.entrySet()) {
Summary columnMetrics = col.getValue();
ColumnStats columnStats = new ColumnStats()
.setColumnName(col.getKey())
.setColumnType(columnTypes.get(col.getKey()))
.setRecordsCount(columnMetrics.getCount())
.setDistinctCount(columnMetrics.getDistinct());
if (columnMetrics instanceof NumericSummary) {
NumericSummary numericSummary = (NumericSummary) columnMetrics;
columnStats.setMeanValue(numericSummary.getMean())
.setMinValue(numericSummary.getMin())
.setMaxValue(numericSummary.getMax())
.setStdValue(numericSummary.getStd())
.setQuartile1(numericSummary.get_25())
.setQuartile2(numericSummary.get_50())
.setQuartile3(numericSummary.get_75());
}
result.add(columnStats);
}
return result;
}
}
© 2015 - 2025 Weber Informatics LLC | Privacy Policy