org.apache.iceberg.spark.actions.NDVSketchUtil Maven / Gradle / Ivy
Go to download
Show more of this group Show more artifacts with this name
Show all versions of iceberg-spark-3.5_2.13 Show documentation
Show all versions of iceberg-spark-3.5_2.13 Show documentation
A table format for huge analytic datasets
The newest version!
/*
* Licensed to the Apache Software Foundation (ASF) under one
* or more contributor license agreements. See the NOTICE file
* distributed with this work for additional information
* regarding copyright ownership. The ASF licenses this file
* to you under the Apache License, Version 2.0 (the
* "License"); you may not use this file except in compliance
* with the License. You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing,
* software distributed under the License is distributed on an
* "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
* KIND, either express or implied. See the License for the
* specific language governing permissions and limitations
* under the License.
*/
package org.apache.iceberg.spark.actions;
import java.nio.ByteBuffer;
import java.util.List;
import org.apache.datasketches.memory.Memory;
import org.apache.datasketches.theta.CompactSketch;
import org.apache.datasketches.theta.Sketch;
import org.apache.iceberg.Schema;
import org.apache.iceberg.Snapshot;
import org.apache.iceberg.Table;
import org.apache.iceberg.puffin.Blob;
import org.apache.iceberg.puffin.PuffinCompressionCodec;
import org.apache.iceberg.puffin.StandardBlobTypes;
import org.apache.iceberg.relocated.com.google.common.collect.ImmutableList;
import org.apache.iceberg.relocated.com.google.common.collect.ImmutableMap;
import org.apache.iceberg.relocated.com.google.common.collect.Lists;
import org.apache.iceberg.spark.SparkTableUtil;
import org.apache.iceberg.types.Types;
import org.apache.spark.sql.Column;
import org.apache.spark.sql.Dataset;
import org.apache.spark.sql.Row;
import org.apache.spark.sql.SparkSession;
import org.apache.spark.sql.stats.ThetaSketchAgg;
public class NDVSketchUtil {
private NDVSketchUtil() {}
public static final String APACHE_DATASKETCHES_THETA_V1_NDV_PROPERTY = "ndv";
static List generateBlobs(
SparkSession spark, Table table, Snapshot snapshot, List columns) {
Row sketches = computeNDVSketches(spark, table, snapshot, columns);
Schema schema = table.schemas().get(snapshot.schemaId());
List blobs = Lists.newArrayList();
for (int i = 0; i < columns.size(); i++) {
Types.NestedField field = schema.findField(columns.get(i));
Sketch sketch = CompactSketch.wrap(Memory.wrap((byte[]) sketches.get(i)));
blobs.add(toBlob(field, sketch, snapshot));
}
return blobs;
}
private static Blob toBlob(Types.NestedField field, Sketch sketch, Snapshot snapshot) {
return new Blob(
StandardBlobTypes.APACHE_DATASKETCHES_THETA_V1,
ImmutableList.of(field.fieldId()),
snapshot.snapshotId(),
snapshot.sequenceNumber(),
ByteBuffer.wrap(sketch.toByteArray()),
PuffinCompressionCodec.ZSTD,
ImmutableMap.of(
APACHE_DATASKETCHES_THETA_V1_NDV_PROPERTY,
String.valueOf((long) sketch.getEstimate())));
}
private static Row computeNDVSketches(
SparkSession spark, Table table, Snapshot snapshot, List colNames) {
Dataset inputDF = SparkTableUtil.loadTable(spark, table, snapshot.snapshotId());
return inputDF.select(toAggColumns(colNames)).first();
}
private static Column[] toAggColumns(List colNames) {
return colNames.stream().map(NDVSketchUtil::toAggColumn).toArray(Column[]::new);
}
private static Column toAggColumn(String colName) {
ThetaSketchAgg agg = new ThetaSketchAgg(colName);
return new Column(agg.toAggregateExpression());
}
}