![JAR search and dependency download from the Maven repository](/logo.png)
org.apache.iceberg.SparkDistributedDataScan Maven / Gradle / Ivy
Show all versions of iceberg-spark-3.5_2.13 Show documentation
/*
* Licensed to the Apache Software Foundation (ASF) under one
* or more contributor license agreements. See the NOTICE file
* distributed with this work for additional information
* regarding copyright ownership. The ASF licenses this file
* to you under the Apache License, Version 2.0 (the
* "License"); you may not use this file except in compliance
* with the License. You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing,
* software distributed under the License is distributed on an
* "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
* KIND, either express or implied. See the License for the
* specific language governing permissions and limitations
* under the License.
*/
package org.apache.iceberg;
import java.util.Arrays;
import java.util.Iterator;
import java.util.List;
import java.util.Map;
import java.util.function.Supplier;
import java.util.stream.Collectors;
import java.util.stream.IntStream;
import org.apache.iceberg.expressions.Expression;
import org.apache.iceberg.io.CloseableIterable;
import org.apache.iceberg.io.ClosingIterator;
import org.apache.iceberg.io.FileIO;
import org.apache.iceberg.metrics.MetricsReporter;
import org.apache.iceberg.relocated.com.google.common.base.Joiner;
import org.apache.iceberg.relocated.com.google.common.collect.Iterables;
import org.apache.iceberg.relocated.com.google.common.collect.Lists;
import org.apache.iceberg.spark.JobGroupInfo;
import org.apache.iceberg.spark.JobGroupUtils;
import org.apache.iceberg.spark.SparkReadConf;
import org.apache.iceberg.spark.actions.ManifestFileBean;
import org.apache.iceberg.spark.source.SerializableTableWithSize;
import org.apache.spark.api.java.JavaRDD;
import org.apache.spark.api.java.JavaSparkContext;
import org.apache.spark.api.java.function.FlatMapFunction;
import org.apache.spark.broadcast.Broadcast;
import org.apache.spark.sql.SparkSession;
/**
* A batch data scan that can utilize Spark cluster resources for planning.
*
* This scan remotely filters manifests, fetching only the relevant data and delete files to the
* driver. The delete file assignment is done locally after the remote filtering step. Such approach
* is beneficial if the remote parallelism is much higher than the number of driver cores.
*
*
This scan is best suited for queries with selective filters on lower/upper bounds across all
* partitions, or against poorly clustered metadata. This allows job planning to benefit from highly
* concurrent remote filtering while not incurring high serialization and data transfer costs. This
* class is also useful for full table scans over large tables but the cost of bringing data and
* delete file details to the driver may become noticeable. Make sure to follow the performance tips
* below in such cases.
*
*
Ensure the filtered metadata size doesn't exceed the driver's max result size. For large table
* scans, consider increasing `spark.driver.maxResultSize` to avoid job failures.
*
*
Performance tips:
*
*
* - Enable Kryo serialization (`spark.serializer`)
*
- Increase the number of driver cores (`spark.driver.cores`)
*
- Tune the number of threads used to fetch task results (`spark.resultGetter.threads`)
*
*/
public class SparkDistributedDataScan extends BaseDistributedDataScan {
private static final Joiner COMMA = Joiner.on(',');
private static final String DELETE_PLANNING_JOB_GROUP_ID = "DELETE-PLANNING";
private static final String DATA_PLANNING_JOB_GROUP_ID = "DATA-PLANNING";
private final SparkSession spark;
private final JavaSparkContext sparkContext;
private final SparkReadConf readConf;
private Broadcast tableBroadcast = null;
public SparkDistributedDataScan(SparkSession spark, Table table, SparkReadConf readConf) {
this(spark, table, readConf, table.schema(), newTableScanContext(table));
}
private SparkDistributedDataScan(
SparkSession spark,
Table table,
SparkReadConf readConf,
Schema schema,
TableScanContext context) {
super(table, schema, context);
this.spark = spark;
this.sparkContext = JavaSparkContext.fromSparkContext(spark.sparkContext());
this.readConf = readConf;
}
@Override
protected BatchScan newRefinedScan(
Table newTable, Schema newSchema, TableScanContext newContext) {
return new SparkDistributedDataScan(spark, newTable, readConf, newSchema, newContext);
}
@Override
protected int remoteParallelism() {
return readConf.parallelism();
}
@Override
protected PlanningMode dataPlanningMode() {
return readConf.dataPlanningMode();
}
@Override
protected boolean shouldCopyRemotelyPlannedDataFiles() {
return false;
}
@Override
protected Iterable> planDataRemotely(
List dataManifests, boolean withColumnStats) {
JobGroupInfo info = new JobGroupInfo(DATA_PLANNING_JOB_GROUP_ID, jobDesc("data"));
return withJobGroupInfo(info, () -> doPlanDataRemotely(dataManifests, withColumnStats));
}
private Iterable> doPlanDataRemotely(
List dataManifests, boolean withColumnStats) {
scanMetrics().scannedDataManifests().increment(dataManifests.size());
JavaRDD dataFileRDD =
sparkContext
.parallelize(toBeans(dataManifests), dataManifests.size())
.flatMap(new ReadDataManifest(tableBroadcast(), context(), withColumnStats));
List> dataFileGroups = collectPartitions(dataFileRDD);
int matchingFilesCount = dataFileGroups.stream().mapToInt(List::size).sum();
int skippedFilesCount = liveFilesCount(dataManifests) - matchingFilesCount;
scanMetrics().skippedDataFiles().increment(skippedFilesCount);
return Iterables.transform(dataFileGroups, CloseableIterable::withNoopClose);
}
@Override
protected PlanningMode deletePlanningMode() {
return readConf.deletePlanningMode();
}
@Override
protected DeleteFileIndex planDeletesRemotely(List deleteManifests) {
JobGroupInfo info = new JobGroupInfo(DELETE_PLANNING_JOB_GROUP_ID, jobDesc("deletes"));
return withJobGroupInfo(info, () -> doPlanDeletesRemotely(deleteManifests));
}
private DeleteFileIndex doPlanDeletesRemotely(List deleteManifests) {
scanMetrics().scannedDeleteManifests().increment(deleteManifests.size());
List deleteFiles =
sparkContext
.parallelize(toBeans(deleteManifests), deleteManifests.size())
.flatMap(new ReadDeleteManifest(tableBroadcast(), context()))
.collect();
int skippedFilesCount = liveFilesCount(deleteManifests) - deleteFiles.size();
scanMetrics().skippedDeleteFiles().increment(skippedFilesCount);
return DeleteFileIndex.builderFor(deleteFiles)
.specsById(table().specs())
.caseSensitive(isCaseSensitive())
.scanMetrics(scanMetrics())
.build();
}
private T withJobGroupInfo(JobGroupInfo info, Supplier supplier) {
return JobGroupUtils.withJobGroupInfo(sparkContext, info, supplier);
}
private String jobDesc(String type) {
List options = Lists.newArrayList();
options.add("snapshot_id=" + snapshot().snapshotId());
String optionsAsString = COMMA.join(options);
return String.format("Planning %s (%s) for %s", type, optionsAsString, table().name());
}
private List toBeans(List manifests) {
return manifests.stream().map(ManifestFileBean::fromManifest).collect(Collectors.toList());
}
private Broadcast tableBroadcast() {
if (tableBroadcast == null) {
Table serializableTable = SerializableTableWithSize.copyOf(table());
this.tableBroadcast = sparkContext.broadcast(serializableTable);
}
return tableBroadcast;
}
private List> collectPartitions(JavaRDD rdd) {
int[] partitionIds = IntStream.range(0, rdd.getNumPartitions()).toArray();
return Arrays.asList(rdd.collectPartitions(partitionIds));
}
private int liveFilesCount(List manifests) {
return manifests.stream().mapToInt(this::liveFilesCount).sum();
}
private int liveFilesCount(ManifestFile manifest) {
return manifest.existingFilesCount() + manifest.addedFilesCount();
}
private static TableScanContext newTableScanContext(Table table) {
if (table instanceof BaseTable) {
MetricsReporter reporter = ((BaseTable) table).reporter();
return ImmutableTableScanContext.builder().metricsReporter(reporter).build();
} else {
return TableScanContext.empty();
}
}
private static class ReadDataManifest implements FlatMapFunction {
private final Broadcast table;
private final Expression filter;
private final boolean withStats;
private final boolean isCaseSensitive;
ReadDataManifest(Broadcast table, TableScanContext context, boolean withStats) {
this.table = table;
this.filter = context.rowFilter();
this.withStats = withStats;
this.isCaseSensitive = context.caseSensitive();
}
@Override
public Iterator call(ManifestFileBean manifest) throws Exception {
FileIO io = table.value().io();
Map specs = table.value().specs();
return new ClosingIterator<>(
ManifestFiles.read(manifest, io, specs)
.select(withStats ? SCAN_WITH_STATS_COLUMNS : SCAN_COLUMNS)
.filterRows(filter)
.caseSensitive(isCaseSensitive)
.iterator());
}
}
private static class ReadDeleteManifest implements FlatMapFunction {
private final Broadcast table;
private final Expression filter;
private final boolean isCaseSensitive;
ReadDeleteManifest(Broadcast table, TableScanContext context) {
this.table = table;
this.filter = context.rowFilter();
this.isCaseSensitive = context.caseSensitive();
}
@Override
public Iterator call(ManifestFileBean manifest) throws Exception {
FileIO io = table.value().io();
Map specs = table.value().specs();
return new ClosingIterator<>(
ManifestFiles.readDeleteManifest(manifest, io, specs)
.select(DELETE_SCAN_WITH_STATS_COLUMNS)
.filterRows(filter)
.caseSensitive(isCaseSensitive)
.iterator());
}
}
}
© 2015 - 2025 Weber Informatics LLC | Privacy Policy