org.apache.iceberg.BaseDistributedDataScan Maven / Gradle / Ivy
Show all versions of iceberg-core Show documentation
/*
* Licensed to the Apache Software Foundation (ASF) under one
* or more contributor license agreements. See the NOTICE file
* distributed with this work for additional information
* regarding copyright ownership. The ASF licenses this file
* to you under the Apache License, Version 2.0 (the
* "License"); you may not use this file except in compliance
* with the License. You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing,
* software distributed under the License is distributed on an
* "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
* KIND, either express or implied. See the License for the
* specific language governing permissions and limitations
* under the License.
*/
package org.apache.iceberg;
import static org.apache.iceberg.PlanningMode.AUTO;
import static org.apache.iceberg.TableProperties.DATA_PLANNING_MODE;
import static org.apache.iceberg.TableProperties.DELETE_PLANNING_MODE;
import static org.apache.iceberg.TableProperties.PLANNING_MODE_DEFAULT;
import java.util.List;
import java.util.Map;
import java.util.concurrent.CompletableFuture;
import java.util.concurrent.CompletionException;
import java.util.concurrent.ExecutorService;
import java.util.function.Function;
import java.util.stream.Collectors;
import org.apache.iceberg.expressions.Expression;
import org.apache.iceberg.expressions.ManifestEvaluator;
import org.apache.iceberg.expressions.Projections;
import org.apache.iceberg.expressions.ResidualEvaluator;
import org.apache.iceberg.io.CloseableIterable;
import org.apache.iceberg.metrics.ScanMetricsUtil;
import org.apache.iceberg.relocated.com.google.common.collect.Iterables;
import org.apache.iceberg.relocated.com.google.common.collect.Maps;
import org.apache.iceberg.util.ContentFileUtil;
import org.apache.iceberg.util.ParallelIterable;
import org.apache.iceberg.util.TableScanUtil;
import org.apache.iceberg.util.ThreadPools;
import org.slf4j.Logger;
import org.slf4j.LoggerFactory;
/**
* An abstract class for batch data scans that can utilize cluster resources for planning.
*
* This class provides common logic to create data scans that are capable of reading and
* filtering manifests remotely when the metadata size exceeds the threshold for local processing.
* Also, it takes care of planning tasks locally if remote planning is not considered beneficial.
*
*
Note that this class is evolving and is subject to change even in minor releases.
*/
abstract class BaseDistributedDataScan
extends DataScan> implements BatchScan {
private static final Logger LOG = LoggerFactory.getLogger(BaseDistributedDataScan.class);
private static final long LOCAL_PLANNING_MAX_SLOT_SIZE = 128L * 1024 * 1024; // 128 MB
private static final int MONITOR_POOL_SIZE = 2;
private final int localParallelism;
private final long localPlanningSizeThreshold;
protected BaseDistributedDataScan(Table table, Schema schema, TableScanContext context) {
super(table, schema, context);
this.localParallelism = PLAN_SCANS_WITH_WORKER_POOL ? ThreadPools.WORKER_THREAD_POOL_SIZE : 1;
this.localPlanningSizeThreshold = localParallelism * LOCAL_PLANNING_MAX_SLOT_SIZE;
}
/**
* Returns the cluster parallelism.
*
* This value indicates the maximum number of manifests that can be processed concurrently by
* the cluster. Implementations should take into account both the currently available processing
* slots and potential dynamic allocation, if applicable.
*
*
The remote parallelism is compared against the size of the thread pool available locally to
* determine the feasibility of remote planning. This value is ignored if the planning mode is set
* explicitly as local or distributed.
*/
protected abstract int remoteParallelism();
/** Returns which planning mode to use for data. */
protected PlanningMode dataPlanningMode() {
Map properties = table().properties();
String modeName = properties.getOrDefault(DATA_PLANNING_MODE, PLANNING_MODE_DEFAULT);
return PlanningMode.fromName(modeName);
}
/**
* Controls whether defensive copies are created for remotely planned data files.
*
* By default, this class creates defensive copies for each data file that is planned remotely,
* assuming the provided iterable can be lazy and may reuse objects. If unnecessary and data file
* objects can be safely added into a collection, implementations can override this behavior.
*/
protected boolean shouldCopyRemotelyPlannedDataFiles() {
return true;
}
/**
* Plans data remotely.
*
*
Implementations are encouraged to return groups of matching data files, enabling this class
* to process multiple groups concurrently to speed up the remaining work. This is particularly
* useful when dealing with equality deletes, as delete index lookups with such delete files
* require comparing bounds and typically benefit from parallelization.
*
*
If the result iterable reuses objects, {@link #shouldCopyRemotelyPlannedDataFiles()} must
* return true.
*
*
The input data manifests have been already filtered to include only potential matches based
* on the scan filter. Implementations are expected to further filter these manifests and only
* return files that may hold data matching the scan filter.
*
* @param dataManifests data manifests that may contain files matching the scan filter
* @param withColumnStats a flag whether to load column stats
* @return groups of data files planned remotely
*/
protected abstract Iterable> planDataRemotely(
List dataManifests, boolean withColumnStats);
/** Returns which planning mode to use for deletes. */
protected PlanningMode deletePlanningMode() {
Map properties = table().properties();
String modeName = properties.getOrDefault(DELETE_PLANNING_MODE, PLANNING_MODE_DEFAULT);
return PlanningMode.fromName(modeName);
}
/**
* Plans deletes remotely.
*
* The input delete manifests have been already filtered to include only potential matches
* based on the scan filter. Implementations are expected to further filter these manifests and
* return files that may hold deletes matching the scan filter.
*
* @param deleteManifests delete manifests that may contain files matching the scan filter
* @return a delete file index planned remotely
*/
protected abstract DeleteFileIndex planDeletesRemotely(List deleteManifests);
@Override
protected CloseableIterable doPlanFiles() {
Snapshot snapshot = snapshot();
List deleteManifests = findMatchingDeleteManifests(snapshot);
boolean mayHaveEqualityDeletes = !deleteManifests.isEmpty() && mayHaveEqualityDeletes(snapshot);
boolean planDeletesLocally = shouldPlanDeletesLocally(deleteManifests, mayHaveEqualityDeletes);
List dataManifests = findMatchingDataManifests(snapshot);
boolean loadColumnStats = mayHaveEqualityDeletes || shouldReturnColumnStats();
boolean planDataLocally = shouldPlanDataLocally(dataManifests, loadColumnStats);
boolean copyDataFiles = shouldCopyDataFiles(planDataLocally, loadColumnStats);
if (planDataLocally && planDeletesLocally) {
return planFileTasksLocally(dataManifests, deleteManifests);
}
ExecutorService monitorPool = newMonitorPool();
CompletableFuture deletesFuture =
newDeletesFuture(deleteManifests, planDeletesLocally, monitorPool);
CompletableFuture>> dataFuture =
newDataFuture(dataManifests, planDataLocally, loadColumnStats, monitorPool);
try {
Iterable> fileTasks =
toFileTasks(dataFuture, deletesFuture, copyDataFiles);
if (shouldPlanWithExecutor() && (planDataLocally || mayHaveEqualityDeletes)) {
return new ParallelIterable<>(fileTasks, planExecutor());
} else {
return CloseableIterable.concat(fileTasks);
}
} catch (CompletionException e) {
deletesFuture.cancel(true /* may interrupt */);
dataFuture.cancel(true /* may interrupt */);
throw new RuntimeException("Failed to plan files", e);
} finally {
monitorPool.shutdown();
}
}
@Override
public CloseableIterable> planTasks() {
return TableScanUtil.planTaskGroups(
planFiles(), targetSplitSize(), splitLookback(), splitOpenFileCost());
}
private List findMatchingDataManifests(Snapshot snapshot) {
List dataManifests = snapshot.dataManifests(io());
scanMetrics().totalDataManifests().increment(dataManifests.size());
List matchingDataManifests = filterManifests(dataManifests);
int skippedDataManifestsCount = dataManifests.size() - matchingDataManifests.size();
scanMetrics().skippedDataManifests().increment(skippedDataManifestsCount);
return matchingDataManifests;
}
private List findMatchingDeleteManifests(Snapshot snapshot) {
List deleteManifests = snapshot.deleteManifests(io());
scanMetrics().totalDeleteManifests().increment(deleteManifests.size());
List matchingDeleteManifests = filterManifests(deleteManifests);
int skippedDeleteManifestsCount = deleteManifests.size() - matchingDeleteManifests.size();
scanMetrics().skippedDeleteManifests().increment(skippedDeleteManifestsCount);
return matchingDeleteManifests;
}
private List filterManifests(List manifests) {
Map evalCache = specCache(this::newManifestEvaluator);
return manifests.stream()
.filter(manifest -> manifest.hasAddedFiles() || manifest.hasExistingFiles())
.filter(manifest -> evalCache.get(manifest.partitionSpecId()).eval(manifest))
.collect(Collectors.toList());
}
private boolean shouldPlanDeletesLocally(
List deleteManifests, boolean mayHaveEqualityDeletes) {
PlanningMode mode = deletePlanningMode();
return (mode == AUTO && mayHaveEqualityDeletes) || shouldPlanLocally(mode, deleteManifests);
}
private boolean shouldPlanDataLocally(List dataManifests, boolean loadColumnStats) {
PlanningMode mode = dataPlanningMode();
return (mode == AUTO && loadColumnStats) || shouldPlanLocally(mode, dataManifests);
}
private boolean shouldPlanLocally(PlanningMode mode, List manifests) {
if (context().planWithCustomizedExecutor()) {
return true;
}
switch (mode) {
case LOCAL:
return true;
case DISTRIBUTED:
return manifests.isEmpty();
case AUTO:
return remoteParallelism() <= localParallelism
|| manifests.size() <= 2 * localParallelism
|| totalSize(manifests) <= localPlanningSizeThreshold;
default:
throw new IllegalArgumentException("Unknown planning mode: " + mode);
}
}
private long totalSize(List manifests) {
return manifests.stream().mapToLong(ManifestFile::length).sum();
}
private boolean shouldCopyDataFiles(boolean planDataLocally, boolean loadColumnStats) {
return planDataLocally
|| shouldCopyRemotelyPlannedDataFiles()
|| (loadColumnStats && !shouldReturnColumnStats());
}
@SuppressWarnings("unchecked")
private CloseableIterable planFileTasksLocally(
List dataManifests, List deleteManifests) {
LOG.info("Planning file tasks locally for table {}", table().name());
ManifestGroup manifestGroup = newManifestGroup(dataManifests, deleteManifests);
CloseableIterable extends ScanTask> fileTasks = manifestGroup.planFiles();
return (CloseableIterable) fileTasks;
}
private CompletableFuture newDeletesFuture(
List deleteManifests, boolean planLocally, ExecutorService monitorPool) {
return CompletableFuture.supplyAsync(
() -> {
if (planLocally) {
LOG.info("Planning deletes locally for table {}", table().name());
return planDeletesLocally(deleteManifests);
} else {
LOG.info("Planning deletes remotely for table {}", table().name());
return planDeletesRemotely(deleteManifests);
}
},
monitorPool);
}
private DeleteFileIndex planDeletesLocally(List deleteManifests) {
DeleteFileIndex.Builder builder = DeleteFileIndex.builderFor(io(), deleteManifests);
if (shouldPlanWithExecutor() && deleteManifests.size() > 1) {
builder.planWith(planExecutor());
}
return builder
.specsById(table().specs())
.filterData(filter())
.caseSensitive(isCaseSensitive())
.scanMetrics(scanMetrics())
.build();
}
private CompletableFuture>> newDataFuture(
List dataManifests,
boolean planLocally,
boolean withColumnStats,
ExecutorService monitorPool) {
return CompletableFuture.supplyAsync(
() -> {
if (planLocally) {
LOG.info("Planning data locally for table {}", table().name());
ManifestGroup manifestGroup = newManifestGroup(dataManifests, withColumnStats);
return manifestGroup.fileGroups();
} else {
LOG.info("Planning data remotely for table {}", table().name());
return planDataRemotely(dataManifests, withColumnStats);
}
},
monitorPool);
}
private Iterable> toFileTasks(
CompletableFuture>> dataFuture,
CompletableFuture deletesFuture,
boolean copyDataFiles) {
String schemaString = SchemaParser.toJson(tableSchema());
Map specStringCache = specCache(PartitionSpecParser::toJson);
Map residualCache = specCache(this::newResidualEvaluator);
Iterable> dataFileGroups = dataFuture.join();
return Iterables.transform(
dataFileGroups,
dataFiles ->
toFileTasks(
dataFiles,
deletesFuture,
copyDataFiles,
schemaString,
specStringCache,
residualCache));
}
private CloseableIterable toFileTasks(
CloseableIterable dataFiles,
CompletableFuture deletesFuture,
boolean copyDataFiles,
String schemaString,
Map specStringCache,
Map residualCache) {
return CloseableIterable.transform(
dataFiles,
dataFile -> {
DeleteFile[] deleteFiles = deletesFuture.join().forDataFile(dataFile);
String specString = specStringCache.get(dataFile.specId());
ResidualEvaluator residuals = residualCache.get(dataFile.specId());
ScanMetricsUtil.fileTask(scanMetrics(), dataFile, deleteFiles);
return new BaseFileScanTask(
copyDataFiles ? copy(dataFile) : dataFile,
deleteFiles,
schemaString,
specString,
residuals);
});
}
private > F copy(F file) {
return ContentFileUtil.copy(file, shouldReturnColumnStats(), columnsToKeepStats());
}
private ManifestEvaluator newManifestEvaluator(PartitionSpec spec) {
Expression projection = Projections.inclusive(spec, isCaseSensitive()).project(filter());
return ManifestEvaluator.forPartitionFilter(projection, spec, isCaseSensitive());
}
private ResidualEvaluator newResidualEvaluator(PartitionSpec spec) {
return ResidualEvaluator.of(spec, residualFilter(), isCaseSensitive());
}
private Map specCache(Function load) {
Map cache = Maps.newHashMap();
table().specs().forEach((specId, spec) -> cache.put(specId, load.apply(spec)));
return cache;
}
private boolean mayHaveEqualityDeletes(Snapshot snapshot) {
String count = snapshot.summary().get(SnapshotSummary.TOTAL_EQ_DELETES_PROP);
return count == null || !count.equals("0");
}
// a monitor pool that enables planing data and deletes concurrently if remote planning is used
private ExecutorService newMonitorPool() {
return ThreadPools.newWorkerPool("iceberg-planning-monitor-service", MONITOR_POOL_SIZE);
}
}