org.apache.iceberg.spark.actions.BaseSparkAction Maven / Gradle / Ivy
Go to download
Show more of this group Show more artifacts with this name
Show all versions of iceberg-spark3 Show documentation
Show all versions of iceberg-spark3 Show documentation
A table format for huge analytic datasets
/*
* Licensed to the Apache Software Foundation (ASF) under one
* or more contributor license agreements. See the NOTICE file
* distributed with this work for additional information
* regarding copyright ownership. The ASF licenses this file
* to you under the Apache License, Version 2.0 (the
* "License"); you may not use this file except in compliance
* with the License. You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing,
* software distributed under the License is distributed on an
* "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
* KIND, either express or implied. See the License for the
* specific language governing permissions and limitations
* under the License.
*/
package org.apache.iceberg.spark.actions;
import java.util.Iterator;
import java.util.List;
import java.util.Map;
import java.util.concurrent.atomic.AtomicInteger;
import java.util.function.Supplier;
import org.apache.iceberg.BaseTable;
import org.apache.iceberg.ManifestFiles;
import org.apache.iceberg.MetadataTableType;
import org.apache.iceberg.ReachableFileUtil;
import org.apache.iceberg.StaticTableOperations;
import org.apache.iceberg.Table;
import org.apache.iceberg.TableMetadata;
import org.apache.iceberg.actions.Action;
import org.apache.iceberg.io.ClosingIterator;
import org.apache.iceberg.io.FileIO;
import org.apache.iceberg.relocated.com.google.common.collect.Lists;
import org.apache.iceberg.relocated.com.google.common.collect.Maps;
import org.apache.iceberg.spark.JobGroupInfo;
import org.apache.iceberg.spark.JobGroupUtils;
import org.apache.iceberg.spark.SparkTableUtil;
import org.apache.iceberg.spark.SparkUtil;
import org.apache.spark.SparkContext;
import org.apache.spark.api.java.JavaSparkContext;
import org.apache.spark.api.java.function.FlatMapFunction;
import org.apache.spark.broadcast.Broadcast;
import org.apache.spark.sql.Dataset;
import org.apache.spark.sql.Encoders;
import org.apache.spark.sql.Row;
import org.apache.spark.sql.SparkSession;
import static org.apache.iceberg.MetadataTableType.ALL_MANIFESTS;
abstract class BaseSparkAction implements Action {
private static final AtomicInteger JOB_COUNTER = new AtomicInteger();
private final SparkSession spark;
private final JavaSparkContext sparkContext;
private final Map options = Maps.newHashMap();
protected BaseSparkAction(SparkSession spark) {
this.spark = spark;
this.sparkContext = JavaSparkContext.fromSparkContext(spark.sparkContext());
}
protected SparkSession spark() {
return spark;
}
protected JavaSparkContext sparkContext() {
return sparkContext;
}
protected abstract ThisT self();
@Override
public ThisT option(String name, String value) {
options.put(name, value);
return self();
}
@Override
public ThisT options(Map newOptions) {
options.putAll(newOptions);
return self();
}
protected Map options() {
return options;
}
protected T withJobGroupInfo(JobGroupInfo info, Supplier supplier) {
SparkContext context = spark().sparkContext();
JobGroupInfo previousInfo = JobGroupUtils.getJobGroupInfo(context);
try {
JobGroupUtils.setJobGroupInfo(context, info);
return supplier.get();
} finally {
JobGroupUtils.setJobGroupInfo(context, previousInfo);
}
}
protected JobGroupInfo newJobGroupInfo(String groupId, String desc) {
return new JobGroupInfo(groupId + "-" + JOB_COUNTER.incrementAndGet(), desc, false);
}
protected Table newStaticTable(TableMetadata metadata, FileIO io) {
String metadataFileLocation = metadata.metadataFileLocation();
StaticTableOperations ops = new StaticTableOperations(metadataFileLocation, io);
return new BaseTable(ops, metadataFileLocation);
}
protected Dataset buildValidDataFileDF(Table table) {
JavaSparkContext context = JavaSparkContext.fromSparkContext(spark.sparkContext());
Broadcast ioBroadcast = context.broadcast(SparkUtil.serializableFileIO(table));
Dataset allManifests = loadMetadataTable(table, ALL_MANIFESTS)
.selectExpr("path", "length", "partition_spec_id as partitionSpecId", "added_snapshot_id as addedSnapshotId")
.dropDuplicates("path")
.repartition(spark.sessionState().conf().numShufflePartitions()) // avoid adaptive execution combining tasks
.as(Encoders.bean(ManifestFileBean.class));
return allManifests.flatMap(new ReadManifest(ioBroadcast), Encoders.STRING()).toDF("file_path");
}
protected Dataset buildManifestFileDF(Table table) {
return loadMetadataTable(table, ALL_MANIFESTS).selectExpr("path as file_path");
}
protected Dataset buildManifestListDF(Table table) {
List manifestLists = ReachableFileUtil.manifestListLocations(table);
return spark.createDataset(manifestLists, Encoders.STRING()).toDF("file_path");
}
protected Dataset buildOtherMetadataFileDF(Table table) {
List otherMetadataFiles = Lists.newArrayList();
otherMetadataFiles.addAll(ReachableFileUtil.metadataFileLocations(table, false));
otherMetadataFiles.add(ReachableFileUtil.versionHintLocation(table));
return spark.createDataset(otherMetadataFiles, Encoders.STRING()).toDF("file_path");
}
protected Dataset buildValidMetadataFileDF(Table table) {
Dataset manifestDF = buildManifestFileDF(table);
Dataset manifestListDF = buildManifestListDF(table);
Dataset otherMetadataFileDF = buildOtherMetadataFileDF(table);
return manifestDF.union(otherMetadataFileDF).union(manifestListDF);
}
protected Dataset loadMetadataTable(Table table, MetadataTableType type) {
return SparkTableUtil.loadMetadataTable(spark, table, type);
}
private static class ReadManifest implements FlatMapFunction {
private final Broadcast io;
ReadManifest(Broadcast io) {
this.io = io;
}
@Override
public Iterator call(ManifestFileBean manifest) {
return new ClosingIterator<>(ManifestFiles.readPaths(manifest, io.getValue()).iterator());
}
}
}
© 2015 - 2025 Weber Informatics LLC | Privacy Policy