org.apache.iceberg.spark.SparkTableUtil Maven / Gradle / Ivy
Show all versions of iceberg-spark-3.3_2.13 Show documentation
/*
* Licensed to the Apache Software Foundation (ASF) under one
* or more contributor license agreements. See the NOTICE file
* distributed with this work for additional information
* regarding copyright ownership. The ASF licenses this file
* to you under the Apache License, Version 2.0 (the
* "License"); you may not use this file except in compliance
* with the License. You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing,
* software distributed under the License is distributed on an
* "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
* KIND, either express or implied. See the License for the
* specific language governing permissions and limitations
* under the License.
*/
package org.apache.iceberg.spark;
import static org.apache.spark.sql.functions.col;
import java.io.IOException;
import java.io.Serializable;
import java.net.URI;
import java.util.Collections;
import java.util.Iterator;
import java.util.List;
import java.util.Map;
import java.util.UUID;
import java.util.concurrent.ExecutorService;
import java.util.stream.Collectors;
import org.apache.hadoop.conf.Configuration;
import org.apache.hadoop.fs.Path;
import org.apache.iceberg.AppendFiles;
import org.apache.iceberg.DataFile;
import org.apache.iceberg.FileFormat;
import org.apache.iceberg.HasTableOperations;
import org.apache.iceberg.ManifestFile;
import org.apache.iceberg.ManifestFiles;
import org.apache.iceberg.ManifestWriter;
import org.apache.iceberg.MetadataTableType;
import org.apache.iceberg.MetadataTableUtils;
import org.apache.iceberg.MetricsConfig;
import org.apache.iceberg.PartitionSpec;
import org.apache.iceberg.Table;
import org.apache.iceberg.TableOperations;
import org.apache.iceberg.TableProperties;
import org.apache.iceberg.data.TableMigrationUtil;
import org.apache.iceberg.exceptions.ValidationException;
import org.apache.iceberg.hadoop.HadoopFileIO;
import org.apache.iceberg.hadoop.SerializableConfiguration;
import org.apache.iceberg.hadoop.Util;
import org.apache.iceberg.io.FileIO;
import org.apache.iceberg.io.OutputFile;
import org.apache.iceberg.mapping.NameMapping;
import org.apache.iceberg.mapping.NameMappingParser;
import org.apache.iceberg.relocated.com.google.common.base.Joiner;
import org.apache.iceberg.relocated.com.google.common.base.MoreObjects;
import org.apache.iceberg.relocated.com.google.common.base.Objects;
import org.apache.iceberg.relocated.com.google.common.base.Preconditions;
import org.apache.iceberg.relocated.com.google.common.collect.ImmutableList;
import org.apache.iceberg.relocated.com.google.common.collect.ImmutableMap;
import org.apache.iceberg.relocated.com.google.common.collect.Lists;
import org.apache.iceberg.relocated.com.google.common.collect.Maps;
import org.apache.iceberg.spark.source.SparkTable;
import org.apache.iceberg.util.PropertyUtil;
import org.apache.iceberg.util.Tasks;
import org.apache.iceberg.util.ThreadPools;
import org.apache.spark.TaskContext;
import org.apache.spark.api.java.JavaRDD;
import org.apache.spark.api.java.JavaSparkContext;
import org.apache.spark.api.java.function.FlatMapFunction;
import org.apache.spark.api.java.function.MapFunction;
import org.apache.spark.api.java.function.MapPartitionsFunction;
import org.apache.spark.sql.AnalysisException;
import org.apache.spark.sql.Column;
import org.apache.spark.sql.Dataset;
import org.apache.spark.sql.Encoders;
import org.apache.spark.sql.Row;
import org.apache.spark.sql.SparkSession;
import org.apache.spark.sql.catalyst.TableIdentifier;
import org.apache.spark.sql.catalyst.analysis.NoSuchDatabaseException;
import org.apache.spark.sql.catalyst.analysis.NoSuchTableException;
import org.apache.spark.sql.catalyst.analysis.UnresolvedAttribute;
import org.apache.spark.sql.catalyst.catalog.CatalogTable;
import org.apache.spark.sql.catalyst.catalog.CatalogTablePartition;
import org.apache.spark.sql.catalyst.catalog.SessionCatalog;
import org.apache.spark.sql.catalyst.expressions.Expression;
import org.apache.spark.sql.catalyst.expressions.NamedExpression;
import org.apache.spark.sql.catalyst.parser.ParseException;
import org.apache.spark.sql.catalyst.plans.logical.LogicalPlan;
import org.apache.spark.sql.execution.datasources.v2.DataSourceV2Relation;
import org.apache.spark.sql.util.CaseInsensitiveStringMap;
import scala.Function2;
import scala.Option;
import scala.Some;
import scala.Tuple2;
import scala.collection.JavaConverters;
import scala.collection.immutable.Map$;
import scala.collection.immutable.Seq;
import scala.collection.mutable.Builder;
import scala.runtime.AbstractPartialFunction;
/**
* Java version of the original SparkTableUtil.scala
* https://github.com/apache/iceberg/blob/apache-iceberg-0.8.0-incubating/spark/src/main/scala/org/apache/iceberg/spark/SparkTableUtil.scala
*/
public class SparkTableUtil {
private static final String DUPLICATE_FILE_MESSAGE =
"Cannot complete import because data files "
+ "to be imported already exist within the target table: %s. "
+ "This is disabled by default as Iceberg is not designed for multiple references to the same file"
+ " within the same table. If you are sure, you may set 'check_duplicate_files' to false to force the import.";
private SparkTableUtil() {}
/**
* Returns a DataFrame with a row for each partition in the table.
*
* The DataFrame has 3 columns, partition key (a=1/b=2), partition location, and format (avro
* or parquet).
*
* @param spark a Spark session
* @param table a table name and (optional) database
* @return a DataFrame of the table's partitions
*/
public static Dataset partitionDF(SparkSession spark, String table) {
List partitions = getPartitions(spark, table);
return spark
.createDataFrame(partitions, SparkPartition.class)
.toDF("partition", "uri", "format");
}
/**
* Returns a DataFrame with a row for each partition that matches the specified 'expression'.
*
* @param spark a Spark session.
* @param table name of the table.
* @param expression The expression whose matching partitions are returned.
* @return a DataFrame of the table partitions.
*/
public static Dataset partitionDFByFilter(
SparkSession spark, String table, String expression) {
List partitions = getPartitionsByFilter(spark, table, expression);
return spark
.createDataFrame(partitions, SparkPartition.class)
.toDF("partition", "uri", "format");
}
/**
* Returns all partitions in the table.
*
* @param spark a Spark session
* @param table a table name and (optional) database
* @return all table's partitions
*/
public static List getPartitions(SparkSession spark, String table) {
try {
TableIdentifier tableIdent = spark.sessionState().sqlParser().parseTableIdentifier(table);
return getPartitions(spark, tableIdent, null);
} catch (ParseException e) {
throw SparkExceptionUtil.toUncheckedException(
e, "Unable to parse table identifier: %s", table);
}
}
/**
* Returns all partitions in the table.
*
* @param spark a Spark session
* @param tableIdent a table identifier
* @param partitionFilter partition filter, or null if no filter
* @return all table's partitions
*/
public static List getPartitions(
SparkSession spark, TableIdentifier tableIdent, Map partitionFilter) {
try {
SessionCatalog catalog = spark.sessionState().catalog();
CatalogTable catalogTable = catalog.getTableMetadata(tableIdent);
Option> scalaPartitionFilter;
if (partitionFilter != null && !partitionFilter.isEmpty()) {
Builder, scala.collection.immutable.Map> builder =
Map$.MODULE$.newBuilder();
partitionFilter.forEach((key, value) -> builder.$plus$eq(Tuple2.apply(key, value)));
scalaPartitionFilter = Option.apply(builder.result());
} else {
scalaPartitionFilter = Option.empty();
}
Seq partitions =
catalog.listPartitions(tableIdent, scalaPartitionFilter).toIndexedSeq();
return JavaConverters.seqAsJavaListConverter(partitions).asJava().stream()
.map(catalogPartition -> toSparkPartition(catalogPartition, catalogTable))
.collect(Collectors.toList());
} catch (NoSuchDatabaseException e) {
throw SparkExceptionUtil.toUncheckedException(
e, "Unknown table: %s. Database not found in catalog.", tableIdent);
} catch (NoSuchTableException e) {
throw SparkExceptionUtil.toUncheckedException(
e, "Unknown table: %s. Table not found in catalog.", tableIdent);
}
}
/**
* Returns partitions that match the specified 'predicate'.
*
* @param spark a Spark session
* @param table a table name and (optional) database
* @param predicate a predicate on partition columns
* @return matching table's partitions
*/
public static List getPartitionsByFilter(
SparkSession spark, String table, String predicate) {
TableIdentifier tableIdent;
try {
tableIdent = spark.sessionState().sqlParser().parseTableIdentifier(table);
} catch (ParseException e) {
throw SparkExceptionUtil.toUncheckedException(
e, "Unable to parse the table identifier: %s", table);
}
Expression unresolvedPredicateExpr;
try {
unresolvedPredicateExpr = spark.sessionState().sqlParser().parseExpression(predicate);
} catch (ParseException e) {
throw SparkExceptionUtil.toUncheckedException(
e, "Unable to parse the predicate expression: %s", predicate);
}
Expression resolvedPredicateExpr = resolveAttrs(spark, table, unresolvedPredicateExpr);
return getPartitionsByFilter(spark, tableIdent, resolvedPredicateExpr);
}
/**
* Returns partitions that match the specified 'predicate'.
*
* @param spark a Spark session
* @param tableIdent a table identifier
* @param predicateExpr a predicate expression on partition columns
* @return matching table's partitions
*/
public static List getPartitionsByFilter(
SparkSession spark, TableIdentifier tableIdent, Expression predicateExpr) {
try {
SessionCatalog catalog = spark.sessionState().catalog();
CatalogTable catalogTable = catalog.getTableMetadata(tableIdent);
Expression resolvedPredicateExpr;
if (!predicateExpr.resolved()) {
resolvedPredicateExpr = resolveAttrs(spark, tableIdent.quotedString(), predicateExpr);
} else {
resolvedPredicateExpr = predicateExpr;
}
Seq predicates =
JavaConverters.collectionAsScalaIterableConverter(ImmutableList.of(resolvedPredicateExpr))
.asScala()
.toIndexedSeq();
Seq partitions =
catalog.listPartitionsByFilter(tableIdent, predicates).toIndexedSeq();
return JavaConverters.seqAsJavaListConverter(partitions).asJava().stream()
.map(catalogPartition -> toSparkPartition(catalogPartition, catalogTable))
.collect(Collectors.toList());
} catch (NoSuchDatabaseException e) {
throw SparkExceptionUtil.toUncheckedException(
e, "Unknown table: %s. Database not found in catalog.", tableIdent);
} catch (NoSuchTableException e) {
throw SparkExceptionUtil.toUncheckedException(
e, "Unknown table: %s. Table not found in catalog.", tableIdent);
}
}
/**
* Returns the data files in a partition by listing the partition location.
*
* For Parquet and ORC partitions, this will read metrics from the file footer. For Avro
* partitions, metrics are set to null.
*
* @param partition a partition
* @param conf a serializable Hadoop conf
* @param metricsConfig a metrics conf
* @return a List of DataFile
* @deprecated use {@link TableMigrationUtil#listPartition(Map, String, String, PartitionSpec,
* Configuration, MetricsConfig, NameMapping)}
*/
@Deprecated
public static List listPartition(
SparkPartition partition,
PartitionSpec spec,
SerializableConfiguration conf,
MetricsConfig metricsConfig) {
return listPartition(partition, spec, conf, metricsConfig, null, 1);
}
/**
* Returns the data files in a partition by listing the partition location.
*
* For Parquet and ORC partitions, this will read metrics from the file footer. For Avro
* partitions, metrics are set to null.
*
* @param partition a partition
* @param conf a serializable Hadoop conf
* @param metricsConfig a metrics conf
* @param mapping a name mapping
* @return a List of DataFile
* @deprecated use {@link TableMigrationUtil#listPartition(Map, String, String, PartitionSpec,
* Configuration, MetricsConfig, NameMapping)}
*/
@Deprecated
public static List listPartition(
SparkPartition partition,
PartitionSpec spec,
SerializableConfiguration conf,
MetricsConfig metricsConfig,
NameMapping mapping,
int parallelism) {
return TableMigrationUtil.listPartition(
partition.values,
partition.uri,
partition.format,
spec,
conf.get(),
metricsConfig,
mapping,
parallelism);
}
private static List listPartition(
SparkPartition partition,
PartitionSpec spec,
SerializableConfiguration conf,
MetricsConfig metricsConfig,
NameMapping mapping,
ExecutorService service) {
return TableMigrationUtil.listPartition(
partition.values,
partition.uri,
partition.format,
spec,
conf.get(),
metricsConfig,
mapping,
service);
}
private static SparkPartition toSparkPartition(
CatalogTablePartition partition, CatalogTable table) {
Option locationUri = partition.storage().locationUri();
Option serde = partition.storage().serde();
Preconditions.checkArgument(locationUri.nonEmpty(), "Partition URI should be defined");
Preconditions.checkArgument(
serde.nonEmpty() || table.provider().nonEmpty(), "Partition format should be defined");
String uri = Util.uriToString(locationUri.get());
String format = serde.nonEmpty() ? serde.get() : table.provider().get();
Map partitionSpec =
JavaConverters.mapAsJavaMapConverter(partition.spec()).asJava();
return new SparkPartition(partitionSpec, uri, format);
}
private static Expression resolveAttrs(SparkSession spark, String table, Expression expr) {
Function2 resolver = spark.sessionState().analyzer().resolver();
LogicalPlan plan = spark.table(table).queryExecution().analyzed();
return expr.transform(
new AbstractPartialFunction() {
@Override
public Expression apply(Expression attr) {
UnresolvedAttribute unresolvedAttribute = (UnresolvedAttribute) attr;
Option namedExpressionOption =
plan.resolve(unresolvedAttribute.nameParts(), resolver);
if (namedExpressionOption.isDefined()) {
return (Expression) namedExpressionOption.get();
} else {
throw new IllegalArgumentException(
String.format("Could not resolve %s using columns: %s", attr, plan.output()));
}
}
@Override
public boolean isDefinedAt(Expression attr) {
return attr instanceof UnresolvedAttribute;
}
});
}
private static Iterator buildManifest(
SerializableConfiguration conf,
PartitionSpec spec,
String basePath,
Iterator> fileTuples) {
if (fileTuples.hasNext()) {
FileIO io = new HadoopFileIO(conf.get());
TaskContext ctx = TaskContext.get();
String suffix =
String.format(
"stage-%d-task-%d-manifest-%s",
ctx.stageId(), ctx.taskAttemptId(), UUID.randomUUID());
Path location = new Path(basePath, suffix);
String outputPath = FileFormat.AVRO.addExtension(location.toString());
OutputFile outputFile = io.newOutputFile(outputPath);
ManifestWriter writer = ManifestFiles.write(spec, outputFile);
try (ManifestWriter writerRef = writer) {
fileTuples.forEachRemaining(fileTuple -> writerRef.add(fileTuple._2));
} catch (IOException e) {
throw SparkExceptionUtil.toUncheckedException(
e, "Unable to close the manifest writer: %s", outputPath);
}
ManifestFile manifestFile = writer.toManifestFile();
return ImmutableList.of(manifestFile).iterator();
} else {
return Collections.emptyIterator();
}
}
/**
* Import files from an existing Spark table to an Iceberg table.
*
* The import uses the Spark session to get table metadata. It assumes no operation is going on
* the original and target table and thus is not thread-safe.
*
* @param spark a Spark session
* @param sourceTableIdent an identifier of the source Spark table
* @param targetTable an Iceberg table where to import the data
* @param stagingDir a staging directory to store temporary manifest files
* @param partitionFilter only import partitions whose values match those in the map, can be
* partially defined
* @param checkDuplicateFiles if true, throw exception if import results in a duplicate data file
*/
public static void importSparkTable(
SparkSession spark,
TableIdentifier sourceTableIdent,
Table targetTable,
String stagingDir,
Map partitionFilter,
boolean checkDuplicateFiles) {
importSparkTable(
spark, sourceTableIdent, targetTable, stagingDir, partitionFilter, checkDuplicateFiles, 1);
}
/**
* Import files from an existing Spark table to an Iceberg table.
*
* The import uses the Spark session to get table metadata. It assumes no operation is going on
* the original and target table and thus is not thread-safe.
*
* @param spark a Spark session
* @param sourceTableIdent an identifier of the source Spark table
* @param targetTable an Iceberg table where to import the data
* @param stagingDir a staging directory to store temporary manifest files
* @param parallelism number of threads to use for file reading
*/
public static void importSparkTable(
SparkSession spark,
TableIdentifier sourceTableIdent,
Table targetTable,
String stagingDir,
int parallelism) {
importSparkTable(
spark,
sourceTableIdent,
targetTable,
stagingDir,
TableMigrationUtil.migrationService(parallelism));
}
/**
* Import files from an existing Spark table to an Iceberg table.
*
*
The import uses the Spark session to get table metadata. It assumes no operation is going on
* the original and target table and thus is not thread-safe.
*
* @param spark a Spark session
* @param sourceTableIdent an identifier of the source Spark table
* @param targetTable an Iceberg table where to import the data
* @param stagingDir a staging directory to store temporary manifest files
* @param service executor service to use for file reading
*/
public static void importSparkTable(
SparkSession spark,
TableIdentifier sourceTableIdent,
Table targetTable,
String stagingDir,
ExecutorService service) {
importSparkTable(
spark, sourceTableIdent, targetTable, stagingDir, Collections.emptyMap(), false, service);
}
/**
* Import files from an existing Spark table to an Iceberg table.
*
*
The import uses the Spark session to get table metadata. It assumes no operation is going on
* the original and target table and thus is not thread-safe.
*
* @param spark a Spark session
* @param sourceTableIdent an identifier of the source Spark table
* @param targetTable an Iceberg table where to import the data
* @param stagingDir a staging directory to store temporary manifest files
* @param partitionFilter only import partitions whose values match those in the map, can be
* partially defined
* @param checkDuplicateFiles if true, throw exception if import results in a duplicate data file
* @param parallelism number of threads to use for file reading
*/
public static void importSparkTable(
SparkSession spark,
TableIdentifier sourceTableIdent,
Table targetTable,
String stagingDir,
Map partitionFilter,
boolean checkDuplicateFiles,
int parallelism) {
importSparkTable(
spark,
sourceTableIdent,
targetTable,
stagingDir,
partitionFilter,
checkDuplicateFiles,
TableMigrationUtil.migrationService(parallelism));
}
/**
* Import files from an existing Spark table to an Iceberg table.
*
* The import uses the Spark session to get table metadata. It assumes no operation is going on
* the original and target table and thus is not thread-safe.
*
* @param spark a Spark session
* @param sourceTableIdent an identifier of the source Spark table
* @param targetTable an Iceberg table where to import the data
* @param stagingDir a staging directory to store temporary manifest files
* @param partitionFilter only import partitions whose values match those in the map, can be
* partially defined
* @param checkDuplicateFiles if true, throw exception if import results in a duplicate data file
* @param service executor service to use for file reading
*/
public static void importSparkTable(
SparkSession spark,
TableIdentifier sourceTableIdent,
Table targetTable,
String stagingDir,
Map partitionFilter,
boolean checkDuplicateFiles,
ExecutorService service) {
SessionCatalog catalog = spark.sessionState().catalog();
String db =
sourceTableIdent.database().nonEmpty()
? sourceTableIdent.database().get()
: catalog.getCurrentDatabase();
TableIdentifier sourceTableIdentWithDB =
new TableIdentifier(sourceTableIdent.table(), Some.apply(db));
if (!catalog.tableExists(sourceTableIdentWithDB)) {
throw new org.apache.iceberg.exceptions.NoSuchTableException(
"Table %s does not exist", sourceTableIdentWithDB);
}
try {
PartitionSpec spec =
SparkSchemaUtil.specForTable(spark, sourceTableIdentWithDB.unquotedString());
if (Objects.equal(spec, PartitionSpec.unpartitioned())) {
importUnpartitionedSparkTable(
spark, sourceTableIdentWithDB, targetTable, checkDuplicateFiles, service);
} else {
List sourceTablePartitions =
getPartitions(spark, sourceTableIdent, partitionFilter);
if (sourceTablePartitions.isEmpty()) {
targetTable.newAppend().commit();
} else {
importSparkPartitions(
spark,
sourceTablePartitions,
targetTable,
spec,
stagingDir,
checkDuplicateFiles,
service);
}
}
} catch (AnalysisException e) {
throw SparkExceptionUtil.toUncheckedException(
e, "Unable to get partition spec for table: %s", sourceTableIdentWithDB);
}
}
/**
* Import files from an existing Spark table to an Iceberg table.
*
* The import uses the Spark session to get table metadata. It assumes no operation is going on
* the original and target table and thus is not thread-safe.
*
* @param spark a Spark session
* @param sourceTableIdent an identifier of the source Spark table
* @param targetTable an Iceberg table where to import the data
* @param stagingDir a staging directory to store temporary manifest files
* @param checkDuplicateFiles if true, throw exception if import results in a duplicate data file
*/
public static void importSparkTable(
SparkSession spark,
TableIdentifier sourceTableIdent,
Table targetTable,
String stagingDir,
boolean checkDuplicateFiles) {
importSparkTable(
spark,
sourceTableIdent,
targetTable,
stagingDir,
Collections.emptyMap(),
checkDuplicateFiles,
1);
}
/**
* Import files from an existing Spark table to an Iceberg table.
*
*
The import uses the Spark session to get table metadata. It assumes no operation is going on
* the original and target table and thus is not thread-safe.
*
* @param spark a Spark session
* @param sourceTableIdent an identifier of the source Spark table
* @param targetTable an Iceberg table where to import the data
* @param stagingDir a staging directory to store temporary manifest files
*/
public static void importSparkTable(
SparkSession spark, TableIdentifier sourceTableIdent, Table targetTable, String stagingDir) {
importSparkTable(
spark, sourceTableIdent, targetTable, stagingDir, Collections.emptyMap(), false, 1);
}
private static void importUnpartitionedSparkTable(
SparkSession spark,
TableIdentifier sourceTableIdent,
Table targetTable,
boolean checkDuplicateFiles,
ExecutorService service) {
try {
CatalogTable sourceTable = spark.sessionState().catalog().getTableMetadata(sourceTableIdent);
Option format =
sourceTable.storage().serde().nonEmpty()
? sourceTable.storage().serde()
: sourceTable.provider();
Preconditions.checkArgument(format.nonEmpty(), "Could not determine table format");
Map partition = Collections.emptyMap();
PartitionSpec spec = PartitionSpec.unpartitioned();
Configuration conf = spark.sessionState().newHadoopConf();
MetricsConfig metricsConfig = MetricsConfig.forTable(targetTable);
String nameMappingString = targetTable.properties().get(TableProperties.DEFAULT_NAME_MAPPING);
NameMapping nameMapping =
nameMappingString != null ? NameMappingParser.fromJson(nameMappingString) : null;
List files =
TableMigrationUtil.listPartition(
partition,
Util.uriToString(sourceTable.location()),
format.get(),
spec,
conf,
metricsConfig,
nameMapping,
service);
if (checkDuplicateFiles) {
Dataset importedFiles =
spark
.createDataset(Lists.transform(files, f -> f.path().toString()), Encoders.STRING())
.toDF("file_path");
Dataset existingFiles =
loadMetadataTable(spark, targetTable, MetadataTableType.ENTRIES).filter("status != 2");
Column joinCond =
existingFiles.col("data_file.file_path").equalTo(importedFiles.col("file_path"));
Dataset duplicates =
importedFiles.join(existingFiles, joinCond).select("file_path").as(Encoders.STRING());
Preconditions.checkState(
duplicates.isEmpty(),
String.format(
DUPLICATE_FILE_MESSAGE, Joiner.on(",").join((String[]) duplicates.take(10))));
}
AppendFiles append = targetTable.newAppend();
files.forEach(append::appendFile);
append.commit();
} catch (NoSuchDatabaseException e) {
throw SparkExceptionUtil.toUncheckedException(
e, "Unknown table: %s. Database not found in catalog.", sourceTableIdent);
} catch (NoSuchTableException e) {
throw SparkExceptionUtil.toUncheckedException(
e, "Unknown table: %s. Table not found in catalog.", sourceTableIdent);
}
}
/**
* Import files from given partitions to an Iceberg table.
*
* @param spark a Spark session
* @param partitions partitions to import
* @param targetTable an Iceberg table where to import the data
* @param spec a partition spec
* @param stagingDir a staging directory to store temporary manifest files
* @param checkDuplicateFiles if true, throw exception if import results in a duplicate data file
*/
public static void importSparkPartitions(
SparkSession spark,
List partitions,
Table targetTable,
PartitionSpec spec,
String stagingDir,
boolean checkDuplicateFiles) {
importSparkPartitions(spark, partitions, targetTable, spec, stagingDir, checkDuplicateFiles, 1);
}
/**
* Import files from given partitions to an Iceberg table.
*
* @param spark a Spark session
* @param partitions partitions to import
* @param targetTable an Iceberg table where to import the data
* @param spec a partition spec
* @param stagingDir a staging directory to store temporary manifest files
* @param checkDuplicateFiles if true, throw exception if import results in a duplicate data file
* @param parallelism number of threads to use for file reading
*/
public static void importSparkPartitions(
SparkSession spark,
List partitions,
Table targetTable,
PartitionSpec spec,
String stagingDir,
boolean checkDuplicateFiles,
int parallelism) {
importSparkPartitions(
spark,
partitions,
targetTable,
spec,
stagingDir,
checkDuplicateFiles,
TableMigrationUtil.migrationService(parallelism));
}
/**
* Import files from given partitions to an Iceberg table.
*
* @param spark a Spark session
* @param partitions partitions to import
* @param targetTable an Iceberg table where to import the data
* @param spec a partition spec
* @param stagingDir a staging directory to store temporary manifest files
* @param checkDuplicateFiles if true, throw exception if import results in a duplicate data file
* @param service executor service to use for file reading
*/
public static void importSparkPartitions(
SparkSession spark,
List partitions,
Table targetTable,
PartitionSpec spec,
String stagingDir,
boolean checkDuplicateFiles,
ExecutorService service) {
Configuration conf = spark.sessionState().newHadoopConf();
SerializableConfiguration serializableConf = new SerializableConfiguration(conf);
int listingParallelism =
Math.min(
partitions.size(), spark.sessionState().conf().parallelPartitionDiscoveryParallelism());
int numShufflePartitions = spark.sessionState().conf().numShufflePartitions();
MetricsConfig metricsConfig = MetricsConfig.fromProperties(targetTable.properties());
String nameMappingString = targetTable.properties().get(TableProperties.DEFAULT_NAME_MAPPING);
NameMapping nameMapping =
nameMappingString != null ? NameMappingParser.fromJson(nameMappingString) : null;
JavaSparkContext sparkContext = JavaSparkContext.fromSparkContext(spark.sparkContext());
JavaRDD partitionRDD = sparkContext.parallelize(partitions, listingParallelism);
Dataset partitionDS =
spark.createDataset(partitionRDD.rdd(), Encoders.javaSerialization(SparkPartition.class));
Dataset filesToImport =
partitionDS.flatMap(
(FlatMapFunction)
sparkPartition ->
listPartition(
sparkPartition,
spec,
serializableConf,
metricsConfig,
nameMapping,
service)
.iterator(),
Encoders.javaSerialization(DataFile.class));
if (checkDuplicateFiles) {
Dataset importedFiles =
filesToImport
.map((MapFunction) f -> f.path().toString(), Encoders.STRING())
.toDF("file_path");
Dataset existingFiles =
loadMetadataTable(spark, targetTable, MetadataTableType.ENTRIES).filter("status != 2");
Column joinCond =
existingFiles.col("data_file.file_path").equalTo(importedFiles.col("file_path"));
Dataset duplicates =
importedFiles.join(existingFiles, joinCond).select("file_path").as(Encoders.STRING());
Preconditions.checkState(
duplicates.isEmpty(),
String.format(
DUPLICATE_FILE_MESSAGE, Joiner.on(",").join((String[]) duplicates.take(10))));
}
List manifests =
filesToImport
.repartition(numShufflePartitions)
.map(
(MapFunction>)
file -> Tuple2.apply(file.path().toString(), file),
Encoders.tuple(Encoders.STRING(), Encoders.javaSerialization(DataFile.class)))
.orderBy(col("_1"))
.mapPartitions(
(MapPartitionsFunction, ManifestFile>)
fileTuple -> buildManifest(serializableConf, spec, stagingDir, fileTuple),
Encoders.javaSerialization(ManifestFile.class))
.collectAsList();
try {
TableOperations ops = ((HasTableOperations) targetTable).operations();
int formatVersion = ops.current().formatVersion();
boolean snapshotIdInheritanceEnabled =
PropertyUtil.propertyAsBoolean(
targetTable.properties(),
TableProperties.SNAPSHOT_ID_INHERITANCE_ENABLED,
TableProperties.SNAPSHOT_ID_INHERITANCE_ENABLED_DEFAULT);
AppendFiles append = targetTable.newAppend();
manifests.forEach(append::appendManifest);
append.commit();
if (formatVersion == 1 && !snapshotIdInheritanceEnabled) {
// delete original manifests as they were rewritten before the commit
deleteManifests(targetTable.io(), manifests);
}
} catch (Throwable e) {
deleteManifests(targetTable.io(), manifests);
throw e;
}
}
/**
* Import files from given partitions to an Iceberg table.
*
* @param spark a Spark session
* @param partitions partitions to import
* @param targetTable an Iceberg table where to import the data
* @param spec a partition spec
* @param stagingDir a staging directory to store temporary manifest files
*/
public static void importSparkPartitions(
SparkSession spark,
List partitions,
Table targetTable,
PartitionSpec spec,
String stagingDir) {
importSparkPartitions(spark, partitions, targetTable, spec, stagingDir, false, 1);
}
public static List filterPartitions(
List partitions, Map partitionFilter) {
if (partitionFilter.isEmpty()) {
return partitions;
} else {
return partitions.stream()
.filter(p -> p.getValues().entrySet().containsAll(partitionFilter.entrySet()))
.collect(Collectors.toList());
}
}
private static void deleteManifests(FileIO io, List manifests) {
Tasks.foreach(manifests)
.executeWith(ThreadPools.getWorkerPool())
.noRetry()
.suppressFailureWhenFinished()
.run(item -> io.deleteFile(item.path()));
}
public static Dataset loadMetadataTable(
SparkSession spark, Table table, MetadataTableType type) {
return loadMetadataTable(spark, table, type, ImmutableMap.of());
}
public static Dataset loadMetadataTable(
SparkSession spark, Table table, MetadataTableType type, Map extraOptions) {
SparkTable metadataTable =
new SparkTable(MetadataTableUtils.createMetadataTableInstance(table, type), false);
CaseInsensitiveStringMap options = new CaseInsensitiveStringMap(extraOptions);
return Dataset.ofRows(
spark, DataSourceV2Relation.create(metadataTable, Some.empty(), Some.empty(), options));
}
/**
* Determine the write branch.
*
* Validate wap config and determine the write branch.
*
* @param spark a Spark Session
* @param branch write branch if there is no WAP branch configured
* @return branch for write operation
*/
public static String determineWriteBranch(SparkSession spark, String branch) {
String wapId = spark.conf().get(SparkSQLProperties.WAP_ID, null);
String wapBranch = spark.conf().get(SparkSQLProperties.WAP_BRANCH, null);
ValidationException.check(
wapId == null || wapBranch == null,
"Cannot set both WAP ID and branch, but got ID [%s] and branch [%s]",
wapId,
wapBranch);
if (wapBranch != null) {
ValidationException.check(
branch == null,
"Cannot write to both branch and WAP branch, but got branch [%s] and WAP branch [%s]",
branch,
wapBranch);
return wapBranch;
}
return branch;
}
public static boolean wapEnabled(Table table) {
return PropertyUtil.propertyAsBoolean(
table.properties(),
TableProperties.WRITE_AUDIT_PUBLISH_ENABLED,
Boolean.parseBoolean(TableProperties.WRITE_AUDIT_PUBLISH_ENABLED_DEFAULT));
}
/** Class representing a table partition. */
public static class SparkPartition implements Serializable {
private final Map values;
private final String uri;
private final String format;
public SparkPartition(Map values, String uri, String format) {
this.values = Maps.newHashMap(values);
this.uri = uri;
this.format = format;
}
public Map getValues() {
return values;
}
public String getUri() {
return uri;
}
public String getFormat() {
return format;
}
@Override
public String toString() {
return MoreObjects.toStringHelper(this)
.add("values", values)
.add("uri", uri)
.add("format", format)
.toString();
}
@Override
public boolean equals(Object o) {
if (this == o) {
return true;
}
if (o == null || getClass() != o.getClass()) {
return false;
}
SparkPartition that = (SparkPartition) o;
return Objects.equal(values, that.values)
&& Objects.equal(uri, that.uri)
&& Objects.equal(format, that.format);
}
@Override
public int hashCode() {
return Objects.hashCode(values, uri, format);
}
}
}