org.apache.iceberg.spark.procedures.AddFilesProcedure Maven / Gradle / Ivy
Go to download
Show more of this group Show more artifacts with this name
Show all versions of iceberg-spark-3.5_2.13 Show documentation
Show all versions of iceberg-spark-3.5_2.13 Show documentation
A table format for huge analytic datasets
The newest version!
/*
* Licensed to the Apache Software Foundation (ASF) under one
* or more contributor license agreements. See the NOTICE file
* distributed with this work for additional information
* regarding copyright ownership. The ASF licenses this file
* to you under the Apache License, Version 2.0 (the
* "License"); you may not use this file except in compliance
* with the License. You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing,
* software distributed under the License is distributed on an
* "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
* KIND, either express or implied. See the License for the
* specific language governing permissions and limitations
* under the License.
*/
package org.apache.iceberg.spark.procedures;
import java.util.Collections;
import java.util.List;
import java.util.Map;
import java.util.Set;
import java.util.stream.Collectors;
import org.apache.hadoop.fs.Path;
import org.apache.iceberg.PartitionField;
import org.apache.iceberg.PartitionSpec;
import org.apache.iceberg.Snapshot;
import org.apache.iceberg.SnapshotSummary;
import org.apache.iceberg.Table;
import org.apache.iceberg.TableProperties;
import org.apache.iceberg.mapping.MappingUtil;
import org.apache.iceberg.mapping.NameMapping;
import org.apache.iceberg.mapping.NameMappingParser;
import org.apache.iceberg.relocated.com.google.common.base.Preconditions;
import org.apache.iceberg.relocated.com.google.common.collect.ImmutableList;
import org.apache.iceberg.relocated.com.google.common.collect.ImmutableMap;
import org.apache.iceberg.spark.Spark3Util;
import org.apache.iceberg.spark.SparkTableUtil;
import org.apache.iceberg.spark.SparkTableUtil.SparkPartition;
import org.apache.iceberg.util.LocationUtil;
import org.apache.iceberg.util.PropertyUtil;
import org.apache.spark.sql.catalyst.InternalRow;
import org.apache.spark.sql.catalyst.TableIdentifier;
import org.apache.spark.sql.connector.catalog.CatalogPlugin;
import org.apache.spark.sql.connector.catalog.Identifier;
import org.apache.spark.sql.connector.catalog.TableCatalog;
import org.apache.spark.sql.connector.iceberg.catalog.ProcedureParameter;
import org.apache.spark.sql.types.DataTypes;
import org.apache.spark.sql.types.Metadata;
import org.apache.spark.sql.types.StructField;
import org.apache.spark.sql.types.StructType;
class AddFilesProcedure extends BaseProcedure {
private static final ProcedureParameter TABLE_PARAM =
ProcedureParameter.required("table", DataTypes.StringType);
private static final ProcedureParameter SOURCE_TABLE_PARAM =
ProcedureParameter.required("source_table", DataTypes.StringType);
private static final ProcedureParameter PARTITION_FILTER_PARAM =
ProcedureParameter.optional("partition_filter", STRING_MAP);
private static final ProcedureParameter CHECK_DUPLICATE_FILES_PARAM =
ProcedureParameter.optional("check_duplicate_files", DataTypes.BooleanType);
private static final ProcedureParameter PARALLELISM =
ProcedureParameter.optional("parallelism", DataTypes.IntegerType);
private static final ProcedureParameter[] PARAMETERS =
new ProcedureParameter[] {
TABLE_PARAM,
SOURCE_TABLE_PARAM,
PARTITION_FILTER_PARAM,
CHECK_DUPLICATE_FILES_PARAM,
PARALLELISM
};
private static final StructType OUTPUT_TYPE =
new StructType(
new StructField[] {
new StructField("added_files_count", DataTypes.LongType, false, Metadata.empty()),
new StructField("changed_partition_count", DataTypes.LongType, true, Metadata.empty()),
});
private AddFilesProcedure(TableCatalog tableCatalog) {
super(tableCatalog);
}
public static SparkProcedures.ProcedureBuilder builder() {
return new BaseProcedure.Builder() {
@Override
protected AddFilesProcedure doBuild() {
return new AddFilesProcedure(tableCatalog());
}
};
}
@Override
public ProcedureParameter[] parameters() {
return PARAMETERS;
}
@Override
public StructType outputType() {
return OUTPUT_TYPE;
}
@Override
public InternalRow[] call(InternalRow args) {
ProcedureInput input = new ProcedureInput(spark(), tableCatalog(), PARAMETERS, args);
Identifier tableIdent = input.ident(TABLE_PARAM);
CatalogPlugin sessionCat = spark().sessionState().catalogManager().v2SessionCatalog();
Identifier sourceIdent = input.ident(SOURCE_TABLE_PARAM, sessionCat);
Map partitionFilter =
input.asStringMap(PARTITION_FILTER_PARAM, ImmutableMap.of());
boolean checkDuplicateFiles = input.asBoolean(CHECK_DUPLICATE_FILES_PARAM, true);
int parallelism = input.asInt(PARALLELISM, 1);
return importToIceberg(
tableIdent, sourceIdent, partitionFilter, checkDuplicateFiles, parallelism);
}
private InternalRow[] toOutputRows(Snapshot snapshot) {
Map summary = snapshot.summary();
return new InternalRow[] {
newInternalRow(addedFilesCount(summary), changedPartitionCount(summary))
};
}
private long addedFilesCount(Map stats) {
return PropertyUtil.propertyAsLong(stats, SnapshotSummary.ADDED_FILES_PROP, 0L);
}
private Long changedPartitionCount(Map stats) {
return PropertyUtil.propertyAsNullableLong(stats, SnapshotSummary.CHANGED_PARTITION_COUNT_PROP);
}
private boolean isFileIdentifier(Identifier ident) {
String[] namespace = ident.namespace();
return namespace.length == 1
&& (namespace[0].equalsIgnoreCase("orc")
|| namespace[0].equalsIgnoreCase("parquet")
|| namespace[0].equalsIgnoreCase("avro"));
}
private InternalRow[] importToIceberg(
Identifier destIdent,
Identifier sourceIdent,
Map partitionFilter,
boolean checkDuplicateFiles,
int parallelism) {
return modifyIcebergTable(
destIdent,
table -> {
validatePartitionSpec(table, partitionFilter);
ensureNameMappingPresent(table);
if (isFileIdentifier(sourceIdent)) {
Path sourcePath = new Path(sourceIdent.name());
String format = sourceIdent.namespace()[0];
importFileTable(
table,
sourcePath,
format,
partitionFilter,
checkDuplicateFiles,
table.spec(),
parallelism);
} else {
importCatalogTable(
table, sourceIdent, partitionFilter, checkDuplicateFiles, parallelism);
}
Snapshot snapshot = table.currentSnapshot();
return toOutputRows(snapshot);
});
}
private static void ensureNameMappingPresent(Table table) {
if (table.properties().get(TableProperties.DEFAULT_NAME_MAPPING) == null) {
// Forces Name based resolution instead of position based resolution
NameMapping mapping = MappingUtil.create(table.schema());
String mappingJson = NameMappingParser.toJson(mapping);
table.updateProperties().set(TableProperties.DEFAULT_NAME_MAPPING, mappingJson).commit();
}
}
private void importFileTable(
Table table,
Path tableLocation,
String format,
Map partitionFilter,
boolean checkDuplicateFiles,
PartitionSpec spec,
int parallelism) {
// List Partitions via Spark InMemory file search interface
List partitions =
Spark3Util.getPartitions(spark(), tableLocation, format, partitionFilter, spec);
if (table.spec().isUnpartitioned()) {
Preconditions.checkArgument(
partitions.isEmpty(), "Cannot add partitioned files to an unpartitioned table");
Preconditions.checkArgument(
partitionFilter.isEmpty(),
"Cannot use a partition filter when importing" + "to an unpartitioned table");
// Build a Global Partition for the source
SparkPartition partition =
new SparkPartition(Collections.emptyMap(), tableLocation.toString(), format);
importPartitions(table, ImmutableList.of(partition), checkDuplicateFiles, parallelism);
} else {
Preconditions.checkArgument(
!partitions.isEmpty(), "Cannot find any matching partitions in table %s", table.name());
importPartitions(table, partitions, checkDuplicateFiles, parallelism);
}
}
private void importCatalogTable(
Table table,
Identifier sourceIdent,
Map partitionFilter,
boolean checkDuplicateFiles,
int parallelism) {
String stagingLocation = getMetadataLocation(table);
TableIdentifier sourceTableIdentifier = Spark3Util.toV1TableIdentifier(sourceIdent);
SparkTableUtil.importSparkTable(
spark(),
sourceTableIdentifier,
table,
stagingLocation,
partitionFilter,
checkDuplicateFiles,
parallelism);
}
private void importPartitions(
Table table,
List partitions,
boolean checkDuplicateFiles,
int parallelism) {
String stagingLocation = getMetadataLocation(table);
SparkTableUtil.importSparkPartitions(
spark(),
partitions,
table,
table.spec(),
stagingLocation,
checkDuplicateFiles,
parallelism);
}
private String getMetadataLocation(Table table) {
String defaultValue = LocationUtil.stripTrailingSlash(table.location()) + "/metadata";
return LocationUtil.stripTrailingSlash(
table.properties().getOrDefault(TableProperties.WRITE_METADATA_LOCATION, defaultValue));
}
@Override
public String description() {
return "AddFiles";
}
private void validatePartitionSpec(Table table, Map partitionFilter) {
List partitionFields = table.spec().fields();
Set partitionNames =
table.spec().fields().stream().map(PartitionField::name).collect(Collectors.toSet());
boolean tablePartitioned = !partitionFields.isEmpty();
boolean partitionSpecPassed = !partitionFilter.isEmpty();
// Check for any non-identity partition columns
List nonIdentityFields =
partitionFields.stream()
.filter(x -> !x.transform().isIdentity())
.collect(Collectors.toList());
Preconditions.checkArgument(
nonIdentityFields.isEmpty(),
"Cannot add data files to target table %s because that table is partitioned and contains non-identity"
+ "partition transforms which will not be compatible. Found non-identity fields %s",
table.name(),
nonIdentityFields);
if (tablePartitioned && partitionSpecPassed) {
// Check to see there are sufficient partition columns to satisfy the filter
Preconditions.checkArgument(
partitionFields.size() >= partitionFilter.size(),
"Cannot add data files to target table %s because that table is partitioned, "
+ "but the number of columns in the provided partition filter (%s) "
+ "is greater than the number of partitioned columns in table (%s)",
table.name(),
partitionFilter.size(),
partitionFields.size());
// Check for any filters of non existent columns
List unMatchedFilters =
partitionFilter.keySet().stream()
.filter(filterName -> !partitionNames.contains(filterName))
.collect(Collectors.toList());
Preconditions.checkArgument(
unMatchedFilters.isEmpty(),
"Cannot add files to target table %s. %s is partitioned but the specified partition filter "
+ "refers to columns that are not partitioned: '%s' . Valid partition columns %s",
table.name(),
table.name(),
unMatchedFilters,
String.join(",", partitionNames));
} else {
Preconditions.checkArgument(
!partitionSpecPassed,
"Cannot use partition filter with an unpartitioned table %s",
table.name());
}
}
}