org.apache.iceberg.delta.BaseSnapshotDeltaLakeTableAction Maven / Gradle / Ivy
Show all versions of iceberg-delta-lake Show documentation
/*
* Licensed to the Apache Software Foundation (ASF) under one
* or more contributor license agreements. See the NOTICE file
* distributed with this work for additional information
* regarding copyright ownership. The ASF licenses this file
* to you under the Apache License, Version 2.0 (the
* "License"); you may not use this file except in compliance
* with the License. You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing,
* software distributed under the License is distributed on an
* "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
* KIND, either express or implied. See the License for the
* specific language governing permissions and limitations
* under the License.
*/
package org.apache.iceberg.delta;
import io.delta.standalone.DeltaLog;
import io.delta.standalone.VersionLog;
import io.delta.standalone.actions.Action;
import io.delta.standalone.actions.AddFile;
import io.delta.standalone.actions.RemoveFile;
import io.delta.standalone.exceptions.DeltaStandaloneException;
import java.io.File;
import java.io.UnsupportedEncodingException;
import java.net.URI;
import java.net.URLDecoder;
import java.nio.charset.StandardCharsets;
import java.sql.Timestamp;
import java.util.Iterator;
import java.util.List;
import java.util.Map;
import java.util.stream.Collectors;
import org.apache.hadoop.conf.Configuration;
import org.apache.iceberg.AppendFiles;
import org.apache.iceberg.DataFile;
import org.apache.iceberg.DataFiles;
import org.apache.iceberg.DeleteFiles;
import org.apache.iceberg.FileFormat;
import org.apache.iceberg.ManageSnapshots;
import org.apache.iceberg.Metrics;
import org.apache.iceberg.MetricsConfig;
import org.apache.iceberg.OverwriteFiles;
import org.apache.iceberg.PartitionField;
import org.apache.iceberg.PartitionSpec;
import org.apache.iceberg.Schema;
import org.apache.iceberg.Table;
import org.apache.iceberg.TableProperties;
import org.apache.iceberg.Transaction;
import org.apache.iceberg.catalog.Catalog;
import org.apache.iceberg.catalog.TableIdentifier;
import org.apache.iceberg.exceptions.NotFoundException;
import org.apache.iceberg.exceptions.ValidationException;
import org.apache.iceberg.hadoop.HadoopFileIO;
import org.apache.iceberg.io.InputFile;
import org.apache.iceberg.mapping.MappingUtil;
import org.apache.iceberg.mapping.NameMapping;
import org.apache.iceberg.mapping.NameMappingParser;
import org.apache.iceberg.parquet.ParquetUtil;
import org.apache.iceberg.relocated.com.google.common.base.Preconditions;
import org.apache.iceberg.relocated.com.google.common.collect.ImmutableMap;
import org.apache.iceberg.relocated.com.google.common.collect.ImmutableSet;
import org.apache.iceberg.relocated.com.google.common.collect.Lists;
import org.apache.iceberg.types.Type;
import org.slf4j.Logger;
import org.slf4j.LoggerFactory;
/**
* Takes a Delta Lake table's location and attempts to create an Iceberg table snapshot in an
* optional user-specified location (default to the Delta Lake table's location) with a different
* identifier.
*/
class BaseSnapshotDeltaLakeTableAction implements SnapshotDeltaLakeTable {
private static final Logger LOG = LoggerFactory.getLogger(BaseSnapshotDeltaLakeTableAction.class);
private static final String SNAPSHOT_SOURCE_PROP = "snapshot_source";
private static final String DELTA_SOURCE_VALUE = "delta";
private static final String ORIGINAL_LOCATION_PROP = "original_location";
private static final String PARQUET_SUFFIX = ".parquet";
private static final String DELTA_VERSION_TAG_PREFIX = "delta-version-";
private static final String DELTA_TIMESTAMP_TAG_PREFIX = "delta-ts-";
private final ImmutableMap.Builder additionalPropertiesBuilder =
ImmutableMap.builder();
private DeltaLog deltaLog;
private Catalog icebergCatalog;
private final String deltaTableLocation;
private TableIdentifier newTableIdentifier;
private String newTableLocation;
private HadoopFileIO deltaLakeFileIO;
private long deltaStartVersion;
/**
* Snapshot a delta lake table to be an iceberg table. The action will read the delta lake table's
* log through the table's path, create a new iceberg table using the given icebergCatalog and
* newTableIdentifier, and commit all changes in one iceberg transaction.
*
* The new table will only be created if the snapshot is successful.
*
* @param deltaTableLocation the delta lake table's path
*/
BaseSnapshotDeltaLakeTableAction(String deltaTableLocation) {
this.deltaTableLocation = deltaTableLocation;
this.newTableLocation = deltaTableLocation;
}
@Override
public SnapshotDeltaLakeTable tableProperties(Map properties) {
additionalPropertiesBuilder.putAll(properties);
return this;
}
@Override
public SnapshotDeltaLakeTable tableProperty(String name, String value) {
additionalPropertiesBuilder.put(name, value);
return this;
}
@Override
public SnapshotDeltaLakeTable tableLocation(String location) {
this.newTableLocation = location;
return this;
}
@Override
public SnapshotDeltaLakeTable as(TableIdentifier identifier) {
this.newTableIdentifier = identifier;
return this;
}
@Override
public SnapshotDeltaLakeTable icebergCatalog(Catalog catalog) {
this.icebergCatalog = catalog;
return this;
}
@Override
public SnapshotDeltaLakeTable deltaLakeConfiguration(Configuration conf) {
this.deltaLog = DeltaLog.forTable(conf, deltaTableLocation);
this.deltaLakeFileIO = new HadoopFileIO(conf);
// get the earliest version available in the delta lake table
this.deltaStartVersion = deltaLog.getVersionAtOrAfterTimestamp(0L);
return this;
}
@Override
public SnapshotDeltaLakeTable.Result execute() {
Preconditions.checkArgument(
icebergCatalog != null && newTableIdentifier != null,
"Iceberg catalog and identifier cannot be null. Make sure to configure the action with a valid Iceberg catalog and identifier.");
Preconditions.checkArgument(
deltaLog != null && deltaLakeFileIO != null,
"Make sure to configure the action with a valid deltaLakeConfiguration");
Preconditions.checkArgument(
deltaLog.tableExists(),
"Delta Lake table does not exist at the given location: %s",
deltaTableLocation);
ImmutableSet.Builder migratedDataFilesBuilder = ImmutableSet.builder();
io.delta.standalone.Snapshot updatedSnapshot = deltaLog.update();
Schema schema = convertDeltaLakeSchema(updatedSnapshot.getMetadata().getSchema());
PartitionSpec partitionSpec = getPartitionSpecFromDeltaSnapshot(schema, updatedSnapshot);
Transaction icebergTransaction =
icebergCatalog.newCreateTableTransaction(
newTableIdentifier,
schema,
partitionSpec,
newTableLocation,
destTableProperties(updatedSnapshot, deltaTableLocation));
icebergTransaction
.table()
.updateProperties()
.set(
TableProperties.DEFAULT_NAME_MAPPING,
NameMappingParser.toJson(MappingUtil.create(icebergTransaction.table().schema())))
.commit();
long constructableStartVersion =
commitInitialDeltaSnapshotToIcebergTransaction(
updatedSnapshot.getVersion(), icebergTransaction, migratedDataFilesBuilder);
Iterator versionLogIterator =
deltaLog.getChanges(
constructableStartVersion + 1, false // not throw exception when data loss detected
);
while (versionLogIterator.hasNext()) {
VersionLog versionLog = versionLogIterator.next();
commitDeltaVersionLogToIcebergTransaction(
versionLog, icebergTransaction, migratedDataFilesBuilder);
}
icebergTransaction.commitTransaction();
long totalDataFiles = migratedDataFilesBuilder.build().size();
LOG.info(
"Successfully created Iceberg table {} from Delta Lake table at {}, total data file count: {}",
newTableIdentifier,
deltaTableLocation,
totalDataFiles);
return ImmutableSnapshotDeltaLakeTable.Result.builder()
.snapshotDataFilesCount(totalDataFiles)
.build();
}
private Schema convertDeltaLakeSchema(io.delta.standalone.types.StructType deltaSchema) {
Type converted =
DeltaLakeDataTypeVisitor.visit(deltaSchema, new DeltaLakeTypeToType(deltaSchema));
return new Schema(converted.asNestedType().asStructType().fields());
}
private PartitionSpec getPartitionSpecFromDeltaSnapshot(
Schema schema, io.delta.standalone.Snapshot deltaSnapshot) {
List partitionNames = deltaSnapshot.getMetadata().getPartitionColumns();
if (partitionNames.isEmpty()) {
return PartitionSpec.unpartitioned();
}
PartitionSpec.Builder builder = PartitionSpec.builderFor(schema);
for (String partitionName : partitionNames) {
builder.identity(partitionName);
}
return builder.build();
}
/**
* Commit the initial delta snapshot to iceberg transaction. It tries the snapshot starting from
* {@code deltaStartVersion} to {@code latestVersion} and commit the first constructable one.
*
* There are two cases that the delta snapshot is not constructable:
*
*
* - the version is earlier than the earliest checkpoint
*
- the corresponding data files are deleted by {@code VACUUM}
*
*
* For more information, please refer to delta lake's Data Retention
*
* @param latestVersion the latest version of the delta lake table
* @param transaction the iceberg transaction
* @return the initial version of the delta lake table that is successfully committed to iceberg
*/
private long commitInitialDeltaSnapshotToIcebergTransaction(
long latestVersion,
Transaction transaction,
ImmutableSet.Builder migratedDataFilesBuilder) {
long constructableStartVersion = deltaStartVersion;
while (constructableStartVersion <= latestVersion) {
try {
List initDataFiles =
deltaLog.getSnapshotForVersionAsOf(constructableStartVersion).getAllFiles();
List filesToAdd = Lists.newArrayList();
for (AddFile addFile : initDataFiles) {
DataFile dataFile = buildDataFileFromAction(addFile, transaction.table());
filesToAdd.add(dataFile);
migratedDataFilesBuilder.add(dataFile.path().toString());
}
// AppendFiles case
AppendFiles appendFiles = transaction.newAppend();
filesToAdd.forEach(appendFiles::appendFile);
appendFiles.commit();
tagCurrentSnapshot(constructableStartVersion, transaction);
return constructableStartVersion;
} catch (NotFoundException | IllegalArgumentException | DeltaStandaloneException e) {
constructableStartVersion++;
}
}
throw new ValidationException(
"Delta Lake table at %s contains no constructable snapshot", deltaTableLocation);
}
/**
* Iterate through the {@code VersionLog} to determine the update type and commit the update to
* the given {@code Transaction}.
*
* There are 3 cases:
*
*
1. AppendFiles - when there are only AddFile instances (an INSERT on the table)
*
*
2. DeleteFiles - when there are only RemoveFile instances (a DELETE where all the records of
* file(s) were removed)
*
*
3. OverwriteFiles - when there are a mix of AddFile and RemoveFile (a DELETE/UPDATE)
*
* @param versionLog the delta log version to commit to iceberg table transaction
* @param transaction the iceberg table transaction to commit to
*/
private void commitDeltaVersionLogToIcebergTransaction(
VersionLog versionLog,
Transaction transaction,
ImmutableSet.Builder migratedDataFilesBuilder) {
// Only need actions related to data change: AddFile and RemoveFile
List dataFileActions =
versionLog.getActions().stream()
.filter(action -> action instanceof AddFile || action instanceof RemoveFile)
.collect(Collectors.toList());
List filesToAdd = Lists.newArrayList();
List filesToRemove = Lists.newArrayList();
for (Action action : dataFileActions) {
DataFile dataFile = buildDataFileFromAction(action, transaction.table());
if (action instanceof AddFile) {
filesToAdd.add(dataFile);
} else if (action instanceof RemoveFile) {
filesToRemove.add(dataFile);
} else {
throw new ValidationException(
"The action %s's is unsupported", action.getClass().getSimpleName());
}
migratedDataFilesBuilder.add(dataFile.path().toString());
}
if (!filesToAdd.isEmpty() && !filesToRemove.isEmpty()) {
// OverwriteFiles case
OverwriteFiles overwriteFiles = transaction.newOverwrite();
filesToAdd.forEach(overwriteFiles::addFile);
filesToRemove.forEach(overwriteFiles::deleteFile);
overwriteFiles.commit();
} else if (!filesToAdd.isEmpty()) {
// AppendFiles case
AppendFiles appendFiles = transaction.newAppend();
filesToAdd.forEach(appendFiles::appendFile);
appendFiles.commit();
} else if (!filesToRemove.isEmpty()) {
// DeleteFiles case
DeleteFiles deleteFiles = transaction.newDelete();
filesToRemove.forEach(deleteFiles::deleteFile);
deleteFiles.commit();
} else {
// No data change case, dummy append to tag the snapshot
transaction.newAppend().commit();
}
tagCurrentSnapshot(versionLog.getVersion(), transaction);
}
private DataFile buildDataFileFromAction(Action action, Table table) {
PartitionSpec spec = table.spec();
String path;
long fileSize;
Long nullableFileSize;
Map partitionValues;
if (action instanceof AddFile) {
AddFile addFile = (AddFile) action;
path = addFile.getPath();
nullableFileSize = addFile.getSize();
partitionValues = addFile.getPartitionValues();
} else if (action instanceof RemoveFile) {
RemoveFile removeFile = (RemoveFile) action;
path = removeFile.getPath();
nullableFileSize = removeFile.getSize().orElse(null);
partitionValues = removeFile.getPartitionValues();
} else {
throw new ValidationException(
"Unexpected action type for Delta Lake: %s", action.getClass().getSimpleName());
}
String fullFilePath = getFullFilePath(path, deltaLog.getPath().toString());
// For unpartitioned table, the partitionValues should be an empty map rather than null
Preconditions.checkArgument(
partitionValues != null,
String.format("File %s does not specify a partitionValues", fullFilePath));
FileFormat format = determineFileFormatFromPath(fullFilePath);
InputFile file = deltaLakeFileIO.newInputFile(fullFilePath);
if (!file.exists()) {
throw new NotFoundException(
"File %s is referenced in the logs of Delta Lake table at %s, but cannot be found in the storage",
fullFilePath, deltaTableLocation);
}
// If the file size is not specified, the size should be read from the file
if (nullableFileSize != null) {
fileSize = nullableFileSize;
} else {
fileSize = file.getLength();
}
// get metrics from the file
MetricsConfig metricsConfig = MetricsConfig.forTable(table);
String nameMappingString = table.properties().get(TableProperties.DEFAULT_NAME_MAPPING);
NameMapping nameMapping =
nameMappingString != null ? NameMappingParser.fromJson(nameMappingString) : null;
Metrics metrics = getMetricsForFile(file, format, metricsConfig, nameMapping);
List partitionValueList =
spec.fields().stream()
.map(PartitionField::name)
.map(partitionValues::get)
.collect(Collectors.toList());
return DataFiles.builder(spec)
.withPath(fullFilePath)
.withFormat(format)
.withFileSizeInBytes(fileSize)
.withMetrics(metrics)
.withPartitionValues(partitionValueList)
.build();
}
private FileFormat determineFileFormatFromPath(String path) {
if (path.endsWith(PARQUET_SUFFIX)) {
return FileFormat.PARQUET;
} else {
throw new ValidationException("Do not support file format in path %s", path);
}
}
private Metrics getMetricsForFile(
InputFile file, FileFormat format, MetricsConfig metricsSpec, NameMapping mapping) {
if (format == FileFormat.PARQUET) {
return ParquetUtil.fileMetrics(file, metricsSpec, mapping);
}
throw new ValidationException("Cannot get metrics from file format: %s", format);
}
private Map destTableProperties(
io.delta.standalone.Snapshot deltaSnapshot, String originalLocation) {
additionalPropertiesBuilder.putAll(deltaSnapshot.getMetadata().getConfiguration());
additionalPropertiesBuilder.putAll(
ImmutableMap.of(
SNAPSHOT_SOURCE_PROP, DELTA_SOURCE_VALUE, ORIGINAL_LOCATION_PROP, originalLocation));
return additionalPropertiesBuilder.build();
}
private void tagCurrentSnapshot(long deltaVersion, Transaction transaction) {
long currentSnapshotId = transaction.table().currentSnapshot().snapshotId();
ManageSnapshots manageSnapshots = transaction.manageSnapshots();
manageSnapshots.createTag(DELTA_VERSION_TAG_PREFIX + deltaVersion, currentSnapshotId);
Timestamp deltaVersionTimestamp = deltaLog.getCommitInfoAt(deltaVersion).getTimestamp();
if (deltaVersionTimestamp != null) {
manageSnapshots.createTag(
DELTA_TIMESTAMP_TAG_PREFIX + deltaVersionTimestamp.getTime(), currentSnapshotId);
}
manageSnapshots.commit();
}
/**
* Get the full file path, the input {@code String} path can be either a relative path or an
* absolute path of a data file in delta table
*
* @param path the return value of {@link AddFile#getPath()} or {@link RemoveFile#getPath()}
* (either absolute or relative)
* @param tableRoot the root path of the delta table
*/
private static String getFullFilePath(String path, String tableRoot) {
URI dataFileUri = URI.create(path);
try {
String decodedPath = URLDecoder.decode(path, StandardCharsets.UTF_8.name());
if (dataFileUri.isAbsolute()) {
return decodedPath;
} else {
return tableRoot + File.separator + decodedPath;
}
} catch (UnsupportedEncodingException e) {
throw new IllegalArgumentException(String.format("Cannot decode path %s", path), e);
}
}
}