Many resources are needed to download a project. Please understand that we have to compensate our server costs. Thank you in advance. Project price only 1 $
You can buy this project and download/modify it how often you want.
/*
* Licensed to the Apache Software Foundation (ASF) under one
* or more contributor license agreements. See the NOTICE file
* distributed with this work for additional information
* regarding copyright ownership. The ASF licenses this file
* to you under the Apache License, Version 2.0 (the
* "License"); you may not use this file except in compliance
* with the License. You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing,
* software distributed under the License is distributed on an
* "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
* KIND, either express or implied. See the License for the
* specific language governing permissions and limitations
* under the License.
*/
package org.apache.iceberg.spark.actions;
import static org.apache.iceberg.MetadataTableType.ENTRIES;
import java.io.Serializable;
import java.util.Iterator;
import java.util.List;
import java.util.UUID;
import java.util.function.Function;
import java.util.function.Predicate;
import java.util.stream.Collectors;
import org.apache.hadoop.fs.Path;
import org.apache.iceberg.ContentFile;
import org.apache.iceberg.DataFile;
import org.apache.iceberg.DeleteFile;
import org.apache.iceberg.FileFormat;
import org.apache.iceberg.HasTableOperations;
import org.apache.iceberg.ManifestContent;
import org.apache.iceberg.ManifestFile;
import org.apache.iceberg.ManifestFiles;
import org.apache.iceberg.ManifestWriter;
import org.apache.iceberg.PartitionSpec;
import org.apache.iceberg.Partitioning;
import org.apache.iceberg.RollingManifestWriter;
import org.apache.iceberg.Snapshot;
import org.apache.iceberg.Table;
import org.apache.iceberg.TableOperations;
import org.apache.iceberg.TableProperties;
import org.apache.iceberg.actions.ImmutableRewriteManifests;
import org.apache.iceberg.actions.RewriteManifests;
import org.apache.iceberg.exceptions.CleanableFailure;
import org.apache.iceberg.exceptions.CommitStateUnknownException;
import org.apache.iceberg.exceptions.ValidationException;
import org.apache.iceberg.io.OutputFile;
import org.apache.iceberg.io.SupportsBulkOperations;
import org.apache.iceberg.relocated.com.google.common.base.Preconditions;
import org.apache.iceberg.relocated.com.google.common.collect.ImmutableList;
import org.apache.iceberg.relocated.com.google.common.collect.Iterables;
import org.apache.iceberg.relocated.com.google.common.collect.Lists;
import org.apache.iceberg.spark.JobGroupInfo;
import org.apache.iceberg.spark.SparkContentFile;
import org.apache.iceberg.spark.SparkDataFile;
import org.apache.iceberg.spark.SparkDeleteFile;
import org.apache.iceberg.spark.source.SerializableTableWithSize;
import org.apache.iceberg.types.Types;
import org.apache.iceberg.util.PropertyUtil;
import org.apache.iceberg.util.ThreadPools;
import org.apache.spark.api.java.function.MapPartitionsFunction;
import org.apache.spark.broadcast.Broadcast;
import org.apache.spark.sql.Column;
import org.apache.spark.sql.Dataset;
import org.apache.spark.sql.Encoder;
import org.apache.spark.sql.Encoders;
import org.apache.spark.sql.Row;
import org.apache.spark.sql.SparkSession;
import org.apache.spark.sql.types.StructType;
import org.slf4j.Logger;
import org.slf4j.LoggerFactory;
/**
* An action that rewrites manifests in a distributed manner and co-locates metadata for partitions.
*
*
By default, this action rewrites all manifests for the current partition spec and writes the
* result to the metadata folder. The behavior can be modified by passing a custom predicate to
* {@link #rewriteIf(Predicate)} and a custom spec ID to {@link #specId(int)}. In addition, there is
* a way to configure a custom location for staged manifests via {@link #stagingLocation(String)}.
* The provided staging location will be ignored if snapshot ID inheritance is enabled. In such
* cases, the manifests are always written to the metadata folder and committed without staging.
*/
public class RewriteManifestsSparkAction
extends BaseSnapshotUpdateSparkAction implements RewriteManifests {
public static final String USE_CACHING = "use-caching";
public static final boolean USE_CACHING_DEFAULT = false;
private static final Logger LOG = LoggerFactory.getLogger(RewriteManifestsSparkAction.class);
private static final RewriteManifests.Result EMPTY_RESULT =
ImmutableRewriteManifests.Result.builder()
.rewrittenManifests(ImmutableList.of())
.addedManifests(ImmutableList.of())
.build();
private final Table table;
private final int formatVersion;
private final long targetManifestSizeBytes;
private final boolean shouldStageManifests;
private PartitionSpec spec;
private Predicate predicate = manifest -> true;
private String outputLocation;
RewriteManifestsSparkAction(SparkSession spark, Table table) {
super(spark);
this.table = table;
this.spec = table.spec();
this.targetManifestSizeBytes =
PropertyUtil.propertyAsLong(
table.properties(),
TableProperties.MANIFEST_TARGET_SIZE_BYTES,
TableProperties.MANIFEST_TARGET_SIZE_BYTES_DEFAULT);
// default the output location to the metadata location
TableOperations ops = ((HasTableOperations) table).operations();
Path metadataFilePath = new Path(ops.metadataFileLocation("file"));
this.outputLocation = metadataFilePath.getParent().toString();
// use the current table format version for new manifests
this.formatVersion = ops.current().formatVersion();
boolean snapshotIdInheritanceEnabled =
PropertyUtil.propertyAsBoolean(
table.properties(),
TableProperties.SNAPSHOT_ID_INHERITANCE_ENABLED,
TableProperties.SNAPSHOT_ID_INHERITANCE_ENABLED_DEFAULT);
this.shouldStageManifests = formatVersion == 1 && !snapshotIdInheritanceEnabled;
}
@Override
protected RewriteManifestsSparkAction self() {
return this;
}
@Override
public RewriteManifestsSparkAction specId(int specId) {
Preconditions.checkArgument(table.specs().containsKey(specId), "Invalid spec id %s", specId);
this.spec = table.specs().get(specId);
return this;
}
@Override
public RewriteManifestsSparkAction rewriteIf(Predicate newPredicate) {
this.predicate = newPredicate;
return this;
}
@Override
public RewriteManifestsSparkAction stagingLocation(String newStagingLocation) {
if (shouldStageManifests) {
this.outputLocation = newStagingLocation;
} else {
LOG.warn("Ignoring provided staging location as new manifests will be committed directly");
}
return this;
}
@Override
public RewriteManifests.Result execute() {
String desc = String.format("Rewriting manifests in %s", table.name());
JobGroupInfo info = newJobGroupInfo("REWRITE-MANIFESTS", desc);
return withJobGroupInfo(info, this::doExecute);
}
private RewriteManifests.Result doExecute() {
List rewrittenManifests = Lists.newArrayList();
List addedManifests = Lists.newArrayList();
RewriteManifests.Result dataResult = rewriteManifests(ManifestContent.DATA);
Iterables.addAll(rewrittenManifests, dataResult.rewrittenManifests());
Iterables.addAll(addedManifests, dataResult.addedManifests());
RewriteManifests.Result deletesResult = rewriteManifests(ManifestContent.DELETES);
Iterables.addAll(rewrittenManifests, deletesResult.rewrittenManifests());
Iterables.addAll(addedManifests, deletesResult.addedManifests());
if (rewrittenManifests.isEmpty()) {
return EMPTY_RESULT;
}
replaceManifests(rewrittenManifests, addedManifests);
return ImmutableRewriteManifests.Result.builder()
.rewrittenManifests(rewrittenManifests)
.addedManifests(addedManifests)
.build();
}
private RewriteManifests.Result rewriteManifests(ManifestContent content) {
List matchingManifests = findMatchingManifests(content);
if (matchingManifests.isEmpty()) {
return EMPTY_RESULT;
}
int targetNumManifests = targetNumManifests(totalSizeBytes(matchingManifests));
if (targetNumManifests == 1 && matchingManifests.size() == 1) {
return EMPTY_RESULT;
}
Dataset manifestEntryDF = buildManifestEntryDF(matchingManifests);
List newManifests;
if (spec.isUnpartitioned()) {
newManifests = writeUnpartitionedManifests(content, manifestEntryDF, targetNumManifests);
} else {
newManifests = writePartitionedManifests(content, manifestEntryDF, targetNumManifests);
}
return ImmutableRewriteManifests.Result.builder()
.rewrittenManifests(matchingManifests)
.addedManifests(newManifests)
.build();
}
private Dataset buildManifestEntryDF(List manifests) {
Dataset manifestDF =
spark()
.createDataset(Lists.transform(manifests, ManifestFile::path), Encoders.STRING())
.toDF("manifest");
Dataset manifestEntryDF =
loadMetadataTable(table, ENTRIES)
.filter("status < 2") // select only live entries
.selectExpr(
"input_file_name() as manifest",
"snapshot_id",
"sequence_number",
"file_sequence_number",
"data_file");
Column joinCond = manifestDF.col("manifest").equalTo(manifestEntryDF.col("manifest"));
return manifestEntryDF
.join(manifestDF, joinCond, "left_semi")
.select("snapshot_id", "sequence_number", "file_sequence_number", "data_file");
}
private List writeUnpartitionedManifests(
ManifestContent content, Dataset manifestEntryDF, int numManifests) {
WriteManifests> writeFunc = newWriteManifestsFunc(content, manifestEntryDF.schema());
Dataset transformedManifestEntryDF = manifestEntryDF.repartition(numManifests);
return writeFunc.apply(transformedManifestEntryDF).collectAsList();
}
private List writePartitionedManifests(
ManifestContent content, Dataset manifestEntryDF, int numManifests) {
return withReusableDS(
manifestEntryDF,
df -> {
WriteManifests> writeFunc = newWriteManifestsFunc(content, df.schema());
Column partitionColumn = df.col("data_file.partition");
Dataset transformedDF = repartitionAndSort(df, partitionColumn, numManifests);
return writeFunc.apply(transformedDF).collectAsList();
});
}
private WriteManifests> newWriteManifestsFunc(ManifestContent content, StructType sparkType) {
ManifestWriterFactory writers = manifestWriters();
StructType sparkFileType = (StructType) sparkType.apply("data_file").dataType();
Types.StructType combinedFileType = DataFile.getType(Partitioning.partitionType(table));
Types.StructType fileType = DataFile.getType(spec.partitionType());
if (content == ManifestContent.DATA) {
return new WriteDataManifests(writers, combinedFileType, fileType, sparkFileType);
} else {
return new WriteDeleteManifests(writers, combinedFileType, fileType, sparkFileType);
}
}
private Dataset repartitionAndSort(Dataset df, Column col, int numPartitions) {
return df.repartitionByRange(numPartitions, col).sortWithinPartitions(col);
}
private U withReusableDS(Dataset ds, Function, U> func) {
boolean useCaching =
PropertyUtil.propertyAsBoolean(options(), USE_CACHING, USE_CACHING_DEFAULT);
Dataset reusableDS = useCaching ? ds.cache() : ds;
try {
return func.apply(reusableDS);
} finally {
if (useCaching) {
reusableDS.unpersist(false);
}
}
}
private List findMatchingManifests(ManifestContent content) {
Snapshot currentSnapshot = table.currentSnapshot();
if (currentSnapshot == null) {
return ImmutableList.of();
}
List manifests = loadManifests(content, currentSnapshot);
return manifests.stream()
.filter(manifest -> manifest.partitionSpecId() == spec.specId() && predicate.test(manifest))
.collect(Collectors.toList());
}
private List loadManifests(ManifestContent content, Snapshot snapshot) {
switch (content) {
case DATA:
return snapshot.dataManifests(table.io());
case DELETES:
return snapshot.deleteManifests(table.io());
default:
throw new IllegalArgumentException("Unknown manifest content: " + content);
}
}
private int targetNumManifests(long totalSizeBytes) {
return (int) ((totalSizeBytes + targetManifestSizeBytes - 1) / targetManifestSizeBytes);
}
private long totalSizeBytes(Iterable manifests) {
long totalSizeBytes = 0L;
for (ManifestFile manifest : manifests) {
ValidationException.check(
hasFileCounts(manifest), "No file counts in manifest: %s", manifest.path());
totalSizeBytes += manifest.length();
}
return totalSizeBytes;
}
private boolean hasFileCounts(ManifestFile manifest) {
return manifest.addedFilesCount() != null
&& manifest.existingFilesCount() != null
&& manifest.deletedFilesCount() != null;
}
private void replaceManifests(
Iterable deletedManifests, Iterable addedManifests) {
try {
org.apache.iceberg.RewriteManifests rewriteManifests = table.rewriteManifests();
deletedManifests.forEach(rewriteManifests::deleteManifest);
addedManifests.forEach(rewriteManifests::addManifest);
commit(rewriteManifests);
if (shouldStageManifests) {
// delete new manifests as they were rewritten before the commit
deleteFiles(Iterables.transform(addedManifests, ManifestFile::path));
}
} catch (CommitStateUnknownException commitStateUnknownException) {
// don't clean up added manifest files, because they may have been successfully committed.
throw commitStateUnknownException;
} catch (Exception e) {
if (e instanceof CleanableFailure) {
// delete all new manifests because the rewrite failed
deleteFiles(Iterables.transform(addedManifests, ManifestFile::path));
}
throw e;
}
}
private void deleteFiles(Iterable locations) {
Iterable files =
Iterables.transform(locations, location -> new FileInfo(location, MANIFEST));
if (table.io() instanceof SupportsBulkOperations) {
deleteFiles((SupportsBulkOperations) table.io(), files.iterator());
} else {
deleteFiles(
ThreadPools.getWorkerPool(), file -> table.io().deleteFile(file), files.iterator());
}
}
private ManifestWriterFactory manifestWriters() {
return new ManifestWriterFactory(
sparkContext().broadcast(SerializableTableWithSize.copyOf(table)),
formatVersion,
spec.specId(),
outputLocation,
// allow the actual size of manifests to be 20% higher as the estimation is not precise
(long) (1.2 * targetManifestSizeBytes));
}
private static class WriteDataManifests extends WriteManifests {
WriteDataManifests(
ManifestWriterFactory manifestWriters,
Types.StructType combinedPartitionType,
Types.StructType partitionType,
StructType sparkFileType) {
super(manifestWriters, combinedPartitionType, partitionType, sparkFileType);
}
@Override
protected SparkDataFile newFileWrapper() {
return new SparkDataFile(combinedFileType(), fileType(), sparkFileType());
}
@Override
protected RollingManifestWriter newManifestWriter() {
return writers().newRollingManifestWriter();
}
}
private static class WriteDeleteManifests extends WriteManifests {
WriteDeleteManifests(
ManifestWriterFactory manifestWriters,
Types.StructType combinedFileType,
Types.StructType fileType,
StructType sparkFileType) {
super(manifestWriters, combinedFileType, fileType, sparkFileType);
}
@Override
protected SparkDeleteFile newFileWrapper() {
return new SparkDeleteFile(combinedFileType(), fileType(), sparkFileType());
}
@Override
protected RollingManifestWriter newManifestWriter() {
return writers().newRollingDeleteManifestWriter();
}
}
private abstract static class WriteManifests>
implements MapPartitionsFunction {
private static final Encoder MANIFEST_ENCODER =
Encoders.javaSerialization(ManifestFile.class);
private final ManifestWriterFactory writers;
private final Types.StructType combinedFileType;
private final Types.StructType fileType;
private final StructType sparkFileType;
WriteManifests(
ManifestWriterFactory writers,
Types.StructType combinedFileType,
Types.StructType fileType,
StructType sparkFileType) {
this.writers = writers;
this.combinedFileType = combinedFileType;
this.fileType = fileType;
this.sparkFileType = sparkFileType;
}
protected abstract SparkContentFile newFileWrapper();
protected abstract RollingManifestWriter newManifestWriter();
public Dataset apply(Dataset input) {
return input.mapPartitions(this, MANIFEST_ENCODER);
}
@Override
public Iterator call(Iterator rows) throws Exception {
SparkContentFile fileWrapper = newFileWrapper();
RollingManifestWriter writer = newManifestWriter();
try {
while (rows.hasNext()) {
Row row = rows.next();
long snapshotId = row.getLong(0);
long sequenceNumber = row.getLong(1);
Long fileSequenceNumber = row.isNullAt(2) ? null : row.getLong(2);
Row file = row.getStruct(3);
writer.existing(fileWrapper.wrap(file), snapshotId, sequenceNumber, fileSequenceNumber);
}
} finally {
writer.close();
}
return writer.toManifestFiles().iterator();
}
protected ManifestWriterFactory writers() {
return writers;
}
protected Types.StructType combinedFileType() {
return combinedFileType;
}
protected Types.StructType fileType() {
return fileType;
}
protected StructType sparkFileType() {
return sparkFileType;
}
}
private static class ManifestWriterFactory implements Serializable {
private final Broadcast
tableBroadcast;
private final int formatVersion;
private final int specId;
private final String outputLocation;
private final long maxManifestSizeBytes;
ManifestWriterFactory(
Broadcast