org.apache.iceberg.ManifestMergeManager Maven / Gradle / Ivy
Go to download
Show more of this group Show more artifacts with this name
Show all versions of iceberg-core Show documentation
Show all versions of iceberg-core Show documentation
A table format for huge analytic datasets
The newest version!
/*
* Licensed to the Apache Software Foundation (ASF) under one
* or more contributor license agreements. See the NOTICE file
* distributed with this work for additional information
* regarding copyright ownership. The ASF licenses this file
* to you under the Apache License, Version 2.0 (the
* "License"); you may not use this file except in compliance
* with the License. You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing,
* software distributed under the License is distributed on an
* "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
* KIND, either express or implied. See the License for the
* specific language governing permissions and limitations
* under the License.
*/
package org.apache.iceberg;
import java.io.IOException;
import java.lang.reflect.Array;
import java.util.Comparator;
import java.util.Iterator;
import java.util.List;
import java.util.Map;
import java.util.Set;
import java.util.concurrent.ExecutorService;
import java.util.function.Supplier;
import org.apache.iceberg.ManifestEntry.Status;
import org.apache.iceberg.exceptions.RuntimeIOException;
import org.apache.iceberg.relocated.com.google.common.collect.Iterables;
import org.apache.iceberg.relocated.com.google.common.collect.ListMultimap;
import org.apache.iceberg.relocated.com.google.common.collect.Lists;
import org.apache.iceberg.relocated.com.google.common.collect.Maps;
import org.apache.iceberg.relocated.com.google.common.collect.Multimaps;
import org.apache.iceberg.util.BinPacking.ListPacker;
import org.apache.iceberg.util.Exceptions;
import org.apache.iceberg.util.Tasks;
abstract class ManifestMergeManager> {
private final long targetSizeBytes;
private final int minCountToMerge;
private final boolean mergeEnabled;
// cache merge results to reuse when retrying
private final Map, ManifestFile> mergedManifests = Maps.newConcurrentMap();
private final Supplier workerPoolSupplier;
ManifestMergeManager(
long targetSizeBytes,
int minCountToMerge,
boolean mergeEnabled,
Supplier executorSupplier) {
this.targetSizeBytes = targetSizeBytes;
this.minCountToMerge = minCountToMerge;
this.mergeEnabled = mergeEnabled;
this.workerPoolSupplier = executorSupplier;
}
protected abstract long snapshotId();
protected abstract PartitionSpec spec(int specId);
protected abstract void deleteFile(String location);
protected abstract ManifestWriter newManifestWriter(PartitionSpec spec);
protected abstract ManifestReader newManifestReader(ManifestFile manifest);
Iterable mergeManifests(Iterable manifests) {
Iterator manifestIter = manifests.iterator();
if (!mergeEnabled || !manifestIter.hasNext()) {
return manifests;
}
ManifestFile first = manifestIter.next();
List merged = Lists.newArrayList();
ListMultimap groups = groupBySpec(first, manifestIter);
for (Integer specId : groups.keySet()) {
Iterables.addAll(merged, mergeGroup(first, specId, groups.get(specId)));
}
return merged;
}
void cleanUncommitted(Set committed) {
// iterate over a copy of entries to avoid concurrent modification
List, ManifestFile>> entries =
Lists.newArrayList(mergedManifests.entrySet());
for (Map.Entry, ManifestFile> entry : entries) {
// delete any new merged manifests that aren't in the committed list
ManifestFile merged = entry.getValue();
if (!committed.contains(merged)) {
deleteFile(merged.path());
// remove the deleted file from the cache
mergedManifests.remove(entry.getKey());
}
}
}
private ListMultimap groupBySpec(
ManifestFile first, Iterator remaining) {
ListMultimap groups =
Multimaps.newListMultimap(
Maps.newTreeMap(Comparator.reverseOrder()), Lists::newArrayList);
groups.put(first.partitionSpecId(), first);
remaining.forEachRemaining(manifest -> groups.put(manifest.partitionSpecId(), manifest));
return groups;
}
@SuppressWarnings("unchecked")
private Iterable mergeGroup(
ManifestFile first, int specId, List group) {
// use a lookback of 1 to avoid reordering the manifests. using 1 also means this should pack
// from the end so that the manifest that gets under-filled is the first one, which will be
// merged the next time.
ListPacker packer = new ListPacker<>(targetSizeBytes, 1, false);
List> bins = packer.packEnd(group, ManifestFile::length);
// process bins in parallel, but put results in the order of the bins into an array to preserve
// the order of manifests and contents. preserving the order helps avoid random deletes when
// data files are eventually aged off.
List[] binResults =
(List[]) Array.newInstance(List.class, bins.size());
Tasks.range(bins.size())
.stopOnFailure()
.throwFailureWhenFinished()
.executeWith(workerPoolSupplier.get())
.run(
index -> {
List bin = bins.get(index);
List outputManifests = Lists.newArrayList();
binResults[index] = outputManifests;
if (bin.size() == 1) {
// no need to rewrite
outputManifests.add(bin.get(0));
return;
}
// if the bin has the first manifest (the new data files or an appended manifest file)
// then only merge it
// if the number of manifests is above the minimum count. this is applied only to bins
// with an in-memory
// manifest so that large manifests don't prevent merging older groups.
if (bin.contains(first) && bin.size() < minCountToMerge) {
// not enough to merge, add all manifest files to the output list
outputManifests.addAll(bin);
} else {
// merge the group
outputManifests.add(createManifest(specId, bin));
}
});
return Iterables.concat(binResults);
}
private ManifestFile createManifest(int specId, List bin) {
// if this merge was already rewritten, use the existing file.
// if the new files are in this merge, then the ManifestFile for the new files has changed and
// will be a cache miss.
if (mergedManifests.containsKey(bin)) {
return mergedManifests.get(bin);
}
ManifestWriter writer = newManifestWriter(spec(specId));
boolean threw = true;
try {
for (ManifestFile manifest : bin) {
try (ManifestReader reader = newManifestReader(manifest)) {
for (ManifestEntry entry : reader.entries()) {
if (entry.status() == Status.DELETED) {
// suppress deletes from previous snapshots. only files deleted by this snapshot
// should be added to the new manifest
if (entry.snapshotId() == snapshotId()) {
writer.delete(entry);
}
} else if (entry.status() == Status.ADDED && entry.snapshotId() == snapshotId()) {
// adds from this snapshot are still adds, otherwise they should be existing
writer.add(entry);
} else {
// add all files from the old manifest as existing files
writer.existing(entry);
}
}
} catch (IOException e) {
throw new RuntimeIOException(e, "Failed to close manifest reader");
}
}
threw = false;
} finally {
Exceptions.close(writer, threw);
}
ManifestFile manifest = writer.toManifestFile();
// update the cache
mergedManifests.put(bin, manifest);
return manifest;
}
}
© 2015 - 2025 Weber Informatics LLC | Privacy Policy