All Downloads are FREE. Search and download functionalities are using the official Maven repository.

com.netease.arctic.shade.org.apache.iceberg.DeleteFileIndex Maven / Gradle / Ivy

The newest version!
/*
 * Licensed to the Apache Software Foundation (ASF) under one
 * or more contributor license agreements.  See the NOTICE file
 * distributed with this work for additional information
 * regarding copyright ownership.  The ASF licenses this file
 * to you under the Apache License, Version 2.0 (the
 * "License"); you may not use this file except in compliance
 * with the License.  You may obtain a copy of the License at
 *
 *   http://www.apache.org/licenses/LICENSE-2.0
 *
 * Unless required by applicable law or agreed to in writing,
 * software distributed under the License is distributed on an
 * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
 * KIND, either express or implied.  See the License for the
 * specific language governing permissions and limitations
 * under the License.
 */

package com.netease.arctic.shade.org.apache.iceberg;

import com.netease.arctic.shade.com.github.benmanes.caffeine.cache.Caffeine;
import com.netease.arctic.shade.com.github.benmanes.caffeine.cache.LoadingCache;
import java.io.IOException;
import java.nio.ByteBuffer;
import java.util.Arrays;
import java.util.Collections;
import java.util.Comparator;
import java.util.List;
import java.util.Map;
import java.util.Queue;
import java.util.Set;
import java.util.concurrent.ConcurrentLinkedQueue;
import java.util.concurrent.ExecutorService;
import java.util.stream.Collectors;
import java.util.stream.Stream;
import com.netease.arctic.shade.org.apache.iceberg.exceptions.RuntimeIOException;
import com.netease.arctic.shade.org.apache.iceberg.expressions.Expression;
import com.netease.arctic.shade.org.apache.iceberg.expressions.Expressions;
import com.netease.arctic.shade.org.apache.iceberg.expressions.ManifestEvaluator;
import com.netease.arctic.shade.org.apache.iceberg.expressions.Projections;
import com.netease.arctic.shade.org.apache.iceberg.io.CloseableIterable;
import com.netease.arctic.shade.org.apache.iceberg.io.FileIO;
import com.netease.arctic.shade.org.apache.iceberg.relocated.com.google.common.base.Preconditions;
import com.netease.arctic.shade.org.apache.iceberg.relocated.com.google.common.collect.ImmutableMap;
import com.netease.arctic.shade.org.apache.iceberg.relocated.com.google.common.collect.Iterables;
import com.netease.arctic.shade.org.apache.iceberg.relocated.com.google.common.collect.ListMultimap;
import com.netease.arctic.shade.org.apache.iceberg.relocated.com.google.common.collect.Lists;
import com.netease.arctic.shade.org.apache.iceberg.relocated.com.google.common.collect.Maps;
import com.netease.arctic.shade.org.apache.iceberg.relocated.com.google.common.collect.Multimaps;
import com.netease.arctic.shade.org.apache.iceberg.relocated.com.google.common.collect.Sets;
import com.netease.arctic.shade.org.apache.iceberg.types.Comparators;
import com.netease.arctic.shade.org.apache.iceberg.types.Conversions;
import com.netease.arctic.shade.org.apache.iceberg.types.Type;
import com.netease.arctic.shade.org.apache.iceberg.types.Types;
import com.netease.arctic.shade.org.apache.iceberg.util.Pair;
import com.netease.arctic.shade.org.apache.iceberg.util.StructLikeWrapper;
import com.netease.arctic.shade.org.apache.iceberg.util.Tasks;

/**
 * An index of {@link DeleteFile delete files} by sequence number.
 * 

* Use {@link #builderFor(FileIO, Iterable)} to construct an index, and {@link #forDataFile(long, DataFile)} or * {@link #forEntry(ManifestEntry)} to get the the delete files to apply to a given data file. */ class DeleteFileIndex { private final Map specsById; private final Map partitionTypeById; private final Map> wrapperById; private final long[] globalSeqs; private final DeleteFile[] globalDeletes; private final Map, Pair> sortedDeletesByPartition; DeleteFileIndex(Map specsById, long[] globalSeqs, DeleteFile[] globalDeletes, Map, Pair> sortedDeletesByPartition) { this.specsById = specsById; ImmutableMap.Builder builder = ImmutableMap.builder(); specsById.forEach((specId, spec) -> builder.put(specId, spec.partitionType())); this.partitionTypeById = builder.build(); this.wrapperById = Maps.newConcurrentMap(); this.globalSeqs = globalSeqs; this.globalDeletes = globalDeletes; this.sortedDeletesByPartition = sortedDeletesByPartition; } public boolean isEmpty() { return (globalDeletes == null || globalDeletes.length == 0) && sortedDeletesByPartition.isEmpty(); } public Iterable referencedDeleteFiles() { Iterable deleteFiles = Collections.emptyList(); if (globalDeletes != null) { deleteFiles = Iterables.concat(deleteFiles, Arrays.asList(globalDeletes)); } for (Pair partitionDeletes : sortedDeletesByPartition.values()) { deleteFiles = Iterables.concat(deleteFiles, Arrays.asList(partitionDeletes.second())); } return deleteFiles; } private StructLikeWrapper newWrapper(int specId) { return StructLikeWrapper.forType(partitionTypeById.get(specId)); } private Pair partition(int specId, StructLike struct) { ThreadLocal wrapper = wrapperById.computeIfAbsent(specId, id -> ThreadLocal.withInitial(() -> newWrapper(id))); return Pair.of(specId, wrapper.get().set(struct)); } DeleteFile[] forEntry(ManifestEntry entry) { return forDataFile(entry.sequenceNumber(), entry.file()); } DeleteFile[] forDataFile(long sequenceNumber, DataFile file) { Pair partition = partition(file.specId(), file.partition()); Pair partitionDeletes = sortedDeletesByPartition.get(partition); Stream matchingDeletes; if (partitionDeletes == null) { matchingDeletes = limitBySequenceNumber(sequenceNumber, globalSeqs, globalDeletes); } else if (globalDeletes == null) { matchingDeletes = limitBySequenceNumber(sequenceNumber, partitionDeletes.first(), partitionDeletes.second()); } else { matchingDeletes = Stream.concat( limitBySequenceNumber(sequenceNumber, globalSeqs, globalDeletes), limitBySequenceNumber(sequenceNumber, partitionDeletes.first(), partitionDeletes.second())); } return matchingDeletes .filter(deleteFile -> canContainDeletesForFile(file, deleteFile, specsById.get(file.specId()).schema())) .toArray(DeleteFile[]::new); } private static boolean canContainDeletesForFile(DataFile dataFile, DeleteFile deleteFile, Schema schema) { switch (deleteFile.content()) { case POSITION_DELETES: return canContainPosDeletesForFile(dataFile, deleteFile); case EQUALITY_DELETES: return canContainEqDeletesForFile(dataFile, deleteFile, schema); } return true; } private static boolean canContainPosDeletesForFile(DataFile dataFile, DeleteFile deleteFile) { // check that the delete file can contain the data file's file_path Map lowers = deleteFile.lowerBounds(); Map uppers = deleteFile.upperBounds(); if (lowers == null || uppers == null) { return true; } Type pathType = MetadataColumns.DELETE_FILE_PATH.type(); int pathId = MetadataColumns.DELETE_FILE_PATH.fieldId(); Comparator comparator = Comparators.charSequences(); ByteBuffer lower = lowers.get(pathId); if (lower != null && comparator.compare(dataFile.path(), Conversions.fromByteBuffer(pathType, lower)) < 0) { return false; } ByteBuffer upper = uppers.get(pathId); if (upper != null && comparator.compare(dataFile.path(), Conversions.fromByteBuffer(pathType, upper)) > 0) { return false; } return true; } @SuppressWarnings("checkstyle:CyclomaticComplexity") private static boolean canContainEqDeletesForFile(DataFile dataFile, DeleteFile deleteFile, Schema schema) { // whether to check data ranges or to assume that the ranges match // if upper/lower bounds are missing, null counts may still be used to determine delete files can be skipped boolean checkRanges = dataFile.lowerBounds() != null && dataFile.upperBounds() != null && deleteFile.lowerBounds() != null && deleteFile.upperBounds() != null; Map dataLowers = dataFile.lowerBounds(); Map dataUppers = dataFile.upperBounds(); Map deleteLowers = deleteFile.lowerBounds(); Map deleteUppers = deleteFile.upperBounds(); Map dataNullCounts = dataFile.nullValueCounts(); Map dataValueCounts = dataFile.valueCounts(); Map deleteNullCounts = deleteFile.nullValueCounts(); Map deleteValueCounts = deleteFile.valueCounts(); for (int id : deleteFile.equalityFieldIds()) { Types.NestedField field = schema.findField(id); if (!field.type().isPrimitiveType()) { // stats are not kept for nested types. assume that the delete file may match continue; } if (containsNull(dataNullCounts, field) && containsNull(deleteNullCounts, field)) { // the data has null values and null has been deleted, so the deletes must be applied continue; } if (allNull(dataNullCounts, dataValueCounts, field) && allNonNull(deleteNullCounts, field)) { // the data file contains only null values for this field, but there are no deletes for null values return false; } if (allNull(deleteNullCounts, deleteValueCounts, field) && allNonNull(dataNullCounts, field)) { // the delete file removes only null rows with null for this field, but there are no data rows with null return false; } if (!checkRanges) { // some upper and lower bounds are missing, assume they match continue; } ByteBuffer dataLower = dataLowers.get(id); ByteBuffer dataUpper = dataUppers.get(id); ByteBuffer deleteLower = deleteLowers.get(id); ByteBuffer deleteUpper = deleteUppers.get(id); if (dataLower == null || dataUpper == null || deleteLower == null || deleteUpper == null) { // at least one bound is not known, assume the delete file may match continue; } if (!rangesOverlap(field.type().asPrimitiveType(), dataLower, dataUpper, deleteLower, deleteUpper)) { // no values overlap between the data file and the deletes return false; } } return true; } private static boolean rangesOverlap(Type.PrimitiveType type, ByteBuffer dataLowerBuf, ByteBuffer dataUpperBuf, ByteBuffer deleteLowerBuf, ByteBuffer deleteUpperBuf) { Comparator comparator = Comparators.forType(type); T dataLower = Conversions.fromByteBuffer(type, dataLowerBuf); T dataUpper = Conversions.fromByteBuffer(type, dataUpperBuf); T deleteLower = Conversions.fromByteBuffer(type, deleteLowerBuf); T deleteUpper = Conversions.fromByteBuffer(type, deleteUpperBuf); return comparator.compare(deleteLower, dataUpper) <= 0 && comparator.compare(dataLower, deleteUpper) <= 0; } private static boolean allNonNull(Map nullValueCounts, Types.NestedField field) { if (field.isRequired()) { return true; } if (nullValueCounts == null) { return false; } Long nullValueCount = nullValueCounts.get(field.fieldId()); if (nullValueCount == null) { return false; } return nullValueCount <= 0; } private static boolean allNull(Map nullValueCounts, Map valueCounts, Types.NestedField field) { if (field.isRequired()) { return false; } if (nullValueCounts == null || valueCounts == null) { return false; } Long nullValueCount = nullValueCounts.get(field.fieldId()); Long valueCount = valueCounts.get(field.fieldId()); if (nullValueCount == null || valueCount == null) { return false; } return nullValueCount.equals(valueCount); } private static boolean containsNull(Map nullValueCounts, Types.NestedField field) { if (field.isRequired()) { return false; } if (nullValueCounts == null) { return true; } Long nullValueCount = nullValueCounts.get(field.fieldId()); if (nullValueCount == null) { return true; } return nullValueCount > 0; } private static Stream limitBySequenceNumber(long sequenceNumber, long[] seqs, DeleteFile[] files) { if (files == null) { return Stream.empty(); } int pos = Arrays.binarySearch(seqs, sequenceNumber); int start; if (pos < 0) { // the sequence number was not found, where it would be inserted is -(pos + 1) start = -(pos + 1); } else { // the sequence number was found, but may not be the first // find the first delete file with the given sequence number by decrementing the position start = pos; while (start > 0 && seqs[start - 1] >= sequenceNumber) { start -= 1; } } return Arrays.stream(files, start, files.length); } static Builder builderFor(FileIO io, Iterable deleteManifests) { return new Builder(io, Sets.newHashSet(deleteManifests)); } static class Builder { private final FileIO io; private final Set deleteManifests; private long minSequenceNumber = 0L; private Map specsById = null; private Expression dataFilter = Expressions.alwaysTrue(); private Expression partitionFilter = Expressions.alwaysTrue(); private boolean caseSensitive = true; private ExecutorService executorService = null; Builder(FileIO io, Set deleteManifests) { this.io = io; this.deleteManifests = Sets.newHashSet(deleteManifests); } Builder afterSequenceNumber(long seq) { this.minSequenceNumber = seq; return this; } Builder specsById(Map newSpecsById) { this.specsById = newSpecsById; return this; } Builder filterData(Expression newDataFilter) { this.dataFilter = Expressions.and(dataFilter, newDataFilter); return this; } Builder filterPartitions(Expression newPartitionFilter) { this.partitionFilter = Expressions.and(partitionFilter, newPartitionFilter); return this; } Builder caseSensitive(boolean newCaseSensitive) { this.caseSensitive = newCaseSensitive; return this; } Builder planWith(ExecutorService newExecutorService) { this.executorService = newExecutorService; return this; } DeleteFileIndex build() { // read all of the matching delete manifests in parallel and accumulate the matching files in a queue Queue> deleteEntries = new ConcurrentLinkedQueue<>(); Tasks.foreach(deleteManifestReaders()) .stopOnFailure().throwFailureWhenFinished() .executeWith(executorService) .run(deleteFile -> { try (CloseableIterable> reader = deleteFile) { for (ManifestEntry entry : reader) { if (entry.sequenceNumber() > minSequenceNumber) { // copy with stats for better filtering against data file stats deleteEntries.add(entry.copy()); } } } catch (IOException e) { throw new RuntimeIOException(e, "Failed to close"); } }); // build a map from (specId, partition) to delete file entries ListMultimap, ManifestEntry> deleteFilesByPartition = Multimaps.newListMultimap(Maps.newHashMap(), Lists::newArrayList); for (ManifestEntry entry : deleteEntries) { int specId = entry.file().specId(); StructLikeWrapper wrapper = StructLikeWrapper.forType(specsById.get(specId).partitionType()) .set(entry.file().partition()); deleteFilesByPartition.put(Pair.of(specId, wrapper), entry); } // sort the entries in each map value by sequence number and split into sequence numbers and delete files lists Map, Pair> sortedDeletesByPartition = Maps.newHashMap(); // also, separate out equality deletes in an unpartitioned spec that should be applied globally long[] globalApplySeqs = null; DeleteFile[] globalDeletes = null; for (Pair partition : deleteFilesByPartition.keySet()) { if (specsById.get(partition.first()).isUnpartitioned()) { Preconditions.checkState(globalDeletes == null, "Detected multiple partition specs with no partitions"); List> eqFilesSortedBySeq = deleteFilesByPartition.get(partition).stream() .filter(entry -> entry.file().content() == FileContent.EQUALITY_DELETES) .map(entry -> // a delete file is indexed by the sequence number it should be applied to Pair.of(entry.sequenceNumber() - 1, entry.file())) .sorted(Comparator.comparingLong(Pair::first)) .collect(Collectors.toList()); globalApplySeqs = eqFilesSortedBySeq.stream().mapToLong(Pair::first).toArray(); globalDeletes = eqFilesSortedBySeq.stream().map(Pair::second).toArray(DeleteFile[]::new); List> posFilesSortedBySeq = deleteFilesByPartition.get(partition).stream() .filter(entry -> entry.file().content() == FileContent.POSITION_DELETES) .map(entry -> Pair.of(entry.sequenceNumber(), entry.file())) .sorted(Comparator.comparingLong(Pair::first)) .collect(Collectors.toList()); long[] seqs = posFilesSortedBySeq.stream().mapToLong(Pair::first).toArray(); DeleteFile[] files = posFilesSortedBySeq.stream().map(Pair::second).toArray(DeleteFile[]::new); sortedDeletesByPartition.put(partition, Pair.of(seqs, files)); } else { List> filesSortedBySeq = deleteFilesByPartition.get(partition).stream() .map(entry -> { // a delete file is indexed by the sequence number it should be applied to long applySeq = entry.sequenceNumber() - (entry.file().content() == FileContent.EQUALITY_DELETES ? 1 : 0); return Pair.of(applySeq, entry.file()); }) .sorted(Comparator.comparingLong(Pair::first)) .collect(Collectors.toList()); long[] seqs = filesSortedBySeq.stream().mapToLong(Pair::first).toArray(); DeleteFile[] files = filesSortedBySeq.stream().map(Pair::second).toArray(DeleteFile[]::new); sortedDeletesByPartition.put(partition, Pair.of(seqs, files)); } } return new DeleteFileIndex(specsById, globalApplySeqs, globalDeletes, sortedDeletesByPartition); } private Iterable>> deleteManifestReaders() { LoadingCache evalCache = specsById == null ? null : Caffeine.newBuilder().build(specId -> { PartitionSpec spec = specsById.get(specId); return ManifestEvaluator.forPartitionFilter( Expressions.and(partitionFilter, Projections.inclusive(spec, caseSensitive).project(dataFilter)), spec, caseSensitive); }); Iterable matchingManifests = evalCache == null ? deleteManifests : Iterables.filter(deleteManifests, manifest -> manifest.content() == ManifestContent.DELETES && (manifest.hasAddedFiles() || manifest.hasExistingFiles()) && evalCache.get(manifest.partitionSpecId()).eval(manifest)); return Iterables.transform( matchingManifests, manifest -> ManifestFiles.readDeleteManifest(manifest, io, specsById) .filterRows(dataFilter) .filterPartitions(partitionFilter) .caseSensitive(caseSensitive) .liveEntries() ); } } }





© 2015 - 2025 Weber Informatics LLC | Privacy Policy