All Downloads are FREE. Search and download functionalities are using the official Maven repository.

org.apache.iceberg.spark.actions.DeleteOrphanFilesSparkAction Maven / Gradle / Ivy

The newest version!
/*
 * Licensed to the Apache Software Foundation (ASF) under one
 * or more contributor license agreements.  See the NOTICE file
 * distributed with this work for additional information
 * regarding copyright ownership.  The ASF licenses this file
 * to you under the Apache License, Version 2.0 (the
 * "License"); you may not use this file except in compliance
 * with the License.  You may obtain a copy of the License at
 *
 *   http://www.apache.org/licenses/LICENSE-2.0
 *
 * Unless required by applicable law or agreed to in writing,
 * software distributed under the License is distributed on an
 * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
 * KIND, either express or implied.  See the License for the
 * specific language governing permissions and limitations
 * under the License.
 */
package org.apache.iceberg.spark.actions;

import static org.apache.iceberg.TableProperties.GC_ENABLED;
import static org.apache.iceberg.TableProperties.GC_ENABLED_DEFAULT;

import java.io.IOException;
import java.io.Serializable;
import java.io.UncheckedIOException;
import java.net.URI;
import java.sql.Timestamp;
import java.util.Collections;
import java.util.Iterator;
import java.util.List;
import java.util.Map;
import java.util.Objects;
import java.util.Set;
import java.util.concurrent.ExecutorService;
import java.util.concurrent.TimeUnit;
import java.util.function.Consumer;
import java.util.function.Predicate;
import java.util.stream.Collectors;
import org.apache.hadoop.conf.Configuration;
import org.apache.hadoop.fs.FileStatus;
import org.apache.hadoop.fs.FileSystem;
import org.apache.hadoop.fs.Path;
import org.apache.hadoop.fs.PathFilter;
import org.apache.iceberg.PartitionSpec;
import org.apache.iceberg.Table;
import org.apache.iceberg.actions.DeleteOrphanFiles;
import org.apache.iceberg.actions.ImmutableDeleteOrphanFiles;
import org.apache.iceberg.exceptions.ValidationException;
import org.apache.iceberg.hadoop.HiddenPathFilter;
import org.apache.iceberg.io.BulkDeletionFailureException;
import org.apache.iceberg.io.SupportsBulkOperations;
import org.apache.iceberg.relocated.com.google.common.annotations.VisibleForTesting;
import org.apache.iceberg.relocated.com.google.common.base.Preconditions;
import org.apache.iceberg.relocated.com.google.common.base.Strings;
import org.apache.iceberg.relocated.com.google.common.collect.ImmutableMap;
import org.apache.iceberg.relocated.com.google.common.collect.Iterators;
import org.apache.iceberg.relocated.com.google.common.collect.Lists;
import org.apache.iceberg.relocated.com.google.common.collect.Maps;
import org.apache.iceberg.spark.JobGroupInfo;
import org.apache.iceberg.util.Pair;
import org.apache.iceberg.util.PropertyUtil;
import org.apache.iceberg.util.Tasks;
import org.apache.spark.api.java.JavaRDD;
import org.apache.spark.api.java.function.FlatMapFunction;
import org.apache.spark.api.java.function.MapPartitionsFunction;
import org.apache.spark.broadcast.Broadcast;
import org.apache.spark.sql.Column;
import org.apache.spark.sql.Dataset;
import org.apache.spark.sql.Encoder;
import org.apache.spark.sql.Encoders;
import org.apache.spark.sql.Row;
import org.apache.spark.sql.SparkSession;
import org.apache.spark.sql.types.DataTypes;
import org.apache.spark.sql.types.StructField;
import org.apache.spark.sql.types.StructType;
import org.apache.spark.util.SerializableConfiguration;
import org.slf4j.Logger;
import org.slf4j.LoggerFactory;
import scala.Tuple2;

/**
 * An action that removes orphan metadata, data and delete files by listing a given location and
 * comparing the actual files in that location with content and metadata files referenced by all
 * valid snapshots. The location must be accessible for listing via the Hadoop {@link FileSystem}.
 *
 * 

By default, this action cleans up the table location returned by {@link Table#location()} and * removes unreachable files that are older than 3 days using {@link Table#io()}. The behavior can * be modified by passing a custom location to {@link #location} and a custom timestamp to {@link * #olderThan(long)}. For example, someone might point this action to the data folder to clean up * only orphan data files. * *

Configure an alternative delete method using {@link #deleteWith(Consumer)}. * *

For full control of the set of files being evaluated, use the {@link * #compareToFileList(Dataset)} argument. This skips the directory listing - any files in the * dataset provided which are not found in table metadata will be deleted, using the same {@link * Table#location()} and {@link #olderThan(long)} filtering as above. * *

Note: It is dangerous to call this action with a short retention interval as it might * corrupt the state of the table if another operation is writing at the same time. */ public class DeleteOrphanFilesSparkAction extends BaseSparkAction implements DeleteOrphanFiles { private static final Logger LOG = LoggerFactory.getLogger(DeleteOrphanFilesSparkAction.class); private static final Map EQUAL_SCHEMES_DEFAULT = ImmutableMap.of("s3n,s3a", "s3"); private static final int MAX_DRIVER_LISTING_DEPTH = 3; private static final int MAX_DRIVER_LISTING_DIRECT_SUB_DIRS = 10; private static final int MAX_EXECUTOR_LISTING_DEPTH = 2000; private static final int MAX_EXECUTOR_LISTING_DIRECT_SUB_DIRS = Integer.MAX_VALUE; private final SerializableConfiguration hadoopConf; private final int listingParallelism; private final Table table; private Map equalSchemes = flattenMap(EQUAL_SCHEMES_DEFAULT); private Map equalAuthorities = Collections.emptyMap(); private PrefixMismatchMode prefixMismatchMode = PrefixMismatchMode.ERROR; private String location; private long olderThanTimestamp = System.currentTimeMillis() - TimeUnit.DAYS.toMillis(3); private Dataset compareToFileList; private Consumer deleteFunc = null; private ExecutorService deleteExecutorService = null; DeleteOrphanFilesSparkAction(SparkSession spark, Table table) { super(spark); this.hadoopConf = new SerializableConfiguration(spark.sessionState().newHadoopConf()); this.listingParallelism = spark.sessionState().conf().parallelPartitionDiscoveryParallelism(); this.table = table; this.location = table.location(); ValidationException.check( PropertyUtil.propertyAsBoolean(table.properties(), GC_ENABLED, GC_ENABLED_DEFAULT), "Cannot delete orphan files: GC is disabled (deleting files may corrupt other tables)"); } @Override protected DeleteOrphanFilesSparkAction self() { return this; } @Override public DeleteOrphanFilesSparkAction executeDeleteWith(ExecutorService executorService) { this.deleteExecutorService = executorService; return this; } @Override public DeleteOrphanFilesSparkAction prefixMismatchMode(PrefixMismatchMode newPrefixMismatchMode) { this.prefixMismatchMode = newPrefixMismatchMode; return this; } @Override public DeleteOrphanFilesSparkAction equalSchemes(Map newEqualSchemes) { this.equalSchemes = Maps.newHashMap(); equalSchemes.putAll(flattenMap(EQUAL_SCHEMES_DEFAULT)); equalSchemes.putAll(flattenMap(newEqualSchemes)); return this; } @Override public DeleteOrphanFilesSparkAction equalAuthorities(Map newEqualAuthorities) { this.equalAuthorities = Maps.newHashMap(); equalAuthorities.putAll(flattenMap(newEqualAuthorities)); return this; } @Override public DeleteOrphanFilesSparkAction location(String newLocation) { this.location = newLocation; return this; } @Override public DeleteOrphanFilesSparkAction olderThan(long newOlderThanTimestamp) { this.olderThanTimestamp = newOlderThanTimestamp; return this; } @Override public DeleteOrphanFilesSparkAction deleteWith(Consumer newDeleteFunc) { this.deleteFunc = newDeleteFunc; return this; } public DeleteOrphanFilesSparkAction compareToFileList(Dataset files) { StructType schema = files.schema(); StructField filePathField = schema.apply(FILE_PATH); Preconditions.checkArgument( filePathField.dataType() == DataTypes.StringType, "Invalid %s column: %s is not a string", FILE_PATH, filePathField.dataType()); StructField lastModifiedField = schema.apply(LAST_MODIFIED); Preconditions.checkArgument( lastModifiedField.dataType() == DataTypes.TimestampType, "Invalid %s column: %s is not a timestamp", LAST_MODIFIED, lastModifiedField.dataType()); this.compareToFileList = files; return this; } private Dataset filteredCompareToFileList() { Dataset files = compareToFileList; if (location != null) { files = files.filter(files.col(FILE_PATH).startsWith(location)); } return files .filter(files.col(LAST_MODIFIED).lt(new Timestamp(olderThanTimestamp))) .select(files.col(FILE_PATH)) .as(Encoders.STRING()); } @Override public DeleteOrphanFiles.Result execute() { JobGroupInfo info = newJobGroupInfo("DELETE-ORPHAN-FILES", jobDesc()); return withJobGroupInfo(info, this::doExecute); } private String jobDesc() { List options = Lists.newArrayList(); options.add("older_than=" + olderThanTimestamp); if (location != null) { options.add("location=" + location); } String optionsAsString = COMMA_JOINER.join(options); return String.format("Deleting orphan files (%s) from %s", optionsAsString, table.name()); } private void deleteFiles(SupportsBulkOperations io, List paths) { try { io.deleteFiles(paths); LOG.info("Deleted {} files using bulk deletes", paths.size()); } catch (BulkDeletionFailureException e) { int deletedFilesCount = paths.size() - e.numberFailedObjects(); LOG.warn("Deleted only {} of {} files using bulk deletes", deletedFilesCount, paths.size()); } } private DeleteOrphanFiles.Result doExecute() { Dataset actualFileIdentDS = actualFileIdentDS(); Dataset validFileIdentDS = validFileIdentDS(); List orphanFiles = findOrphanFiles(spark(), actualFileIdentDS, validFileIdentDS, prefixMismatchMode); if (deleteFunc == null && table.io() instanceof SupportsBulkOperations) { deleteFiles((SupportsBulkOperations) table.io(), orphanFiles); } else { Tasks.Builder deleteTasks = Tasks.foreach(orphanFiles) .noRetry() .executeWith(deleteExecutorService) .suppressFailureWhenFinished() .onFailure((file, exc) -> LOG.warn("Failed to delete file: {}", file, exc)); if (deleteFunc == null) { LOG.info( "Table IO {} does not support bulk operations. Using non-bulk deletes.", table.io().getClass().getName()); deleteTasks.run(table.io()::deleteFile); } else { LOG.info("Custom delete function provided. Using non-bulk deletes"); deleteTasks.run(deleteFunc::accept); } } return ImmutableDeleteOrphanFiles.Result.builder().orphanFileLocations(orphanFiles).build(); } private Dataset validFileIdentDS() { // transform before union to avoid extra serialization/deserialization FileInfoToFileURI toFileURI = new FileInfoToFileURI(equalSchemes, equalAuthorities); Dataset contentFileIdentDS = toFileURI.apply(contentFileDS(table)); Dataset manifestFileIdentDS = toFileURI.apply(manifestDS(table)); Dataset manifestListIdentDS = toFileURI.apply(manifestListDS(table)); Dataset otherMetadataFileIdentDS = toFileURI.apply(otherMetadataFileDS(table)); return contentFileIdentDS .union(manifestFileIdentDS) .union(manifestListIdentDS) .union(otherMetadataFileIdentDS); } private Dataset actualFileIdentDS() { StringToFileURI toFileURI = new StringToFileURI(equalSchemes, equalAuthorities); if (compareToFileList == null) { return toFileURI.apply(listedFileDS()); } else { return toFileURI.apply(filteredCompareToFileList()); } } private Dataset listedFileDS() { List subDirs = Lists.newArrayList(); List matchingFiles = Lists.newArrayList(); Predicate predicate = file -> file.getModificationTime() < olderThanTimestamp; PathFilter pathFilter = PartitionAwareHiddenPathFilter.forSpecs(table.specs()); // list at most MAX_DRIVER_LISTING_DEPTH levels and only dirs that have // less than MAX_DRIVER_LISTING_DIRECT_SUB_DIRS direct sub dirs on the driver listDirRecursively( location, predicate, hadoopConf.value(), MAX_DRIVER_LISTING_DEPTH, MAX_DRIVER_LISTING_DIRECT_SUB_DIRS, subDirs, pathFilter, matchingFiles); JavaRDD matchingFileRDD = sparkContext().parallelize(matchingFiles, 1); if (subDirs.isEmpty()) { return spark().createDataset(matchingFileRDD.rdd(), Encoders.STRING()); } int parallelism = Math.min(subDirs.size(), listingParallelism); JavaRDD subDirRDD = sparkContext().parallelize(subDirs, parallelism); Broadcast conf = sparkContext().broadcast(hadoopConf); ListDirsRecursively listDirs = new ListDirsRecursively(conf, olderThanTimestamp, pathFilter); JavaRDD matchingLeafFileRDD = subDirRDD.mapPartitions(listDirs); JavaRDD completeMatchingFileRDD = matchingFileRDD.union(matchingLeafFileRDD); return spark().createDataset(completeMatchingFileRDD.rdd(), Encoders.STRING()); } private static void listDirRecursively( String dir, Predicate predicate, Configuration conf, int maxDepth, int maxDirectSubDirs, List remainingSubDirs, PathFilter pathFilter, List matchingFiles) { // stop listing whenever we reach the max depth if (maxDepth <= 0) { remainingSubDirs.add(dir); return; } try { Path path = new Path(dir); FileSystem fs = path.getFileSystem(conf); List subDirs = Lists.newArrayList(); for (FileStatus file : fs.listStatus(path, pathFilter)) { if (file.isDirectory()) { subDirs.add(file.getPath().toString()); } else if (file.isFile() && predicate.test(file)) { matchingFiles.add(file.getPath().toString()); } } // stop listing if the number of direct sub dirs is bigger than maxDirectSubDirs if (subDirs.size() > maxDirectSubDirs) { remainingSubDirs.addAll(subDirs); return; } for (String subDir : subDirs) { listDirRecursively( subDir, predicate, conf, maxDepth - 1, maxDirectSubDirs, remainingSubDirs, pathFilter, matchingFiles); } } catch (IOException e) { throw new UncheckedIOException(e); } } @VisibleForTesting static List findOrphanFiles( SparkSession spark, Dataset actualFileIdentDS, Dataset validFileIdentDS, PrefixMismatchMode prefixMismatchMode) { SetAccumulator> conflicts = new SetAccumulator<>(); spark.sparkContext().register(conflicts); Column joinCond = actualFileIdentDS.col("path").equalTo(validFileIdentDS.col("path")); List orphanFiles = actualFileIdentDS .joinWith(validFileIdentDS, joinCond, "leftouter") .mapPartitions(new FindOrphanFiles(prefixMismatchMode, conflicts), Encoders.STRING()) .collectAsList(); if (prefixMismatchMode == PrefixMismatchMode.ERROR && !conflicts.value().isEmpty()) { throw new ValidationException( "Unable to determine whether certain files are orphan. " + "Metadata references files that match listed/provided files except for authority/scheme. " + "Please, inspect the conflicting authorities/schemes and provide which of them are equal " + "by further configuring the action via equalSchemes() and equalAuthorities() methods. " + "Set the prefix mismatch mode to 'NONE' to ignore remaining locations with conflicting " + "authorities/schemes or to 'DELETE' iff you are ABSOLUTELY confident that remaining conflicting " + "authorities/schemes are different. It will be impossible to recover deleted files. " + "Conflicting authorities/schemes: %s.", conflicts.value()); } return orphanFiles; } private static Map flattenMap(Map map) { Map flattenedMap = Maps.newHashMap(); if (map != null) { for (String key : map.keySet()) { String value = map.get(key); for (String splitKey : COMMA_SPLITTER.split(key)) { flattenedMap.put(splitKey.trim(), value.trim()); } } } return flattenedMap; } private static class ListDirsRecursively implements FlatMapFunction, String> { private final Broadcast hadoopConf; private final long olderThanTimestamp; private final PathFilter pathFilter; ListDirsRecursively( Broadcast hadoopConf, long olderThanTimestamp, PathFilter pathFilter) { this.hadoopConf = hadoopConf; this.olderThanTimestamp = olderThanTimestamp; this.pathFilter = pathFilter; } @Override public Iterator call(Iterator dirs) throws Exception { List subDirs = Lists.newArrayList(); List files = Lists.newArrayList(); Predicate predicate = file -> file.getModificationTime() < olderThanTimestamp; while (dirs.hasNext()) { listDirRecursively( dirs.next(), predicate, hadoopConf.value().value(), MAX_EXECUTOR_LISTING_DEPTH, MAX_EXECUTOR_LISTING_DIRECT_SUB_DIRS, subDirs, pathFilter, files); } if (!subDirs.isEmpty()) { throw new RuntimeException( "Could not list sub directories, reached maximum depth: " + MAX_EXECUTOR_LISTING_DEPTH); } return files.iterator(); } } private static class FindOrphanFiles implements MapPartitionsFunction, String> { private final PrefixMismatchMode mode; private final SetAccumulator> conflicts; FindOrphanFiles(PrefixMismatchMode mode, SetAccumulator> conflicts) { this.mode = mode; this.conflicts = conflicts; } @Override public Iterator call(Iterator> rows) throws Exception { Iterator orphanFiles = Iterators.transform(rows, this::toOrphanFile); return Iterators.filter(orphanFiles, Objects::nonNull); } private String toOrphanFile(Tuple2 row) { FileURI actual = row._1; FileURI valid = row._2; if (valid == null) { return actual.uriAsString; } boolean schemeMatch = uriComponentMatch(valid.scheme, actual.scheme); boolean authorityMatch = uriComponentMatch(valid.authority, actual.authority); if ((!schemeMatch || !authorityMatch) && mode == PrefixMismatchMode.DELETE) { return actual.uriAsString; } else { if (!schemeMatch) { conflicts.add(Pair.of(valid.scheme, actual.scheme)); } if (!authorityMatch) { conflicts.add(Pair.of(valid.authority, actual.authority)); } return null; } } private boolean uriComponentMatch(String valid, String actual) { return Strings.isNullOrEmpty(valid) || valid.equalsIgnoreCase(actual); } } @VisibleForTesting static class StringToFileURI extends ToFileURI { StringToFileURI(Map equalSchemes, Map equalAuthorities) { super(equalSchemes, equalAuthorities); } @Override protected String uriAsString(String input) { return input; } } @VisibleForTesting static class FileInfoToFileURI extends ToFileURI { FileInfoToFileURI(Map equalSchemes, Map equalAuthorities) { super(equalSchemes, equalAuthorities); } @Override protected String uriAsString(FileInfo fileInfo) { return fileInfo.getPath(); } } private abstract static class ToFileURI implements MapPartitionsFunction { private final Map equalSchemes; private final Map equalAuthorities; ToFileURI(Map equalSchemes, Map equalAuthorities) { this.equalSchemes = equalSchemes; this.equalAuthorities = equalAuthorities; } protected abstract String uriAsString(I input); Dataset apply(Dataset ds) { return ds.mapPartitions(this, FileURI.ENCODER); } @Override public Iterator call(Iterator rows) throws Exception { return Iterators.transform(rows, this::toFileURI); } private FileURI toFileURI(I input) { String uriAsString = uriAsString(input); URI uri = new Path(uriAsString).toUri(); String scheme = equalSchemes.getOrDefault(uri.getScheme(), uri.getScheme()); String authority = equalAuthorities.getOrDefault(uri.getAuthority(), uri.getAuthority()); return new FileURI(scheme, authority, uri.getPath(), uriAsString); } } /** * A {@link PathFilter} that filters out hidden path, but does not filter out paths that would be * marked as hidden by {@link HiddenPathFilter} due to a partition field that starts with one of * the characters that indicate a hidden path. */ @VisibleForTesting static class PartitionAwareHiddenPathFilter implements PathFilter, Serializable { private final Set hiddenPathPartitionNames; PartitionAwareHiddenPathFilter(Set hiddenPathPartitionNames) { this.hiddenPathPartitionNames = hiddenPathPartitionNames; } @Override public boolean accept(Path path) { return isHiddenPartitionPath(path) || HiddenPathFilter.get().accept(path); } private boolean isHiddenPartitionPath(Path path) { return hiddenPathPartitionNames.stream().anyMatch(path.getName()::startsWith); } static PathFilter forSpecs(Map specs) { if (specs == null) { return HiddenPathFilter.get(); } Set partitionNames = specs.values().stream() .map(PartitionSpec::fields) .flatMap(List::stream) .filter(field -> field.name().startsWith("_") || field.name().startsWith(".")) .map(field -> field.name() + "=") .collect(Collectors.toSet()); if (partitionNames.isEmpty()) { return HiddenPathFilter.get(); } else { return new PartitionAwareHiddenPathFilter(partitionNames); } } } public static class FileURI { public static final Encoder ENCODER = Encoders.bean(FileURI.class); private String scheme; private String authority; private String path; private String uriAsString; public FileURI(String scheme, String authority, String path, String uriAsString) { this.scheme = scheme; this.authority = authority; this.path = path; this.uriAsString = uriAsString; } public FileURI() {} public void setScheme(String scheme) { this.scheme = scheme; } public void setAuthority(String authority) { this.authority = authority; } public void setPath(String path) { this.path = path; } public void setUriAsString(String uriAsString) { this.uriAsString = uriAsString; } public String getScheme() { return scheme; } public String getAuthority() { return authority; } public String getPath() { return path; } public String getUriAsString() { return uriAsString; } } }