All Downloads are FREE. Search and download functionalities are using the official Maven repository.

com.facebook.presto.iceberg.procedure.RemoveOrphanFiles Maven / Gradle / Ivy

The newest version!
/*
 * Licensed under the Apache License, Version 2.0 (the "License");
 * you may not use this file except in compliance with the License.
 * You may obtain a copy of the License at
 *
 *     http://www.apache.org/licenses/LICENSE-2.0
 *
 * Unless required by applicable law or agreed to in writing, software
 * distributed under the License is distributed on an "AS IS" BASIS,
 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 * See the License for the specific language governing permissions and
 * limitations under the License.
 */
package com.facebook.presto.iceberg.procedure;

import com.facebook.airlift.log.Logger;
import com.facebook.presto.common.type.SqlTimestamp;
import com.facebook.presto.hive.HdfsContext;
import com.facebook.presto.hive.HdfsEnvironment;
import com.facebook.presto.iceberg.IcebergAbstractMetadata;
import com.facebook.presto.iceberg.IcebergMetadataFactory;
import com.facebook.presto.spi.ConnectorSession;
import com.facebook.presto.spi.PrestoException;
import com.facebook.presto.spi.SchemaTableName;
import com.facebook.presto.spi.classloader.ThreadContextClassLoader;
import com.facebook.presto.spi.procedure.Procedure;
import com.facebook.presto.spi.procedure.Procedure.Argument;
import com.google.common.collect.ImmutableList;
import com.google.common.collect.ImmutableSet;
import org.apache.hadoop.fs.FileSystem;
import org.apache.hadoop.fs.LocatedFileStatus;
import org.apache.hadoop.fs.Path;
import org.apache.hadoop.fs.RemoteIterator;
import org.apache.iceberg.CatalogUtil;
import org.apache.iceberg.ContentFile;
import org.apache.iceberg.ManifestFile;
import org.apache.iceberg.ManifestFiles;
import org.apache.iceberg.ManifestReader;
import org.apache.iceberg.Snapshot;
import org.apache.iceberg.Table;

import javax.inject.Inject;
import javax.inject.Provider;

import java.io.IOException;
import java.lang.invoke.MethodHandle;
import java.util.ArrayList;
import java.util.HashSet;
import java.util.List;
import java.util.Set;

import static com.facebook.presto.common.block.MethodHandleUtil.methodHandle;
import static com.facebook.presto.common.type.StandardTypes.TIMESTAMP;
import static com.facebook.presto.common.type.StandardTypes.VARCHAR;
import static com.facebook.presto.iceberg.IcebergErrorCode.ICEBERG_FILESYSTEM_ERROR;
import static com.facebook.presto.iceberg.IcebergErrorCode.ICEBERG_UNKNOWN_MANIFEST_TYPE;
import static com.facebook.presto.iceberg.IcebergUtil.dataLocation;
import static com.facebook.presto.iceberg.IcebergUtil.getIcebergTable;
import static com.facebook.presto.iceberg.IcebergUtil.metadataLocation;
import static java.lang.String.format;
import static java.util.Objects.requireNonNull;
import static java.util.concurrent.TimeUnit.DAYS;
import static org.apache.hadoop.fs.Path.SEPARATOR;
import static org.apache.iceberg.ReachableFileUtil.metadataFileLocations;
import static org.apache.iceberg.ReachableFileUtil.statisticsFilesLocations;

public class RemoveOrphanFiles
        implements Provider
{
    private static final int REMOVE_UNUSED_FILES_OLDER_THAN_IN_DAYS = 3;
    private static final int BATCH_DELETE_FILES_COUNT = 100;
    private static final Logger LOGGER = Logger.get(RemoveOrphanFiles.class);
    private static final MethodHandle DELETE_ORPHAN_FILES = methodHandle(
            RemoveOrphanFiles.class,
            "removeOrphanFiles",
            ConnectorSession.class,
            String.class,
            String.class,
            SqlTimestamp.class);
    private final IcebergMetadataFactory metadataFactory;
    private final HdfsEnvironment hdfsEnvironment;

    @Inject
    public RemoveOrphanFiles(IcebergMetadataFactory metadataFactory,
                             HdfsEnvironment hdfsEnvironment)
    {
        this.metadataFactory = requireNonNull(metadataFactory, "metadataFactory is null");
        this.hdfsEnvironment = requireNonNull(hdfsEnvironment, "hdfsEnvironment is null");
    }

    @Override
    public Procedure get()
    {
        return new Procedure(
                "system",
                "remove_orphan_files",
                ImmutableList.of(
                        new Argument("schema", VARCHAR),
                        new Argument("table_name", VARCHAR),
                        new Argument("older_than", TIMESTAMP, false, null)),
                DELETE_ORPHAN_FILES.bindTo(this));
    }

    public void removeOrphanFiles(ConnectorSession clientSession, String schema, String tableName, SqlTimestamp olderThan)
    {
        try (ThreadContextClassLoader ignored = new ThreadContextClassLoader(getClass().getClassLoader())) {
            doRemoveOrphanFiles(clientSession, schema, tableName, olderThan);
        }
    }

    private void doRemoveOrphanFiles(ConnectorSession clientSession, String schema, String tableName, SqlTimestamp olderThan)
    {
        IcebergAbstractMetadata metadata = (IcebergAbstractMetadata) metadataFactory.create();
        SchemaTableName schemaTableName = new SchemaTableName(schema, tableName);
        Table icebergTable = getIcebergTable(metadata, clientSession, schemaTableName);

        Set processedManifestFilePaths = new HashSet<>();
        ImmutableSet.Builder validMetadataFileNames = ImmutableSet.builder();
        ImmutableSet.Builder validDataFileNames = ImmutableSet.builder();

        for (Snapshot snapshot : icebergTable.snapshots()) {
            if (snapshot.manifestListLocation() != null) {
                validMetadataFileNames.add(extractFileName(snapshot.manifestListLocation()));
            }

            for (ManifestFile manifest : snapshot.allManifests(icebergTable.io())) {
                if (!processedManifestFilePaths.add(manifest.path())) {
                    // Already read this manifest
                    continue;
                }

                validMetadataFileNames.add(extractFileName(manifest.path()));
                try (ManifestReader> manifestReader = readerForManifest(icebergTable, manifest)) {
                    for (ContentFile contentFile : manifestReader) {
                        validDataFileNames.add(extractFileName(contentFile.path().toString()));
                    }
                }
                catch (IOException e) {
                    throw new PrestoException(ICEBERG_FILESYSTEM_ERROR, "Unable to list manifest file content from " + manifest.path(), e);
                }
            }
        }

        metadataFileLocations(icebergTable, false).stream()
                .map(RemoveOrphanFiles::extractFileName)
                .forEach(validMetadataFileNames::add);

        statisticsFilesLocations(icebergTable).stream()
                .map(RemoveOrphanFiles::extractFileName)
                .forEach(validMetadataFileNames::add);

        // Always reserve `version-hint.text` as it's a shortcut to find the newest version
        validMetadataFileNames.add("version-hint.text");

        // Remove unused metadata and data files older than 3 days by default
        // This default value is consistent with Spark procedure `remove_orphan_files` on Iceberg, see:
        //  https://iceberg.apache.org/docs/1.5.2/spark-procedures/#remove_orphan_files
        long expiration = olderThan == null ?
                System.currentTimeMillis() - DAYS.toMillis(REMOVE_UNUSED_FILES_OLDER_THAN_IN_DAYS) :
                olderThan.isLegacyTimestamp() ? olderThan.getMillisUtc() : olderThan.getMillis();
        scanAndDeleteInvalidFiles(icebergTable, clientSession, schemaTableName, expiration, validDataFileNames.build(), dataLocation(icebergTable));
        scanAndDeleteInvalidFiles(icebergTable, clientSession, schemaTableName, expiration, validMetadataFileNames.build(), metadataLocation(icebergTable));
    }

    private void scanAndDeleteInvalidFiles(Table table, ConnectorSession session, SchemaTableName schemaTableName, long expiration, Set validFiles, String folderFullPath)
    {
        try {
            List filesToDelete = new ArrayList<>();
            FileSystem fileSystem = getFileSystem(session, this.hdfsEnvironment, schemaTableName, new Path(table.location()));
            Path fullPath = new Path(folderFullPath);
            if (!fileSystem.exists(fullPath)) {
                return;
            }
            RemoteIterator allFiles = fileSystem.listFiles(fullPath, true);
            while (allFiles.hasNext()) {
                LocatedFileStatus entry = allFiles.next();
                if (entry.getModificationTime() <= expiration && !validFiles.contains(entry.getPath().getName())) {
                    filesToDelete.add(entry.getPath().toString());
                    if (filesToDelete.size() >= BATCH_DELETE_FILES_COUNT) {
                        LOGGER.debug("Deleting files while removing orphan files for table %s : %s", schemaTableName, filesToDelete);
                        CatalogUtil.deleteFiles(table.io(), filesToDelete, folderFullPath, true);
                        filesToDelete.clear();
                    }
                }
                else {
                    LOGGER.debug("%s retained while removing orphan files %s", entry.getPath().toString(), schemaTableName.getTableName());
                }
            }

            if (!filesToDelete.isEmpty()) {
                LOGGER.debug("Deleting files while removing orphan files for table %s : %s", schemaTableName, filesToDelete);
                CatalogUtil.deleteFiles(table.io(), filesToDelete, folderFullPath, true);
            }
        }
        catch (IOException e) {
            throw new PrestoException(ICEBERG_FILESYSTEM_ERROR, "Failed accessing data for table: " + schemaTableName, e);
        }
    }

    private static String extractFileName(String path)
    {
        return path.substring(path.lastIndexOf(SEPARATOR) + 1);
    }

    private static FileSystem getFileSystem(ConnectorSession clientSession, HdfsEnvironment hdfsEnvironment, SchemaTableName schemaTableName, Path location)
    {
        HdfsContext hdfsContext = new HdfsContext(
                clientSession,
                schemaTableName.getSchemaName(),
                schemaTableName.getTableName(),
                location.getName(),
                true);

        try {
            return hdfsEnvironment.getFileSystem(hdfsContext, location);
        }
        catch (Exception e) {
            throw new PrestoException(ICEBERG_FILESYSTEM_ERROR, format("Error getting file system at path %s", location), e);
        }
    }

    private static ManifestReader> readerForManifest(Table table, ManifestFile manifest)
    {
        switch (manifest.content()) {
            case DATA:
                return ManifestFiles.read(manifest, table.io());
            case DELETES:
                ManifestFiles.readDeleteManifest(manifest, table.io(), table.specs());
            default:
                throw new PrestoException(ICEBERG_UNKNOWN_MANIFEST_TYPE, "Unknown manifest file content: " + manifest.content());
        }
    }
}




© 2015 - 2025 Weber Informatics LLC | Privacy Policy