io.delta.flink.source.internal.file.DeltaFileEnumerator Maven / Gradle / Ivy

Go to download
Show more of this group Show more artifacts with this name
Show all versions of delta-flink Show documentation
delta-flink
There is a newer version: 3.2.1
package io.delta.flink.source.internal.file;

import java.io.IOException;
import java.net.URI;
import java.util.ArrayList;
import java.util.Arrays;
import java.util.List;
import java.util.Map;

import io.delta.flink.source.internal.exceptions.DeltaSourceExceptions;
import io.delta.flink.source.internal.state.DeltaSourceSplit;
import org.apache.flink.annotation.VisibleForTesting;
import org.apache.flink.core.fs.BlockLocation;
import org.apache.flink.core.fs.FileStatus;
import org.apache.flink.core.fs.FileSystem;
import org.apache.flink.core.fs.Path;
import org.slf4j.Logger;
import org.slf4j.LoggerFactory;

import io.delta.standalone.actions.AddFile;

/**
 * The implementation of {@link AddFileEnumerator} for {@link DeltaSourceSplit}.
 * 
 * This implementation is converting all discovered Delta's {@link AddFile} objects to set of {@link
 * DeltaSourceSplit}. During the conversion, all {@code AddFiles} are filtered using {@link
 * SplitFilter}
 */
public class DeltaFileEnumerator implements AddFileEnumerator {

    /**
     * The directory separator, a slash.
     */
    public static final String SEPARATOR = "/";

    private static final Logger LOG = LoggerFactory.getLogger(DeltaFileEnumerator.class);

    /**
     * The current Id as a mutable string representation. This covers more values than the integer
     * value range, so we should never overflow.
     */
    // This is copied from Flink's NonSplittingRecursiveEnumerator
    private final char[] currentId = "0000000000".toCharArray();

    /**
     * @param context     {@link AddFileEnumeratorContext} input object for Split conversion.
     * @param splitFilter {@link SplitFilter} instance that will be used to filter out {@link
     *                    AddFile} from split conversion. The {@code SplitFilter} is based on {@link
     *                    Path} representing created from {@link AddFile#getPath()}
     * @return List of {@link DeltaSourceSplit} objects.
     */
    @Override
    public List enumerateSplits(
        AddFileEnumeratorContext context, SplitFilter splitFilter) {

        ArrayList splitsToReturn = new ArrayList<>(context.getAddFiles().size());

        for (AddFile addFile : context.getAddFiles()) {
            Path path = acquireFilePath(context.getTablePath(), addFile);
            if (splitFilter.test(path)) {
                tryConvertToSourceSplits(context, splitsToReturn, addFile, path);
            }
        }

        return splitsToReturn;
    }

    private void tryConvertToSourceSplits(
        AddFileEnumeratorContext context, ArrayList splitsToReturn,
        AddFile addFile, Path path) {
        try {
            FileSystem fs = path.getFileSystem();
            FileStatus status = fs.getFileStatus(path);
            convertToSourceSplits(status, fs, addFile.getPartitionValues(), splitsToReturn);
        } catch (IOException e) {
            throw DeltaSourceExceptions.fileEnumerationException(context, path, e);
        }
    }

    @VisibleForTesting
    Path acquireFilePath(String tablePath, AddFile addFile) {
        String addFilePath = addFile.getPath();
        URI addFileUri = URI.create(addFilePath);
        if (!addFileUri.isAbsolute()) {
            addFileUri = URI.create(getTablePath(tablePath) + addFilePath);
        }
        return new Path(addFileUri);
    }

    private String getTablePath(String tablePath) {
        // When we deserialize DeltaTablePath as string during recovery,
        // Flink's Path(String path) constructor removes the last '/' from the String.
        return (tablePath.endsWith(SEPARATOR))
            ? tablePath : tablePath + SEPARATOR;
    }

    // ------------------------------------------------------------------------
    //  Copied from Flink's BlockSplittingRecursiveEnumerator and adjusted.
    // ------------------------------------------------------------------------
    private void convertToSourceSplits(final FileStatus fileStatus, final FileSystem fileSystem,
        Map partitionValues, final List target)
        throws IOException {

        final BlockLocation[] blocks = getBlockLocationsForFile(fileStatus, fileSystem);
        if (blocks == null) {
            target.add(
                new DeltaSourceSplit(
                    partitionValues,
                    getNextId(),
                    fileStatus.getPath(),
                    0L,
                    fileStatus.getLen()));
        } else {
            for (BlockLocation block : blocks) {
                target.add(new DeltaSourceSplit(
                    partitionValues,
                    getNextId(),
                    fileStatus.getPath(),
                    block.getOffset(),
                    block.getLength(),
                    block.getHosts()));
            }
        }
    }

    @VisibleForTesting
    String getNextId() {
        // because we just increment numbers, we increment the char representation directly,
        // rather than incrementing an integer and converting it to a string representation
        // every time again (requires quite some expensive conversion logic).
        incrementCharArrayByOne(currentId, currentId.length - 1);
        return new String(currentId);
    }

    // ------------------------------------------------------------------------
    //  Copied as is from Flink's BlockSplittingRecursiveEnumerator
    // ------------------------------------------------------------------------
    private void incrementCharArrayByOne(char[] array, int pos) {
        char c = array[pos];
        c++;

        if (c > '9') {
            c = '0';
            incrementCharArrayByOne(array, pos - 1);
        }
        array[pos] = c;
    }

    /**
     * This method will try to get all blocks for given file from underlying File System. If total
     * block size will not match total file size, a warning will be logged and file will not be
     * split to blocks. It will be processed as one.
     */
    private BlockLocation[] getBlockLocationsForFile(FileStatus file, FileSystem fs)
        throws IOException {
        final long len = file.getLen();

        final BlockLocation[] blocks = fs.getFileBlockLocations(file, 0, len);
        if (blocks == null || blocks.length == 0) {
            return null;
        }

        // A cheap check whether we have all blocks.
        // We don't check whether the blocks fully cover the file (too expensive)
        // but make some sanity checks to catch early the common cases where incorrect
        // block info is returned by the implementation.
        long totalLen = 0L;
        for (BlockLocation block : blocks) {
            totalLen += block.getLength();
        }
        if (totalLen != len) {
            LOG.warn(
                "Block lengths do not match file length for {}. File length is {}, blocks are {}",
                file.getPath(), len, Arrays.toString(blocks));
            return null;
        }

        return blocks;
    }
    // ------------------------------------------------------------------------
    //  End of code copied from Flink's BlockSplittingRecursiveEnumerator
    // ------------------------------------------------------------------------
}