All Downloads are FREE. Search and download functionalities are using the official Maven repository.

org.apache.paimon.flink.compact.changelog.ChangelogCompactTask Maven / Gradle / Ivy

The newest version!
/*
 * Licensed to the Apache Software Foundation (ASF) under one
 * or more contributor license agreements.  See the NOTICE file
 * distributed with this work for additional information
 * regarding copyright ownership.  The ASF licenses this file
 * to you under the Apache License, Version 2.0 (the
 * "License"); you may not use this file except in compliance
 * with the License.  You may obtain a copy of the License at
 *
 *     http://www.apache.org/licenses/LICENSE-2.0
 *
 * Unless required by applicable law or agreed to in writing, software
 * distributed under the License is distributed on an "AS IS" BASIS,
 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 * See the License for the specific language governing permissions and
 * limitations under the License.
 */

package org.apache.paimon.flink.compact.changelog;

import org.apache.paimon.data.BinaryRow;
import org.apache.paimon.flink.compact.changelog.format.CompactedChangelogReadOnlyFormat;
import org.apache.paimon.flink.sink.Committable;
import org.apache.paimon.fs.Path;
import org.apache.paimon.fs.PositionOutputStream;
import org.apache.paimon.fs.SeekableInputStream;
import org.apache.paimon.io.CompactIncrement;
import org.apache.paimon.io.DataFileMeta;
import org.apache.paimon.io.DataFilePathFactory;
import org.apache.paimon.io.DataIncrement;
import org.apache.paimon.table.FileStoreTable;
import org.apache.paimon.table.sink.CommitMessageImpl;
import org.apache.paimon.utils.FileStorePathFactory;
import org.apache.paimon.utils.IOUtils;
import org.apache.paimon.utils.Preconditions;

import java.io.IOException;
import java.io.Serializable;
import java.util.ArrayList;
import java.util.Collections;
import java.util.HashMap;
import java.util.List;
import java.util.Map;
import java.util.Objects;
import java.util.UUID;

/**
 * {@link ChangelogCompactTask} to compact several changelog files from the same partition into one
 * file, in order to reduce the number of small files.
 */
public class ChangelogCompactTask implements Serializable {
    private final long checkpointId;
    private final BinaryRow partition;
    private final Map> newFileChangelogFiles;
    private final Map> compactChangelogFiles;

    public ChangelogCompactTask(
            long checkpointId,
            BinaryRow partition,
            Map> newFileChangelogFiles,
            Map> compactChangelogFiles) {
        this.checkpointId = checkpointId;
        this.partition = partition;
        this.newFileChangelogFiles = newFileChangelogFiles;
        this.compactChangelogFiles = compactChangelogFiles;
    }

    public long checkpointId() {
        return checkpointId;
    }

    public BinaryRow partition() {
        return partition;
    }

    public Map> newFileChangelogFiles() {
        return newFileChangelogFiles;
    }

    public Map> compactChangelogFiles() {
        return compactChangelogFiles;
    }

    public List doCompact(FileStoreTable table) throws Exception {
        FileStorePathFactory pathFactory = table.store().pathFactory();
        OutputStream outputStream = new OutputStream();
        List results = new ArrayList<>();

        // copy all changelog files to a new big file
        for (Map.Entry> entry : newFileChangelogFiles.entrySet()) {
            int bucket = entry.getKey();
            DataFilePathFactory dataFilePathFactory =
                    pathFactory.createDataFilePathFactory(partition, bucket);
            for (DataFileMeta meta : entry.getValue()) {
                copyFile(
                        outputStream,
                        results,
                        table,
                        dataFilePathFactory.toPath(meta),
                        bucket,
                        false,
                        meta);
            }
        }
        for (Map.Entry> entry : compactChangelogFiles.entrySet()) {
            Integer bucket = entry.getKey();
            DataFilePathFactory dataFilePathFactory =
                    pathFactory.createDataFilePathFactory(partition, bucket);
            for (DataFileMeta meta : entry.getValue()) {
                copyFile(
                        outputStream,
                        results,
                        table,
                        dataFilePathFactory.toPath(meta),
                        bucket,
                        true,
                        meta);
            }
        }
        outputStream.out.close();

        return produceNewCommittables(results, table, pathFactory, outputStream.path);
    }

    private void copyFile(
            OutputStream outputStream,
            List results,
            FileStoreTable table,
            Path path,
            int bucket,
            boolean isCompactResult,
            DataFileMeta meta)
            throws Exception {
        if (!outputStream.isInitialized) {
            Path outputPath =
                    new Path(path.getParent(), "tmp-compacted-changelog-" + UUID.randomUUID());
            outputStream.init(outputPath, table.fileIO().newOutputStream(outputPath, false));
        }
        long offset = outputStream.out.getPos();
        try (SeekableInputStream in = table.fileIO().newInputStream(path)) {
            IOUtils.copyBytes(in, outputStream.out, IOUtils.BLOCKSIZE, false);
        }
        table.fileIO().deleteQuietly(path);
        results.add(
                new Result(
                        bucket, isCompactResult, meta, offset, outputStream.out.getPos() - offset));
    }

    private List produceNewCommittables(
            List results,
            FileStoreTable table,
            FileStorePathFactory pathFactory,
            Path changelogTempPath)
            throws IOException {
        Result baseResult = results.get(0);
        Preconditions.checkArgument(baseResult.offset == 0);
        DataFilePathFactory dataFilePathFactory =
                pathFactory.createDataFilePathFactory(partition, baseResult.bucket);
        // see Java docs of `CompactedChangelogFormatReaderFactory`
        String realName =
                "compacted-changelog-"
                        + UUID.randomUUID()
                        + "$"
                        + baseResult.bucket
                        + "-"
                        + baseResult.length;
        table.fileIO()
                .rename(
                        changelogTempPath,
                        dataFilePathFactory.toAlignedPath(
                                realName
                                        + "."
                                        + CompactedChangelogReadOnlyFormat.getIdentifier(
                                                baseResult.meta.fileFormat()),
                                baseResult.meta));

        List newCommittables = new ArrayList<>();

        Map> bucketedResults = new HashMap<>();
        for (Result result : results) {
            bucketedResults.computeIfAbsent(result.bucket, b -> new ArrayList<>()).add(result);
        }

        for (Map.Entry> entry : bucketedResults.entrySet()) {
            List newFilesChangelog = new ArrayList<>();
            List compactChangelog = new ArrayList<>();
            for (Result result : entry.getValue()) {
                // see Java docs of `CompactedChangelogFormatReaderFactory`
                String name =
                        (result.offset == 0
                                        ? realName
                                        : realName + "-" + result.offset + "-" + result.length)
                                + "."
                                + CompactedChangelogReadOnlyFormat.getIdentifier(
                                        result.meta.fileFormat());
                if (result.isCompactResult) {
                    compactChangelog.add(result.meta.rename(name));
                } else {
                    newFilesChangelog.add(result.meta.rename(name));
                }
            }

            CommitMessageImpl newMessage =
                    new CommitMessageImpl(
                            partition,
                            entry.getKey(),
                            new DataIncrement(
                                    Collections.emptyList(),
                                    Collections.emptyList(),
                                    newFilesChangelog),
                            new CompactIncrement(
                                    Collections.emptyList(),
                                    Collections.emptyList(),
                                    compactChangelog));
            newCommittables.add(new Committable(checkpointId, Committable.Kind.FILE, newMessage));
        }
        return newCommittables;
    }

    public int hashCode() {
        return Objects.hash(checkpointId, partition, newFileChangelogFiles, compactChangelogFiles);
    }

    @Override
    public boolean equals(Object o) {
        if (this == o) {
            return true;
        }
        if (o == null || getClass() != o.getClass()) {
            return false;
        }

        ChangelogCompactTask that = (ChangelogCompactTask) o;
        return checkpointId == that.checkpointId
                && Objects.equals(partition, that.partition)
                && Objects.equals(newFileChangelogFiles, that.newFileChangelogFiles)
                && Objects.equals(compactChangelogFiles, that.compactChangelogFiles);
    }

    @Override
    public String toString() {
        return String.format(
                "ChangelogCompactionTask {"
                        + "partition = %s, "
                        + "newFileChangelogFiles = %s, "
                        + "compactChangelogFiles = %s}",
                partition, newFileChangelogFiles, compactChangelogFiles);
    }

    private static class OutputStream {

        private Path path;
        private PositionOutputStream out;
        private boolean isInitialized;

        private OutputStream() {
            this.isInitialized = false;
        }

        private void init(Path path, PositionOutputStream out) {
            this.path = path;
            this.out = out;
            this.isInitialized = true;
        }
    }

    private static class Result {

        private final int bucket;
        private final boolean isCompactResult;
        private final DataFileMeta meta;
        private final long offset;
        private final long length;

        private Result(
                int bucket, boolean isCompactResult, DataFileMeta meta, long offset, long length) {
            this.bucket = bucket;
            this.isCompactResult = isCompactResult;
            this.meta = meta;
            this.offset = offset;
            this.length = length;
        }
    }
}




© 2015 - 2025 Weber Informatics LLC | Privacy Policy