com.swirlds.merkledb.files.DataFileCompactor Maven / Gradle / Ivy

Go to download
/*
 * Copyright (C) 2023-2024 Hedera Hashgraph, LLC
 *
 * Licensed under the Apache License, Version 2.0 (the "License");
 * you may not use this file except in compliance with the License.
 * You may obtain a copy of the License at
 *
 *      http://www.apache.org/licenses/LICENSE-2.0
 *
 * Unless required by applicable law or agreed to in writing, software
 * distributed under the License is distributed on an "AS IS" BASIS,
 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 * See the License for the specific language governing permissions and
 * limitations under the License.
 */

package com.swirlds.merkledb.files;

import static com.swirlds.logging.legacy.LogMarker.EXCEPTION;
import static com.swirlds.logging.legacy.LogMarker.MERKLE_DB;
import static com.swirlds.merkledb.files.DataFileCommon.formatSizeBytes;
import static com.swirlds.merkledb.files.DataFileCommon.getSizeOfFiles;
import static com.swirlds.merkledb.files.DataFileCommon.getSizeOfFilesByPath;
import static com.swirlds.merkledb.files.DataFileCommon.logCompactStats;

import com.hedera.pbj.runtime.io.buffer.BufferedData;
import com.swirlds.base.units.UnitConstants;
import com.swirlds.merkledb.KeyRange;
import com.swirlds.merkledb.collections.CASableLongIndex;
import com.swirlds.merkledb.config.MerkleDbConfig;
import edu.umd.cs.findbugs.annotations.Nullable;
import java.io.IOException;
import java.nio.channels.ClosedByInterruptException;
import java.nio.file.Path;
import java.time.Instant;
import java.util.ArrayList;
import java.util.Collections;
import java.util.List;
import java.util.Map;
import java.util.concurrent.Semaphore;
import java.util.concurrent.atomic.AtomicBoolean;
import java.util.concurrent.atomic.AtomicInteger;
import java.util.concurrent.atomic.AtomicReference;
import java.util.function.BiConsumer;
import java.util.stream.Collectors;
import org.apache.logging.log4j.LogManager;
import org.apache.logging.log4j.Logger;

/**
 * This class is responsible performing compaction of data files in a {@link DataFileCollection}.
 * The compaction is supposed to happen in the background and can be paused and resumed with {@link #pauseCompaction()}
 * and {@link #resumeCompaction()} to prevent compaction from interfering with snapshots.
 */
public class DataFileCompactor {

    private static final Logger logger = LogManager.getLogger(DataFileCompactor.class);

    /**
     * This is the compaction level that non-compacted files have.
     */
    public static final int INITIAL_COMPACTION_LEVEL = 0;

    private final MerkleDbConfig dbConfig;

    /**
     * Name of the file store to compact.
     */
    private final String storeName;
    /**
     * The data file collection to compact
     */
    private final DataFileCollection dataFileCollection;

    /**
     * Index to update during compaction
     */
    private final CASableLongIndex index;
    /**
     * A function that will be called to report the duration of the compaction
     */
    @Nullable
    private final BiConsumer reportDurationMetricFunction;
    /**
     * A function that will be called to report the amount of space saved by the compaction
     */
    @Nullable
    private final BiConsumer reportSavedSpaceMetricFunction;

    private final BiConsumer reportFileSizeByLevelMetricFunction;

    /**
     * A function that updates statistics of total usage of disk space and off-heap space
     */
    @Nullable
    private final Runnable updateTotalStatsFunction;

    /**
     * A lock used for synchronization between snapshots and compactions. While a compaction is in
     * progress, it runs on its own without any synchronization. However, a few critical sections
     * are protected with this lock: to create a new compaction writer/reader when compaction is
     * started, to copy data items to the current writer and update the corresponding index item,
     * and to close the compaction writer. This mechanism allows snapshots to effectively put
     * compaction on hold, which is critical as snapshots should be as fast as possible, while
     * compactions are just background processes.
     */
    private final Semaphore snapshotCompactionLock = new Semaphore(1);

    /**
     * Start time of the current compaction, or null if compaction isn't running
     */
    private final AtomicReference currentCompactionStartTime = new AtomicReference<>();

    /**
     * Current data file writer during compaction, or null if compaction isn't running. The writer
     * is created at compaction start. If compaction is interrupted by a snapshot, the writer is
     * closed before the snapshot, and then a new writer / new file is created after the snapshot is
     * taken.
     */
    private final AtomicReference currentWriter = new AtomicReference<>();
    /**
     * Currrent data file reader for the compaction writer above.
     */
    private final AtomicReference currentReader = new AtomicReference<>();
    /**
     * The list of new files created during compaction. Usually, all files to process are compacted
     * to a single new file, but if compaction is interrupted by a snapshot, there may be more than
     * one file created.
     */
    private final List newCompactedFiles = new ArrayList<>();

    /**
     * Indicates whether compaction is in progress at the time when {@link #pauseCompaction()}
     * is called. This flag is then checked in {@link DataFileCompactor#resumeCompaction()} )} to start a new
     * compacted file or not.
     */
    private final AtomicBoolean compactionWasInProgress = new AtomicBoolean(false);

    /**
     * This variable keeps track of the compaction level that was in progress at the time when it was suspended.
     * Once the compaction is resumed, this level is used to start a new compacted file, and then it's reset to 0.
     */
    private final AtomicInteger compactionLevelInProgress = new AtomicInteger(0);

    /**
     * @param dbConfig                       MerkleDb config
     * @param storeName                      name of the store to compact
     * @param dataFileCollection             data file collection to compact
     * @param index                          index to update during compaction
     * @param reportDurationMetricFunction   function to report how long compaction took, in ms
     * @param reportSavedSpaceMetricFunction function to report how much space was compacted, in Mb
     * @param reportFileSizeByLevelMetricFunction function to report how much spaсе is used by the store by compaction level, in Mb
     * @param updateTotalStatsFunction       A function that updates statistics of total usage of disk space and off-heap space
     */
    public DataFileCompactor(
            final MerkleDbConfig dbConfig,
            final String storeName,
            final DataFileCollection dataFileCollection,
            CASableLongIndex index,
            @Nullable final BiConsumer reportDurationMetricFunction,
            @Nullable final BiConsumer reportSavedSpaceMetricFunction,
            @Nullable final BiConsumer reportFileSizeByLevelMetricFunction,
            @Nullable Runnable updateTotalStatsFunction) {
        this.dbConfig = dbConfig;
        this.storeName = storeName;
        this.dataFileCollection = dataFileCollection;
        this.index = index;
        this.reportDurationMetricFunction = reportDurationMetricFunction;
        this.reportSavedSpaceMetricFunction = reportSavedSpaceMetricFunction;
        this.reportFileSizeByLevelMetricFunction = reportFileSizeByLevelMetricFunction;
        this.updateTotalStatsFunction = updateTotalStatsFunction;
    }

    /**
     * Compacts all files in compactionPlan.
     *
     * @param index          takes a map of moves from old location to new location. Once it is finished and
     *                       returns it is assumed all readers will no longer be looking in old location, so old files
     *                       can be safely deleted.
     * @param filesToCompact list of files to compact
     * @param targetCompactionLevel target compaction level
     * @return list of files created during the compaction
     * @throws IOException          If there was a problem with the compaction
     * @throws InterruptedException If the compaction thread was interrupted
     */
    synchronized List compactFiles(
            final CASableLongIndex index,
            final List filesToCompact,
            final int targetCompactionLevel)
            throws IOException, InterruptedException {
        if (filesToCompact.size() < getMinNumberOfFilesToCompact()) {
            // nothing to do we have merged since the last data update
            logger.debug(MERKLE_DB.getMarker(), "No files were available for merging [{}]", storeName);
            return Collections.emptyList();
        }

        // create a merge time stamp, this timestamp is the newest time of the set of files we are
        // merging
        final Instant startTime = filesToCompact.stream()
                .map(file -> file.getMetadata().getCreationDate())
                .max(Instant::compareTo)
                .orElseGet(Instant::now);
        snapshotCompactionLock.acquire();
        try {
            currentCompactionStartTime.set(startTime);
            newCompactedFiles.clear();
            startNewCompactionFile(targetCompactionLevel);
        } finally {
            snapshotCompactionLock.release();
        }

        // We need a map to find readers by file index below. It doesn't have to be synchronized
        // as it will be accessed in this thread only, so it can be a simple HashMap or alike.
        // However, standard Java maps can only work with Integer, not int (yet), so auto-boxing
        // will put significant load on GC. Let's do something different
        int minFileIndex = Integer.MAX_VALUE;
        int maxFileIndex = 0;
        for (final DataFileReader r : filesToCompact) {
            minFileIndex = Math.min(minFileIndex, r.getIndex());
            maxFileIndex = Math.max(maxFileIndex, r.getIndex());
        }
        final int firstIndexInc = minFileIndex;
        final int lastIndexExc = maxFileIndex + 1;
        final DataFileReader[] readers = new DataFileReader[lastIndexExc - firstIndexInc];
        for (DataFileReader r : filesToCompact) {
            readers[r.getIndex() - firstIndexInc] = r;
        }

        boolean allDataItemsProcessed = false;
        try {
            final KeyRange keyRange = dataFileCollection.getValidKeyRange();
            index.forEach((path, dataLocation) -> {
                if (!keyRange.withinRange(path)) {
                    return;
                }
                final int fileIndex = DataFileCommon.fileIndexFromDataLocation(dataLocation);
                if ((fileIndex < firstIndexInc) || (fileIndex >= lastIndexExc)) {
                    return;
                }
                final DataFileReader reader = readers[fileIndex - firstIndexInc];
                if (reader == null) {
                    return;
                }
                final long fileOffset = DataFileCommon.byteOffsetFromDataLocation(dataLocation);
                // Take the lock. If a snapshot is started in a different thread, this call
                // will block until the snapshot is done. The current file will be flushed,
                // and current data file writer and reader will point to a new file
                snapshotCompactionLock.acquire();
                try {
                    final DataFileWriter newFileWriter = currentWriter.get();
                    final BufferedData itemBytes = reader.readDataItem(fileOffset);
                    assert itemBytes != null;
                    long newLocation = newFileWriter.storeDataItem(itemBytes);
                    // update the index
                    index.putIfEqual(path, dataLocation, newLocation);
                } catch (final ClosedByInterruptException e) {
                    logger.info(
                            MERKLE_DB.getMarker(),
                            "Failed to copy data item {} / {} due to thread interruption",
                            fileIndex,
                            fileOffset,
                            e);
                    throw e;
                } catch (final IOException z) {
                    logger.error(EXCEPTION.getMarker(), "Failed to copy data item {} / {}", fileIndex, fileOffset, z);
                    throw z;
                } finally {
                    snapshotCompactionLock.release();
                }
            });
            allDataItemsProcessed = true;
        } finally {
            // Even if the thread is interrupted, make sure the new compacted file is properly closed
            // and is included to future compactions
            snapshotCompactionLock.acquire();
            try {
                // Finish writing the last file. In rare cases, it may be an empty file
                finishCurrentCompactionFile();
                // Clear compaction start time
                currentCompactionStartTime.set(null);
                if (allDataItemsProcessed) {
                    // Close the readers and delete compacted files
                    dataFileCollection.deleteFiles(filesToCompact);
                }
            } finally {
                snapshotCompactionLock.release();
            }
        }

        return newCompactedFiles;
    }

    // visible for testing
    int getMinNumberOfFilesToCompact() {
        return dbConfig.minNumberOfFilesInCompaction();
    }

    /**
     * Opens a new file for writing during compaction. This method is called, when compaction is
     * started. If compaction is interrupted and resumed by data source snapshot using {@link
     * #pauseCompaction()} and {@link #resumeCompaction()}, a new file is created for writing using
     * this method before compaction is resumed.
     * 
     * This method must be called under snapshot/compaction lock.
     *
     * @throws IOException If an I/O error occurs
     */
    private void startNewCompactionFile(int compactionLevel) throws IOException {
        final Instant startTime = currentCompactionStartTime.get();
        assert startTime != null;
        final DataFileWriter newFileWriter = dataFileCollection.newDataFile(startTime, compactionLevel);
        currentWriter.set(newFileWriter);
        final Path newFileCreated = newFileWriter.getPath();
        newCompactedFiles.add(newFileCreated);
        final DataFileMetadata newFileMetadata = newFileWriter.getMetadata();
        final DataFileReader newFileReader = dataFileCollection.addNewDataFileReader(newFileCreated, newFileMetadata);
        currentReader.set(newFileReader);
    }

    /**
     * Closes the current compaction file. This method is called in the end of compaction process,
     * and also before a snapshot is taken to make sure the current file is fully written and safe
     * to include to snapshots.
     * 

     * This method must be called under snapshot/compaction lock.
     *
     * @throws IOException If an I/O error occurs
     */
    private void finishCurrentCompactionFile() throws IOException {
        currentWriter.get().finishWriting();
        currentWriter.set(null);
        // Now include the file in future compactions
        currentReader.get().setFileCompleted();
        currentReader.set(null);
    }

    /**
     * Puts file compaction on hold, if it's currently in progress. If not in progress, it will
     * prevent compaction from starting until {@link #resumeCompaction()} is called. The most
     * important thing this method does is it makes data files consistent and read only, so they can
     * be included to snapshots as easily as to create hard links. In particular, if compaction is
     * in progress, and a new data file is being written to, this file is flushed to disk, no files
     * are created and no index entries are updated until compaction is resumed.
     * 

     * This method should not be called on the compaction thread.
     * 

     * This method must be always balanced with and called before {@link DataFileCompactor#resumeCompaction()}. If
     * there are more / less calls to resume compactions than to pause, or if they are called in a
     * wrong order, it will result in deadlocks.
     *
     * @throws IOException If an I/O error occurs
     * @see #resumeCompaction()
     */
    public void pauseCompaction() throws IOException {
        snapshotCompactionLock.acquireUninterruptibly();
        // Check if compaction is currently in progress. If so, flush and close the current file, so
        // it's included to the snapshot
        final DataFileWriter compactionWriter = currentWriter.get();
        if (compactionWriter != null) {
            compactionWasInProgress.set(true);
            compactionLevelInProgress.set(compactionWriter.getMetadata().getCompactionLevel());
            finishCurrentCompactionFile();
            // Don't start a new compaction file here, as it would be included to snapshots, but
            // it shouldn't, as it isn't fully written yet. Instead, a new file will be started
            // right after snapshot is taken, in resumeCompaction()
        }
        // Don't release the lock here, it will be done later in resumeCompaction(). If there is no
        // compaction currently running, the lock will prevent starting a new one until snapshot is
        // done
    }

    /**
     * Resumes compaction previously put on hold with {@link #pauseCompaction()}. If there was no
     * compaction running at that moment, but new compaction was started (and blocked) since {@link
     * #pauseCompaction()}, this new compaction is resumed.
     * 
     * This method must be always balanced with and called after {@link #pauseCompaction()}. If
     * there are more / less calls to resume compactions than to pause, or if they are called in a
     * wrong order, it will result in deadlocks.
     *
     * @throws IOException If an I/O error occurs
     */
    public void resumeCompaction() throws IOException {
        try {
            if (compactionWasInProgress.getAndSet(false)) {
                assert currentWriter.get() == null;
                assert currentReader.get() == null;
                startNewCompactionFile(compactionLevelInProgress.getAndSet(0));
            }
        } finally {
            snapshotCompactionLock.release();
        }
    }

    /**
     * Compact data files in the collection according to the compaction algorithm.
     *
     * @throws IOException          if there was a problem merging
     * @throws InterruptedException if the merge thread was interrupted
     * @return true if compaction was performed, false otherwise
     */
    public boolean compact() throws IOException, InterruptedException {
        final List completedFiles = dataFileCollection.getAllCompletedFiles();
        reportFileSizeByLevel(completedFiles);
        final List filesToCompact =
                compactionPlan(completedFiles, getMinNumberOfFilesToCompact(), dbConfig.maxCompactionLevel());
        if (filesToCompact.isEmpty()) {
            logger.debug(MERKLE_DB.getMarker(), "[{}] No need to compact, as the compaction plan is empty", storeName);
            return false;
        }

        final int filesCount = filesToCompact.size();
        logger.info(MERKLE_DB.getMarker(), "[{}] Starting compaction", storeName);

        final int targetCompactionLevel = getTargetCompactionLevel(filesToCompact, filesCount);

        final long start = System.currentTimeMillis();

        final long filesToCompactSize = getSizeOfFiles(filesToCompact);
        logger.debug(
                MERKLE_DB.getMarker(),
                "[{}] Starting merging {} files / {}",
                storeName,
                filesCount,
                formatSizeBytes(filesToCompactSize));

        final List newFilesCreated = compactFiles(index, filesToCompact, targetCompactionLevel);

        final long end = System.currentTimeMillis();
        final long tookMillis = end - start;
        if (reportDurationMetricFunction != null) {
            reportDurationMetricFunction.accept(targetCompactionLevel, tookMillis);
        }

        final long compactedFilesSize = getSizeOfFilesByPath(newFilesCreated);
        if (reportSavedSpaceMetricFunction != null) {
            reportSavedSpaceMetricFunction.accept(
                    targetCompactionLevel,
                    (filesToCompactSize - compactedFilesSize) * UnitConstants.BYTES_TO_MEBIBYTES);
        }

        reportFileSizeByLevel(dataFileCollection.getAllCompletedFiles());

        logCompactStats(
                storeName,
                tookMillis,
                filesToCompact,
                filesToCompactSize,
                newFilesCreated,
                targetCompactionLevel,
                dataFileCollection);
        logger.info(
                MERKLE_DB.getMarker(),
                "[{}] Finished compaction {} files / {} in {} ms",
                storeName,
                filesCount,
                formatSizeBytes(filesToCompactSize),
                tookMillis);

        if (updateTotalStatsFunction != null) {
            updateTotalStatsFunction.run();
        }

        return true;
    }

    private void reportFileSizeByLevel(List allCompletedFiles) {
        if (reportFileSizeByLevelMetricFunction != null) {
            final Map> readersByLevel = getReadersByLevel(allCompletedFiles);
            for (int i = 0; i < readersByLevel.size(); i++) {
                final List readers = readersByLevel.get(i);
                if (readers != null) {
                    reportFileSizeByLevelMetricFunction.accept(
                            i, getSizeOfFiles(readers) * UnitConstants.BYTES_TO_MEBIBYTES);
                }
            }
        }
    }

    /**
     * The target compaction level should not exceed the maxCompactionLevel configuration parameter.
     * We need a limit on compaction levels for two reasons:
     *  - To ensure a reasonably predictable frequency for full compactions, even for data that changes infrequently.
     *  - We maintain metrics for each level, and there should be a cap on the number of these metrics.
     */
    private int getTargetCompactionLevel(List filesToCompact, int filesCount) {
        int highestExistingCompactionLevel =
                filesToCompact.get(filesCount - 1).getMetadata().getCompactionLevel();

        return Math.min(highestExistingCompactionLevel + 1, dbConfig.maxCompactionLevel());
    }

    /**
     * This method creates a compaction plan (a set of files to be compacted). The plan is organized by compaction levels
     * in ascending order. If there are not enough files to compact, then no files are compacted and the plan will be empty.
     * If the current level doesn't reach minNumberOfFilesToCompact threshold,
     * then this level and the levels above it are not included in the plan.
     * @return filter creating a compaction plan
     */
    static  List compactionPlan(
            List dataFileReaders, int minNumberOfFilesToCompact, int maxCompactionLevel) {
        if (dataFileReaders.isEmpty()) {
            return dataFileReaders;
        }

        final Map> readersByLevel = getReadersByLevel(dataFileReaders);

        final List nonCompactedReaders = readersByLevel.get(INITIAL_COMPACTION_LEVEL);
        if (nonCompactedReaders == null || nonCompactedReaders.size() < minNumberOfFilesToCompact) {
            return Collections.emptyList();
        }

        // we always compact files from level 0 if we have enough files
        final List readersToCompact = new ArrayList<>(nonCompactedReaders);

        for (int i = 1; i <= maxCompactionLevel; i++) {
            final List readers = readersByLevel.get(i);
            // Presumably, one file comes from the compaction of the previous level.
            // If, counting this file in, it still doesn't have enough, then it stops collecting.
            if (readers == null || readers.size() < minNumberOfFilesToCompact - 1) {
                break;
            }
            readersToCompact.addAll(readers);
        }
        return readersToCompact;
    }

    private static Map> getReadersByLevel(final List dataFileReaders) {
        return dataFileReaders.stream()
                .collect(Collectors.groupingBy(r -> r.getMetadata().getCompactionLevel()));
    }
}