All Downloads are FREE. Search and download functionalities are using the official Maven repository.

com.swirlds.merkledb.files.DataFileCompactor Maven / Gradle / Ivy

/*
 * Copyright (C) 2023-2024 Hedera Hashgraph, LLC
 *
 * Licensed under the Apache License, Version 2.0 (the "License");
 * you may not use this file except in compliance with the License.
 * You may obtain a copy of the License at
 *
 *      http://www.apache.org/licenses/LICENSE-2.0
 *
 * Unless required by applicable law or agreed to in writing, software
 * distributed under the License is distributed on an "AS IS" BASIS,
 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 * See the License for the specific language governing permissions and
 * limitations under the License.
 */

package com.swirlds.merkledb.files;

import static com.swirlds.logging.legacy.LogMarker.EXCEPTION;
import static com.swirlds.logging.legacy.LogMarker.MERKLE_DB;
import static com.swirlds.merkledb.files.DataFileCommon.formatSizeBytes;
import static com.swirlds.merkledb.files.DataFileCommon.getSizeOfFiles;
import static com.swirlds.merkledb.files.DataFileCommon.getSizeOfFilesByPath;
import static com.swirlds.merkledb.files.DataFileCommon.logCompactStats;

import com.hedera.pbj.runtime.io.buffer.BufferedData;
import com.swirlds.base.units.UnitConstants;
import com.swirlds.merkledb.KeyRange;
import com.swirlds.merkledb.collections.CASableLongIndex;
import com.swirlds.merkledb.config.MerkleDbConfig;
import edu.umd.cs.findbugs.annotations.Nullable;
import java.io.IOException;
import java.nio.channels.ClosedByInterruptException;
import java.nio.file.Path;
import java.time.Instant;
import java.util.ArrayList;
import java.util.Collections;
import java.util.List;
import java.util.Map;
import java.util.concurrent.Semaphore;
import java.util.concurrent.atomic.AtomicBoolean;
import java.util.concurrent.atomic.AtomicInteger;
import java.util.concurrent.atomic.AtomicReference;
import java.util.function.BiConsumer;
import java.util.stream.Collectors;
import org.apache.logging.log4j.LogManager;
import org.apache.logging.log4j.Logger;

/**
 * This class is responsible performing compaction of data files in a {@link DataFileCollection}.
 * The compaction is supposed to happen in the background and can be paused and resumed with {@link #pauseCompaction()}
 * and {@link #resumeCompaction()} to prevent compaction from interfering with snapshots.
 */
public class DataFileCompactor {

    private static final Logger logger = LogManager.getLogger(DataFileCompactor.class);

    /**
     * This is the compaction level that non-compacted files have.
     */
    public static final int INITIAL_COMPACTION_LEVEL = 0;

    private final MerkleDbConfig dbConfig;

    /**
     * Name of the file store to compact.
     */
    private final String storeName;
    /**
     * The data file collection to compact
     */
    private final DataFileCollection dataFileCollection;

    /**
     * Index to update during compaction
     */
    private final CASableLongIndex index;
    /**
     * A function that will be called to report the duration of the compaction
     */
    @Nullable
    private final BiConsumer reportDurationMetricFunction;
    /**
     * A function that will be called to report the amount of space saved by the compaction
     */
    @Nullable
    private final BiConsumer reportSavedSpaceMetricFunction;

    private final BiConsumer reportFileSizeByLevelMetricFunction;

    /**
     * A function that updates statistics of total usage of disk space and off-heap space
     */
    @Nullable
    private final Runnable updateTotalStatsFunction;

    /**
     * A lock used for synchronization between snapshots and compactions. While a compaction is in
     * progress, it runs on its own without any synchronization. However, a few critical sections
     * are protected with this lock: to create a new compaction writer/reader when compaction is
     * started, to copy data items to the current writer and update the corresponding index item,
     * and to close the compaction writer. This mechanism allows snapshots to effectively put
     * compaction on hold, which is critical as snapshots should be as fast as possible, while
     * compactions are just background processes.
     */
    private final Semaphore snapshotCompactionLock = new Semaphore(1);

    /**
     * Start time of the current compaction, or null if compaction isn't running
     */
    private final AtomicReference currentCompactionStartTime = new AtomicReference<>();

    /**
     * Current data file writer during compaction, or null if compaction isn't running. The writer
     * is created at compaction start. If compaction is interrupted by a snapshot, the writer is
     * closed before the snapshot, and then a new writer / new file is created after the snapshot is
     * taken.
     */
    private final AtomicReference currentWriter = new AtomicReference<>();
    /**
     * Currrent data file reader for the compaction writer above.
     */
    private final AtomicReference currentReader = new AtomicReference<>();
    /**
     * The list of new files created during compaction. Usually, all files to process are compacted
     * to a single new file, but if compaction is interrupted by a snapshot, there may be more than
     * one file created.
     */
    private final List newCompactedFiles = new ArrayList<>();

    /**
     * Indicates whether compaction is in progress at the time when {@link #pauseCompaction()}
     * is called. This flag is then checked in {@link DataFileCompactor#resumeCompaction()} )} to start a new
     * compacted file or not.
     */
    private final AtomicBoolean compactionWasInProgress = new AtomicBoolean(false);

    /**
     * This variable keeps track of the compaction level that was in progress at the time when it was suspended.
     * Once the compaction is resumed, this level is used to start a new compacted file, and then it's reset to 0.
     */
    private final AtomicInteger compactionLevelInProgress = new AtomicInteger(0);

    /**
     * @param dbConfig                       MerkleDb config
     * @param storeName                      name of the store to compact
     * @param dataFileCollection             data file collection to compact
     * @param index                          index to update during compaction
     * @param reportDurationMetricFunction   function to report how long compaction took, in ms
     * @param reportSavedSpaceMetricFunction function to report how much space was compacted, in Mb
     * @param reportFileSizeByLevelMetricFunction function to report how much spaсе is used by the store by compaction level, in Mb
     * @param updateTotalStatsFunction       A function that updates statistics of total usage of disk space and off-heap space
     */
    public DataFileCompactor(
            final MerkleDbConfig dbConfig,
            final String storeName,
            final DataFileCollection dataFileCollection,
            CASableLongIndex index,
            @Nullable final BiConsumer reportDurationMetricFunction,
            @Nullable final BiConsumer reportSavedSpaceMetricFunction,
            @Nullable final BiConsumer reportFileSizeByLevelMetricFunction,
            @Nullable Runnable updateTotalStatsFunction) {
        this.dbConfig = dbConfig;
        this.storeName = storeName;
        this.dataFileCollection = dataFileCollection;
        this.index = index;
        this.reportDurationMetricFunction = reportDurationMetricFunction;
        this.reportSavedSpaceMetricFunction = reportSavedSpaceMetricFunction;
        this.reportFileSizeByLevelMetricFunction = reportFileSizeByLevelMetricFunction;
        this.updateTotalStatsFunction = updateTotalStatsFunction;
    }

    /**
     * Compacts all files in compactionPlan.
     *
     * @param index          takes a map of moves from old location to new location. Once it is finished and
     *                       returns it is assumed all readers will no longer be looking in old location, so old files
     *                       can be safely deleted.
     * @param filesToCompact list of files to compact
     * @param targetCompactionLevel target compaction level
     * @return list of files created during the compaction
     * @throws IOException          If there was a problem with the compaction
     * @throws InterruptedException If the compaction thread was interrupted
     */
    synchronized List compactFiles(
            final CASableLongIndex index,
            final List filesToCompact,
            final int targetCompactionLevel)
            throws IOException, InterruptedException {
        if (filesToCompact.size() < getMinNumberOfFilesToCompact()) {
            // nothing to do we have merged since the last data update
            logger.debug(MERKLE_DB.getMarker(), "No files were available for merging [{}]", storeName);
            return Collections.emptyList();
        }

        // create a merge time stamp, this timestamp is the newest time of the set of files we are
        // merging
        final Instant startTime = filesToCompact.stream()
                .map(file -> file.getMetadata().getCreationDate())
                .max(Instant::compareTo)
                .orElseGet(Instant::now);
        snapshotCompactionLock.acquire();
        try {
            currentCompactionStartTime.set(startTime);
            newCompactedFiles.clear();
            startNewCompactionFile(targetCompactionLevel);
        } finally {
            snapshotCompactionLock.release();
        }

        // We need a map to find readers by file index below. It doesn't have to be synchronized
        // as it will be accessed in this thread only, so it can be a simple HashMap or alike.
        // However, standard Java maps can only work with Integer, not int (yet), so auto-boxing
        // will put significant load on GC. Let's do something different
        int minFileIndex = Integer.MAX_VALUE;
        int maxFileIndex = 0;
        for (final DataFileReader r : filesToCompact) {
            minFileIndex = Math.min(minFileIndex, r.getIndex());
            maxFileIndex = Math.max(maxFileIndex, r.getIndex());
        }
        final int firstIndexInc = minFileIndex;
        final int lastIndexExc = maxFileIndex + 1;
        final DataFileReader[] readers = new DataFileReader[lastIndexExc - firstIndexInc];
        for (DataFileReader r : filesToCompact) {
            readers[r.getIndex() - firstIndexInc] = r;
        }

        boolean allDataItemsProcessed = false;
        try {
            final KeyRange keyRange = dataFileCollection.getValidKeyRange();
            index.forEach((path, dataLocation) -> {
                if (!keyRange.withinRange(path)) {
                    return;
                }
                final int fileIndex = DataFileCommon.fileIndexFromDataLocation(dataLocation);
                if ((fileIndex < firstIndexInc) || (fileIndex >= lastIndexExc)) {
                    return;
                }
                final DataFileReader reader = readers[fileIndex - firstIndexInc];
                if (reader == null) {
                    return;
                }
                final long fileOffset = DataFileCommon.byteOffsetFromDataLocation(dataLocation);
                // Take the lock. If a snapshot is started in a different thread, this call
                // will block until the snapshot is done. The current file will be flushed,
                // and current data file writer and reader will point to a new file
                snapshotCompactionLock.acquire();
                try {
                    final DataFileWriter newFileWriter = currentWriter.get();
                    final BufferedData itemBytes = reader.readDataItem(fileOffset);
                    assert itemBytes != null;
                    long newLocation = newFileWriter.storeDataItem(itemBytes);
                    // update the index
                    index.putIfEqual(path, dataLocation, newLocation);
                } catch (final ClosedByInterruptException e) {
                    logger.info(
                            MERKLE_DB.getMarker(),
                            "Failed to copy data item {} / {} due to thread interruption",
                            fileIndex,
                            fileOffset,
                            e);
                    throw e;
                } catch (final IOException z) {
                    logger.error(EXCEPTION.getMarker(), "Failed to copy data item {} / {}", fileIndex, fileOffset, z);
                    throw z;
                } finally {
                    snapshotCompactionLock.release();
                }
            });
            allDataItemsProcessed = true;
        } finally {
            // Even if the thread is interrupted, make sure the new compacted file is properly closed
            // and is included to future compactions
            snapshotCompactionLock.acquire();
            try {
                // Finish writing the last file. In rare cases, it may be an empty file
                finishCurrentCompactionFile();
                // Clear compaction start time
                currentCompactionStartTime.set(null);
                if (allDataItemsProcessed) {
                    // Close the readers and delete compacted files
                    dataFileCollection.deleteFiles(filesToCompact);
                }
            } finally {
                snapshotCompactionLock.release();
            }
        }

        return newCompactedFiles;
    }

    // visible for testing
    int getMinNumberOfFilesToCompact() {
        return dbConfig.minNumberOfFilesInCompaction();
    }

    /**
     * Opens a new file for writing during compaction. This method is called, when compaction is
     * started. If compaction is interrupted and resumed by data source snapshot using {@link
     * #pauseCompaction()} and {@link #resumeCompaction()}, a new file is created for writing using
     * this method before compaction is resumed.
     * 

* This method must be called under snapshot/compaction lock. * * @throws IOException If an I/O error occurs */ private void startNewCompactionFile(int compactionLevel) throws IOException { final Instant startTime = currentCompactionStartTime.get(); assert startTime != null; final DataFileWriter newFileWriter = dataFileCollection.newDataFile(startTime, compactionLevel); currentWriter.set(newFileWriter); final Path newFileCreated = newFileWriter.getPath(); newCompactedFiles.add(newFileCreated); final DataFileMetadata newFileMetadata = newFileWriter.getMetadata(); final DataFileReader newFileReader = dataFileCollection.addNewDataFileReader(newFileCreated, newFileMetadata); currentReader.set(newFileReader); } /** * Closes the current compaction file. This method is called in the end of compaction process, * and also before a snapshot is taken to make sure the current file is fully written and safe * to include to snapshots. *

* This method must be called under snapshot/compaction lock. * * @throws IOException If an I/O error occurs */ private void finishCurrentCompactionFile() throws IOException { currentWriter.get().finishWriting(); currentWriter.set(null); // Now include the file in future compactions currentReader.get().setFileCompleted(); currentReader.set(null); } /** * Puts file compaction on hold, if it's currently in progress. If not in progress, it will * prevent compaction from starting until {@link #resumeCompaction()} is called. The most * important thing this method does is it makes data files consistent and read only, so they can * be included to snapshots as easily as to create hard links. In particular, if compaction is * in progress, and a new data file is being written to, this file is flushed to disk, no files * are created and no index entries are updated until compaction is resumed. *

* This method should not be called on the compaction thread. *

* This method must be always balanced with and called before {@link DataFileCompactor#resumeCompaction()}. If * there are more / less calls to resume compactions than to pause, or if they are called in a * wrong order, it will result in deadlocks. * * @throws IOException If an I/O error occurs * @see #resumeCompaction() */ public void pauseCompaction() throws IOException { snapshotCompactionLock.acquireUninterruptibly(); // Check if compaction is currently in progress. If so, flush and close the current file, so // it's included to the snapshot final DataFileWriter compactionWriter = currentWriter.get(); if (compactionWriter != null) { compactionWasInProgress.set(true); compactionLevelInProgress.set(compactionWriter.getMetadata().getCompactionLevel()); finishCurrentCompactionFile(); // Don't start a new compaction file here, as it would be included to snapshots, but // it shouldn't, as it isn't fully written yet. Instead, a new file will be started // right after snapshot is taken, in resumeCompaction() } // Don't release the lock here, it will be done later in resumeCompaction(). If there is no // compaction currently running, the lock will prevent starting a new one until snapshot is // done } /** * Resumes compaction previously put on hold with {@link #pauseCompaction()}. If there was no * compaction running at that moment, but new compaction was started (and blocked) since {@link * #pauseCompaction()}, this new compaction is resumed. *

* This method must be always balanced with and called after {@link #pauseCompaction()}. If * there are more / less calls to resume compactions than to pause, or if they are called in a * wrong order, it will result in deadlocks. * * @throws IOException If an I/O error occurs */ public void resumeCompaction() throws IOException { try { if (compactionWasInProgress.getAndSet(false)) { assert currentWriter.get() == null; assert currentReader.get() == null; startNewCompactionFile(compactionLevelInProgress.getAndSet(0)); } } finally { snapshotCompactionLock.release(); } } /** * Compact data files in the collection according to the compaction algorithm. * * @throws IOException if there was a problem merging * @throws InterruptedException if the merge thread was interrupted * @return true if compaction was performed, false otherwise */ public boolean compact() throws IOException, InterruptedException { final List completedFiles = dataFileCollection.getAllCompletedFiles(); reportFileSizeByLevel(completedFiles); final List filesToCompact = compactionPlan(completedFiles, getMinNumberOfFilesToCompact(), dbConfig.maxCompactionLevel()); if (filesToCompact.isEmpty()) { logger.debug(MERKLE_DB.getMarker(), "[{}] No need to compact, as the compaction plan is empty", storeName); return false; } final int filesCount = filesToCompact.size(); logger.info(MERKLE_DB.getMarker(), "[{}] Starting compaction", storeName); final int targetCompactionLevel = getTargetCompactionLevel(filesToCompact, filesCount); final long start = System.currentTimeMillis(); final long filesToCompactSize = getSizeOfFiles(filesToCompact); logger.debug( MERKLE_DB.getMarker(), "[{}] Starting merging {} files / {}", storeName, filesCount, formatSizeBytes(filesToCompactSize)); final List newFilesCreated = compactFiles(index, filesToCompact, targetCompactionLevel); final long end = System.currentTimeMillis(); final long tookMillis = end - start; if (reportDurationMetricFunction != null) { reportDurationMetricFunction.accept(targetCompactionLevel, tookMillis); } final long compactedFilesSize = getSizeOfFilesByPath(newFilesCreated); if (reportSavedSpaceMetricFunction != null) { reportSavedSpaceMetricFunction.accept( targetCompactionLevel, (filesToCompactSize - compactedFilesSize) * UnitConstants.BYTES_TO_MEBIBYTES); } reportFileSizeByLevel(dataFileCollection.getAllCompletedFiles()); logCompactStats( storeName, tookMillis, filesToCompact, filesToCompactSize, newFilesCreated, targetCompactionLevel, dataFileCollection); logger.info( MERKLE_DB.getMarker(), "[{}] Finished compaction {} files / {} in {} ms", storeName, filesCount, formatSizeBytes(filesToCompactSize), tookMillis); if (updateTotalStatsFunction != null) { updateTotalStatsFunction.run(); } return true; } private void reportFileSizeByLevel(List allCompletedFiles) { if (reportFileSizeByLevelMetricFunction != null) { final Map> readersByLevel = getReadersByLevel(allCompletedFiles); for (int i = 0; i < readersByLevel.size(); i++) { final List readers = readersByLevel.get(i); if (readers != null) { reportFileSizeByLevelMetricFunction.accept( i, getSizeOfFiles(readers) * UnitConstants.BYTES_TO_MEBIBYTES); } } } } /** * The target compaction level should not exceed the maxCompactionLevel configuration parameter. * We need a limit on compaction levels for two reasons: * - To ensure a reasonably predictable frequency for full compactions, even for data that changes infrequently. * - We maintain metrics for each level, and there should be a cap on the number of these metrics. */ private int getTargetCompactionLevel(List filesToCompact, int filesCount) { int highestExistingCompactionLevel = filesToCompact.get(filesCount - 1).getMetadata().getCompactionLevel(); return Math.min(highestExistingCompactionLevel + 1, dbConfig.maxCompactionLevel()); } /** * This method creates a compaction plan (a set of files to be compacted). The plan is organized by compaction levels * in ascending order. If there are not enough files to compact, then no files are compacted and the plan will be empty. * If the current level doesn't reach minNumberOfFilesToCompact threshold, * then this level and the levels above it are not included in the plan. * @return filter creating a compaction plan */ static List compactionPlan( List dataFileReaders, int minNumberOfFilesToCompact, int maxCompactionLevel) { if (dataFileReaders.isEmpty()) { return dataFileReaders; } final Map> readersByLevel = getReadersByLevel(dataFileReaders); final List nonCompactedReaders = readersByLevel.get(INITIAL_COMPACTION_LEVEL); if (nonCompactedReaders == null || nonCompactedReaders.size() < minNumberOfFilesToCompact) { return Collections.emptyList(); } // we always compact files from level 0 if we have enough files final List readersToCompact = new ArrayList<>(nonCompactedReaders); for (int i = 1; i <= maxCompactionLevel; i++) { final List readers = readersByLevel.get(i); // Presumably, one file comes from the compaction of the previous level. // If, counting this file in, it still doesn't have enough, then it stops collecting. if (readers == null || readers.size() < minNumberOfFilesToCompact - 1) { break; } readersToCompact.addAll(readers); } return readersToCompact; } private static Map> getReadersByLevel(final List dataFileReaders) { return dataFileReaders.stream() .collect(Collectors.groupingBy(r -> r.getMetadata().getCompactionLevel())); } }





© 2015 - 2024 Weber Informatics LLC | Privacy Policy