com.swirlds.merkledb.files.hashmap.HalfDiskHashMap Maven / Gradle / Ivy

Go to download
/*
 * Copyright (C) 2021-2024 Hedera Hashgraph, LLC
 *
 * Licensed under the Apache License, Version 2.0 (the "License");
 * you may not use this file except in compliance with the License.
 * You may obtain a copy of the License at
 *
 *      http://www.apache.org/licenses/LICENSE-2.0
 *
 * Unless required by applicable law or agreed to in writing, software
 * distributed under the License is distributed on an "AS IS" BASIS,
 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 * See the License for the specific language governing permissions and
 * limitations under the License.
 */

package com.swirlds.merkledb.files.hashmap;

import static com.swirlds.common.threading.manager.AdHocThreadManager.getStaticThreadManager;
import static com.swirlds.logging.legacy.LogMarker.EXCEPTION;
import static com.swirlds.logging.legacy.LogMarker.MERKLE_DB;
import static com.swirlds.merkledb.MerkleDb.MERKLEDB_COMPONENT;

import com.hedera.pbj.runtime.io.buffer.BufferedData;
import com.hedera.pbj.runtime.io.buffer.Bytes;
import com.swirlds.common.config.singleton.ConfigurationHolder;
import com.swirlds.common.threading.framework.config.ThreadConfiguration;
import com.swirlds.merkledb.FileStatisticAware;
import com.swirlds.merkledb.Snapshotable;
import com.swirlds.merkledb.collections.CASableLongIndex;
import com.swirlds.merkledb.collections.LongList;
import com.swirlds.merkledb.collections.LongListDisk;
import com.swirlds.merkledb.collections.LongListOffHeap;
import com.swirlds.merkledb.collections.OffHeapUser;
import com.swirlds.merkledb.config.MerkleDbConfig;
import com.swirlds.merkledb.files.DataFileCollection;
import com.swirlds.merkledb.files.DataFileCollection.LoadedDataCallback;
import com.swirlds.merkledb.files.DataFileReader;
import edu.umd.cs.findbugs.annotations.Nullable;
import java.io.DataInputStream;
import java.io.DataOutputStream;
import java.io.IOException;
import java.nio.file.Files;
import java.nio.file.Path;
import java.util.Iterator;
import java.util.LongSummaryStatistics;
import java.util.Queue;
import java.util.concurrent.ConcurrentLinkedQueue;
import java.util.concurrent.ExecutorService;
import java.util.concurrent.Executors;
import org.apache.logging.log4j.LogManager;
import org.apache.logging.log4j.Logger;
import org.eclipse.collections.api.tuple.primitive.IntObjectPair;
import org.eclipse.collections.impl.map.mutable.primitive.IntObjectHashMap;

/**
 * This is a hash map implementation where the bucket index is in RAM and the buckets are on disk.
 * It maps a VirtualKey to a long value. This allows very large maps with minimal RAM usage and the
 * best performance profile as by using an in memory index we avoid the need for random disk writes.
 * Random disk writes are horrible performance wise in our testing.
 *
 * This implementation depends on good hashCode() implementation on the keys, if there are too
 * many hash collisions the performance can get bad.
 *
 * 
IMPORTANT: This implementation assumes a single writing thread. There can be multiple
 * readers while writing is happening.
 */
public class HalfDiskHashMap implements AutoCloseable, Snapshotable, FileStatisticAware, OffHeapUser {

    private static final Logger logger = LogManager.getLogger(HalfDiskHashMap.class);

    /** The version number for format of current data files */
    private static final int METADATA_FILE_FORMAT_VERSION = 1;
    /** Metadata file name suffix with extension. */
    private static final String METADATA_FILENAME_SUFFIX = "_metadata.hdhm";
    /** Bucket index file name suffix with extension */
    private static final String BUCKET_INDEX_FILENAME_SUFFIX = "_bucket_index.ll";
    /**
     * A marker to indicate that a value should be deleted from the map, or that there is
     * no old value to compare against in putIfEqual/deleteIfEqual
     */
    protected static final long INVALID_VALUE = Long.MIN_VALUE;

    /**
     * This is the average number of entries per bucket we aim for when filled to mapSize. It is a
     * heuristic used in calculation for how many buckets to create. The larger this number the
     * slower lookups will be but the more even distribution of entries across buckets will be. So
     * it is a matter of balance.
     */
    private static final long GOOD_AVERAGE_BUCKET_ENTRY_COUNT = 32;
    /** The limit on the number of concurrent read tasks in {@code endWriting()} */
    private static final int MAX_IN_FLIGHT = 64;

    /**
     * Long list used for mapping bucketIndex(index into list) to disk location for latest copy of
     * bucket
     */
    private final LongList bucketIndexToBucketLocation;
    /** DataFileCollection manages the files storing the buckets on disk */
    private final DataFileCollection fileCollection;

    /**
     * This is the next power of 2 bigger than minimumBuckets. It needs to be a power of two, so
     * that we can optimize and avoid the cost of doing a % to find the bucket index from hash code.
     */
    private final int numOfBuckets;
    /**
     * The requested max size for the map, this is the maximum number of key/values expected to be
     * stored in this map.
     */
    private final long mapSize;
    /** The name to use for the files prefix on disk */
    private final String storeName;

    /** Bucket pool used by this HDHM */
    private final ReusableBucketPool bucketPool;
    /** Store for session data during a writing transaction */
    private IntObjectHashMap oneTransactionsData = null;
    /**
     * The thread that called startWriting. We use it to check that other writing calls are done on
     * same thread
     */
    private Thread writingThread;

    /** Executor for parallel bucket reads/updates in {@link #endWriting()} */
    private static volatile ExecutorService flushExecutor = null;

    private static ExecutorService getFlushExecutor() {
        ExecutorService exec = flushExecutor;
        if (exec == null) {
            synchronized (HalfDiskHashMap.class) {
                exec = flushExecutor;
                if (exec == null) {
                    final MerkleDbConfig config = ConfigurationHolder.getConfigData(MerkleDbConfig.class);
                    exec = Executors.newFixedThreadPool(
                            config.getNumHalfDiskHashMapFlushThreads(),
                            new ThreadConfiguration(getStaticThreadManager())
                                    .setComponent(MERKLEDB_COMPONENT)
                                    .setThreadName("HalfDiskHashMap Flushing")
                                    .setExceptionHandler((t, ex) -> logger.error(
                                            EXCEPTION.getMarker(), "Uncaught exception during HDHM flushing", ex))
                                    .buildFactory());
                    flushExecutor = exec;
                }
            }
        }
        return exec;
    }

    /**
     * Construct a new HalfDiskHashMap
     *
     * @param config                         MerkleDb config
     * @param mapSize                        The maximum map number of entries. This should be more than big enough to
     *                                       avoid too many key collisions.
     * @param storeDir                       The directory to use for storing data files.
     * @param storeName                      The name for the data store, this allows more than one data store in a
     *                                       single directory.
     * @param legacyStoreName                Base name for the data store. If not null, the store will process
     *                                       files with this prefix at startup. New files in the store will be prefixed with {@code
     *                                       storeName}
     * @param preferDiskBasedIndex           When true we will use disk based index rather than ram where
     *                                       possible. This will come with a significant performance cost, especially for writing. It
     *                                       is possible to load a data source that was written with memory index with disk based
     *                                       index and vice versa.
     * @throws IOException If there was a problem creating or opening a set of data files.
     */
    public HalfDiskHashMap(
            final MerkleDbConfig config,
            final long mapSize,
            final Path storeDir,
            final String storeName,
            final String legacyStoreName,
            final boolean preferDiskBasedIndex)
            throws IOException {
        this.mapSize = mapSize;
        this.storeName = storeName;
        Path indexFile = storeDir.resolve(storeName + BUCKET_INDEX_FILENAME_SUFFIX);
        // create bucket pool
        this.bucketPool = new ReusableBucketPool(Bucket::new);
        // load or create new
        LoadedDataCallback loadedDataCallback;
        if (Files.exists(storeDir)) {
            // load metadata
            Path metaDataFile = storeDir.resolve(storeName + METADATA_FILENAME_SUFFIX);
            boolean loadedLegacyMetadata = false;
            if (!Files.exists(metaDataFile)) {
                metaDataFile = storeDir.resolve(legacyStoreName + METADATA_FILENAME_SUFFIX);
                indexFile = storeDir.resolve(legacyStoreName + BUCKET_INDEX_FILENAME_SUFFIX);
                loadedLegacyMetadata = true;
            }
            if (Files.exists(metaDataFile)) {
                try (DataInputStream metaIn = new DataInputStream(Files.newInputStream(metaDataFile))) {
                    final int fileVersion = metaIn.readInt();
                    if (fileVersion != METADATA_FILE_FORMAT_VERSION) {
                        throw new IOException("Tried to read a file with incompatible file format version ["
                                + fileVersion
                                + "], expected ["
                                + METADATA_FILE_FORMAT_VERSION
                                + "].");
                    }
                    metaIn.readInt(); // backwards compatibility, was: minimumBuckets
                    numOfBuckets = metaIn.readInt();
                }
                if (loadedLegacyMetadata) {
                    Files.delete(metaDataFile);
                }
            } else {
                logger.error(
                        EXCEPTION.getMarker(),
                        "Loading existing set of data files but no metadata file was found in [{}]",
                        storeDir.toAbsolutePath());
                throw new IOException("Can not load an existing HalfDiskHashMap from ["
                        + storeDir.toAbsolutePath()
                        + "] because metadata file is missing");
            }
            // load or rebuild index
            final boolean forceIndexRebuilding = config.indexRebuildingEnforced();
            if (Files.exists(indexFile) && !forceIndexRebuilding) {
                bucketIndexToBucketLocation =
                        preferDiskBasedIndex ? new LongListDisk(indexFile) : new LongListOffHeap(indexFile);
                loadedDataCallback = null;
            } else {
                // create new index and setup call back to rebuild
                bucketIndexToBucketLocation =
                        preferDiskBasedIndex ? new LongListDisk(indexFile) : new LongListOffHeap();
                loadedDataCallback = (dataLocation, bucketData) -> {
                    final Bucket bucket = bucketPool.getBucket();
                    bucket.readFrom(bucketData);
                    bucketIndexToBucketLocation.put(bucket.getBucketIndex(), dataLocation);
                };
            }
        } else {
            // create store dir
            Files.createDirectories(storeDir);
            // create new index
            bucketIndexToBucketLocation = preferDiskBasedIndex ? new LongListDisk(indexFile) : new LongListOffHeap();
            // calculate number of entries we can store in a disk page
            final int minimumBuckets = (int) (mapSize / GOOD_AVERAGE_BUCKET_ENTRY_COUNT);
            // numOfBuckets is the nearest power of two greater than minimumBuckets with a min of 2
            numOfBuckets = Math.max(Integer.highestOneBit(minimumBuckets) * 2, 2);
            // we are new so no need for a loadedDataCallback
            loadedDataCallback = null;
            // write metadata
            writeMetadata(storeDir);
            logger.info(
                    MERKLE_DB.getMarker(),
                    "HalfDiskHashMap [{}] created with minimumBuckets={} and numOfBuckets={}",
                    storeName,
                    minimumBuckets,
                    numOfBuckets);
        }
        bucketIndexToBucketLocation.updateValidRange(0, numOfBuckets - 1);
        // create file collection
        fileCollection = new DataFileCollection(
                // Need: propagate MerkleDb config from the database
                config, storeDir, storeName, legacyStoreName, loadedDataCallback);
    }

    private void writeMetadata(final Path dir) throws IOException {
        try (DataOutputStream metaOut =
                new DataOutputStream(Files.newOutputStream(dir.resolve(storeName + METADATA_FILENAME_SUFFIX)))) {
            metaOut.writeInt(METADATA_FILE_FORMAT_VERSION);
            metaOut.writeInt(0); // backwards compatibility, was: minimumBuckets
            metaOut.writeInt(numOfBuckets);
            metaOut.flush();
        }
    }

    /** {@inheritDoc} */
    public void snapshot(final Path snapshotDirectory) throws IOException {
        // create snapshot directory if needed
        Files.createDirectories(snapshotDirectory);
        // write index to file
        bucketIndexToBucketLocation.writeToFile(snapshotDirectory.resolve(storeName + BUCKET_INDEX_FILENAME_SUFFIX));
        // snapshot files
        fileCollection.snapshot(snapshotDirectory);
        // write metadata
        writeMetadata(snapshotDirectory);
    }

    /**
     * {@inheritDoc}
     */
    @Override
    public long getOffHeapConsumption() {
        if (bucketIndexToBucketLocation instanceof LongListOffHeap offheapIndex) {
            return offheapIndex.getOffHeapConsumption();
        }
        return 0;
    }

    /**
     * {@inheritDoc}
     */
    public LongSummaryStatistics getFilesSizeStatistics() {
        return fileCollection.getAllCompletedFilesSizeStatistics();
    }

    /**
     * Close this HalfDiskHashMap's data files. Once closed this HalfDiskHashMap can not be reused.
     * You should make sure you call close before system exit otherwise any files being written
     * might not be in a good state.
     *
     * @throws IOException If there was a problem closing the data files.
     */
    @Override
    public void close() throws IOException {
        // Close the files first, then the index. If done in a different order, there may be
        // file operations still running, but the index is already closed
        fileCollection.close();
        bucketIndexToBucketLocation.close();
    }

    // =================================================================================================================
    // Writing API - Single thead safe

    /**
     * Start a writing session to the map. Each new writing session results in a new data file on
     * disk, so you should ideally batch up map writes.
     */
    public void startWriting() {
        oneTransactionsData = new IntObjectHashMap<>();
        writingThread = Thread.currentThread();
    }

    private BucketMutation findBucketForUpdate(
            final Bytes keyBytes, final int keyHashCode, final long oldValue, final long value) {
        if (keyBytes == null) {
            throw new IllegalArgumentException("Can not write a null key");
        }
        if (oneTransactionsData == null) {
            throw new IllegalStateException(
                    "Trying to write to a HalfDiskHashMap when you have not called startWriting().");
        }
        if (Thread.currentThread() != writingThread) {
            throw new IllegalStateException("Tried to write with different thread to startWriting()");
        }
        // store key and value in transaction cache
        final int bucketIndex = computeBucketIndex(keyHashCode);
        return oneTransactionsData.getIfAbsentPut(
                bucketIndex, () -> new BucketMutation(keyBytes, keyHashCode, oldValue, value));
    }

    /**
     * Put a key/value during the current writing session. The value will not be retrievable until
     * it is committed in the {@link #endWriting()} call.
     *
     * 
This method may be called multiple times for the same key in a single writing
     * session. The value from the last call will be stored in this map after the session is
     * ended.
     *
     * @param keyBytes the key to store the value for
     * @param keyHashCode the key hash code
     * @param value the value to store for given key
     */
    public void put(final Bytes keyBytes, final int keyHashCode, final long value) {
        final BucketMutation bucketMap = findBucketForUpdate(keyBytes, keyHashCode, INVALID_VALUE, value);
        bucketMap.put(keyBytes, keyHashCode, value);
    }

    /**
     * Put a key/value during the current writing session. This method is similar to {@link
     * #put(Bytes, int, long)}, but the new value is set only if the current value is equal to
     * the given {@code oldValue}.
     *
     * 
This method may be called multiple times for the same key in a single writing
     * session. If the new value from the first call is equal to the old value in the second
     * call, the new value from the second call will be stored in this map after the session
     * is ended, otherwise the value from the second call will be ignored.
     *
     * If the value for {@code oldValue} is {@link #INVALID_VALUE}, it's ignored, and this
     * method is identical to {@link #put(Bytes, int, long)}.
     *
     * @param keyBytes the key to store the value for
     * @param keyHashCode the key hash code
     * @param oldValue the value to check the current value against, or {@link #INVALID_VALUE}
     *                 if no current value check is needed
     * @param value the value to store for the given key
     */
    public void putIfEqual(final Bytes keyBytes, final int keyHashCode, final long oldValue, final long value) {
        final BucketMutation bucketMap = findBucketForUpdate(keyBytes, keyHashCode, oldValue, value);
        bucketMap.putIfEqual(keyBytes, keyHashCode, oldValue, value);
    }

    /**
     * Delete a key entry from the map.
     *
     * @param keyBytes The key to delete entry for
     */
    public void delete(final Bytes keyBytes, final int keyHashCode) {
        put(keyBytes, keyHashCode, INVALID_VALUE);
    }

    /**
     * Delete a key entry from the map, if the current value is equal to the given {@code oldValue}.
     * If {@code oldValue} is {@link #INVALID_VALUE}, no current value check is performed, and this
     * method is identical to {@link #delete(Bytes, int)}.
     *
     * @param keyBytes the key to delete the entry for
     * @param oldValue the value to check the current value against, or {@link #INVALID_VALUE}
     *                 if no current value check is needed
     */
    public void deleteIfEqual(final Bytes keyBytes, final int keyHashCode, final long oldValue) {
        putIfEqual(keyBytes, keyHashCode, oldValue, INVALID_VALUE);
    }

    /**
     * End current writing session, committing all puts to data store.
     *
     * @return Data file reader for the file written
     * @throws IOException If there was a problem committing data to store
     */
    @Nullable
    public DataFileReader endWriting() throws IOException {
        /* FUTURE WORK - https://github.com/swirlds/swirlds-platform/issues/3943 */
        if (Thread.currentThread() != writingThread) {
            throw new IllegalStateException("Tried calling endWriting with different thread to startWriting()");
        }
        writingThread = null;
        final int size = oneTransactionsData.size();
        logger.info(
                MERKLE_DB.getMarker(),
                "Finishing writing to {}, num of changed bins = {}, num of changed keys = {}",
                storeName,
                size,
                oneTransactionsData.stream().mapToLong(BucketMutation::size).sum());

        final ExecutorService flushExecutor = getFlushExecutor();
        final DataFileReader dataFileReader;
        if (size > 0) {
            final Queue queue = new ConcurrentLinkedQueue<>();
            final Iterator> iterator =
                    oneTransactionsData.keyValuesView().iterator();

            // read and update all buckets in parallel, write sequentially in random order
            fileCollection.startWriting();
            int processed = 0;
            int inFlight = 0;
            while (processed < size) {
                // submit read tasks
                while (inFlight < MAX_IN_FLIGHT && iterator.hasNext()) {
                    IntObjectPair keyValue = iterator.next();
                    final int bucketIndex = keyValue.getOne();
                    final BucketMutation bucketMap = keyValue.getTwo();
                    flushExecutor.execute(() -> readUpdateQueueBucket(bucketIndex, bucketMap, queue));
                    ++inFlight;
                }

                ReadBucketResult res;
                while ((res = queue.poll()) != null) {
                    --inFlight;
                    if (res.error != null) {
                        throw new RuntimeException(res.error);
                    }
                    try (final Bucket bucket = res.bucket) {
                        final int bucketIndex = bucket.getBucketIndex();
                        if (bucket.isEmpty()) {
                            // bucket is missing or empty, remove it from the index
                            bucketIndexToBucketLocation.remove(bucketIndex);
                        } else {
                            // save bucket
                            final long bucketLocation =
                                    fileCollection.storeDataItem(bucket::writeTo, bucket.sizeInBytes());
                            // update bucketIndexToBucketLocation
                            bucketIndexToBucketLocation.put(bucketIndex, bucketLocation);
                        }
                    } finally {
                        ++processed;
                    }
                }
            }
            // close files session
            dataFileReader = fileCollection.endWriting(0, numOfBuckets);
            // we have updated all indexes so the data file can now be included in merges
            dataFileReader.setFileCompleted();
        } else {
            dataFileReader = null;
        }

        // clear put cache
        oneTransactionsData = null;
        return dataFileReader;
    }

    /**
     * Reads a bucket with a given index from disk, updates given keys in it, and puts the bucket to
     * a queue. If an exception is thrown, it's put to the queue instead, so the number of {@code
     * ReadBucketResult} objects in the queue is consistent.
     *
     * @param bucketIndex The bucket index
     * @param keyUpdates Key/value updates to apply to the bucket
     * @param queue The queue to put the bucket or exception to
     */
    private void readUpdateQueueBucket(
            final int bucketIndex, final BucketMutation keyUpdates, final Queue queue) {
        try {
            // The bucket will be closed on the lifecycle thread
            final Bucket bucket;
            BufferedData bucketData = fileCollection.readDataItemUsingIndex(bucketIndexToBucketLocation, bucketIndex);
            if (bucketData == null) {
                // create a new bucket
                bucket = bucketPool.getBucket();
                bucket.setBucketIndex(bucketIndex);
            } else {
                bucket = bucketPool.getBucket();
                bucket.readFrom(bucketData);
            }
            // for each changed key in bucket, update bucket
            keyUpdates.forEachKeyValue(bucket::putValue);
            queue.offer(new ReadBucketResult(bucket, null));
        } catch (final Exception e) {
            logger.error(EXCEPTION.getMarker(), "Failed to read / update bucket", e);
            queue.offer(new ReadBucketResult(null, e));
        }
    }

    // =================================================================================================================
    // Reading API - Multi thead safe

    /**
     * Get a value from this map
     *
     * @param keyBytes the key to get value for
     * @param keyHashCode the key hash code
     * @param notFoundValue the value to return if the key was not found
     * @return the value retrieved from the map or {notFoundValue} if no value was stored for the
     *     given key
     * @throws IOException If there was a problem reading from the map
     */
    public long get(final Bytes keyBytes, final int keyHashCode, final long notFoundValue) throws IOException {
        if (keyBytes == null) {
            throw new IllegalArgumentException("Can not get a null key");
        }
        final int bucketIndex = computeBucketIndex(keyHashCode);
        try (final Bucket bucket = readBucket(bucketIndex)) {
            if (bucket != null) {
                return bucket.findValue(keyHashCode, keyBytes, notFoundValue);
            }
        }
        return notFoundValue;
    }

    private Bucket readBucket(final int bucketIndex) throws IOException {
        final BufferedData bucketData = fileCollection.readDataItemUsingIndex(bucketIndexToBucketLocation, bucketIndex);
        if (bucketData == null) {
            return null;
        }
        final Bucket bucket = bucketPool.getBucket();
        bucket.readFrom(bucketData);
        return bucket;
    }

    // =================================================================================================================
    // Debugging Print API

    /** Debug dump stats for this map */
    public void printStats() {
        logger.info(
                MERKLE_DB.getMarker(),
                """
                        HalfDiskHashMap Stats {
                        	mapSize = {}
                        	numOfBuckets = {}
                        	GOOD_AVERAGE_BUCKET_ENTRY_COUNT = {}
                        }""",
                mapSize,
                numOfBuckets,
                GOOD_AVERAGE_BUCKET_ENTRY_COUNT);
    }

    public DataFileCollection getFileCollection() {
        return fileCollection;
    }

    public CASableLongIndex getBucketIndexToBucketLocation() {
        return bucketIndexToBucketLocation;
    }

    // =================================================================================================================
    // Private API

    /**
     * Computes which bucket a key with the given hash falls. Depends on the fact the numOfBuckets
     * is a power of two. Based on same calculation that is used in java HashMap.
     *
     * @param keyHash the int hash for key
     * @return the index of the bucket that key falls in
     */
    private int computeBucketIndex(final int keyHash) {
        return (numOfBuckets - 1) & keyHash;
    }

    private record ReadBucketResult(Bucket bucket, Throwable error) {
        public ReadBucketResult {
            assert (bucket != null) ^ (error != null);
        }
    }
}