com.swirlds.merkledb.MerkleDbDataSource Maven / Gradle / Ivy

Go to download
Show more of this group Show more artifacts with this name
Show all versions of swirlds-merkledb Show documentation
Swirlds is a software platform designed to build fully-distributed applications that harness the power of the cloud without servers. Now you can develop applications with fairness in decision making, speed, trust and reliability, at a fraction of the cost of traditional server-based platforms.
There is a newer version: 0.56.6
Show newest version
/*
 * Copyright (C) 2022-2024 Hedera Hashgraph, LLC
 *
 * Licensed under the Apache License, Version 2.0 (the "License");
 * you may not use this file except in compliance with the License.
 * You may obtain a copy of the License at
 *
 *      http://www.apache.org/licenses/LICENSE-2.0
 *
 * Unless required by applicable law or agreed to in writing, software
 * distributed under the License is distributed on an "AS IS" BASIS,
 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 * See the License for the specific language governing permissions and
 * limitations under the License.
 */

package com.swirlds.merkledb;

import static com.hedera.pbj.runtime.ProtoParserTools.TAG_FIELD_OFFSET;
import static com.swirlds.common.threading.manager.AdHocThreadManager.getStaticThreadManager;
import static com.swirlds.logging.legacy.LogMarker.EXCEPTION;
import static com.swirlds.logging.legacy.LogMarker.MERKLE_DB;
import static com.swirlds.merkledb.KeyRange.INVALID_KEY_RANGE;
import static com.swirlds.merkledb.MerkleDb.MERKLEDB_COMPONENT;
import static java.util.Objects.requireNonNull;

import com.hedera.pbj.runtime.FieldDefinition;
import com.hedera.pbj.runtime.FieldType;
import com.hedera.pbj.runtime.ProtoWriterTools;
import com.hedera.pbj.runtime.io.WritableSequentialData;
import com.hedera.pbj.runtime.io.buffer.BufferedData;
import com.hedera.pbj.runtime.io.buffer.Bytes;
import com.hedera.pbj.runtime.io.stream.ReadableStreamingData;
import com.hedera.pbj.runtime.io.stream.WritableStreamingData;
import com.swirlds.base.units.UnitConstants;
import com.swirlds.base.utility.ToStringBuilder;
import com.swirlds.common.crypto.Hash;
import com.swirlds.common.io.streams.SerializableDataOutputStream;
import com.swirlds.common.threading.framework.config.ThreadConfiguration;
import com.swirlds.merkledb.collections.HashListByteBuffer;
import com.swirlds.merkledb.collections.LongList;
import com.swirlds.merkledb.collections.LongListDisk;
import com.swirlds.merkledb.collections.LongListOffHeap;
import com.swirlds.merkledb.collections.OffHeapUser;
import com.swirlds.merkledb.files.DataFileCollection.LoadedDataCallback;
import com.swirlds.merkledb.files.DataFileCompactor;
import com.swirlds.merkledb.files.DataFileReader;
import com.swirlds.merkledb.files.MemoryIndexDiskKeyValueStore;
import com.swirlds.merkledb.files.hashmap.HalfDiskHashMap;
import com.swirlds.metrics.api.Metrics;
import com.swirlds.virtualmap.datasource.VirtualDataSource;
import com.swirlds.virtualmap.datasource.VirtualHashRecord;
import com.swirlds.virtualmap.datasource.VirtualLeafBytes;
import com.swirlds.virtualmap.serialize.KeySerializer;
import com.swirlds.virtualmap.serialize.ValueSerializer;
import edu.umd.cs.findbugs.annotations.NonNull;
import edu.umd.cs.findbugs.annotations.Nullable;
import java.io.IOException;
import java.io.OutputStream;
import java.io.UncheckedIOException;
import java.nio.file.Files;
import java.nio.file.Path;
import java.nio.file.StandardOpenOption;
import java.util.Comparator;
import java.util.Iterator;
import java.util.Objects;
import java.util.concurrent.Callable;
import java.util.concurrent.CountDownLatch;
import java.util.concurrent.ExecutorService;
import java.util.concurrent.Executors;
import java.util.concurrent.TimeUnit;
import java.util.concurrent.atomic.AtomicBoolean;
import java.util.concurrent.atomic.LongAdder;
import java.util.stream.Stream;
import org.apache.logging.log4j.LogManager;
import org.apache.logging.log4j.Logger;

public final class MerkleDbDataSource implements VirtualDataSource {

    private static final Logger logger = LogManager.getLogger(MerkleDbDataSource.class);

    /** Count of open database instances */
    private static final LongAdder COUNT_OF_OPEN_DATABASES = new LongAdder();

    /** Data source metadata fields */
    private static final FieldDefinition FIELD_DSMETADATA_MINVALIDKEY =
            new FieldDefinition("minValidKey", FieldType.UINT64, false, true, false, 1);

    private static final FieldDefinition FIELD_DSMETADATA_MAXVALIDKEY =
            new FieldDefinition("maxValidKey", FieldType.UINT64, false, true, false, 2);

    /** Virtual database instance that hosts this data source. */
    private final MerkleDb database;

    /** Table name. Used as a subdir name in the database directory */
    private final String tableName;

    /** Table ID used by this data source in its database instance. */
    private final int tableId;

    /**
     * Table config, includes key and value serializers as well as a few other non-global params.
     */
    private final MerkleDbTableConfig tableConfig;

    /**
     * In memory off-heap store for path to disk location, this is used for internal hashes store.
     */
    private final LongList pathToDiskLocationInternalNodes;

    /** In memory off-heap store for path to disk location, this is used by leave store. */
    private final LongList pathToDiskLocationLeafNodes;

    /**
     * In memory off-heap store for node hashes. This data is never stored on disk so on load from disk, this
     * will be empty. That should cause all internal node hashes to have to be computed on the first round
     * which will be expensive. Stores {@link Hash} objects as bytes.
     */
    private final HashListByteBuffer hashStoreRam;

    /**
     * On disk store for node hashes. Can be null if all hashes are being stored in ram by setting
     * tableConfig.hashesRamToDiskThreshold to Long.MAX_VALUE. Stores {@link VirtualHashRecord}
     * objects as bytes.
     */
    private final MemoryIndexDiskKeyValueStore hashStoreDisk;

    /** True when hashesRamToDiskThreshold is less than Long.MAX_VALUE */
    private final boolean hasDiskStoreForHashes;

    /** Mixed disk and off-heap memory store for key to path map */
    private final HalfDiskHashMap keyToPath;

    /**
     * Mixed disk and off-heap memory store for path to leaf key and value. Stores {@link
     * VirtualLeafBytes} objects as bytes.
     */
    private final MemoryIndexDiskKeyValueStore pathToKeyValue;

    /**
     * Cache size for reading virtual leaf records. Initialized in data source creation time from
     * MerkleDb settings. If the value is zero, leaf records cache isn't used.
     */
    private final int leafRecordCacheSize;

    /**
     * Virtual leaf records cache. It's a simple array indexed by leaf keys % cache size. Cache
     * eviction is not needed, as array size is fixed and can be configured in MerkleDb settings.
     * Index conflicts are resolved in a very straightforward way: whatever entry is read last, it's
     * put to the cache.
     */
    private final VirtualLeafBytes[] leafRecordCache;

    /** Thread pool storing internal records */
    private final ExecutorService storeHashesExecutor;

    /** Thread pool storing key-to-path mappings */
    private final ExecutorService storeLeavesExecutor;

    /** Thread pool creating snapshots, it is unbounded in threads, but we use at most 7 */
    private final ExecutorService snapshotExecutor;

    /** Flag for if a snapshot is in progress */
    private final AtomicBoolean snapshotInProgress = new AtomicBoolean(false);

    /** The range of valid leaf paths for data currently stored by this data source. */
    private volatile KeyRange validLeafPathRange = INVALID_KEY_RANGE;

    /** Paths to all database files and directories */
    private final MerkleDbPaths dbPaths;

    private final AtomicBoolean closed = new AtomicBoolean(false);

    /** Runs compactions for the storages of this data source */
    final MerkleDbCompactionCoordinator compactionCoordinator;

    private MerkleDbStatisticsUpdater statisticsUpdater;

    public MerkleDbDataSource(
            final MerkleDb database,
            final String tableName,
            final int tableId,
            final MerkleDbTableConfig tableConfig,
            final boolean compactionEnabled)
            throws IOException {
        this.database = database;
        this.tableName = tableName;
        this.tableId = tableId;
        this.tableConfig = tableConfig;

        // create thread group with label
        final ThreadGroup threadGroup = new ThreadGroup("MerkleDb-" + tableName);
        // create thread pool storing virtual node hashes
        storeHashesExecutor = Executors.newSingleThreadExecutor(new ThreadConfiguration(getStaticThreadManager())
                .setComponent(MERKLEDB_COMPONENT)
                .setThreadGroup(threadGroup)
                .setThreadName("Store hashes")
                .setExceptionHandler((t, ex) -> logger.error(
                        EXCEPTION.getMarker(), "[{}] Uncaught exception during storing hashes", tableName, ex))
                .buildFactory());
        // create thread pool storing virtual leaf nodes
        storeLeavesExecutor = Executors.newSingleThreadExecutor(new ThreadConfiguration(getStaticThreadManager())
                .setComponent(MERKLEDB_COMPONENT)
                .setThreadGroup(threadGroup)
                .setThreadName("Store leaves")
                .setExceptionHandler((t, ex) -> logger.error(
                        EXCEPTION.getMarker(), "[{}] Uncaught exception during storing leaves", tableName, ex))
                .buildFactory());
        // thread pool creating snapshots, it is unbounded in threads, but we use at most 7
        snapshotExecutor = Executors.newCachedThreadPool(new ThreadConfiguration(getStaticThreadManager())
                .setComponent(MERKLEDB_COMPONENT)
                .setThreadGroup(threadGroup)
                .setThreadName("Snapshot")
                .setExceptionHandler(
                        (t, ex) -> logger.error(EXCEPTION.getMarker(), "Uncaught exception during snapshots", ex))
                .buildFactory());

        final Path storageDir = database.getTableDir(tableName, tableId);
        dbPaths = new MerkleDbPaths(storageDir);

        // check if we are loading an existing database or creating a new one
        if (Files.exists(storageDir)) {
            // read metadata
            if (!loadMetadata(dbPaths)) {
                logger.info(
                        MERKLE_DB.getMarker(),
                        "[{}] Loading existing set of data files but no metadata file was found in" + " [{}]",
                        tableName,
                        storageDir.toAbsolutePath());
                throw new IOException("Can not load an existing MerkleDbDataSource from ["
                        + storageDir.toAbsolutePath()
                        + "] because metadata file is missing");
            }
        } else {
            Files.createDirectories(storageDir);
        }
        saveMetadata(dbPaths);

        // create path to disk location index
        final boolean forceIndexRebuilding = database.getConfig().indexRebuildingEnforced();
        if (tableConfig.isPreferDiskBasedIndices()) {
            pathToDiskLocationInternalNodes = new LongListDisk(dbPaths.pathToDiskLocationInternalNodesFile);
        } else if (Files.exists(dbPaths.pathToDiskLocationInternalNodesFile) && !forceIndexRebuilding) {
            pathToDiskLocationInternalNodes = new LongListOffHeap(dbPaths.pathToDiskLocationInternalNodesFile);
        } else {
            pathToDiskLocationInternalNodes = new LongListOffHeap();
        }
        // path to disk location index, leaf nodes
        if (tableConfig.isPreferDiskBasedIndices()) {
            pathToDiskLocationLeafNodes = new LongListDisk(dbPaths.pathToDiskLocationLeafNodesFile);
        } else if (Files.exists(dbPaths.pathToDiskLocationLeafNodesFile) && !forceIndexRebuilding) {
            pathToDiskLocationLeafNodes = new LongListOffHeap(dbPaths.pathToDiskLocationLeafNodesFile);
        } else {
            pathToDiskLocationLeafNodes =
                    new LongListOffHeap(database.getConfig().reservedBufferLengthForLeafList());
        }

        // internal node hashes store, RAM
        if (tableConfig.getHashesRamToDiskThreshold() > 0) {
            if (Files.exists(dbPaths.hashStoreRamFile)) {
                hashStoreRam = new HashListByteBuffer(dbPaths.hashStoreRamFile);
            } else {
                hashStoreRam = new HashListByteBuffer();
            }
        } else {
            hashStoreRam = null;
        }

        statisticsUpdater = new MerkleDbStatisticsUpdater(database.getConfig(), tableName);

        final Runnable updateTotalStatsFunction = () -> {
            statisticsUpdater.updateStoreFileStats(this);
            statisticsUpdater.updateOffHeapStats(this);
        };

        // internal node hashes store, on disk
        hasDiskStoreForHashes = tableConfig.getHashesRamToDiskThreshold() < Long.MAX_VALUE;
        final DataFileCompactor hashStoreDiskFileCompactor;
        if (hasDiskStoreForHashes) {
            final boolean hashIndexEmpty = pathToDiskLocationInternalNodes.size() == 0;
            final LoadedDataCallback hashRecordLoadedCallback;
            if (hashIndexEmpty) {
                if (validLeafPathRange.getMaxValidKey() >= 0) {
                    pathToDiskLocationInternalNodes.updateValidRange(0, validLeafPathRange.getMaxValidKey());
                }
                hashRecordLoadedCallback = (dataLocation, hashData) -> {
                    final VirtualHashRecord hashRecord = VirtualHashRecord.parseFrom(hashData);
                    pathToDiskLocationInternalNodes.put(hashRecord.path(), dataLocation);
                };
            } else {
                hashRecordLoadedCallback = null;
            }
            final String storeName = tableName + "_internalhashes";
            hashStoreDisk = new MemoryIndexDiskKeyValueStore(
                    database.getConfig(),
                    dbPaths.hashStoreDiskDirectory,
                    storeName,
                    tableName + ":internalHashes",
                    hashRecordLoadedCallback,
                    pathToDiskLocationInternalNodes);
            hashStoreDiskFileCompactor = new DataFileCompactor(
                    database.getConfig(),
                    storeName,
                    hashStoreDisk.getFileCollection(),
                    pathToDiskLocationInternalNodes,
                    statisticsUpdater::setHashesStoreCompactionTimeMs,
                    statisticsUpdater::setHashesStoreCompactionSavedSpaceMb,
                    statisticsUpdater::setHashesStoreFileSizeByLevelMb,
                    updateTotalStatsFunction);
        } else {
            hashStoreDisk = null;
            hashStoreDiskFileCompactor = null;
        }

        final DataFileCompactor keyToPathFileCompactor;
        // key to path store
        String keyToPathStoreName = tableName + "_objectkeytopath";
        keyToPath = new HalfDiskHashMap(
                database.getConfig(),
                tableConfig.getMaxNumberOfKeys(),
                dbPaths.keyToPathDirectory,
                keyToPathStoreName,
                tableName + ":objectKeyToPath",
                tableConfig.isPreferDiskBasedIndices());
        keyToPathFileCompactor = new DataFileCompactor(
                database.getConfig(),
                keyToPathStoreName,
                keyToPath.getFileCollection(),
                keyToPath.getBucketIndexToBucketLocation(),
                statisticsUpdater::setLeafKeysStoreCompactionTimeMs,
                statisticsUpdater::setLeafKeysStoreCompactionSavedSpaceMb,
                statisticsUpdater::setLeafKeysStoreFileSizeByLevelMb,
                updateTotalStatsFunction);
        keyToPath.printStats();

        final LoadedDataCallback leafRecordLoadedCallback;
        final boolean needRestorePathToDiskLocationLeafNodes = pathToDiskLocationLeafNodes.size() == 0;
        if (needRestorePathToDiskLocationLeafNodes) {
            if (validLeafPathRange.getMaxValidKey() >= 0) {
                pathToDiskLocationLeafNodes.updateValidRange(
                        validLeafPathRange.getMinValidKey(), validLeafPathRange.getMaxValidKey());
            }
            leafRecordLoadedCallback = (dataLocation, leafData) -> {
                final VirtualLeafBytes leafBytes = VirtualLeafBytes.parseFrom(leafData);
                pathToDiskLocationLeafNodes.put(leafBytes.path(), dataLocation);
            };
        } else {
            leafRecordLoadedCallback = null;
        }
        // Create path to key/value store, this will create new or load if files exist
        final String pathToKeyValueStoreName = tableName + "_pathtohashkeyvalue";
        pathToKeyValue = new MemoryIndexDiskKeyValueStore(
                database.getConfig(),
                dbPaths.pathToKeyValueDirectory,
                pathToKeyValueStoreName,
                tableName + ":pathToHashKeyValue",
                leafRecordLoadedCallback,
                pathToDiskLocationLeafNodes);
        final DataFileCompactor pathToKeyValueFileCompactor = new DataFileCompactor(
                database.getConfig(),
                pathToKeyValueStoreName,
                pathToKeyValue.getFileCollection(),
                pathToDiskLocationLeafNodes,
                statisticsUpdater::setLeavesStoreCompactionTimeMs,
                statisticsUpdater::setLeavesStoreCompactionSavedSpaceMb,
                statisticsUpdater::setLeavesStoreFileSizeByLevelMb,
                updateTotalStatsFunction);

        // Leaf records cache
        leafRecordCacheSize = database.getConfig().leafRecordCacheSize();
        leafRecordCache = (leafRecordCacheSize > 0) ? new VirtualLeafBytes[leafRecordCacheSize] : null;

        // Update count of open databases
        COUNT_OF_OPEN_DATABASES.increment();

        logger.info(
                MERKLE_DB.getMarker(),
                "Created MerkleDB [{}] with store path '{}', maxNumKeys = {}, hash RAM/disk cutoff" + " = {}",
                tableName,
                storageDir,
                tableConfig.getMaxNumberOfKeys(),
                tableConfig.getHashesRamToDiskThreshold());

        compactionCoordinator = new MerkleDbCompactionCoordinator(
                tableName, keyToPathFileCompactor, hashStoreDiskFileCompactor, pathToKeyValueFileCompactor);

        if (compactionEnabled) {
            enableBackgroundCompaction();
        }
    }

    /**
     * Enables background compaction process.
     */
    @Override
    public void enableBackgroundCompaction() {
        compactionCoordinator.enableBackgroundCompaction();
    }

    /** Stop background compaction process, interrupting the current compaction if one is happening.
     * This will not corrupt the database but will leave files around.*/
    @Override
    public void stopAndDisableBackgroundCompaction() {
        compactionCoordinator.stopAndDisableBackgroundCompaction();
    }

    /**
     * Get the count of open database instances. This is databases that have been opened but not yet
     * closed.
     *
     * @return Count of open databases.
     */
    public static long getCountOfOpenDatabases() {
        return COUNT_OF_OPEN_DATABASES.sum();
    }

    /** Get the most recent first leaf path */
    public long getFirstLeafPath() {
        return validLeafPathRange.getMinValidKey();
    }

    /** Get the most recent last leaf path */
    public long getLastLeafPath() {
        return validLeafPathRange.getMaxValidKey();
    }

    /**
     * Pauses compaction of all data file collections used by this data source. It may not stop compaction
     * immediately, but as soon as compaction process needs to update data source state, which is
     * critical for snapshots (e.g. update an index), it will be stopped until {@link
     * #resumeCompaction()}} is called.
     */
    void pauseCompaction() throws IOException {
        compactionCoordinator.pauseCompaction();
    }

    /** Resumes previously stopped data file collection merging. */
    void resumeCompaction() throws IOException {
        compactionCoordinator.resumeCompaction();
    }

    /**
     * {@inheritDoc}
     */
    @Override
    @Deprecated
    @SuppressWarnings("rawtypes")
    public KeySerializer getKeySerializer() {
        return tableConfig.getKeySerializer();
    }

    /**
     * {@inheritDoc}
     */
    @Override
    @Deprecated
    @SuppressWarnings("rawtypes")
    public ValueSerializer getValueSerializer() {
        return tableConfig.getValueSerializer();
    }

    /**
     * Save a batch of data to data store.
     * 
     * If you call this method where not all data is provided to cover the change in
     * firstLeafPath and lastLeafPath, then any reads after this call may return rubbish or throw
     * obscure exceptions for any internals or leaves that have not been written. For example, if
     * you were to grow the tree by more than 2x, and then called this method in batches, be aware
     * that if you were to query for some record between batches that hadn't yet been saved, you
     * will encounter problems.
     *
     * @param firstLeafPath the tree path for first leaf
     * @param lastLeafPath the tree path for last leaf
     * @param hashRecordsToUpdate stream of records with hashes to update, it is assumed this is sorted by
     *     path and each path only appears once.
     * @param leafRecordsToAddOrUpdate stream of new leaf nodes and updated leaf nodes
     * @param leafRecordsToDelete stream of new leaf nodes to delete, The leaf record's key and path
     *     have to be populated, all other data can be null.
     * @param isReconnectContext if true, the method called in the context of reconnect
     * @throws IOException If there was a problem saving changes to data source
     */
    @Override
    public void saveRecords(
            final long firstLeafPath,
            final long lastLeafPath,
            @NonNull final Stream hashRecordsToUpdate,
            @NonNull final Stream leafRecordsToAddOrUpdate,
            @NonNull final Stream leafRecordsToDelete,
            final boolean isReconnectContext)
            throws IOException {
        try {
            validLeafPathRange = new KeyRange(firstLeafPath, lastLeafPath);
            final CountDownLatch countDownLatch = new CountDownLatch(lastLeafPath > 0 ? 2 : 1);

            if (lastLeafPath > 0) {
                // Use an executor to make sure the data source is not closed in parallel. See
                // the comment in close() for details
                storeHashesExecutor.execute(() -> {
                    try {
                        writeHashes(lastLeafPath, hashRecordsToUpdate);
                    } catch (final IOException e) {
                        logger.error(EXCEPTION.getMarker(), "[{}] Failed to store hashes", tableName, e);
                        throw new UncheckedIOException(e);
                    } finally {
                        countDownLatch.countDown();
                    }
                });
            }

            // Use an executor to make sure the data source is not closed in parallel. See
            // the comment in close() for details
            storeLeavesExecutor.execute(() -> {
                try {
                    // we might as well do this in the archive thread rather than leaving it waiting
                    writeLeavesToPathToKeyValue(
                            firstLeafPath,
                            lastLeafPath,
                            leafRecordsToAddOrUpdate,
                            leafRecordsToDelete,
                            isReconnectContext);
                } catch (final IOException e) {
                    logger.error(EXCEPTION.getMarker(), "[{}] Failed to store leaves", tableName, e);
                    throw new UncheckedIOException(e);
                } finally {
                    countDownLatch.countDown();
                }
            });

            // wait for the other threads in the rare case they are not finished yet. We need to
            // have all writing
            // done before we return as when we return the state version we are writing is deleted
            // from the cache and
            // the flood gates are opened for reads through to the data we have written here.
            try {
                countDownLatch.await();
            } catch (final InterruptedException e) {
                logger.warn(
                        EXCEPTION.getMarker(),
                        "[{}] Interrupted while waiting on internal record storage",
                        tableName,
                        e);
                Thread.currentThread().interrupt();
            }
        } finally {
            // Report total size on disk as sum of all store files. All metadata and other helper files
            // are considered small enough to be ignored. If/when we decide to use on-disk long lists
            // for indices, they should be added here
            statisticsUpdater.updateStoreFileStats(this);
            // update off-heap stats
            statisticsUpdater.updateOffHeapStats(this);
        }
    }

    /**
     * Load a leaf record by key.
     *
     * @param keyBytes they to the leaf to load record for
     * @return loaded record or null if not found
     * @throws IOException If there was a problem reading record from db
     */
    @Nullable
    @Override
    public VirtualLeafBytes loadLeafRecord(final Bytes keyBytes, final int keyHashCode) throws IOException {
        requireNonNull(keyBytes);

        final long path;
        VirtualLeafBytes cached = null;
        int cacheIndex = -1;
        if (leafRecordCache != null) {
            cacheIndex = Math.abs(keyHashCode % leafRecordCacheSize);
            // No synchronization is needed here. Java guarantees (JLS 17.7) that reference writes
            // are atomic, so we will never get corrupted objects from the array. The object may
            // be overwritten in the cache in a different thread in parallel, but it isn't a
            // problem as cached entry key is checked below anyway
            cached = leafRecordCache[cacheIndex];
        }
        // If an entry is found in the cache, and entry key is the one requested
        if ((cached != null) && keyBytes.equals(cached.keyBytes())) {
            // Some cache entries contain just key and path, but no value. If the value is there,
            // just return the cached entry. If not, at least make use of the path
            if (cached.valueBytes() != null) {
                return cached;
            }
            // Note that the path may be INVALID_PATH here, this is perfectly legal
            path = cached.path();
        } else {
            // Cache miss
            cached = null;
            statisticsUpdater.countLeafKeyReads();
            path = keyToPath.get(keyBytes, keyHashCode, INVALID_PATH);
        }

        // If the key didn't map to anything, we just return null
        if (path == INVALID_PATH) {
            // Cache the result if not already cached
            if (leafRecordCache != null && cached == null) {
                leafRecordCache[cacheIndex] = new VirtualLeafBytes(path, keyBytes, 0, null);
            }
            return null;
        }

        // If the key returns a value from the map, but it lies outside the first/last
        // leaf path, then return null. This can happen if the map contains old keys
        // that haven't been removed.
        if (!validLeafPathRange.withinRange(path)) {
            return null;
        }

        statisticsUpdater.countLeafReads();
        // Go ahead and lookup the value.
        VirtualLeafBytes leafBytes = VirtualLeafBytes.parseFrom(pathToKeyValue.get(path));
        assert leafBytes != null && leafBytes.keyBytes().equals(keyBytes);

        if (leafRecordCache != null) {
            // No synchronization is needed here, see the comment above
            leafRecordCache[cacheIndex] = leafBytes;
        }

        return leafBytes;
    }

    /**
     * Load a leaf record by path. This method returns {@code null}, if the path is outside the
     * valid path range.
     *
     * @param path the path for the leaf we are loading
     * @return loaded record or null if not found
     * @throws IOException If there was a problem reading record from db
     */
    @Nullable
    @Override
    public VirtualLeafBytes loadLeafRecord(final long path) throws IOException {
        if (path < 0) {
            throw new IllegalArgumentException("Path (" + path + ") is not valid");
        }
        final KeyRange leafPathRange = validLeafPathRange;
        if (!leafPathRange.withinRange(path)) {
            return null;
        }
        statisticsUpdater.countLeafReads();
        return VirtualLeafBytes.parseFrom(pathToKeyValue.get(path));
    }

    /**
     * Find the path of the given key.
     *
     * @param keyBytes the key for a path
     * @return the path or INVALID_PATH if not stored
     * @throws IOException If there was a problem locating the key
     */
    @Override
    public long findKey(final Bytes keyBytes, final int keyHashCode) throws IOException {
        requireNonNull(keyBytes);

        // Check the cache first
        int cacheIndex = -1;
        if (leafRecordCache != null) {
            cacheIndex = Math.abs(keyHashCode % leafRecordCacheSize);
            // No synchronization is needed here. See the comment in loadLeafRecord(key) above
            final VirtualLeafBytes cached = leafRecordCache[cacheIndex];
            if (cached != null && keyBytes.equals(cached.keyBytes())) {
                // Cached path may be a valid path or INVALID_PATH, both are legal here
                return cached.path();
            }
        }

        statisticsUpdater.countLeafKeyReads();
        final long path = keyToPath.get(keyBytes, keyHashCode, INVALID_PATH);

        if (leafRecordCache != null) {
            // Path may be INVALID_PATH here. Still needs to be cached (negative result)
            leafRecordCache[cacheIndex] = new VirtualLeafBytes(path, keyBytes, keyHashCode, null);
        }

        return path;
    }

    /**
     * {@inheritDoc}
     */
    @Nullable
    @Override
    public Hash loadHash(final long path) throws IOException {
        if (path < 0) {
            throw new IllegalArgumentException("Path (" + path + ") is not valid");
        }

        // It is possible that the caller will ask for an internal node that the database doesn't
        // know about. This can happen if some leaves have been added to the tree, but we haven't
        // hashed yet, so the cache doesn't have any internal records for it, and somebody
        // tries to iterate over the nodes in the tree.
        long lastLeaf = validLeafPathRange.getMaxValidKey();
        if (path > lastLeaf) {
            return null;
        }

        final Hash hash;
        if (path < tableConfig.getHashesRamToDiskThreshold()) {
            hash = hashStoreRam.get(path);
            // Should count hash reads here, too?
        } else {
            final VirtualHashRecord rec = VirtualHashRecord.parseFrom(hashStoreDisk.get(path));
            hash = (rec != null) ? rec.hash() : null;
            statisticsUpdater.countHashReads();
        }

        return hash;
    }

    /**
     * {@inheritDoc}
     */
    @Override
    public boolean loadAndWriteHash(final long path, final SerializableDataOutputStream out) throws IOException {
        if (path < 0) {
            throw new IllegalArgumentException("path is less than 0");
        }
        long lastLeaf = validLeafPathRange.getMaxValidKey();
        if (path > lastLeaf) {
            return false;
        }
        // This method must write hashes in the same binary format as Hash.(de)serialize(). If a
        // hash comes from hashStoreRam, it's enough to just serialize it to the output stream.
        // However, if a hash is stored in the files as a VirtualHashRecord, its bytes are
        // slightly different, so additional processing is required
        if (path < tableConfig.getHashesRamToDiskThreshold()) {
            final Hash hash = hashStoreRam.get(path);
            if (hash == null) {
                return false;
            }
            hash.serialize(out);
        } else {
            final BufferedData hashBytes = hashStoreDisk.get(path);
            if (hashBytes == null) {
                return false;
            }
            // Hash.serialize() format is: digest ID (4 bytes) + size (4 bytes) + hash (48 bytes)
            VirtualHashRecord.extractAndWriteHashBytes(hashBytes, out);
        }
        return true;
    }

    /** Wait for any merges to finish, then close all data stores and free all resources. */
    @Override
    public void close() throws IOException {
        if (!closed.getAndSet(true)) {
            try {
                // Stop merging and shutdown the datasource compactor
                compactionCoordinator.stopAndDisableBackgroundCompaction();
                // Shut down all executors. If a flush is currently in progress, it will be interrupted.
                // It's critical to make sure there are no disk read/write operations before all indiced
                // and file collections are closed below
                shutdownThreadsAndWait(storeHashesExecutor, storeLeavesExecutor, snapshotExecutor);
            } finally {
                try {
                    // close all closable data stores
                    logger.info(MERKLE_DB.getMarker(), "Closing Data Source [{}]", tableName);
                    // Hashes store
                    if (hashStoreRam != null) {
                        hashStoreRam.close();
                    }
                    if (hashStoreDisk != null) {
                        hashStoreDisk.close();
                    }
                    // Then hashes index
                    pathToDiskLocationInternalNodes.close();
                    // Key to paths, both store and index
                    keyToPath.close();
                    // Leaves store
                    pathToKeyValue.close();
                    // Then leaves index
                    pathToDiskLocationLeafNodes.close();
                } catch (final Exception e) {
                    logger.warn(EXCEPTION.getMarker(), "Exception while closing Data Source [{}]", tableName);
                } catch (final Error t) {
                    logger.error(EXCEPTION.getMarker(), "Error while closing Data Source [{}]", tableName);
                    throw t;
                } finally {
                    // updated count of open databases
                    COUNT_OF_OPEN_DATABASES.decrement();
                    // Notify the database
                    database.closeDataSource(this);
                }
            }
        }
    }

    /**
     * Write a snapshot of the current state of the database at this moment in time. This will block
     * till the snapshot is completely created.
     * 

     *
     *  Only one snapshot can happen at a time, this will throw an IllegalStateException if
     * another snapshot is currently happening. 
     * 

     *  IMPORTANT, after this is completed the caller owns the directory. It is responsible
     * for deleting it when it is no longer needed. 
     *
     * @param snapshotDirectory Directory to put snapshot into, it will be created if it doesn't
     *     exist.
     * @throws IOException If there was a problem writing the current database out to the given
     *     directory
     * @throws IllegalStateException If there is already a snapshot happening
     */
    @SuppressWarnings("ConstantConditions")
    @Override
    public void snapshot(final Path snapshotDirectory) throws IOException, IllegalStateException {
        // check if another snapshot was running
        final boolean aSnapshotWasInProgress = snapshotInProgress.getAndSet(true);
        if (aSnapshotWasInProgress) {
            throw new IllegalStateException("Tried to start a snapshot when one was already in progress");
        }
        try {
            // start timing snapshot
            final long START = System.currentTimeMillis();
            // create snapshot dir if it doesn't exist
            Files.createDirectories(snapshotDirectory);
            final MerkleDbPaths snapshotDbPaths = new MerkleDbPaths(snapshotDirectory);
            // main snapshotting process in multiple-threads
            try {
                final CountDownLatch countDownLatch = new CountDownLatch(7);
                // write all data stores
                runWithSnapshotExecutor(true, countDownLatch, "pathToDiskLocationInternalNodes", () -> {
                    pathToDiskLocationInternalNodes.writeToFile(snapshotDbPaths.pathToDiskLocationInternalNodesFile);
                    return true;
                });
                runWithSnapshotExecutor(true, countDownLatch, "pathToDiskLocationLeafNodes", () -> {
                    pathToDiskLocationLeafNodes.writeToFile(snapshotDbPaths.pathToDiskLocationLeafNodesFile);
                    return true;
                });
                runWithSnapshotExecutor(hashStoreRam != null, countDownLatch, "internalHashStoreRam", () -> {
                    hashStoreRam.writeToFile(snapshotDbPaths.hashStoreRamFile);
                    return true;
                });
                runWithSnapshotExecutor(hashStoreDisk != null, countDownLatch, "internalHashStoreDisk", () -> {
                    hashStoreDisk.snapshot(snapshotDbPaths.hashStoreDiskDirectory);
                    return true;
                });
                runWithSnapshotExecutor(keyToPath != null, countDownLatch, "keyToPath", () -> {
                    keyToPath.snapshot(snapshotDbPaths.keyToPathDirectory);
                    return true;
                });
                runWithSnapshotExecutor(true, countDownLatch, "pathToKeyValue", () -> {
                    pathToKeyValue.snapshot(snapshotDbPaths.pathToKeyValueDirectory);
                    return true;
                });
                runWithSnapshotExecutor(true, countDownLatch, "metadata", () -> {
                    saveMetadata(snapshotDbPaths);
                    return true;
                });
                // wait for the others to finish
                countDownLatch.await();
            } catch (final InterruptedException e) {
                logger.error(
                        EXCEPTION.getMarker(),
                        "[{}] InterruptedException from waiting for countDownLatch in snapshot",
                        tableName,
                        e);
                Thread.currentThread().interrupt();
            }
            logger.info(
                    MERKLE_DB.getMarker(),
                    "[{}] Snapshot all finished in {} seconds",
                    tableName,
                    (System.currentTimeMillis() - START) * UnitConstants.MILLISECONDS_TO_SECONDS);
        } finally {
            snapshotInProgress.set(false);
        }
    }

    /** toString for debugging */
    @Override
    public String toString() {
        return new ToStringBuilder(this)
                .append("maxNumberOfKeys", tableConfig.getMaxNumberOfKeys())
                .append("preferDiskBasedIndexes", tableConfig.isPreferDiskBasedIndices())
                .append("pathToDiskLocationInternalNodes.size", pathToDiskLocationInternalNodes.size())
                .append("pathToDiskLocationLeafNodes.size", pathToDiskLocationLeafNodes.size())
                .append("hashesRamToDiskThreshold", tableConfig.getHashesRamToDiskThreshold())
                .append("hashStoreRam.size", hashStoreRam == null ? null : hashStoreRam.size())
                .append("hashStoreDisk", hashStoreDisk)
                .append("hasDiskStoreForHashes", hasDiskStoreForHashes)
                .append("keyToPath", keyToPath)
                .append("pathToKeyValue", pathToKeyValue)
                .append("snapshotInProgress", snapshotInProgress.get())
                .toString();
    }

    /**
     * Database instance that hosts this data source.
     *
     * @return Virtual database instance
     */
    public MerkleDb getDatabase() {
        return database;
    }

    /**
     * Table ID for this data source in its virtual database instance.
     *
     * @return Table ID
     */
    public int getTableId() {
        return tableId;
    }

    /**
     * Table name for this data source in its virtual database instance.
     *
     * @return Table name
     */
    public String getTableName() {
        return tableName;
    }

    /**
     * Table config for this data source. Includes key and value serializers, internal nodes
     * RAM/disk threshold, etc.
     *
     * @return Table config
     */
    public MerkleDbTableConfig getTableConfig() {
        return tableConfig;
    }

    // For testing purpose
    Path getStorageDir() {
        return dbPaths.storageDir;
    }

    // For testing purpose
    long getMaxNumberOfKeys() {
        return tableConfig.getMaxNumberOfKeys();
    }

    // For testing purpose
    long getHashesRamToDiskThreshold() {
        return tableConfig.getHashesRamToDiskThreshold();
    }

    // For testing purpose
    boolean isPreferDiskBasedIndexes() {
        return tableConfig.isPreferDiskBasedIndices();
    }

    // For testing purpose
    boolean isCompactionEnabled() {
        return compactionCoordinator.isCompactionEnabled();
    }

    private void saveMetadata(final MerkleDbPaths targetDir) throws IOException {
        final KeyRange leafRange = validLeafPathRange;
        final Path targetFile = targetDir.metadataFile;
        try (final OutputStream fileOut =
                Files.newOutputStream(targetFile, StandardOpenOption.CREATE, StandardOpenOption.WRITE)) {
            final WritableSequentialData out = new WritableStreamingData(fileOut);
            if (leafRange.getMinValidKey() != 0) {
                ProtoWriterTools.writeTag(out, FIELD_DSMETADATA_MINVALIDKEY);
                out.writeVarLong(leafRange.getMinValidKey(), false);
            }
            if (leafRange.getMaxValidKey() != 0) {
                ProtoWriterTools.writeTag(out, FIELD_DSMETADATA_MAXVALIDKEY);
                out.writeVarLong(leafRange.getMaxValidKey(), false);
            }
            fileOut.flush();
        }
    }

    private boolean loadMetadata(final MerkleDbPaths sourceDir) throws IOException {
        if (Files.exists(sourceDir.metadataFile)) {
            final Path sourceFile = sourceDir.metadataFile;
            long minValidKey = 0;
            long maxValidKey = 0;
            try (final ReadableStreamingData in = new ReadableStreamingData(sourceFile)) {
                while (in.hasRemaining()) {
                    final int tag = in.readVarInt(false);
                    final int fieldNum = tag >> TAG_FIELD_OFFSET;
                    if (fieldNum == FIELD_DSMETADATA_MINVALIDKEY.number()) {
                        minValidKey = in.readVarLong(false);
                    } else if (fieldNum == FIELD_DSMETADATA_MAXVALIDKEY.number()) {
                        maxValidKey = in.readVarLong(false);
                    } else {
                        throw new IllegalArgumentException("Unknown data source metadata field: " + fieldNum);
                    }
                }
                validLeafPathRange = new KeyRange(minValidKey, maxValidKey);
            }
            Files.delete(sourceFile);
            return true;
        }
        return false;
    }

    /** {@inheritDoc} */
    @Override
    public void registerMetrics(final Metrics metrics) {
        statisticsUpdater.registerMetrics(metrics);
    }

    /** {@inheritDoc} */
    @Override
    public void copyStatisticsFrom(final VirtualDataSource that) {
        if (!(that instanceof MerkleDbDataSource thatDataSource)) {
            logger.warn(MERKLE_DB.getMarker(), "Can only copy statistics from MerkleDbDataSource");
            return;
        }
        statisticsUpdater = thatDataSource.statisticsUpdater;
    }

    // ==================================================================================================================
    // private methods

    /**
     * Shutdown threads if they are running and wait for them to finish
     *
     * @param executors array of threads to shut down.
     * @throws IOException if there was a problem or timeout shutting down threads.
     */
    private void shutdownThreadsAndWait(final ExecutorService... executors) throws IOException {
        try {
            // shutdown threads
            for (final ExecutorService executor : executors) {
                if (!executor.isShutdown()) {
                    executor.shutdown();
                    final boolean finishedWithoutTimeout = executor.awaitTermination(5, TimeUnit.MINUTES);
                    if (!finishedWithoutTimeout) {
                        throw new IOException("Timeout while waiting for executor service to finish.");
                    }
                }
            }
        } catch (final InterruptedException e) {
            logger.warn(EXCEPTION.getMarker(), "[{}] Interrupted while waiting on executors to shutdown", tableName, e);
            Thread.currentThread().interrupt();
            throw new IOException("Interrupted while waiting for shutdown to finish.", e);
        }
    }

    /**
     * Run a runnable on background thread using snapshot ExecutorService, counting down latch when
     * done.
     *
     * @param shouldRun when true, run runnable otherwise just countdown latch
     * @param countDownLatch latch to count down when done
     * @param taskName the name of the task for logging
     * @param runnable the code to run
     */
    private void runWithSnapshotExecutor(
            final boolean shouldRun,
            final CountDownLatch countDownLatch,
            final String taskName,
            final Callable