All Downloads are FREE. Search and download functionalities are using the official Maven repository.

com.sleepycat.je.evictor.OffHeapCache Maven / Gradle / Ivy

The newest version!
/*-
 * Copyright (C) 2002, 2018, Oracle and/or its affiliates. All rights reserved.
 *
 * This file was distributed by Oracle as part of a version of Oracle Berkeley
 * DB Java Edition made available at:
 *
 * http://www.oracle.com/technetwork/database/database-technologies/berkeleydb/downloads/index.html
 *
 * Please see the LICENSE file included in the top-level directory of the
 * appropriate version of Oracle Berkeley DB Java Edition for a copy of the
 * license and additional information.
 */

package com.sleepycat.je.evictor;

import static com.sleepycat.je.EnvironmentFailureException.unexpectedState;
import static com.sleepycat.je.evictor.OffHeapStatDefinition.ALLOC_FAILURE;
import static com.sleepycat.je.evictor.OffHeapStatDefinition.ALLOC_OVERFLOW;
import static com.sleepycat.je.evictor.OffHeapStatDefinition.BINS_LOADED;
import static com.sleepycat.je.evictor.OffHeapStatDefinition.BINS_STORED;
import static com.sleepycat.je.evictor.OffHeapStatDefinition.CACHED_BINS;
import static com.sleepycat.je.evictor.OffHeapStatDefinition.CACHED_BIN_DELTAS;
import static com.sleepycat.je.evictor.OffHeapStatDefinition.CACHED_LNS;
import static com.sleepycat.je.evictor.OffHeapStatDefinition.CRITICAL_NODES_TARGETED;
import static com.sleepycat.je.evictor.OffHeapStatDefinition.DIRTY_NODES_EVICTED;
import static com.sleepycat.je.evictor.OffHeapStatDefinition.GROUP_DESC;
import static com.sleepycat.je.evictor.OffHeapStatDefinition.GROUP_NAME;
import static com.sleepycat.je.evictor.OffHeapStatDefinition.LNS_EVICTED;
import static com.sleepycat.je.evictor.OffHeapStatDefinition.LNS_LOADED;
import static com.sleepycat.je.evictor.OffHeapStatDefinition.LNS_STORED;
import static com.sleepycat.je.evictor.OffHeapStatDefinition.LRU_SIZE;
import static com.sleepycat.je.evictor.OffHeapStatDefinition.NODES_EVICTED;
import static com.sleepycat.je.evictor.OffHeapStatDefinition.NODES_MUTATED;
import static com.sleepycat.je.evictor.OffHeapStatDefinition.NODES_SKIPPED;
import static com.sleepycat.je.evictor.OffHeapStatDefinition.NODES_STRIPPED;
import static com.sleepycat.je.evictor.OffHeapStatDefinition.NODES_TARGETED;
import static com.sleepycat.je.evictor.OffHeapStatDefinition.THREAD_UNAVAILABLE;
import static com.sleepycat.je.evictor.OffHeapStatDefinition.TOTAL_BLOCKS;
import static com.sleepycat.je.evictor.OffHeapStatDefinition.TOTAL_BYTES;

import java.nio.ByteBuffer;
import java.util.Arrays;
import java.util.Map;
import java.util.concurrent.ArrayBlockingQueue;
import java.util.concurrent.ConcurrentHashMap;
import java.util.concurrent.RejectedExecutionHandler;
import java.util.concurrent.ThreadPoolExecutor;
import java.util.concurrent.TimeUnit;
import java.util.concurrent.atomic.AtomicBoolean;
import java.util.concurrent.atomic.AtomicInteger;
import java.util.concurrent.atomic.AtomicLong;
import java.util.logging.Level;
import java.util.logging.Logger;
import java.util.zip.Checksum;

import com.sleepycat.je.CacheMode;
import com.sleepycat.je.EnvironmentFailureException;
import com.sleepycat.je.EnvironmentMutableConfig;
import com.sleepycat.je.StatsConfig;
import com.sleepycat.je.config.EnvironmentParams;
import com.sleepycat.je.dbi.DatabaseImpl;
import com.sleepycat.je.dbi.DbConfigManager;
import com.sleepycat.je.dbi.EnvConfigObserver;
import com.sleepycat.je.dbi.EnvironmentImpl;
import com.sleepycat.je.dbi.MemoryBudget;
import com.sleepycat.je.evictor.Evictor.EvictionSource;
import com.sleepycat.je.log.LogEntryType;
import com.sleepycat.je.log.Provisional;
import com.sleepycat.je.log.entry.BINDeltaLogEntry;
import com.sleepycat.je.log.entry.INLogEntry;
import com.sleepycat.je.tree.BIN;
import com.sleepycat.je.tree.IN;
import com.sleepycat.je.tree.LN;
import com.sleepycat.je.utilint.Adler32;
import com.sleepycat.je.utilint.DbLsn;
import com.sleepycat.je.utilint.IntStat;
import com.sleepycat.je.utilint.LoggerUtils;
import com.sleepycat.je.utilint.LongStat;
import com.sleepycat.je.utilint.Pair;
import com.sleepycat.je.utilint.StatGroup;
import com.sleepycat.je.utilint.StoppableThreadFactory;
import com.sleepycat.je.utilint.VLSN;

/**
 * Off-heap cache and evictor.
 *
 * Overview
 * --------
 * When an LN or BIN is evicted from the main cache it is moved off-heap. The
 * off-heap evictor (this class) will apply the same LRU algorithm and
 * CacheMode logic that is used by the main evictor. When an off-heap cache is
 * used, the main evictor will not place dirty INs on a priority 2 LRU list,
 * and will not perform LN stripping or BIN delta mutation; instead, these
 * actions become the responsibility of the off-heap evictor.
 *
 * UINs are not stored off-heap because the complexity this would add is not
 * worth the benefits. An extremely large data set can be represented by the
 * UINs that fit in a 10GB main cache, so the lack of off-heap UINs is not
 * considered a deficiency.
 *
 * Movement of LNs and BINs between the two caches is performed as follows.
 * Note that LNs and BINs are not moved off-heap if they are deleted nor if
 * they belong to an internal database. And of course, embedded and duplicate
 * DB LNs are not stored separately off-heap.
 *
 * When an LN or a BIN is evicted from main, it is stored off-heap.
 *
 * If the off-heap memory block cannot be allocated, the object is not stored
 * and no exception is thrown. This prevents allocation failures from causing
 * CRUD operation failures. Stats about allocation failures are maintained and
 * a SEVERE message is logged when the failure is because no more system
 * memory is available. See the OffHeapAllocator interface for details.
 *
 * For an off-heap LN with a parent BIN in main cache, the LN's memory ID is
 * maintained in the BIN. The BIN is assigned an off-heap LRU index so the
 * off-heap evictor can perform off-heap LN stripping. In this case, the BIN
 * is in both a main and off-heap LRU list. For now at least (TODO), because
 * deferred write DBs are not supported, a priority-1 off-heap LRU list is
 * used, since the LNs will never be logged.
 *
 * An off-heap BIN is assigned an off-heap LRU index, which is stored in its
 * parent UIN slot in main cache. The slot also has an "off-heap dirty" flag
 * that allows the checkpointer to discover dirty off-heap BINs, and "priority
 * 2" flag that indicates whether the BIN is in the priority 1 or 2 LRU lists.
 *
 * When a BIN moves off-heap and the BIN currently has off-heap LNs, the
 * references (memory IDs) of the off-heap LNs are stored with the serialized
 * BIN. When the off-heap evictor processes the BIN, it will free the
 * off-heap LNs and modify or replace the off-heap BIN so that it no longer
 * references them. This is the equivalent of the LN stripping performed by the
 * main cache evictor.
 *
 * An off-heap BIN will be mutated to a BIN-delta using the same rules used by
 * the main evictor.
 *
 * The use of separate priority 1 and priority 1 LRU lists also copies the
 * approach used in the main evictor (when no off-heap cache is configured).
 *
 *   - Eviction of nodes on the priority 2 lists occurs only after emptying the
 *     priority 1 lists.
 *
 *   - A BIN is moved from a priority 1 list to a priority 2 list when it is
 *     dirty, its LNs have been stripped and BIN-delta mutation has been
 *     attempted.
 *
 *   - Therefore, only dirty BINs with no resident LNs, and which have been
 *     mutated to BIN-deltas (if possible), appear in the priority 2 lists.
 *
 * However, in the off-heap cache, all off-heap BINs appear in an LRU list,
 * unlike the main cache where some INs do not appear because they have
 * resident children and therefore are not evictable.
 *
 * Nodes in both caches at once
 * ----------------------------
 * There are advantages and disadvantages to allowing duplication of a node
 * (LN or BIN) in the off-heap and main cache. The advantage is when a node is
 * loaded from off-heap into main, and we know (because CacheMode.EVICT_LN,
 * EVICT_BIN or UNCHANGED is used) that when the operation is complete the node
 * will be evicted from main and stored off-heap again. In this case it is more
 * efficient to leave it off-heap and tolerate the duplication for the duration
 * of the operation. However, the drawbacks of doing this are:
 *
 * 1. We cannot assume in code that a node is in only one cache at a time. When
 *    it appears in both caches, we must always use the object in the main
 *    cache, since the off-heap object may be stale.
 *
 * 2. If for some reason the node is NOT evicted from the main cache, we must
 *    remove it from off-heap. This can happen when the node is accessed with
 *    a different CacheMode (by the original thread or a different thread)
 *    prior to completing the operation. Removal from the off-heap cache
 *    should be done ASAP, so the duplication does not cause unnecessary
 *    eviction.
 *
 * 3. If the node in the main cache is modified, this invalidates the node in
 *    the off-heap cache and we must be sure not to use the off-heap version
 *    and to remove it ASAP. This is very complex for BINs in particular,
 *    because they can be modified in so many different ways. For LNs, on the
 *    other hand, the types of modifications are fairly limited.
 *
 * Because of the complexity issue in item 3, we do not allow duplication of
 * BINs. We do allow duplication of LNs, and this is handled as follows:
 *
 *  - freeRedundantLN is called when an LN is accessed via IN.fetchLN or getLN.
 *    If a CacheMode is used that will not evict the LN, the LN is removed
 *    from off-heap.
 *
 *  - freeLN is called (via BIN.freeOffHeapLN) during any operation that will
 *    modify an LN.
 *
 * If for some reason these mechanisms fail to prevent unwanted duplication,
 * eviction will eventually remove the redundant nodes.
 *
 * LRU data structures and concurrency control
 * -------------------------------------------
 * LRU entries form linked lists. Like in the main cache, there are two sets of
 * LRU lists for priority 1 and 2 BINs, and multiple lists in each set to
 * reduce thread contention on the linked lists.
 *
 * LRU information is allocated using arrays to minimize per-entry object
 * overhead. There is a single pool of allocated entries that are used for all
 * LRULists. The entries are uniquely identified by an int ID.
 *
 * The arrays are allocated in Chunks and a Chunk is never de-allocated. This
 * is for two reasons:
 *
 *   - Chunks can be referenced without any locking (concurrency control is
 *     discussed below).
 *
 *   - Using Chunks avoids having to pre-allocate all LRU entries, while still
 *     minimizing Object overhead (see CHUNK_SIZE).
 *
 * The 'chunks' array contains all allocated Chunks. In each Chunk there is an
 * array for each field in an LRU entry.
 *
 * LRU entries are assigned sequential int IDs starting from zero. The chunk
 * for a given entry ID is:
 *    chunks[entry / CHUNK_SIZE]
 * and the array index within the chunk is:
 *    entry % CHUNK_SIZE
 *
 * The chunks array can be read (indexed to obtain a Chunk object) without any
 * locking because a copy-on-write technique is used. When a new Chunk must be
 * allocated, the addRemoveEntryMutex protects the assignment of the chunks
 * array. This mutex also protects the free entry list (the firstFreeListEntry
 * field and the next/prev indexes of the entries on the free list). This mutex
 * is global per Environment, but is not frequently locked -- only when an LRU
 * entry is added or removed.
 *
 * The fields of an LRU entry -- the array slots -- are protected as follows.
 *
 *   - The linked list fields -- prev and next slots -- are protected by the
 *     LRUList mutex, for entries in an LRUList. For entries on the free list,
 *     these are protected by the addRemoveEntryMutex.
 *
 *   - Other fields -- owners and memIds slots, for example -- are protected by
 *     the IN latch. The IN "owns" these fields for its associated LRU entry
 *     (in the case of a BIN) or entries (in the case of an IN).
 *
 *     Of course the IN latch also protects the fields in the IN related to the
 *     LRU entry: the BIN's lruIdx field, and the arrays of child LN memId
 *     (for a BIN) and child IN lruIdx (for a UIN).
 *
 * When multiple locks are taken, the order is:
 *    IN latch, LRUList mutex
 *    -or-
 *    IN latch, addRemoveEntryMutex
 *
 * The LRUList mutex and addRemoveEntryMutex are never both locked.
 *
 * AN LRU entry is in a special state when it is removed from the LRU list and
 * is being processed by the evictor. In this case the IN is latched, but there
 * is a window after it is removed and before it is latched where anything can
 * happen. Before processing, several checks are made to ensure that the entry
 * still belongs to the IN, the IN has not been evicted, and the entry has not
 * been put back on the LRUList. This last check requires synchronizing on the
 * LRUList, so unfortunately we must synchronize twice on the LRU list: once to
 * remove the entry, and again after latching the IN to ensure that it has not
 * been put back on the LRUList by another thread.
 *
 * TODO:
 * - Test allocation failures using an allocator that simulates random
 *   failures. Currently, allocation failures never happen in the real world,
 *   because Linux kills the process before memory is exhausted. But when we
 *   allow alternate allocators, failures may occur if a memory pool is filled.
 */
public class OffHeapCache implements EnvConfigObserver {

    private static final int VLSN_SIZE = 8;
    private static final int CHECKSUM_SIZE = 4;
    private static final int MAX_UNUSED_BIN_BYTES = 100;

    private static final int BIN_FLAG_DELTA = 0x1;
    private static final int BIN_FLAG_CAN_MUTATE = 0x2;
    private static final int BIN_FLAG_PROHIBIT_NEXT_DELTA = 0x4;
    private static final int BIN_FLAG_LOGGED_FULL_VERSION = 0x8;

    private static final boolean DEBUG_DOUBLE_FREE = false;
    private static final int DEBUG_FREE_BLOCKS_PER_MAP = 250000; // about 0.5 G

    private static final boolean DEBUG_TRACE = false;
    private static final boolean DEBUG_TRACE_STACK = false;
    private static final boolean DEBUG_TRACE_AND_LOG = false;

    /*
     * Number of LRU entries to allocate at a time, i.e., per chunk.
     * The goals are:
     *
     * 1. Create arrays large enough to make the object overhead insignificant.
     * The byte[], the smallest array, is 100KB and its object overhead (16
     * bytes max) is tiny in comparison.
     *
     * 2. Create arrays less than than 1MB in size to prevent GC issues.
     * "Humongous" objects, which are expensive to GC viewpoint are 1MB or
     * larger. The long[], the largest array, is 800KB with a 100K chunk size.
     *
     * 3. Create chunks small enough that we don't use a big percentage of a
     * smallish heap to allocate one chunk. The chunk size is a little over
     * 2MB, or easily small enough to meet this requirement.
     *
     * 4. Create chunks large enough so that we don't frequently grow the chunk
     * list, which requires holding the free list mutex. 100K entries per chunk
     * is easily enough.
     *
     * 5. Create chunks small enough so that we don't hold the free list mutex
     * for too long while adding all the entries in a new chunk to the free
     * list. 100K may be too large in this respect, and it could be reduced if
     * this is a noticeable issue. Even better, rather than add a new chunk's
     * entries to the free list, treat those entries as a "free stack" and pop
     * them off separately.
     */
    private static final int CHUNK_SIZE = 100 * 1024;

    private static final long CHUNK_MEMORY_SIZE =
        MemoryBudget.OBJECT_OVERHEAD +
        16 + // For four array references -- accuracy is unimportant.
        MemoryBudget.longArraySize(CHUNK_SIZE) +
        MemoryBudget.objectArraySize(CHUNK_SIZE) +
        MemoryBudget.intArraySize(CHUNK_SIZE) * 2;

    /*
     * Amount that tests should add to a minimal main cache configuration,
     * when an off-heap cache is used.
     *
     * TODO: For now this is not budgeted.
     */
    public static final long MIN_MAIN_CACHE_OVERHEAD = 0;//CHUNK_MEMORY_SIZE;

    private static class Chunk {

        /*
         * If the IN is a UIN, the memId is the block containing the BIN.
         *
         * If the IN is a BIN, the memId is currently unused. It may be used in
         * the future for the off-heap full BIN for a BIN-delta in main.
         */
        final long[] memIds;

        /*
         * The IN that owns this entry.
         *   . Is null if the entry is not used, i.e., on the free list.
         *   . Is a UIN if the entry is for an off-heap BIN.
         *   . Is a BIN if the entry is for a BIN in the main cache.
         */
        final IN[] owners;

        /*
         * IDs of the prev/next entries in the LRU linked list. For entries on
         * the free list, only the next entry is used (it is singly-linked).
         *
         * The prev and next entry ID are -1 to mean the end of the list.
         *    . If prev == -1, then entry ID == LRUList.back.
         *    . If next == -1, then entry ID == LRUList.front.
         *
         * If next == -2, the entry is not in an LRUList nor is it on the free
         * list. When next == -2 and the owner is non-null, this means the
         * entry has been removed from the LRU list to be processed by the
         * evictor; the evictor may decide to add it back to an LRU list or
         * place is on the free list.
         *
         * When an entry is on the free list, next is the next ID on the free
         * list, and the owner is null.
         */
        final int[] prev;
        final int[] next;

        Chunk() {
            memIds = new long[CHUNK_SIZE];
            owners = new IN[CHUNK_SIZE];
            prev = new int[CHUNK_SIZE];
            next = new int[CHUNK_SIZE];
        }
    }

    private class LRUList {

        /*
         * The front field is the entry ID of the cold end, and back is the ID
         * of the hot end. Both fields are -1 if the list is empty. If there is
         * only one entry, both fields have the same value.
         */
        private int front = -1;
        private int back = -1;
        private int size = 0;

        void addBack(final int entry, final IN owner, final long memId) {

            final Chunk chunk = chunks[entry / CHUNK_SIZE];
            final int chunkIdx = entry % CHUNK_SIZE;

            /*
             * Must set owner before adding to LRU list, since an entry that is
             * on the LRU list with a null owner would be considered as a free
             * entry (by other threads).
             */
            chunk.owners[chunkIdx] = owner;
            chunk.memIds[chunkIdx] = memId;

            synchronized (this) {
                addBackInternal(entry, chunk, chunkIdx);
            }
        }

        void addFront(final int entry) {

            final Chunk chunk = chunks[entry / CHUNK_SIZE];
            final int chunkIdx = entry % CHUNK_SIZE;

            synchronized (this) {
                addFrontInternal(entry, chunk, chunkIdx);
            }
        }

        void moveBack(final int entry) {

            final Chunk chunk = chunks[entry / CHUNK_SIZE];
            final int chunkIdx = entry % CHUNK_SIZE;

            synchronized (this) {

                if (back == entry ) {
                    return;
                }

                removeInternal(entry, chunk, chunkIdx);
                addBackInternal(entry, chunk, chunkIdx);
            }
        }

        void moveFront(final int entry) {

            final Chunk chunk = chunks[entry / CHUNK_SIZE];
            final int chunkIdx = entry % CHUNK_SIZE;

            synchronized (this) {

                if (front == entry ) {
                    return;
                }

                removeInternal(entry, chunk, chunkIdx);
                addFrontInternal(entry, chunk, chunkIdx);
            }
        }

        int removeFront() {

            synchronized (this) {

                int entry = front;
                if (entry < 0) {
                    return -1;
                }

                final Chunk chunk = chunks[entry / CHUNK_SIZE];
                final int chunkIdx = entry % CHUNK_SIZE;

                removeInternal(entry, chunk, chunkIdx);

                return entry;
            }
        }

        void remove(final int entry) {

            final Chunk chunk = chunks[entry / CHUNK_SIZE];
            final int chunkIdx = entry % CHUNK_SIZE;

            synchronized (this) {
                removeInternal(entry, chunk, chunkIdx);
            }
        }

        private void addBackInternal(final int entry,
                                     final Chunk chunk,
                                     final int chunkIdx ) {

            assert chunk.owners[chunkIdx] != null;
            assert chunk.next[chunkIdx] == -2;

            if (back < 0) {
                assert back == -1;
                assert front == -1;

                chunk.prev[chunkIdx] = -1;
                chunk.next[chunkIdx] = -1;

                back = entry;
                front = entry;
            } else {
                assert front >= 0;

                final Chunk nextChunk = chunks[back / CHUNK_SIZE];
                final int nextIdx = back % CHUNK_SIZE;

                assert nextChunk.prev[nextIdx] < 0;

                nextChunk.prev[nextIdx] = entry;

                chunk.next[chunkIdx] = back;
                chunk.prev[chunkIdx] = -1;

                back = entry;
            }

            size += 1;
        }

        private void addFrontInternal(final int entry,
                                      final Chunk chunk,
                                      final int chunkIdx ) {

            assert chunk.owners[chunkIdx] != null;
            assert chunk.next[chunkIdx] == -2;

            if (front < 0) {
                assert back == -1;
                assert front == -1;

                chunk.prev[chunkIdx] = -1;
                chunk.next[chunkIdx] = -1;

                front = entry;
                back = entry;
            } else {
                assert back >= 0;

                final Chunk prevChunk = chunks[front / CHUNK_SIZE];
                final int prevIdx = front % CHUNK_SIZE;

                assert prevChunk.next[prevIdx] < 0;

                prevChunk.next[prevIdx] = entry;

                chunk.prev[chunkIdx] = front;
                chunk.next[chunkIdx] = -1;

                front = entry;
            }

            size += 1;
        }

        private void removeInternal(final int entry,
                                    final Chunk chunk,
                                    final int chunkIdx ) {

            assert chunk.owners[chunkIdx] != null;

            if (chunk.next[chunkIdx] == -2) {
                return;
            }

            assert front >= 0;
            assert back >= 0;

            final int prev = chunk.prev[chunkIdx];
            final int next = chunk.next[chunkIdx];

            if (prev < 0) {
                assert prev == -1;
                assert back == entry;

                back = next;
            } else {
                assert back != entry;

                final Chunk prevChunk = chunks[prev / CHUNK_SIZE];
                final int prevIdx = prev % CHUNK_SIZE;

                assert prevChunk.next[prevIdx] == entry;

                prevChunk.next[prevIdx] = next;
            }

            if (next < 0) {
                assert next == -1;
                assert front == entry;

                front = prev;
            } else {
                assert front != entry;

                final Chunk nextChunk = chunks[next / CHUNK_SIZE];
                final int nextIdx = next % CHUNK_SIZE;

                assert nextChunk.prev[nextIdx] == entry;

                nextChunk.prev[nextIdx] = prev;
            }

            chunk.next[chunkIdx] = -2;

            size -= 1;
        }

        boolean contains(final Chunk chunk, final int chunkIdx) {

            synchronized (this) {
                assert  chunk.next[chunkIdx] >= -2;

                return chunk.next[chunkIdx] != -2 &&
                       chunk.owners[chunkIdx] != null;
            }
        }

        int getSize() {
            return size;
        }
    }

    private final Logger logger;
    private final OffHeapAllocator allocator;
    private boolean runEvictorThreads;
    private int maxPoolThreads;
    private final AtomicInteger activePoolThreads = new AtomicInteger(0);
    private final AtomicBoolean shutdownRequested = new AtomicBoolean(false);
    private final ThreadPoolExecutor evictionPool;
    private int terminateMillis;
    private long maxMemory;
    private long memoryLimit;
    private final long evictBytes;
    private volatile Map freedBlocks;
    private volatile Map prevFreedBlocks;

    private volatile Chunk[] chunks;
    private int firstFreeListEntry = -1;
    private final Object addRemoveEntryMutex = new Object();

    private final int numLRULists;
    private final LRUList[] pri1LRUSet;
    private final LRUList[] pri2LRUSet;
    private int nextPri1LRUList = 0;
    private int nextPri2LRUList = 0;

    private final AtomicLong nAllocFailure = new AtomicLong(0);
    private final AtomicLong nAllocOverflow = new AtomicLong(0);
    private final AtomicLong nThreadUnavailable = new AtomicLong(0);
    private final AtomicLong nCriticalNodesTargeted = new AtomicLong(0);
    private final AtomicLong nNodesTargeted = new AtomicLong(0);
    private final AtomicLong nNodesEvicted = new AtomicLong(0);
    private final AtomicLong nDirtyNodesEvicted = new AtomicLong(0);
    private final AtomicLong nNodesStripped = new AtomicLong(0);
    private final AtomicLong nNodesMutated = new AtomicLong(0);
    private final AtomicLong nNodesSkipped = new AtomicLong(0);
    private final AtomicLong nLNsEvicted = new AtomicLong(0);
    private final AtomicLong nLNsLoaded = new AtomicLong(0);
    private final AtomicLong nLNsStored = new AtomicLong(0);
    private final AtomicLong nBINsLoaded = new AtomicLong(0);
    private final AtomicLong nBINsStored = new AtomicLong(0);
    private final AtomicInteger cachedLNs = new AtomicInteger(0);
    private final AtomicInteger cachedBINs = new AtomicInteger(0);
    private final AtomicInteger cachedBINDeltas = new AtomicInteger(0);
    private final AtomicInteger totalBlocks = new AtomicInteger(0);
    private final AtomicInteger lruSize = new AtomicInteger(0);

    public OffHeapCache(final EnvironmentImpl envImpl) {

        logger = LoggerUtils.getLogger(getClass());

        final DbConfigManager configManager = envImpl.getConfigManager();

        maxMemory = configManager.getLong(
            EnvironmentParams.MAX_OFF_HEAP_MEMORY);

        if (maxMemory == 0) {
            allocator = DummyAllocator.INSTANCE;
        } else {
            try {
                final OffHeapAllocatorFactory factory =
                    new OffHeapAllocatorFactory();
                allocator = factory.getDefaultAllocator();
            } catch (Throwable e) {
                // TODO: allow continuing without an off-heap cache?
                throw new IllegalStateException(
                    "Unable to create default allocator for off-heap cache", e);
            }
        }

        evictBytes = configManager.getLong(
            EnvironmentParams.OFFHEAP_EVICT_BYTES);

        numLRULists = configManager.getInt(
            EnvironmentParams.OFFHEAP_N_LRU_LISTS);

        allocator.setMaxBytes(maxMemory);
        memoryLimit = maxMemory;

        pri1LRUSet = new LRUList[numLRULists];
        pri2LRUSet = new LRUList[numLRULists];

        for (int i = 0; i < numLRULists; i += 1) {
            pri1LRUSet[i] = new LRUList();
            pri2LRUSet[i] = new LRUList();
        }

        if (DEBUG_DOUBLE_FREE) {
            freedBlocks = new ConcurrentHashMap<>();
            prevFreedBlocks = new ConcurrentHashMap<>();
        } else {
            freedBlocks = null;
            prevFreedBlocks = null;
        }

        terminateMillis = configManager.getDuration(
            EnvironmentParams.EVICTOR_TERMINATE_TIMEOUT);

        final int corePoolSize = configManager.getInt(
            EnvironmentParams.OFFHEAP_CORE_THREADS);

        maxPoolThreads = configManager.getInt(
            EnvironmentParams.OFFHEAP_MAX_THREADS);

        final long keepAliveTime = configManager.getDuration(
            EnvironmentParams.OFFHEAP_KEEP_ALIVE);

        final boolean isShared = envImpl.getSharedCache();

        evictionPool = new ThreadPoolExecutor(
            corePoolSize, maxPoolThreads,
            keepAliveTime, TimeUnit.MILLISECONDS,
            new ArrayBlockingQueue(1),
            new StoppableThreadFactory(
                isShared ? null : envImpl, "JEOffHeapEvictor", logger),
            new RejectedExecutionHandler() {
                @Override
                public void rejectedExecution(
                    Runnable r, ThreadPoolExecutor executor) {
                    nThreadUnavailable.incrementAndGet();
                }
            });

        runEvictorThreads = configManager.getBoolean(
            EnvironmentParams.ENV_RUN_OFFHEAP_EVICTOR);

        envImpl.addConfigObserver(this);
    }

    @Override
    public void envConfigUpdate(
        final DbConfigManager configManager,
        final EnvironmentMutableConfig ignore) {

        terminateMillis = configManager.getDuration(
            EnvironmentParams.EVICTOR_TERMINATE_TIMEOUT);

        final int corePoolSize = configManager.getInt(
            EnvironmentParams.OFFHEAP_CORE_THREADS);

        maxPoolThreads = configManager.getInt(
            EnvironmentParams.OFFHEAP_MAX_THREADS);

        final long keepAliveTime = configManager.getDuration(
            EnvironmentParams.OFFHEAP_KEEP_ALIVE);

        evictionPool.setCorePoolSize(corePoolSize);
        evictionPool.setMaximumPoolSize(maxPoolThreads);
        evictionPool.setKeepAliveTime(keepAliveTime, TimeUnit.MILLISECONDS);

        runEvictorThreads = configManager.getBoolean(
            EnvironmentParams.ENV_RUN_OFFHEAP_EVICTOR);

        final long newMaxMemory = configManager.getLong(
            EnvironmentParams.MAX_OFF_HEAP_MEMORY);

        if ((newMaxMemory > 0) != (maxMemory > 0)) {
            // TODO detect this error earlier?
            throw new IllegalArgumentException(
                "Cannot change off-heap cache size between zero and non-zero");
        }

        maxMemory = newMaxMemory;
        allocator.setMaxBytes(newMaxMemory);
        memoryLimit = newMaxMemory;
    }

    public void requestShutdown() {
        shutdownRequested.set(true);
        evictionPool.shutdown();
    }

    public void shutdown() {

        shutdownRequested.set(true);
        evictionPool.shutdown();

        boolean shutdownFinished = false;
        try {
            shutdownFinished = evictionPool.awaitTermination(
                terminateMillis, TimeUnit.MILLISECONDS);
        } catch (InterruptedException e) {
            /* We've been interrupted, just give up and end. */
        } finally {
            if (!shutdownFinished) {
                evictionPool.shutdownNow();
            }

            clearCache(null);

//            envImpl.getMemoryBudget().updateAdminMemoryUsage(
//                0 - (chunks.length * CHUNK_MEMORY_SIZE));

            chunks = null;
        }
    }

    public boolean isEnabled() {
        return allocator != DummyAllocator.INSTANCE;
    }

    public long clearCache(final EnvironmentImpl matchEnv) {

        /*
         * Use local var because when matchEnv is non-null, other threads (for
         * other envs in the shared pool) are running and may replace the
         * array. However, all entries for matchEnv will remain in the local
         * array.
         */
        final Chunk[] myChunks = chunks;

        if (myChunks == null) {
            return 0;
        }

        long size = 0;

        for (Chunk chunk : myChunks) {

            for (int chunkIdx = 0; chunkIdx < CHUNK_SIZE; chunkIdx += 1) {

                final IN owner = chunk.owners[chunkIdx];
                if (owner == null) {
                    continue;
                }
                if (matchEnv != null && owner.getEnv() != matchEnv) {
                    continue;
                }

                owner.latchNoUpdateLRU();
                try {
                    size += removeINFromMain(owner);
                } finally {
                    owner.releaseLatch();
                }
            }
        }

        return size;
    }

    public StatGroup loadStats(StatsConfig config) {

        StatGroup stats = new StatGroup(GROUP_NAME, GROUP_DESC);

        new LongStat(stats, ALLOC_FAILURE, nAllocFailure.get());
        new LongStat(stats, ALLOC_OVERFLOW, nAllocOverflow.get());
        new LongStat(stats, THREAD_UNAVAILABLE, nThreadUnavailable.get());
        new LongStat(stats, CRITICAL_NODES_TARGETED, nCriticalNodesTargeted.get());
        new LongStat(stats, NODES_TARGETED, nNodesTargeted.get());
        new LongStat(stats, NODES_EVICTED, nNodesEvicted.get());
        new LongStat(stats, DIRTY_NODES_EVICTED, nDirtyNodesEvicted.get());
        new LongStat(stats, NODES_STRIPPED, nNodesStripped.get());
        new LongStat(stats, NODES_MUTATED, nNodesMutated.get());
        new LongStat(stats, NODES_SKIPPED, nNodesSkipped.get());
        new LongStat(stats, LNS_EVICTED, nLNsEvicted.get());
        new LongStat(stats, LNS_LOADED, nLNsLoaded.get());
        new LongStat(stats, LNS_STORED, nLNsStored.get());
        new LongStat(stats, BINS_LOADED, nBINsLoaded.get());
        new LongStat(stats, BINS_STORED, nBINsStored.get());
        new IntStat(stats, CACHED_LNS, cachedLNs.get());
        new IntStat(stats, CACHED_BINS, cachedBINs.get());
        new IntStat(stats, CACHED_BIN_DELTAS, cachedBINDeltas.get());
        new LongStat(stats, TOTAL_BYTES, allocator.getUsedBytes());
        new IntStat(stats, TOTAL_BLOCKS, totalBlocks.get());
        new IntStat(stats, LRU_SIZE, lruSize.get());

        if (config.getClear()) {
            nAllocFailure.set(0);
            nAllocOverflow.set(0);
            nThreadUnavailable.set(0);
            nCriticalNodesTargeted.set(0);
            nNodesTargeted.set(0);
            nNodesEvicted.set(0);
            nDirtyNodesEvicted.set(0);
            nNodesStripped.set(0);
            nNodesMutated.set(0);
            nNodesSkipped.set(0);
            nLNsEvicted.set(0);
            nLNsLoaded.set(0);
            nLNsStored.set(0);
            nBINsLoaded.set(0);
            nBINsStored.set(0);
        }

        return stats;
    }

    public long getMaxMemory() {
        return maxMemory;
    }

    public long getUsedMemory() {
        return allocator.getUsedBytes();
    }

    /**
     * Forces allocation of the first chunk of entries. Used by tests that need
     * to more precisely control cache behavior.
     */
    public void preallocateLRUEntries() {

        if (chunks == null) {
            freeEntry(allocateEntry());
        }
    }

    public OffHeapAllocator getAllocator() {
        return allocator;
    }

    private void debug(final EnvironmentImpl envImpl, String msg) {

        assert DEBUG_TRACE;

        if (DEBUG_TRACE_STACK) {
            msg += " " + LoggerUtils.getStackTrace();
        }

        if (DEBUG_TRACE_AND_LOG) {
            LoggerUtils.traceAndLog(logger, envImpl, Level.INFO, msg);
        } else {
            LoggerUtils.logMsg(logger, envImpl, Level.INFO, msg);
        }
    }

    private int addBack(final boolean pri2, IN owner, long memId) {

        assert owner.isLatchExclusiveOwner();

        final int entry = allocateEntry();

        final int lruIdx = entry % numLRULists;
        final LRUList lru =
            pri2 ? pri2LRUSet[lruIdx] : pri1LRUSet[lruIdx];

        lru.addBack(entry, owner, memId);

        return entry;
    }

    public int moveBack(final int entry, final boolean pri2) {

        final int lruIdx = entry % numLRULists;
        final LRUList lru =
            pri2 ? pri2LRUSet[lruIdx] : pri1LRUSet[lruIdx];

        lru.moveBack(entry);

        return entry;
    }

    private int moveFront(final int entry, final boolean pri2) {

        final int lruIdx = entry % numLRULists;
        final LRUList lru =
            pri2 ? pri2LRUSet[lruIdx] : pri1LRUSet[lruIdx];

        lru.moveFront(entry);

        return entry;
    }

    private void remove(final int entry, final boolean pri2) {

        final int lruIdx = entry % numLRULists;
        final LRUList lru =
            pri2 ? pri2LRUSet[lruIdx] : pri1LRUSet[lruIdx];

        lru.remove(entry);
        freeEntry(entry);
    }

    /**
     * Takes an entry from the free list. If the free list is empty, allocates
     * a new chunk and adds its entries to the free list.
     */
    private int allocateEntry() {

        synchronized (addRemoveEntryMutex) {

            if (firstFreeListEntry >= 0) {

                final int entry = firstFreeListEntry;
                final Chunk chunk = chunks[entry / CHUNK_SIZE];
                final int chunkIdx = entry % CHUNK_SIZE;

                firstFreeListEntry = chunk.next[chunkIdx];
                chunk.next[chunkIdx] = -2;

                lruSize.incrementAndGet();

                return entry;
            }

            final Chunk newChunk = new Chunk();
            final int[] next = newChunk.next;
            final int nOldChunks = (chunks != null) ? chunks.length : 0;

            /* Entry 0 in the new chunk will be returned. */
            int nextFree = nOldChunks * CHUNK_SIZE;
            final int entry = nextFree++;
            next[0] = -2;

            /* Entry 1 is the tail of the free list. */
            next[1] = -1;

            /*
             * Entry 2 and above are added to the free list.
             *
             * This loop needs to be as fast as possible, which is why we're
             * using local vars for next and nextFree.
             *
             * In the loop, nextFree starts out as entry 1 (tail of free
             * list) and ends up as the last free entry (head of free list).
             */
            for (int i = 2; i < CHUNK_SIZE; i += 1) {
                next[i] = nextFree++;
            }

            /* The last entry is the head of the free list. */
            firstFreeListEntry = nextFree;

            final Chunk[] newChunks = new Chunk[nOldChunks + 1];
            if (nOldChunks > 0) {
                System.arraycopy(chunks, 0, newChunks, 0, nOldChunks);
            }
            newChunks[nOldChunks] = newChunk;

            /* Assign to volatile chunks field as the very last step. */
            chunks = newChunks;

            lruSize.incrementAndGet();

//            envImpl.getMemoryBudget().updateAdminMemoryUsage(
//                CHUNK_MEMORY_SIZE);

            return entry;
        }
    }

    /**
     * Removes the entry from its LRU and adds it to the free list.
     */
    private void freeEntry(final int entry) {

        final Chunk chunk = chunks[entry / CHUNK_SIZE];
        final int chunkIdx = entry % CHUNK_SIZE;

        synchronized (addRemoveEntryMutex) {

            if (chunk.owners[chunkIdx] == null) {
                return; // Already on free list
            }

            chunk.owners[chunkIdx] = null;
            chunk.next[chunkIdx] = firstFreeListEntry;
            firstFreeListEntry = entry;

            lruSize.decrementAndGet();
        }
    }

    public long getMemId(final int entry) {

        final Chunk chunk = chunks[entry / CHUNK_SIZE];
        final int chunkIdx = entry % CHUNK_SIZE;

        return chunk.memIds[chunkIdx];
    }

    private IN getOwner(final int entry) {

        final Chunk chunk = chunks[entry / CHUNK_SIZE];
        final int chunkIdx = entry % CHUNK_SIZE;

        return chunk.owners[chunkIdx];
    }

    public void setOwner(final int entry, final IN owner) {

        assert owner.isLatchExclusiveOwner();

        final Chunk chunk = chunks[entry / CHUNK_SIZE];
        final int chunkIdx = entry % CHUNK_SIZE;

        assert chunk.owners[chunkIdx] != null;
        assert chunk.owners[chunkIdx].isLatchExclusiveOwner();

        chunk.owners[chunkIdx] = owner;
    }

    private void setOwnerAndMemId(final int entry,
                                  final IN owner,
                                  final long memId) {

        assert owner.isLatchExclusiveOwner();

        final Chunk chunk = chunks[entry / CHUNK_SIZE];
        final int chunkIdx = entry % CHUNK_SIZE;

        assert chunk.owners[chunkIdx] != null;
        assert chunk.owners[chunkIdx].isLatchExclusiveOwner();

        chunk.owners[chunkIdx] = owner;
        chunk.memIds[chunkIdx] = memId;
    }

    /**
     * Called before eviction of an LN from main cache to provide an
     * opportunity to store the LN off-heap.
     */
    public boolean storeEvictedLN(final BIN bin,
                                  final int index,
                                  final LN ln) {
        assert !ln.isDirty();
        assert bin.isLatchExclusiveOwner();
        assert bin.getInListResident();

        final DatabaseImpl dbImpl = bin.getDatabase();

        long memId = bin.getOffHeapLNId(index);
        if (memId != 0) {
            assert bin.getOffHeapLruId() >= 0;

            /*
             * If already stored off-heap, make the entry hot when
             * CacheMode.UNCHANGED does not apply (getFetchedCold is false).
             */
            if (!bin.getFetchedCold()) {
                moveBack(bin.getOffHeapLruId(), false);
            }

            if (DEBUG_TRACE) {
                debug(
                    bin.getEnv(),
                    "Evicted LN already store LSN=" +
                        DbLsn.getNoFormatString(bin.getLsn(index)));
            }

            return true;
        }

        /*
         * Do not store off-heap:
         *  - When CacheMode.UNCHANGED applies (getFetchedCold is true). This
         *    is when the node was originally fetched from disk into main.
         *  - Deleted LNs are no longer needed.
         *  - For embedded LNs and dup DBs, there is no separate LN.
         *  - Off-heap caching for internal DBs is not currently supported.
         */
        if (ln.getFetchedCold() ||
            ln.isDeleted() ||
            bin.isEmbeddedLN(index) ||
            dbImpl.getSortedDuplicates() ||
            dbImpl.isDeferredWriteMode() || // TODO remove
            dbImpl.getDbType().isInternal()) {
            return false;
        }

        memId = serializeLN(dbImpl.getEnv(), ln);
        if (memId == 0) {
            return false;
        }

        bin.setOffHeapLNId(index, memId);

        /* Add to LRU at hot end, or make hot if already in LRU. */
        int entry = bin.getOffHeapLruId();
        if (entry < 0) {
            entry = addBack(false, bin, 0);
            bin.setOffHeapLruId(entry);
        } else {
            moveBack(entry, false);
        }

        if (DEBUG_TRACE) {
            debug(
                bin.getEnv(),
                "Stored evicted LN LSN=" +
                    DbLsn.getNoFormatString(bin.getLsn(index)));
        }

        return true;
    }

    /**
     * Called when an LN has been fetched from disk and should be stored
     * off-heap.
     */
    public boolean storePreloadedLN(final BIN bin,
                                    final int index,
                                    final LN ln) {
        final DatabaseImpl dbImpl = bin.getDatabase();

        assert !ln.isDirty();
        assert !ln.isDeleted();
        assert bin.isLatchExclusiveOwner();
        assert !bin.isEmbeddedLN(index);
        assert bin.getTarget(index) == null;
        assert !dbImpl.getSortedDuplicates();
        assert !dbImpl.isDeferredWriteMode(); // TODO remove
        assert !dbImpl.getDbType().isInternal();

        if (bin.getOffHeapLNId(index) != 0) {
            assert bin.getInListResident();
            return true;
        }

        final long memId = serializeLN(dbImpl.getEnv(), ln);
        if (memId == 0) {
            return false;
        }

        bin.setOffHeapLNId(index, memId);

        if (!bin.getInListResident()) {
            /* Preloading into a temporary BIN, not in the Btree. */
            return true;
        }

        /* Add to LRU at hot end, or make hot if already in LRU. */
        int entry = bin.getOffHeapLruId();
        if (entry < 0) {
            entry = addBack(false, bin, 0);
            bin.setOffHeapLruId(entry);
        } else {
            moveBack(entry, false);
        }

        return true;
    }

    public boolean ensureOffHeapLNsInLRU(final BIN bin) {

        assert bin.isLatchExclusiveOwner();

        if (bin.getOffHeapLruId() >= 0) {
            return true;
        }

        if (!bin.hasOffHeapLNs()) {
            return false;
        }

        final int entry = addBack(false, bin, 0);
        bin.setOffHeapLruId(entry);
        return true;
    }

    public LN loadLN(final BIN bin,
                     final int index,
                     final CacheMode cacheMode) {

        assert bin.isLatchExclusiveOwner();

        final long memId = bin.getOffHeapLNId(index);
        if (memId == 0) {
            return null;
        }

        final LN ln = materializeLN(bin.getEnv(), memId);

        switch (cacheMode) {
        case UNCHANGED:
        case MAKE_COLD:
            /* Will be evicted from main. Leave off-heap. */
            break;
        case EVICT_LN:
        case EVICT_BIN:
            /* Will be evicted from main. Leave off-heap and make hot. */
            assert bin.getOffHeapLruId() >= 0;
            moveBack(bin.getOffHeapLruId(), false);
            break;
        case DEFAULT:
        case KEEP_HOT:
            /* Will remain in main. Remove from off-heap. */
            bin.setOffHeapLNId(index, 0);
            freeLN(memId);
            break;
        default:
            assert false;
        }

        if (DEBUG_TRACE) {
            debug(
                bin.getEnv(),
                "Loaded LN LSN=" +
                    DbLsn.getNoFormatString(bin.getLsn(index)));
        }

        return ln;
    }

    public void freeRedundantLN(final BIN bin,
                                final int index,
                                final LN ln,
                                final CacheMode cacheMode) {

        assert bin.isLatchExclusiveOwner();

        final long memId = bin.getOffHeapLNId(index);
        if (memId == 0) {
            return;
        }

        switch (cacheMode) {
        case UNCHANGED:
        case MAKE_COLD:
            if (ln.getFetchedCold()) {
                /* Will be evicted from main. Leave off-heap. */
                return;
            }
            /* Will remain in main. Remove from off-heap. */
            break;
        case EVICT_BIN:
        case EVICT_LN:
            /* Will be evicted from main. Leave off-heap. */
            return;
        case DEFAULT:
        case KEEP_HOT:
            /* Will remain in main. Remove from off-heap. */
            break;
        default:
            assert false;
        }

        bin.setOffHeapLNId(index, 0);
        freeLN(memId);
    }

    public long loadVLSN(final BIN bin, final int index) {

        if (!bin.getEnv().getCacheVLSN()) {
            return VLSN.NULL_VLSN_SEQUENCE;
        }

        final long memId = bin.getOffHeapLNId(index);
        if (memId == 0) {
            return VLSN.NULL_VLSN_SEQUENCE;
        }

        return getLong(memId, 0, new byte[8]);
    }

    public int freeLN(final BIN bin, final int index) {

        assert bin.isLatchExclusiveOwner();

        final long memId = bin.getOffHeapLNId(index);
        if (memId == 0) {
            return 0;
        }

        /*
         * Since the LN was off-heap, set fetched-cold to false. Otherwise
         * the fetched-cold flag will prevent the LN from being stored
         * off-heap when it is evicted later.
         */
        final LN ln = (LN) bin.getTarget(index);
        if (ln != null) {
            ln.setFetchedCold(false);
        }

        bin.setOffHeapLNId(index, 0);
        return freeLN(memId);
    }

    private int freeLN(final long memId) {

        cachedLNs.decrementAndGet();
        return freeMemory(memId);
    }

    private long serializeLN(final EnvironmentImpl envImpl, final LN ln) {

        final boolean useChecksums = envImpl.useOffHeapChecksums();
        final int checksumSize = useChecksums ? CHECKSUM_SIZE : 0;
        final int vlsnSize = envImpl.getCacheVLSN() ? VLSN_SIZE : 0;
        final int lnDataOffset = vlsnSize + checksumSize;

        /*
         * We make 3 calls to allocator.copy (one explicit and two via putLong
         * and putInt) rather than just one because:
         *  - This avoids an extra copy and buffer allocation for the LN data.
         *  - The LN data is potentially large.
         *  - The checksum is normally off in production, so there is at most
         *    one extra allocator.copy for the VLSN.
         */
        final byte[] data = ln.getData();
        assert data != null;

        final long memId = allocateMemory(envImpl, lnDataOffset + data.length);
        if (memId == 0) {
            return 0;
        }

        final byte[] tempBuf =
            (vlsnSize > 0 || useChecksums) ? new byte[8] : null;

        if (vlsnSize > 0) {
            putLong(ln.getVLSNSequence(), memId, 0, tempBuf);
        }

        if (useChecksums) {
            final Checksum checksum = Adler32.makeChecksum();
            checksum.update(data, 0, data.length);
            final int checksumValue = (int) checksum.getValue();
            putInt(checksumValue, memId, vlsnSize, tempBuf);
        }

        allocator.copy(data, 0, memId, lnDataOffset, data.length);

        nLNsStored.incrementAndGet();
        cachedLNs.incrementAndGet();

        return memId;
    }

    private LN materializeLN(final EnvironmentImpl envImpl,
                             final long memId) {

        final boolean useChecksums = envImpl.useOffHeapChecksums();
        final int checksumSize = useChecksums ? CHECKSUM_SIZE : 0;
        final int vlsnSize = envImpl.getCacheVLSN() ? VLSN_SIZE : 0;
        final int lnDataOffset = vlsnSize + checksumSize;

        final byte[] data = new byte[allocator.size(memId) - lnDataOffset];
        allocator.copy(memId, lnDataOffset, data, 0, data.length);

        final byte[] tempBuf =
            (vlsnSize > 0 || useChecksums) ? new byte[8] : null;

        if (useChecksums) {
            final int storedChecksum = getInt(memId, vlsnSize, tempBuf);
            if (storedChecksum != 0) {

                final Checksum checksum = Adler32.makeChecksum();
                checksum.update(data, 0, data.length);
                final int checksumValue = (int) checksum.getValue();

                if (storedChecksum != checksumValue) {
                    throw unexpectedState(
                        envImpl,
                        "Off-heap cache checksum error. Expected " +
                        storedChecksum + " but got " + checksumValue);
                }
            }
        }

        nLNsLoaded.incrementAndGet();

        final LN ln = LN.makeLN(envImpl, data);
        ln.clearDirty(); // New LNs are initially dirty.

        if (vlsnSize > 0) {
            ln.setVLSNSequence(getLong(memId, 0, tempBuf));
        }

        return ln;
    }

    /**
     * Called before eviction of a BIN from main cache to provide an
     * opportunity to store the BIN off-heap.
     *
     * removeINFromMain is called after this method by the main evictor. It
     * is removeINFromMain that removes the main BIN's off-heap LRU entry, if
     * it has one. The bin and parent latches are held across the calls to
     * storeEvictedBIN and removeINFromMain.
     *
     * removeINFromMain will also free any off-heap LN IDs in the main BIN,
     * and therefore this method must clear those IDs in the main BIN. When
     * the BIN is stored off-heap by this method, the LN IDs will be stored
     * along with the off-heap BIN.
     */
    public boolean storeEvictedBIN(final BIN bin,
                                   final IN parent,
                                   final int index) {

        assert bin.isLatchExclusiveOwner();
        assert bin.getInListResident();
        assert parent.isLatchExclusiveOwner();
        assert parent.getInListResident();
        assert bin == parent.getTarget(index);
        assert parent.getOffHeapBINId(index) < 0;

        final DatabaseImpl dbImpl = bin.getDatabase();

        /*
         * Do not store off-heap:
         *  - When CacheMode.UNCHANGED applies, the BIN was not loaded from
         *    off-heap, and the BIN is not dirty.
         *  - Off-heap caching for internal DBs is not currently supported.
         */
        if ((bin.getFetchedCold() &&
            !bin.getFetchedColdOffHeap() &&
            !bin.getDirty()) ||
            dbImpl.isDeferredWriteMode() || // TODO remove
            dbImpl.getDbType().isInternal()) {
            return false;
        }

        /* Serialize the BIN and add it to the off-heap LRU. */

        final long memId = serializeBIN(bin, bin.isBINDelta());
        if (memId == 0) {
            return false;
        }

        /*
         * Reuse LRU entry if one exists for the BIN, in order not to change
         * the effective LRU position of its off-heap LNs. When off-heap LNs
         * are present, we want to preserve the off-heap LRU position to allow
         * the LNs to be stripped sooner.
         */
        int entry = bin.getOffHeapLruId();
        if (entry >= 0) {
            setOwnerAndMemId(entry, parent, memId);
            bin.clearOffHeapLNIds();
            bin.setOffHeapLruId(-1);
        } else {
            entry = addBack(false /*pri2*/, parent, memId);
        }

        parent.setOffHeapBINId(index, entry, false /*pri2*/, bin.getDirty());

        if (DEBUG_TRACE) {
            debug(
                bin.getEnv(),
                "Stored BIN LSN=" +
                    DbLsn.getNoFormatString(parent.getLsn(index)) +
                    " Node=" + bin.getNodeId() +
                    " dirty=" + bin.getDirty());
        }

        return true;
    }

    /**
     * Called when a BIN has been fetched from disk and should be stored
     * off-heap.
     */
    public boolean storePreloadedBIN(final BIN bin,
                                     final IN parent,
                                     final int index) {

        assert bin != null;
        assert parent.isLatchExclusiveOwner();
        assert parent.getInListResident();
        assert parent.getTarget(index) == null;

        final DatabaseImpl dbImpl = bin.getDatabase();

        assert !dbImpl.isDeferredWriteMode(); // TODO remove
        assert !dbImpl.getDbType().isInternal();

        /* Pass non-null 'bin' so that off-heap LNs are not freed. */
        freeBIN(bin, parent, index);

        final long memId = serializeBIN(bin, bin.isBINDelta());
        if (memId == 0) {
            return false;
        }

        final int entry = addBack(false /*pri2*/, parent, memId);
        parent.setOffHeapBINId(index, entry, false /*pri2*/, bin.getDirty());

        return true;
    }

    /**
     * Called before eviction of a level 2 IN from main cache. Any off-heap
     * BIN children are first logged, if dirty, and then discarded.
     *
     * @return true if all BINs could be discarded, or false if a dirty BIN
     * could not be logged due to a read-only env or disk limit violation.
     */
    boolean flushAndDiscardBINChildren(final IN in,
                                       final boolean backgroundIO) {
        assert in.isLatchExclusiveOwner();
        assert in.getInListResident();
        assert in.getNormalizedLevel() == 2;

        if (!in.hasOffHeapBINIds()) {
            return true;
        }

        boolean allDiscarded = true;

        for (int i = 0; i < in.getNEntries(); i += 1) {

            final int entry = in.getOffHeapBINId(i);
            if (entry < 0) {
                continue;
            }

            if (flushAndDiscardBIN(
                entry, in.isOffHeapBINPri2(i), in.isOffHeapBINDirty(i),
                getMemId(entry), in, i, backgroundIO, true /*freeLNs*/) == 0) {
                allDiscarded = false;
            }
        }

        return allDiscarded;
    }

    /**
     * Called:
     *  - after eviction of an IN from main cache, and in that case
     *    storeEvictedBIN was called and the eviction was completed.
     *  - when an IN is removed from the main cache for another reason,
     *    such as a reverse split or Database removal.
     *  - for all INs in an Environment being removed from the shared cache.
     */
    public long removeINFromMain(final IN in) {

        assert in.isLatchExclusiveOwner();

        final int level = in.getNormalizedLevel();

        if (level > 2) {
            return 0;
        }

        if (level == 2) {

            if (!in.hasOffHeapBINIds()) {
                return 0;
            }

            long size = 0;

            for (int i = 0; i < in.getNEntries(); i += 1) {

                final BIN bin = (BIN) in.getTarget(i);

                if (bin != null) {
                    bin.latchNoUpdateLRU();
                }
                try {
                    size += freeBIN(bin, in, i);
                } finally {
                    if (bin != null) {
                        bin.releaseLatch();
                    }
                }
            }

            return size;
        }

        assert level == 1 && in.isBIN();

        final BIN bin = (BIN) in;

        final int entry = bin.getOffHeapLruId();
        if (entry < 0) {
            assert !bin.hasOffHeapLNs();
            return 0;
        }

        long size = 0;

        if (bin.hasOffHeapLNs()) {
            for (int i = 0; i < bin.getNEntries(); i += 1) {
                size += freeLN(bin, i);
            }
        }

        bin.setOffHeapLruId(-1);
        remove(entry, false);
        return size;
    }

    public BIN loadBIN(final EnvironmentImpl envImpl, final int entry) {

        assert entry >= 0;

        return materializeBIN(envImpl, getMemBytes(getMemId(entry)));
    }

    /**
     * Loads a BIN for the given entry, if its last logged LSN is the given
     * LSN. Can be used to store an entry for a BIN (the off-heap BIN ID)
     * without holding its parent IN latch, and later find out whether that
     * entry still refers to the same BIN. If the BIN was split, the LSN will
     * have changed and null is returned. If the BIN is no longer off-heap, or
     * was moved off-heap and back on, null is also returned.
     *
     * If the BIN is redundantly resident in the main and off-heap caches, the
     * main cache "live" version is returned. Otherwise the BIN is deserialized
     * from the off-heap version and is not "live". When non-null is returned,
     * the returned BIN is latched.
     */
    public BIN loadBINIfLsnMatches(final EnvironmentImpl envImpl,
                                   final int entry,
                                   final long lsn) {

        final Pair result =
            findBINIfLsnMatches(envImpl, entry, lsn);

        if (result == null) {
            return null;
        }

        final IN in = result.first();
        final int index = result.second();

        try {
            BIN bin = (BIN) in.getTarget(index);
            if (bin != null) {
                bin.latchNoUpdateLRU();
                return bin;
            }

            final long memId = getMemId(entry);
            bin = materializeBIN(envImpl, getMemBytes(memId));
            bin.latchNoUpdateLRU(in.getDatabase());

            return bin;

        } finally {
            in.releaseLatch();
        }
    }

    public void evictBINIfLsnMatch(final EnvironmentImpl envImpl,
                                   final int entry,
                                   final long lsn) {

        final Pair result =
            findBINIfLsnMatches(envImpl, entry, lsn);

        if (result == null) {
            return;
        }

        final IN in = result.first();
        final int index = result.second();

        try {
            assert in.getTarget(index) == null;
            freeBIN(null, in, index);
        } finally {
            in.releaseLatch();
        }
    }

    /**
     * If non-null is returned, the returned IN will be EX latched.
     */
    private Pair findBINIfLsnMatches(
        final EnvironmentImpl envImpl,
        final int entry,
        final long lsn) {

        final Chunk chunk = chunks[entry / CHUNK_SIZE];
        final int chunkIdx = entry % CHUNK_SIZE;
        final IN in = chunk.owners[chunkIdx];

        if (in == null) {
            return null;
        }

        /*
         * The validation process here is very similar to in evictOne. See the
         * comments in that method.
         */
        in.latchNoUpdateLRU();

        if (in != chunk.owners[chunkIdx] ||
            !in.getInListResident() ||
            in.getEnv() != envImpl ||
            in.isBIN()) {

            in.releaseLatch();
            return null;
        }

        int index = -1;
        for (int i = 0; i < in.getNEntries(); i += 1) {
            if (in.getOffHeapBINId(i) == entry) {
                index = i;
                break;
            }
        }

        if (index < 0) {
            in.releaseLatch();
            return null;
        }

        if (in.getLsn(index) != lsn) {
            in.releaseLatch();
            return null;
        }

        return new Pair<>(in, index);
    }

    public byte[] getBINBytes(final IN parent, final int index) {

        assert parent.isLatchOwner();

        final int entry = parent.getOffHeapBINId(index);
        if (entry < 0) {
            return null;
        }

        assert parent == getOwner(entry);

        return getMemBytes(getMemId(entry));
    }

    /**
     * Called when a BIN's bytes were obtained holding a shared latch, and then
     * the latch was released and acquired again. We need to determine whether
     * the BIN was changed and moved off-heap again, while unlatched.
     *
     * Currently we just get the bytes again and compare.
     *
     * Possible optimization: Maintain a generation count in the serialized
     * BIN, whose value comes from a global counter that is incremented
     * whenever a BIN is serialized. But would the range of such a counter be
     * large enough to guarantee that wrapping won't be a problem? Certainly
     * the odds are low, but how can we guarantee it won't happen? Another
     * approach is to maintain the counter in the BIN in main cache, so it is a
     * per BIN value.
     */
    public boolean haveBINBytesChanged(final IN parent,
                                       final int index,
                                       final byte[] bytes) {
        assert parent.isLatchOwner();

        return !Arrays.equals(bytes, getBINBytes(parent, index));
    }

    public void postBINLoad(final IN parent, final int index, final BIN bin) {

        assert bin.isLatchExclusiveOwner();
        assert parent.isLatchExclusiveOwner();
        assert parent.getInListResident();
        assert parent.getTarget(index) == null;

        final int entry = parent.getOffHeapBINId(index);
        assert entry >= 0;
        assert parent == getOwner(entry);

        bin.setDirty(parent.isOffHeapBINDirty(index));

        final long freed = freeBIN(bin, parent, index);
        assert freed > 0;

        ensureOffHeapLNsInLRU(bin);

        if (DEBUG_TRACE) {
            debug(
                parent.getEnv(),
                "Loaded BIN LSN=" +
                    DbLsn.getNoFormatString(parent.getLsn(index)) +
                    " Node=" + bin.getNodeId() +
                    " dirty=" + bin.getDirty());
        }
    }

    public long freeBIN(final BIN bin, final IN parent, final int index) {

        assert parent.isLatchExclusiveOwner();

        assert bin == null || bin.isLatchExclusiveOwner();

        final int entry = parent.getOffHeapBINId(index);

        if (entry < 0) {
            return 0;
        }

        assert parent == getOwner(entry);

        final boolean pri2 = parent.isOffHeapBINPri2(index);
        final long memId = getMemId(entry);

        parent.clearOffHeapBINId(index);
        remove(entry, pri2);

        /*
         * Only free the LNs referenced by the off-heap BIN if the BIN is not
         * resident in main (bin == null). When the off-heap BIN is stale, its
         * LN Ids are also stale.
         */
        return freeBIN(parent.getEnv(), memId, bin == null /*freeLNs*/);
    }

    private long freeBIN(final EnvironmentImpl envImpl,
                         final long memId,
                         final boolean freeLNs) {

        long size = 0;
        final int flags;

        if (freeLNs) {
            final ParsedBIN pb = parseBINBytes(
                envImpl, getMemBytes(memId),
                false /*partialBuf*/, true /*parseLNIds*/);

            if (pb.lnMemIds != null) {
                for (final long lnMemId : pb.lnMemIds) {
                    if (lnMemId == 0) {
                        continue;
                    }
                    size += freeLN(lnMemId);
                }
            }

            flags = pb.flags;
        } else {
            final boolean useChecksums = envImpl.useOffHeapChecksums();
            final int checksumSize = useChecksums ? CHECKSUM_SIZE : 0;
            flags = getByte(memId, checksumSize, new byte[1]);
        }

        cachedBINs.decrementAndGet();
        if ((flags & BIN_FLAG_DELTA) != 0) {
            cachedBINDeltas.decrementAndGet();
        }

        return size + freeMemory(memId);
    }

    long serializeBIN(final BIN bin, final boolean asDelta) {

        assert !bin.hasCachedChildren();
        assert !(bin.isBINDelta() && !asDelta);

        final EnvironmentImpl envImpl = bin.getEnv();
        final boolean useChecksums = envImpl.useOffHeapChecksums();
        final int checksumSize = useChecksums ? CHECKSUM_SIZE : 0;
        final boolean canMutate = !asDelta && bin.canMutateToBINDelta();

        int flags = 0;

        if (asDelta) {
            flags |= BIN_FLAG_DELTA;
        }
        if (canMutate) {
            flags |= BIN_FLAG_CAN_MUTATE;
        }
        if (bin.getProhibitNextDelta()) {
            flags |= BIN_FLAG_PROHIBIT_NEXT_DELTA;
        }

        final short lnIdSize = getPackedLnMemIdSize(bin);

        /*
         * If there are any LNs, then we should not be mutating from a full BIN
         * to a BIN delta -- this isn't handled and should not happen.
         */
        assert !(asDelta && !bin.isBINDelta() && lnIdSize != 0);

        final int memSize =
            checksumSize + 1 + 8 + 8 + 4 + 2 +
            lnIdSize + bin.getLogSize(asDelta);

        final long memId = allocateMemory(envImpl, memSize);

        if (memId == 0) {
            return 0;
        }

        final byte[] buf = new byte[memSize];
        int bufOffset = checksumSize;

        buf[bufOffset] = (byte) flags;
        bufOffset += 1;

        putLong(bin.getLastFullLsn(), buf, bufOffset);
        bufOffset += 8;

        putLong(bin.getLastDeltaLsn(), buf, bufOffset);
        bufOffset += 8;

        putInt(getMinExpiration(bin), buf, bufOffset);
        bufOffset += 4;

        putShort(lnIdSize, buf, bufOffset);
        bufOffset += 2;

        if (lnIdSize > 0) {
            packLnMemIds(bin, buf, bufOffset);
            bufOffset += lnIdSize;
        }

        final ByteBuffer byteBuf =
            ByteBuffer.wrap(buf, bufOffset, buf.length - bufOffset);

        bin.serialize(byteBuf, asDelta, false /*clearDirtyBits*/);

        if (useChecksums) {
            final Checksum checksum = Adler32.makeChecksum();
            checksum.update(buf, checksumSize, buf.length - checksumSize);
            final int checksumValue = (int) checksum.getValue();
            putInt(checksumValue, memId, 0, buf);
        }

        allocator.copy(buf, 0, memId, 0, buf.length);

        nBINsStored.incrementAndGet();
        cachedBINs.incrementAndGet();
        if (asDelta) {
            cachedBINDeltas.incrementAndGet();
        }

        return memId;
    }

    public BIN materializeBIN(final EnvironmentImpl envImpl,
                              final byte[] buf) {

        final ParsedBIN pb = parseBINBytes(
            envImpl, buf, false /*partialBuf*/, true /*parseLNIds*/);

        final BIN bin = materializeBIN(pb, (pb.flags & BIN_FLAG_DELTA) != 0);

        nBINsLoaded.incrementAndGet();

        return bin;
    }

    private BIN materializeBIN(final ParsedBIN pb, final boolean asDelta) {

        final BIN bin = new BIN();

        bin.materialize(
            pb.binBytes, LogEntryType.LOG_VERSION, asDelta /*deltasOnly*/,
            (pb.flags & BIN_FLAG_LOGGED_FULL_VERSION) != 0 /*clearDirtyBits*/);

        bin.setLastFullLsn(pb.lastFullLsn);
        bin.setLastDeltaLsn(pb.lastDeltaLsn);

        bin.setProhibitNextDelta(
            (pb.flags & BIN_FLAG_PROHIBIT_NEXT_DELTA) != 0);

        if (pb.lnMemIds != null) {
            for (int i = 0; i < pb.lnMemIds.length; i += 1) {
                final long lnMemId = pb.lnMemIds[i];
                if (lnMemId == 0) {
                    continue;
                }
                bin.setOffHeapLNId(i, lnMemId);
            }
        }

        return bin;
    }

    public INLogEntry createBINLogEntryForCheckpoint(final IN parent,
                                                          final int index) {
        final int entry = parent.getOffHeapBINId(index);

        if (entry < 0 || !parent.isOffHeapBINDirty(index)) {
            return null;
        }

        assert parent == getOwner(entry);

        final long memId = getMemId(entry);

        return createBINLogEntry(
            memId, entry, parent, true /*preserveBINInCache*/);
    }

    public void postBINLog(final IN parent,
                           final int index,
                           final INLogEntry logEntry,
                           final long newLsn) {

        assert parent.isLatchExclusiveOwner();
        assert parent.getInListResident();

        final EnvironmentImpl envImpl = parent.getEnv();
        final boolean useChecksums = envImpl.useOffHeapChecksums();
        final int checksumSize = useChecksums ? CHECKSUM_SIZE : 0;

        final boolean isDelta = logEntry.isBINDelta();
        final int entry = parent.getOffHeapBINId(index);

        assert entry >= 0;
        assert parent.isOffHeapBINDirty(index);

        final BIN bin =
            logEntry.isPreSerialized() ? null : logEntry.getMainItem();

        /*
         * Update checksum, flags and last full/delta LSNs.
         */
        final long memId = getMemId(entry);
        final byte[] buf = new byte[checksumSize + 1 + 8 + 8];
        allocator.copy(memId, 0, buf, 0, buf.length);
        int bufOffset = 0;

        /* The checksum is now invalid. */
        if (useChecksums) {
            putInt(0, buf, 0);
            bufOffset += checksumSize;
        }

        /* Update flags. */
        int flags = buf[bufOffset];
        if (!isDelta) {
            flags |= BIN_FLAG_LOGGED_FULL_VERSION;
        }
        flags &= ~BIN_FLAG_PROHIBIT_NEXT_DELTA;
        buf[bufOffset] = (byte) flags;
        bufOffset += 1;

        /* Update lastFullLsn. */
        if (!isDelta) {
            putLong(newLsn, buf, bufOffset);
        }
        bufOffset += 8;

        /* Update lastDeltaLsn. */
        putLong(isDelta ? newLsn : DbLsn.NULL_LSN, buf, bufOffset);
        bufOffset += 8;
        assert bufOffset == buf.length;

        allocator.copy(buf, 0, memId, 0, buf.length);

        /* Move from pri2 LRU list to back of pri1 LRU list. */
        if (parent.isOffHeapBINPri2(index)) {
            pri2LRUSet[entry % numLRULists].remove(entry);
            moveBack(entry, false /*pri2*/);
        }

        parent.setOffHeapBINId(
            index, entry, false /*pri2*/, false /*dirty*/);

        if (bin != null) {
            bin.releaseLatch();
        }
    }

    private INLogEntry createBINLogEntry(
        final long memId,
        final int entry,
        final IN parent,
        final boolean preserveBINInCache) {

        final EnvironmentImpl envImpl = parent.getEnv();

        final byte[] buf = getMemBytes(memId);

        final ParsedBIN pb = parseBINBytes(
            envImpl, buf, false /*partialBuf*/, false /*parseLNIds*/);

        final boolean isDelta = (pb.flags & BIN_FLAG_DELTA) != 0;
        final boolean canMutateToDelta = (pb.flags & BIN_FLAG_CAN_MUTATE) != 0;

        /*
         * If the BIN is a delta, we must log a delta. In that case we cannot
         * compress expired slots, and we can log the pre-serialized BIN.
         */
        if (isDelta) {
            return new BINDeltaLogEntry(
                pb.binBytes, pb.lastFullLsn, pb.lastDeltaLsn,
                LogEntryType.LOG_BIN_DELTA, parent);
        }

        /*
         * For a full BIN, normally we log a delta iff it can be mutated to a
         * delta. However, if any slots are expired, we attempt to compress
         * them and then determine whether to log a delta.
         *
         * This mimics the logic in IN.logInternal, for BINs in main cache.
         *
         * TODO: Use materialized BIN in the log entry when any slot has an
         * expiration time, even if none are currently expired. Such BINs must
         * be materialized during logging for expiration tracking anyway by
         * INLogEntry.getBINWithExpiration. Then the getBINWithExpiration and
         * BIN.mayHaveExpirationValues methods will no longer be needed.
         */
        final boolean hasExpiredSlot =
            envImpl.isExpired(pb.minExpiration, true /*hours*/);

        /*
         * If we do not need to log a full BIN as a delta, or to compress its
         * expired slots, then we can log the pre-serialized BIN.
         */
        if (!hasExpiredSlot && !canMutateToDelta) {
            return new INLogEntry<>(
                pb.binBytes, pb.lastFullLsn, pb.lastDeltaLsn,
                LogEntryType.LOG_BIN, parent);
        }

        /*
         * We must materialize the full BIN in order to log it as a delta or to
         * compress its expired slots.
         */
        final BIN bin = materializeBIN(pb, false /*asDelta*/);

        /*
         * Latch the BIN to avoid assertions during BIN.writeToLog. A side
         * effect is setting the Database, which is also needed.
         */
        bin.latchNoUpdateLRU(parent.getDatabase());

        final boolean logDelta;

        if (hasExpiredSlot) {
            final int origNSlots = bin.getNEntries();

            /* Compress non-dirty slots before determining delta status. */
            bin.compress(false /*compressDirtySlots*/, null);
            logDelta = bin.shouldLogDelta();

            /* Also compress dirty slots, if we will not log a delta. */
            if (!logDelta) {
                bin.compress(true /*compressDirtySlots*/, null);
            }

            /* If we compressed an expired slot, re-serialize the BIN. */
            if (preserveBINInCache && origNSlots != bin.getNEntries()) {
                final long newMemId = serializeBIN(bin, bin.isBINDelta());
                if (newMemId == 0) {
                    /*
                     * TODO: Is invalid if compressed slot had off-heap LN.
                     * Should discard off-heap BIN and install 'bin' in main.
                     */
                    return new INLogEntry<>(
                        pb.binBytes, pb.lastFullLsn, pb.lastDeltaLsn,
                        LogEntryType.LOG_BIN, parent);
                }
                freeMemory(memId);
                setOwnerAndMemId(entry, parent, newMemId);
            }
        } else {
            assert canMutateToDelta;
            logDelta = true;
        }

        return logDelta ?
            (new BINDeltaLogEntry(bin)) :
            (new INLogEntry<>(bin));
    }

    public static class BINInfo {

        public final boolean isBINDelta;
        public final long fullBINLsn;

        private BINInfo(final ParsedBIN pb) {
            isBINDelta = (pb.flags & BIN_FLAG_DELTA) != 0;
            fullBINLsn = pb.lastFullLsn;
        }
    }

    public BINInfo getBINInfo(final EnvironmentImpl envImpl, final int entry) {

        assert entry >= 0;

        final boolean useChecksums = envImpl.useOffHeapChecksums();
        final int checksumSize = useChecksums ? CHECKSUM_SIZE : 0;

        final long memId = getMemId(entry);
        final byte[] buf = new byte[checksumSize + 1 + 8 + 8 + 4];
        allocator.copy(memId, 0, buf, 0, buf.length);

        final ParsedBIN pb = parseBINBytes(
            envImpl, buf, true /*partialBuf*/, false /*parseLNIds*/);

        return new BINInfo(pb);
    }

    long getINSize(final IN in) {

        if (in.isBIN()) {
            final BIN bin = (BIN) in;

            if (!bin.hasOffHeapLNs()) {
                return 0;
            }

            long size = 0;

            for (int i = 0; i < in.getNEntries(); i += 1) {
                final long memId = bin.getOffHeapLNId(i);
                if (memId == 0) {
                    continue;
                }
                size += allocator.totalSize(memId);
            }

            return size;
        }

        if (in.getNormalizedLevel() != 2) {
            return 0;
        }

        if (!in.hasOffHeapBINIds()) {
            return 0;
        }

        final EnvironmentImpl envImpl = in.getEnv();
        long size = 0;

        for (int i = 0; i < in.getNEntries(); i += 1) {

            final int entry = in.getOffHeapBINId(i);
            if (entry < 0) {
                continue;
            }

            final long memId = getMemId(entry);
            size += allocator.totalSize(memId);

            if (in.getTarget(i) != null) {
                /* Redundant BIN, do not count off-heap LNs here. */
                continue;
            }

            final ParsedBIN pb = parseBINBytes(
                envImpl, getMemBytes(memId),
                false /*partialBuf*/, true /*parseLNIds*/);

            if (pb.lnMemIds == null) {
                continue;
            }

            for (final long lnMemId : pb.lnMemIds) {
                if (lnMemId == 0) {
                    continue;
                }
                size += allocator.totalSize(lnMemId);
            }
        }

        return size;
    }

    private static class ParsedBIN {
        final int flags;
        final long[] lnMemIds;
        final long lastFullLsn;
        final long lastDeltaLsn;
        final int minExpiration;
        final ByteBuffer binBytes;

        ParsedBIN(final int flags,
                  final long[] lnMemIds,
                  final long lastFullLsn,
                  final long lastDeltaLsn,
                  final int minExpiration,
                  final ByteBuffer binBytes) {

            this.flags = flags;
            this.lnMemIds = lnMemIds;
            this.lastFullLsn = lastFullLsn;
            this.lastDeltaLsn = lastDeltaLsn;
            this.minExpiration = minExpiration;
            this.binBytes = binBytes;
        }
    }

    private ParsedBIN parseBINBytes(final EnvironmentImpl envImpl,
                                    final byte[] buf,
                                    final boolean partialBuf,
                                    final boolean parseLNIds) {

        assert !(partialBuf && parseLNIds);

        final boolean useChecksums = envImpl.useOffHeapChecksums();
        final int checksumSize = useChecksums ? CHECKSUM_SIZE : 0;

        if (useChecksums && !partialBuf) {
            final int storedChecksum = getInt(buf, 0);
            if (storedChecksum != 0) {

                final Checksum checksum = Adler32.makeChecksum();
                checksum.update(buf, checksumSize, buf.length - checksumSize);
                final int checksumValue = (int) checksum.getValue();

                if (storedChecksum != checksumValue) {
                    throw unexpectedState(
                        envImpl,
                        "Off-heap cache checksum error. Expected " +
                            storedChecksum + " but got " + checksumValue);
                }
            }
        }

        int bufOffset = checksumSize;

        final int flags = buf[bufOffset];
        bufOffset += 1;

        final long lastFullLsn = getLong(buf, bufOffset);
        bufOffset += 8;

        final long lastDeltaLsn = getLong(buf, bufOffset);
        bufOffset += 8;

        final int minExpiration = getInt(buf, bufOffset);
        bufOffset += 4;

        if (partialBuf) {
            return new ParsedBIN(
                flags, null, lastFullLsn, lastDeltaLsn, minExpiration, null);
        }

        final short lnIdsSize = getShort(buf, bufOffset);
        bufOffset += 2;

        /* lnIdsSize was negated if LNs were stripped by eviction. */
        final long[] lnMemIds;

        if (lnIdsSize > 0 && parseLNIds) {
            lnMemIds = unpackLnMemIds(buf, bufOffset, lnIdsSize);
        } else {
            lnMemIds = null;
        }

        bufOffset += Math.abs(lnIdsSize);

        final ByteBuffer byteBuf =
            ByteBuffer.wrap(buf, bufOffset, buf.length - bufOffset);

        return new ParsedBIN(
            flags, lnMemIds, lastFullLsn, lastDeltaLsn, minExpiration,
            byteBuf);
    }

    /**
     * Returns the minimum expiration time in hours, or zero of no slots
     * have an expiration time.
     */
    private int getMinExpiration(final BIN bin) {

        int minExpire = 0;

        for (int i = 0; i < bin.getNEntries(); i += 1) {
            int expire = bin.getExpiration(i);
            if (expire == 0) {
                continue;
            }
            if (minExpire > expire || minExpire == 0) {
                minExpire = expire;
            }
        }

        if (minExpire == 0) {
            return 0;
        }

        return bin.isExpirationInHours() ? minExpire : (minExpire * 24);
    }

    /**
     * Adds LN memIds to the buffer using an RLE approach to save space:
     *
     *  - The memIds are packed in slot index order. All slots are represented.
     *  - A positive byte indicates the number of 8-byte memIds that follow.
     *  - A negative byte indicates the number of slots that have no memId.
     *  - When a run exceeds 127 slots, another run is added. So there is no
     *    effective limit on number of slots, although we know the maximum will
     *    fit in a short integer.
     */
    private static void packLnMemIds(final BIN bin,
                                     final byte[] buf,
                                     int off) {
        int nOff = off;
        off += 1;
        byte n = 0;

        for (int i = 0; i < bin.getNEntries(); i += 1) {

            final long memId = bin.getOffHeapLNId(i);

            if (memId != 0) {

                if (n < 0 || n == 127) {
                    buf[nOff] = n;
                    nOff = off;
                    off += 1;
                    n = 0;
                }

                putLong(memId, buf, off);
                off += 8;
                n += 1;

            } else {

                if (n > 0 || n == -127) {
                    buf[nOff] = n;
                    nOff = off;
                    off += 1;
                    n = 0;
                }

                n -= 1;
            }
        }

        buf[nOff] = n;
    }

    private static short getPackedLnMemIdSize(final BIN bin) {

        if (!bin.hasOffHeapLNs()) {
            return 0;
        }

        int off = 1;
        byte n = 0;

        for (int i = 0; i < bin.getNEntries(); i += 1) {

            if (bin.getOffHeapLNId(i) != 0) {

                if (n < 0 || n == 127) {
                    off += 1;
                    n = 0;
                }

                off += 8;
                n += 1;

            } else {

                if (n > 0 || n == -127) {
                    off += 1;
                    n = 0;
                }

                n -= 1;
            }
        }

        if (off > Short.MAX_VALUE) {
            throw unexpectedState();
        }

        return (short) off;
    }

    private static long[] unpackLnMemIds(final byte[] buf,
                                         final int startOff,
                                         final int len) {
        assert len > 0;

        final int endOff = startOff + len;
        int off = startOff;
        int i = 0;

        while (off < endOff) {

            final int n = buf[off];
            off += 1;

            if (n > 0) {
                off += n * 8;
                i += n;
            } else {
                assert n < 0;
                i -= n;
            }
        }

        final long[] ids = new long[i + 1];
        off = startOff;
        i = 0;

        while (off < endOff) {

            int n = buf[off];
            off += 1;

            if (n > 0) {
                while (n > 0) {
                    ids[i] = getLong(buf, off);
                    off += 8;
                    i += 1;
                    n -= 1;
                }
            } else {
                assert n < 0;
                i -= n;
            }
        }

        return ids;
    }

    private long allocateMemory(final EnvironmentImpl envImpl,
                                final int size) {

        /*
         * Only enable the off-heap cache after recovery. This ensures
         * that off-heap memory is available to recovery as file system
         * cache, which is important when performing multiple passes over
         * the recovery interval.
         */
        if (!envImpl.isValid()) {
            return 0;
        }

        long memId = 0;

        try {
            memId = allocator.allocate(size);
            totalBlocks.incrementAndGet();

            if (DEBUG_DOUBLE_FREE) {
                final Long key = memId;
                freedBlocks.remove(key);
                prevFreedBlocks.remove(key);
            }

        } catch (OutOfMemoryError e) {

            LoggerUtils.envLogMsg(
                Level.SEVERE, envImpl,
                "OutOfMemoryError trying to allocate in the off-heap cache. " +
                "Continuing, but more problems are likely. Allocator error: " +
                e.getMessage());

            nAllocFailure.incrementAndGet();

            memoryLimit = allocator.getUsedBytes() - evictBytes;

        } catch (OffHeapAllocator.OffHeapOverflowException e) {

            nAllocOverflow.incrementAndGet();

            memoryLimit = allocator.getUsedBytes();
        }

        if (needEviction()) {
            wakeUpEvictionThreads();
        }

        return memId;
    }

    private int freeMemory(final long memId) {

        if (DEBUG_DOUBLE_FREE) {

            final Long key = memId;
            boolean added = false;
            Exception e = null;
            Map curr = freedBlocks;
            Map prev = prevFreedBlocks;

            if (freedBlocks.size() >= DEBUG_FREE_BLOCKS_PER_MAP) {

                synchronized (this) {

                    if (freedBlocks.size() >= DEBUG_FREE_BLOCKS_PER_MAP) {

                        prevFreedBlocks = freedBlocks;
                        freedBlocks = new ConcurrentHashMap<>();

                        e = freedBlocks.put(
                            key, new Exception("Freed: " + memId));

                        added = true;
                        curr = freedBlocks;
                        prev = prevFreedBlocks;
                    }
                }
            }

            if (!added) {
                e = curr.put(key, new Exception("Freed: " + memId));
            }

            if (e != null) {
                new Exception(
                    "Double-freed: " + memId + "\n" +
                    LoggerUtils.getStackTrace(e)).printStackTrace();
            }

            if (curr != prev) {
                e = prev.get(key);
                if (e != null) {
                    new Exception(
                        "Double-freed: " + memId + "\n" +
                        LoggerUtils.getStackTrace(e)).printStackTrace();
                }
            }
        }

        totalBlocks.decrementAndGet();
        return allocator.free(memId);
    }

    private byte[] getMemBytes(final long memId) {

        final byte[] bytes = new byte[allocator.size(memId)];
        allocator.copy(memId, 0, bytes, 0, bytes.length);
        return bytes;
    }

    private byte getByte(final long memId,
                         final int offset,
                         final byte[] tempBuf) {
        allocator.copy(memId, offset, tempBuf, 0, 1);
        return tempBuf[0];
    }

    private void putShort(final short val,
                          final long memId,
                          final int offset,
                          final byte[] tempBuf) {
        putShort(val, tempBuf, 0);
        allocator.copy(tempBuf, 0, memId, offset, 2);
    }

    private short getShort(final long memId,
                           final int offset,
                           final byte[] tempBuf) {
        allocator.copy(memId, offset, tempBuf, 0, 2);
        return getShort(tempBuf, 0);
    }

    private void putInt(final int val,
                        final long memId,
                        final int offset,
                        final byte[] tempBuf) {
        putInt(val, tempBuf, 0);
        allocator.copy(tempBuf, 0, memId, offset, 4);
    }

    private int getInt(final long memId,
                       final int offset,
                       final byte[] tempBuf) {
        allocator.copy(memId, offset, tempBuf, 0, 4);
        return getInt(tempBuf, 0);
    }

    private void putLong(final long val,
                         final long memId,
                         final int offset,
                         final byte[] tempBuf) {
        putLong(val, tempBuf, 0);
        allocator.copy(tempBuf, 0, memId, offset, 8);
    }

    private long getLong(final long memId,
                         final int offset,
                         final byte[] tempBuf) {
        allocator.copy(memId, offset, tempBuf, 0, 8);
        return getLong(tempBuf, 0);
    }

    private static void putShort(final short val,
                                 final byte[] buf,
                                 final int offset) {
        buf[offset]     = (byte) (val >> 8);
        buf[offset + 1] = (byte) val;
    }

    private static short getShort(final byte[] buf,
                                  final int offset) {
        return (short)
            ((buf[offset] << 8) |
             (buf[offset + 1] & 0xff));
    }

    private static void putInt(final int val,
                               final byte[] buf,
                               final int offset) {
        buf[offset]     = (byte) (val >> 24);
        buf[offset + 1] = (byte) (val >> 16);
        buf[offset + 2] = (byte) (val >>  8);
        buf[offset + 3] = (byte) val;
    }

    private static int getInt(final byte[] buf,
                              final int offset) {
        return
            ((buf[offset]             << 24) |
            ((buf[offset + 1] & 0xff) << 16) |
            ((buf[offset + 2] & 0xff) <<  8) |
             (buf[offset + 3] & 0xff));
    }

    private static void putLong(final long val,
                                final byte[] buf,
                                final int offset) {
        buf[offset]     = (byte) (val >> 56);
        buf[offset + 1] = (byte) (val >> 48);
        buf[offset + 2] = (byte) (val >> 40);
        buf[offset + 3] = (byte) (val >> 32);
        buf[offset + 4] = (byte) (val >> 24);
        buf[offset + 5] = (byte) (val >> 16);
        buf[offset + 6] = (byte) (val >>  8);
        buf[offset + 7] = (byte) val;
    }

    private static long getLong(final byte[] buf,
                                final int offset) {
        return
            (((long)buf[offset]             << 56) |
            (((long)buf[offset + 1] & 0xff) << 48) |
            (((long)buf[offset + 2] & 0xff) << 40) |
            (((long)buf[offset + 3] & 0xff) << 32) |
            (((long)buf[offset + 4] & 0xff) << 24) |
            (((long)buf[offset + 5] & 0xff) << 16) |
            (((long)buf[offset + 6] & 0xff) <<  8) |
             ((long)buf[offset + 7] & 0xff));
    }

    public void doCriticalEviction(boolean backgroundIO) {

        if (needEviction()) {
            wakeUpEvictionThreads();

            if (needCriticalEviction()) {
                evictBatch(EvictionSource.CRITICAL, backgroundIO);
            }
        }
    }

    public void doDaemonEviction(boolean backgroundIO) {

        if (needEviction()) {
            wakeUpEvictionThreads();

            if (needCriticalEviction()) {
                evictBatch(EvictionSource.DAEMON, backgroundIO);
            }
        }
    }

    public void doManualEvict() {

        if (!isEnabled()) {
            return;
        }

        evictBatch(EvictionSource.MANUAL, true /*backgroundIO*/);
    }

    private void wakeUpEvictionThreads() {

        if (!runEvictorThreads || !isEnabled()) {
            return;
        }

        /*
         * This check is meant to avoid the lock taken by
         * ArrayBlockingQueue.offer() when this is futile. The lock reduces
         * concurrency because this method is called so frequently.
         */
        if (activePoolThreads.get() >= maxPoolThreads) {
            return;
        }

        evictionPool.execute(new Runnable() {
            @Override
            public void run() {
                activePoolThreads.incrementAndGet();
                try {
                    evictBatch(
                        EvictionSource.EVICTORTHREAD, true /*backgroundIO*/);
                } finally {
                    activePoolThreads.decrementAndGet();
                }
            }
        });
    }

    private boolean needEviction() {

        if (!isEnabled()) {
            return false;
        }

        /*
         * When off-heap cache size is set to zero after being non-zero, we
         * perform eviction only until the cache becomes empty.
         */
        if (maxMemory == 0) {
            return allocator.getUsedBytes() >= 0;
        }

        return allocator.getUsedBytes() + evictBytes >= memoryLimit;
    }

    private boolean needCriticalEviction() {

        if (!isEnabled()) {
            return false;
        }

        /*
         * When off-heap cache size is set to zero after being non-zero, we
         * perform only non-critical eviction.
         */
        if (maxMemory == 0) {
            return false;
        }

        return allocator.getUsedBytes() >= memoryLimit;
    }

    private int getLRUSize(final LRUList[] listSet) {
        int size = 0;
        for (final LRUList l : listSet) {
            size += l.getSize();
        }
        return size;
    }

    private void evictBatch(final EvictionSource source,
                            final boolean backgroundIO) {

        final long maxBytesToEvict =
            evictBytes + (allocator.getUsedBytes() - memoryLimit);

        long bytesEvicted = 0;

        boolean pri2 = false;
        int maxLruEntries = getLRUSize(pri1LRUSet);
        int nLruEntries = 0;

        while (bytesEvicted < maxBytesToEvict &&
               needEviction() &&
               !shutdownRequested.get()) {

            if (nLruEntries >= maxLruEntries) {
                if (pri2) {
                    break;
                }
                pri2 = true;
                maxLruEntries = getLRUSize(pri2LRUSet);
                nLruEntries = 0;
            }

            final LRUList lru;

            if (pri2) {
                final int lruIdx =
                    Math.abs(nextPri2LRUList++) % numLRULists;

                lru = pri2LRUSet[lruIdx];

            } else {
                final int lruIdx =
                    Math.abs(nextPri1LRUList++) % numLRULists;

                lru = pri1LRUSet[lruIdx];
            }

            final int entry = lru.removeFront();
            nLruEntries += 1;

            if (entry < 0) {
                continue;
            }

            bytesEvicted += evictOne(source, backgroundIO, entry, lru, pri2);
        }
    }

    private long evictOne(final EvictionSource source,
                          final boolean backgroundIO,
                          final int entry,
                          final LRUList lru,
                          final boolean pri2) {

        nNodesTargeted.incrementAndGet();

        if (source == EvictionSource.CRITICAL) {
            nCriticalNodesTargeted.incrementAndGet();
        }

        final Chunk chunk = chunks[entry / CHUNK_SIZE];
        final int chunkIdx = entry % CHUNK_SIZE;

        /*
         * Note that almost anything could have happened in other threads
         * after removing the entry from the LRU and prior to latching the
         * owner IN. We account for these possibilities below.
         *
         * When we decide to "skip" an entry, it is not added back to the LRU
         * and it is not freed. The assumption is that another thread is
         * processing the entry and will add it to the LRU or free it.
         */
        final IN in = chunk.owners[chunkIdx];

        /*
         * If the IN is null, skip the entry. The IN may have been evicted.
         */
        if (in == null) {
            nNodesSkipped.incrementAndGet();
            return 0;
        }

        final EnvironmentImpl envImpl = in.getEnv();

        in.latchNoUpdateLRU();
        try {

            /*
             * If the owner has changed or the IN was evicted, skip it.
             */
            if (in != chunk.owners[chunkIdx] ||
                !in.getInListResident()) {
                nNodesSkipped.incrementAndGet();
                return 0;
            }

            /*
             * If owner is a BIN, it is in the main cache but may have
             * off-heap LNs.
             */
            if (in.isBIN()) {
                final BIN bin = (BIN) in;

                /*
                 * If entry is no longer associated with this BIN, skip it.
                 */
                if (bin.getOffHeapLruId() != entry) {
                    nNodesSkipped.incrementAndGet();
                    return 0;
                }

                /*
                 * If the entry was added back to the LRU, skip it. This check
                 * requires synchronizing on the LRUList after latching the IN.
                 * We know we're checking the correct LRUList because an entry
                 * with a BIN owner can never be in the priority 2 LRU set.
                 */
                if (lru.contains(chunk, chunkIdx)) {
                    nNodesSkipped.incrementAndGet();
                    return 0;
                }

                return stripLNsFromMainBIN(bin, entry, pri2);
            }

            /*
             * The owner has a child BIN that is off-heap.
             */
            int index = -1;
            for (int i = 0; i < in.getNEntries(); i += 1) {
                if (in.getOffHeapBINId(i) == entry) {
                    index = i;
                    break;
                }
            }

            /*
             * If entry is no longer associated with this IN, skip it.
             */
            if (index < 0) {
                nNodesSkipped.incrementAndGet();
                return 0;
            }

            /*
             * If the entry was moved between a pri1 and pri2 LRU list, skip
             * it. This means that the LRUList from which we removed the entry
             * is not the list it belongs to.
             */
            if (pri2 != in.isOffHeapBINPri2(index)) {
                nNodesSkipped.incrementAndGet();
                return 0;
            }

            /*
             * If the entry was added back to the LRU, skip it. This check
             * requires synchronizing on the LRUList after latching the IN, and
             * it requires that we're using the correct LRU list (the check
             * above).
             */
            if (lru.contains(chunk, chunkIdx)) {
                nNodesSkipped.incrementAndGet();
                return 0;
            }

            /*
             * The BIN should never be resident in main.
             */
            final BIN residentBIN = (BIN) in.getTarget(index);
            if (residentBIN != null) {
                throw EnvironmentFailureException.unexpectedState(
                    envImpl, "BIN is resident in both caches, id=" +
                    residentBIN.getNodeId());
            }

            final boolean useChecksums = envImpl.useOffHeapChecksums();
            final int checksumSize = useChecksums ? CHECKSUM_SIZE : 0;

            final long memId = chunk.memIds[chunkIdx];
            final int flags = getByte(memId, checksumSize, new byte[1]);
            final boolean dirty = in.isOffHeapBINDirty(index);

            /*
             * First try stripping LNs.
             */
            final long nLNBytesEvicted = stripLNs(
                entry, pri2, dirty, memId, chunk, chunkIdx, in, index,
                backgroundIO);

            if (nLNBytesEvicted > 0) {
                return nLNBytesEvicted;
            }

            /*
             * Next try mutating a full BIN to a BIN-delta.
             */
            if ((flags & BIN_FLAG_CAN_MUTATE) != 0) {

                final long nBytesEvicted = mutateToBINDelta(
                    envImpl, in.getDatabase(), entry, pri2,
                    chunk, chunkIdx);

                if (nBytesEvicted > 0) {
                    return nBytesEvicted;
                }
            }

            /*
             * If it is in the pri1 list and is dirty with no resident LNs,
             * move it to the pri2 list. We currently have no stat for this.
             */
            if (!pri2 && dirty) {
                moveBack(entry, true /*pri2*/);
                in.setOffHeapBINId(
                    index, entry, true /*pri2*/, true /*dirty*/);
                return 0;
            }

            /*
             * Log the BIN if it is dirty and finally just get rid of it.
             */
            return flushAndDiscardBIN(
                entry, pri2, dirty, memId, in, index, backgroundIO,
                false /*freeLNs*/);

        } finally {
            in.releaseLatch();
        }
    }

    /**
     * Strip off-heap LNs referenced by a main cache BIN. If there are any
     * off-heap expired LNs, strip only them. Otherwise, strip all LNs.
     */
    private long stripLNsFromMainBIN(final BIN bin,
                                     final int entry,
                                     final boolean pri2) {
        /*
         * Strip expired LNs first.
         */
        int nEvicted = 0;
        long nBytesEvicted = 0;
        boolean anyNonExpired = false;

        for (int i = 0; i < bin.getNEntries(); i += 1) {
            if (bin.getOffHeapLNId(i) == 0) {
                continue;
            }
            if (bin.isExpired(i)) {
                nBytesEvicted += freeLN(bin, i);
                nEvicted += 1;
            } else {
                anyNonExpired = true;
            }
        }

        /*
         * If any were expired, return. If any non-expired LNs remain, put back
         * the entry on the LRU list, leaving the non-expired LNs resident.
         * Also compress the BIN to free the expired slots in the main cache.
         */
        if (nEvicted > 0) {
            if (anyNonExpired) {
                moveBack(entry, pri2);
            } else {
                bin.setOffHeapLruId(-1);
                freeEntry(entry);
            }

            bin.getEnv().lazyCompress(bin);

            nLNsEvicted.addAndGet(nEvicted);
            nNodesStripped.incrementAndGet();
            return nBytesEvicted;
        }

        /*
         * No expired LNs are present. Strip the non-expired LNs.
         */
        for (int i = 0; i < bin.getNEntries(); i += 1) {
            final int lnBytes = freeLN(bin, i);
            if (lnBytes == 0) {
                continue;
            }
            nBytesEvicted += lnBytes;
            nEvicted += 1;
        }

        if (nEvicted > 0) {
            nLNsEvicted.addAndGet(nEvicted);
            nNodesStripped.incrementAndGet();
        } else {
            nNodesSkipped.incrementAndGet();
        }

        /*
         * No LNs are off-heap now, so remove the entry.
         */
        bin.setOffHeapLruId(-1);
        freeEntry(entry);

        return nBytesEvicted;
    }

    public long stripLNs(final IN parent, final int index) {

        assert parent.isLatchExclusiveOwner();
        assert parent.getInListResident();

        final int entry = parent.getOffHeapBINId(index);
        assert entry >= 0;

        final Chunk chunk = chunks[entry / CHUNK_SIZE];
        final int chunkIdx = entry % CHUNK_SIZE;

        final boolean pri2 = parent.isOffHeapBINPri2(index);
        final boolean dirty = parent.isOffHeapBINDirty(index);
        final long memId = chunk.memIds[chunkIdx];

        return stripLNs(
            entry, pri2, dirty, memId, chunk, chunkIdx, parent, index, false);
    }

    private long stripLNs(final int entry,
                          final boolean pri2,
                          final boolean dirty,
                          final long memId,
                          final Chunk chunk,
                          final int chunkIdx,
                          final IN parent,
                          final int index,
                          final boolean backgroundIO) {

        final EnvironmentImpl envImpl = parent.getEnv();
        final boolean useChecksums = envImpl.useOffHeapChecksums();
        final int checksumSize = useChecksums ? CHECKSUM_SIZE : 0;

        /*
         * Contents of headBuf:
         *  flags, fullLsn, deltaLsn, minExpiration, lnIdsSize.
         * Note that headBuf does not contain the checksum.
         * Contents of memId following headBuf fields: LN mem Ids, BIN.
         */
        final byte[] headBuf = new byte[1 + 8 + 8 + 4 + 2];
        allocator.copy(memId, checksumSize, headBuf, 0, headBuf.length);
        final int memHeadLen = checksumSize + headBuf.length;
        final byte flags = headBuf[0];
        int bufOffset = 1 + 8 + 8;
        final int minExpiration = getInt(headBuf, bufOffset);
        bufOffset += 4;
        final short lnIdsSize = getShort(headBuf, bufOffset);
        bufOffset += 2;
        assert bufOffset == headBuf.length;

        int nEvicted = 0;
        long nBytesEvicted = 0;

        /*
         * If this is a full BIN and any slot may be expired, then materialize
         * the BIN, evict expired off-heap LNs, compress expired slots, and
         * re-serialize the BIN.
         */
        if ((flags & BIN_FLAG_DELTA) == 0 &&
            envImpl.isExpired(minExpiration, true /*hours*/)) {

            final BIN bin = materializeBIN(envImpl, getMemBytes(memId));

            bin.latchNoUpdateLRU(parent.getDatabase());
            try {
                for (int i = 0; i < bin.getNEntries(); i += 1) {
                    if (bin.getOffHeapLNId(i) == 0 ||
                        !bin.isExpired(i)) {
                        continue;
                    }
                    nBytesEvicted += freeLN(bin, i);
                    nEvicted += 1;
                }

                /*
                 * TODO: Compression is expensive because we must re-serialize
                 * the BIN. It may be more efficient to only proceed to
                 * compression if no LNs were freed above, although we would
                 * first need to clear the LN memIds.
                 */
                final int origNSlots = bin.getNEntries();

                bin.compress(
                    !bin.shouldLogDelta() /*compressDirtySlots*/, null);

                /*
                 * If we compressed any expired slots, re-serialize the BIN.
                 * Also re-serialize in the rare case than an LN was freed but
                 * no slot was compressed due to record locks; if we did not do
                 * this, the invalid/freed LN memId would not be cleared.
                 */
                if (origNSlots != bin.getNEntries() || nEvicted > 0) {
                    final long newMemId = serializeBIN(bin, bin.isBINDelta());
                    if (newMemId == 0) {
                        /*
                         * When allocations are failing, freeing the BIN is the
                         * simplest and most productive thing to do.
                         */
                        nBytesEvicted += flushAndDiscardBIN(
                            entry, pri2, dirty, memId, parent, index,
                            backgroundIO, true /*freeLNs*/);

                        return nBytesEvicted;
                    }

                    nBytesEvicted += freeMemory(memId);
                    nBytesEvicted -= allocator.totalSize(newMemId);
                    chunk.memIds[chunkIdx] = newMemId;
                }
            } finally {
                bin.releaseLatch();
            }

            /* Return if we freed any memory by LN eviction or compression. */
            if (nBytesEvicted > 0) {
                nLNsEvicted.addAndGet(nEvicted);
                nNodesStripped.incrementAndGet();
                moveBack(entry, pri2);
                return nBytesEvicted;
            }
        }

        if (lnIdsSize <= 0) {
            return 0;
        }

        final byte[] lnBuf = new byte[lnIdsSize];
        allocator.copy(memId, memHeadLen, lnBuf, 0, lnBuf.length);
        final long[] lnMemIds = unpackLnMemIds(lnBuf, 0, lnIdsSize);

        for (final long lnMemId : lnMemIds) {
            if (lnMemId == 0) {
                continue;
            }
            nBytesEvicted += freeLN(lnMemId);
            nEvicted += 1;
        }

        assert nEvicted > 0;

        if (lnIdsSize <= MAX_UNUSED_BIN_BYTES) {
            /*
             * When there are only a small number of LN memIds, we can tolerate
             * the wasted space in the BIN so we just negate the size.
             */
            final byte[] tempBuf = new byte[8];

            putShort((short) (-lnIdsSize), memId, memHeadLen - 2, tempBuf);

                /* However, the checksum is now invalid. */
            if (useChecksums) {
                putInt(0, memId, 0, tempBuf);
            }
        } else {
            /*
             * When there are many LN memIds, we reclaim the space they use by
             * copying the BIN to a smaller block and freeing the old block.
             */
            final int newSize = allocator.size(memId) - lnIdsSize;
            final long newMemId = allocateMemory(envImpl, newSize);

            if (newMemId == 0) {
                /*
                 * When allocations are failing, freeing the BIN is the
                 * simplest and most productive thing to do.
                 */
                nBytesEvicted += flushAndDiscardBIN(
                    entry, pri2, dirty, memId, parent, index, backgroundIO,
                    true /*freeLNs*/);

                return nBytesEvicted;
            }

            nBytesEvicted -= allocator.totalSize(newMemId);

            /*
             * Copy all parts of the old BIN to the new, except for the
             * checksum, lnIdsSize and the LN memIds. We don't need to set
             * the checksum or lnIdsSize to zero in the new block because it
             * was zero-filled when it was allocated. Instead we omit these
             * fields when copying.
             *
             * The first copy includes all headBuf fields except for the
             * lnIdsSize at the end of the buffer. The second copy includes
             * the serialized BIN alone.
             */
            allocator.copy(
                headBuf, 0,
                newMemId, checksumSize, headBuf.length - 2);

            allocator.copy(
                memId, memHeadLen + lnIdsSize,
                newMemId, memHeadLen, newSize - memHeadLen);

            nBytesEvicted += freeMemory(memId);
            chunk.memIds[chunkIdx] = newMemId;
        }

        nLNsEvicted.addAndGet(nEvicted);
        nNodesStripped.incrementAndGet();
        moveBack(entry, pri2);
        return nBytesEvicted;
    }

    public long mutateToBINDelta(final IN parent,
                                 final int index) {

        assert parent.isLatchExclusiveOwner();
        assert parent.getInListResident();

        final int entry = parent.getOffHeapBINId(index);
        if (entry < 0) {
            return 0;
        }

        final Chunk chunk = chunks[entry / CHUNK_SIZE];
        final int chunkIdx = entry % CHUNK_SIZE;

        return mutateToBINDelta(
            parent.getEnv(), parent.getDatabase(), entry,
            parent.isOffHeapBINPri2(index), chunk, chunkIdx);
    }

    private long mutateToBINDelta(final EnvironmentImpl envImpl,
                                  final DatabaseImpl dbImpl,
                                  final int entry,
                                  final boolean pri2,
                                  final Chunk chunk,
                                  final int chunkIdx) {

        final long memId = chunk.memIds[chunkIdx];

        final BIN bin = materializeBIN(envImpl, getMemBytes(memId));
        assert bin.getLastFullLsn() != DbLsn.NULL_LSN;

        final long newMemId;
        bin.latchNoUpdateLRU(dbImpl);
        try {
            newMemId = serializeBIN(bin, true /*asDelta*/);
        } finally {
            bin.releaseLatch();
        }

        if (newMemId == 0) {
            return 0;
        }

        long nBytesEvicted = freeBIN(envImpl, memId, false /*freeLNs*/);
        nBytesEvicted -= allocator.totalSize(newMemId);
        chunk.memIds[chunkIdx] = newMemId;

        nNodesMutated.incrementAndGet();
        moveBack(entry, pri2);
        return nBytesEvicted;
    }

    /**
     * Logs the BIN child if it is dirty, and then discards it.
     *
     * @return bytes freed, or zero if a dirty BIN could not be logged due to
     * a read-only env or disk limit violation.
     */
    private long flushAndDiscardBIN(final int entry,
                                    final boolean pri2,
                                    final boolean dirty,
                                    final long memId,
                                    final IN parent,
                                    final int index,
                                    final boolean backgroundIO,
                                    final boolean freeLNs) {

        assert parent.isLatchExclusiveOwner();

        final EnvironmentImpl envImpl = parent.getEnv();

        if (DEBUG_TRACE) {
            debug(
                envImpl,
                "Discard BIN LSN=" +
                    DbLsn.getNoFormatString(parent.getLsn(index)) +
                    " pri2=" + pri2 + " dirty=" + dirty);
        }

        if (dirty) {
            /*
             * Cannot evict dirty nodes in a read-only environment, or when a
             * disk limit has been exceeded. We can assume that the cache will
             * not overflow with dirty nodes because writes are prohibited.
             */
            if (envImpl.isReadOnly() ||
                envImpl.getDiskLimitViolation() != null) {
                nNodesSkipped.incrementAndGet();
                return 0;
            }

            final INLogEntry logEntry = createBINLogEntry(
                memId, entry, parent, false /*preserveBINInCache*/);

            final Provisional provisional =
                envImpl.coordinateWithCheckpoint(
                    parent.getDatabase(), IN.BIN_LEVEL, parent);

            final long lsn = IN.logEntry(
                logEntry, provisional, backgroundIO, parent);

            parent.updateEntry(
                index, lsn, VLSN.NULL_VLSN_SEQUENCE, 0 /*lastLoggedSize*/);

            nDirtyNodesEvicted.incrementAndGet();

            if (!logEntry.isPreSerialized()) {
                logEntry.getMainItem().releaseLatch();
            }
        }

        nNodesEvicted.incrementAndGet();
        parent.clearOffHeapBINId(index);
        remove(entry, pri2);
        return freeBIN(envImpl, memId, freeLNs);
    }

    long testEvictMainBIN(final BIN bin) {

        final int entry = bin.getOffHeapLruId();
        assert entry >= 0;

        final LRUList lru = pri1LRUSet[entry % numLRULists];
        lru.remove(entry);

        return evictOne(
            EvictionSource.MANUAL, false /*backgroundIO*/, entry, lru,
            false /*pri2*/);
    }

    long testEvictOffHeapBIN(final IN in, final int index) {

        final int entry = in.getOffHeapBINId(index);
        assert entry >= 0;

        final boolean pri2 = in.isOffHeapBINPri2(index);
        final int lruIdx = entry % numLRULists;

        final LRUList lru =
            pri2 ? pri2LRUSet[lruIdx] : pri1LRUSet[lruIdx];

        lru.remove(entry);

        return evictOne(
            EvictionSource.MANUAL, false /*backgroundIO*/, entry, lru, pri2);
    }
}




© 2015 - 2024 Weber Informatics LLC | Privacy Policy