org.apache.phoenix.cache.aggcache.SpillMap Maven / Gradle / Ivy

Go to download
/*
 * Licensed to the Apache Software Foundation (ASF) under one
 * or more contributor license agreements.  See the NOTICE file
 * distributed with this work for additional information
 * regarding copyright ownership.  The ASF licenses this file
 * to you under the Apache License, Version 2.0 (the
 * "License"); you may not use this file except in compliance
 * with the License.  You may obtain a copy of the License at
 *
 * http://www.apache.org/licenses/LICENSE-2.0
 *
 * Unless required by applicable law or agreed to in writing, software
 * distributed under the License is distributed on an "AS IS" BASIS,
 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 * See the License for the specific language governing permissions and
 * limitations under the License.
 */

package org.apache.phoenix.cache.aggcache;

import java.io.IOException;
import java.io.RandomAccessFile;
import java.nio.BufferOverflowException;
import java.util.AbstractMap;
import java.util.ArrayList;
import java.util.Collection;
import java.util.HashSet;
import java.util.Iterator;
import java.util.Map;
import java.util.Set;

import org.apache.hadoop.hbase.util.Bytes;

import com.google.common.base.Preconditions;
import com.google.common.collect.Lists;
import com.google.common.collect.Maps;
import com.google.common.hash.BloomFilter;
import com.google.common.hash.Funnels;

import org.apache.phoenix.hbase.index.util.ImmutableBytesPtr;

/**
 * Class implements an active spilled partition serialized tuples are first written into an in-memory data structure
 * that represents a single page. As the page fills up, it is written to the current spillFile or spill partition For
 * fast tuple discovery, the class maintains a per page bloom-filter and never de-serializes elements. The element
 * spilling employs an extentible hashing technique.
 */
public class SpillMap extends AbstractMap implements Iterable {

    // Threshold is typically the page size
    private final int thresholdBytes;
    private final int pageInserts;
    // Global directory depth
    private int globalDepth;
    private int curMapBufferIndex;
    private SpillFile spillFile;
    // Directory of hash buckets --> extendible hashing implementation
    private FileMap[] directory;
    private final SpillableGroupByCache.QueryCache cache;

    public SpillMap(SpillFile file, int thresholdBytes, int estValueSize, SpillableGroupByCache.QueryCache cache)
            throws IOException {
        this.thresholdBytes = thresholdBytes - Bytes.SIZEOF_INT;
        this.pageInserts = thresholdBytes / estValueSize;
        this.spillFile = file;
        this.cache = cache;

        // Init the e-hashing directory structure
        globalDepth = 1;
        directory = new FileMap[(1 << globalDepth)];

        for (int i = 0; i < directory.length; i++) {
            // Create an empty bucket list
            directory[i] = new FileMap(i, this.thresholdBytes, pageInserts, file);
            directory[i].flushBuffer();
        }
        directory[0].pageIn();
        curMapBufferIndex = 0;
    }

    // Get the directoy index for a specific key
    private int getBucketIndex(ImmutableBytesPtr key) {
        // Get key hash
        int hashCode = key.hashCode();

        // Mask all but globalDepth low n bits
        return hashCode & ((1 << globalDepth) - 1);
    }

    // Function redistributes the elements in the current index
    // to two new buckets, based on the bit at localDepth + 1 position.
    // Optionally this function also doubles the directory to allow
    // for bucket splits
    private void redistribute(int index, ImmutableBytesPtr keyNew, byte[] valueNew) {
        // Get the respective bucket
        FileMap byteMap = directory[index];

        // Get the actual bucket index, that the directory index points to
        int mappedIdx = byteMap.pageIndex;

        int localDepth = byteMap.localDepth;
        ArrayList buckets = Lists.newArrayList();
        // Get all directory entries that point to the same bucket.
        // TODO: can be made faster!
        for (int i = 0; i < directory.length; i++) {
            if (directory[i].pageIndex == mappedIdx) {
                buckets.add(i);
            }
        }

        // Assuming no directory doubling for now
        // compute the two new bucket Ids for splitting
        // SpillFile adds new files dynamically in case the directory points to pageIDs
        // that exceed the size limit of a single file.

        // TODO verify if some sort of de-fragmentation might be helpful
        int tmpIndex = index ^ ((1 << localDepth));
        int b1Index = Math.min(index, tmpIndex);
        int b2Index = Math.max(index, tmpIndex);

        // Create two new split buckets
        FileMap b1 = new FileMap(b1Index, thresholdBytes, pageInserts, spillFile);
        FileMap b2 = new FileMap(b2Index, thresholdBytes, pageInserts, spillFile);

        // redistribute old elements into b1 and b2
        for (Entry element : byteMap.pageMap.entrySet()) {
            ImmutableBytesPtr key = element.getKey();
            byte[] value = element.getValue();
            // Only add key during redistribution if its not in the cache
            // Otherwise this is an good point to reduce the number of spilled elements
            if (!cache.isKeyContained(key)) {
                // Re-distribute element onto the new 2 split buckets
                if ((key.hashCode() & ((1 << localDepth))) != 0) {
                    b2.addElement(null, key, value);
                } else {
                    b1.addElement(null, key, value);
                }
            }
        }

        // Clear and GC the old now redistributed bucket
        byteMap.pageMap.clear();
        byteMap = null;

        // Increase local bucket depths
        b1.localDepth = localDepth + 1;
        b2.localDepth = localDepth + 1;
        boolean doubleDir = false;

        if (globalDepth < (localDepth + 1)) {
            // Double directory structure and re-adjust pointers
            doubleDir = true;

            b2Index = doubleDirectory(b2Index, keyNew);
        }

        if (!doubleDir) {
            // This is a bit more tricky, we have to cover scenarios where
            // globalDepth - localDepth > 1
            // Here even after bucket splitting, multiple directory entries point to
            // the new buckets
            for (int i = 0; i < buckets.size(); i++) {
                if ((buckets.get(i) & (1 << (localDepth))) != 0) {
                    directory[buckets.get(i)] = b2;
                } else {
                    directory[buckets.get(i)] = b1;
                }
            }
        } else {
            // Update the directory indexes in case of directory doubling
            directory[b1Index] = b1;
            directory[b2Index] = b2;
        }
    }

    // Doubles the directory and readjusts pointers.
    private int doubleDirectory(int b2Index, ImmutableBytesPtr keyNew) {
        // Double the directory in size, second half points to original first half
        int newDirSize = 1 << (globalDepth + 1);

        // Ensure that the new directory size does not exceed size limits
        Preconditions.checkArgument(newDirSize < Integer.MAX_VALUE);

        // Double it!
        FileMap[] newDirectory = new FileMap[newDirSize];
        for (int i = 0; i < directory.length; i++) {
            newDirectory[i] = directory[i];
            newDirectory[i + directory.length] = directory[i];
        }
        directory = newDirectory;
        newDirectory = null;

        // Adjust the index for new split bucket, according to the directory double
        b2Index = (keyNew.hashCode() & ((1 << globalDepth) - 1)) | (1 << globalDepth);

        // Increment global depth
        globalDepth++;

        return b2Index;
    }

    /**
     * Get a key from the spillable data structures. page is determined via hash partitioning, and a bloomFilter check
     * is used to determine if its worth paging in the data.
     */
    @Override
    public byte[] get(Object key) {
        if (!(key instanceof ImmutableBytesPtr)) {
            // TODO ... work on type safety
        }
        ImmutableBytesPtr ikey = (ImmutableBytesPtr)key;
        byte[] value = null;

        int bucketIndex = getBucketIndex(ikey);
        FileMap byteMap = directory[bucketIndex];

        // Decision based on bucket ID, not the directory ID due to the n:1 relationship
        if (directory[curMapBufferIndex].pageIndex != byteMap.pageIndex) {
            // map not paged in
            FileMap curByteMap = directory[curMapBufferIndex];

            // Use bloomFilter to check if key was spilled before
            if (byteMap.containsKey(ikey.copyBytesIfNecessary())) {
                // ensure consistency and flush current memory page to disk
                // fflush current buffer
                curByteMap.flushBuffer();
                // page in new buffer
                byteMap.pageIn();
                // update index
                curMapBufferIndex = bucketIndex;
            }
        }
        // get KV from current map
        value = byteMap.getPagedInElement(ikey);
        return value;
    }

    // Similar as get(Object key) function, however
    // always pages in page a key is spilled to, no bloom filter decision
    private byte[] getAlways(ImmutableBytesPtr key) {
        byte[] value = null;
        int bucketIndex = getBucketIndex(key);
        FileMap byteMap = directory[bucketIndex];

        if (directory[curMapBufferIndex].pageIndex != byteMap.pageIndex) {
            FileMap curByteMap = directory[curMapBufferIndex];
            // ensure consistency and flush current memory page to disk
            curByteMap.flushBuffer();

            byteMap.pageIn();
            curMapBufferIndex = bucketIndex;
        }
        // get KV from current queue
        value = byteMap.getPagedInElement(key);
        return value;
    }

    /**
     * Spill a key First we discover if the key has been spilled before and load it into memory: #ref get() if it was
     * loaded before just replace the old value in the memory page if it was not loaded before try to store it in the
     * current page alternatively if not enough memory available, request new page.
     */
    @Override
    public byte[] put(ImmutableBytesPtr key, byte[] value) {
        boolean redistributed = false;
        // page in element and replace if present
        byte[] spilledValue = getAlways(key);

        FileMap byteMap = directory[curMapBufferIndex];
        int index = curMapBufferIndex;

        // TODO: We split buckets until the new element fits onto a
        // one of the new buckets. Might consider the use of an overflow
        // bucket, especially in case the directory runs out of page IDs.
        while (!byteMap.canFit(spilledValue, value)) {
            // Element does not fit... Split the bucket!
            redistribute(index, key, value);
            redistributed = true;

            index = getBucketIndex(key);
            byteMap = directory[index];
        }
        // Ensure that all pages that were paged in during redistribution are flushed back out
        // to disk to keep memory footprint small.
        if (redistributed) {
            for (int i = 0; i < directory.length; i++) {
                if (directory[i].pageIndex != byteMap.pageIndex) {
                    directory[i].flushBuffer();
                }
            }
            // Ensure the page that receives the new key is in memory
            spilledValue = getAlways(key);
        }
        byteMap.addElement(spilledValue, key, value);

        return value;
    }

    /**
     * Function returns the current spill file
     */
    public SpillFile getSpillFile() {
        return spillFile;
    }

    /**
     * This inner class represents the currently mapped file region. It uses a Map to represent the current in memory
     * page for easy get() and update() calls on an individual key The class keeps track of the current size of the in
     * memory page and handles flushing and paging in respectively
     */
    private static class FileMap {
        private final SpillFile spillFile;
        private final int pageIndex;
        private final int thresholdBytes;
        private long totalResultSize;
        private boolean pagedIn;
        private int localDepth;
        // dirtyPage flag tracks if a paged in page was modified
        // if not, no need to flush it back out to disk
        private boolean dirtyPage;
        // Use a map for in memory page representation
        Map pageMap = Maps.newHashMap();
        // Used to determine is an element was written to this page before or not
        BloomFilter bFilter;

        public FileMap(int id, int thresholdBytes, int pageInserts, SpillFile spillFile) {
            this.spillFile = spillFile;
            // size threshold of a page
            this.thresholdBytes = thresholdBytes;
            this.pageIndex = id;
            pageMap.clear();
            bFilter = BloomFilter.create(Funnels.byteArrayFunnel(), pageInserts);
            pagedIn = true;
            totalResultSize = 0;
            localDepth = 1;
            dirtyPage = true;
        }

        private boolean containsKey(byte[] key) {
            return bFilter.mightContain(key);
        }

        private boolean canFit(byte[] curValue, byte[] newValue) {
            if (thresholdBytes < newValue.length) {
                // TODO resize page size if single element is too big,
                // Can this ever happen?
                throw new RuntimeException("page size too small to store a single KV element");
            }

            int resultSize = newValue.length + Bytes.SIZEOF_INT;
            if (curValue != null) {
                // Key existed before
                // Ensure to compensate for potential larger byte[] for agg
                resultSize = Math.max(0, resultSize - (curValue.length + Bytes.SIZEOF_INT));
            }

            if ((thresholdBytes - totalResultSize) <= (resultSize)) {
                // KV does not fit
                return false;
            }
            // KV fits
            return true;
        }

        // Flush the current page to the memory mapped byte buffer
        private void flushBuffer() {
            if (pagedIn) {
                // Only flush if page was changed
                if (dirtyPage) {
                    Collection values = pageMap.values();
                    RandomAccessFile file = spillFile.getPage(pageIndex);
                    // number of elements
                    try {
                        file.writeInt(values.size());
                        int written = Bytes.SIZEOF_INT;
                        for (byte[] value : values) {
                            written += Bytes.SIZEOF_INT + value.length;
                            // safety check
                            if (written > SpillFile.DEFAULT_PAGE_SIZE) {
                                throw new BufferOverflowException();
                            }
                            // element length
                            file.writeInt(value.length);
                            // element
                            file.write(value, 0, value.length);
                        }
                    } catch (IOException ioe) {
                        // Error during key access on spilled resource
                        // TODO rework error handling
                        throw new RuntimeException(ioe);
                    }
                }
                // Reset page stats
                pageMap.clear();
                totalResultSize = 0;
            }
            pagedIn = false;
            dirtyPage = false;
        }

        // load a page into a map for fast element access
        private void pageIn() {
            if (!pagedIn) {
                RandomAccessFile file = spillFile.getPage(pageIndex);
                try {
                int numElements = file.readInt();
                for (int i = 0; i < numElements; i++) {
                    int kvSize = file.readInt();
                    byte[] data = new byte[kvSize];
                    file.readFully(data);
                    pageMap.put(SpillManager.getKey(data), data);
                    totalResultSize += (data.length + Bytes.SIZEOF_INT);
                }
                } catch (IOException ioe) {
                    // Error during key access on spilled resource
                    // TODO rework error handling
                    throw new RuntimeException(ioe);
                }
                pagedIn = true;
                dirtyPage = false;
            }
        }

        /**
         * Return a cache element currently page into memory Direct access via mapped page map
         * 
         * @param key
         * @return
         */
        public byte[] getPagedInElement(ImmutableBytesPtr key) {
            return pageMap.get(key);
        }

        /**
         * Inserts / Replaces cache element in the currently loaded page. Direct access via mapped page map
         * 
         * @param key
         * @param value
         */
        public void addElement(byte[] spilledValue, ImmutableBytesPtr key, byte[] value) {

            // put Element into map
            pageMap.put(key, value);
            // Update bloom filter
            bFilter.put(key.copyBytesIfNecessary());
            // track current Map size to prevent Buffer overflows
            if (spilledValue != null) {
                // if previous key was present, just add the size difference
                totalResultSize += Math.max(0, value.length - (spilledValue.length));
            } else {
                // Add new size information
                totalResultSize += (value.length + Bytes.SIZEOF_INT);
            }

            dirtyPage = true;
        }

        /**
         * Returns a value iterator over the pageMap
         */
        public Iterator getPageMapEntries() {
            pageIn();
            return pageMap.values().iterator();
        }
    }

    /**
     * Iterate over all spilled elements, including the ones that are currently paged into memory
     */
    @Override
    public Iterator iterator() {
        directory[curMapBufferIndex].flushBuffer();

        return new Iterator() {
            int pageIndex = 0;
            Iterator entriesIter = directory[pageIndex].getPageMapEntries();
            HashSet dups = new HashSet();

            @Override
            public boolean hasNext() {
                if (!entriesIter.hasNext()) {
                    boolean found = false;
                    // Clear in memory map

                    while (!found) {
                        pageIndex++;
                        if (pageIndex >= directory.length) { return false; }
                        directory[pageIndex - 1].pageMap.clear();
                        // get keys from all spilled pages
                        if (!dups.contains(directory[pageIndex].pageIndex)) {
                            dups.add(directory[pageIndex].pageIndex);
                            entriesIter = directory[pageIndex].getPageMapEntries();
                            if (entriesIter.hasNext()) {
                                found = true;
                            }
                        }
                    }
                }
                dups.add(directory[pageIndex].pageIndex);
                return true;
            }

            @Override
            public byte[] next() {
                // get elements from in memory map first
                return entriesIter.next();
            }

            @Override
            public void remove() {
                throw new IllegalAccessError("Iterator does not support removal operation");
            }
        };
    }

    // TODO implement this method to make the SpillMap a true Map implementation
    @Override
    public Set> entrySet() {
        throw new IllegalAccessError("entrySet is not supported for this type of cache");
    }
}