
org.apache.phoenix.cache.aggcache.SpillMap Maven / Gradle / Ivy
The newest version!
/*
* Licensed to the Apache Software Foundation (ASF) under one
* or more contributor license agreements. See the NOTICE file
* distributed with this work for additional information
* regarding copyright ownership. The ASF licenses this file
* to you under the Apache License, Version 2.0 (the
* "License"); you may not use this file except in compliance
* with the License. You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
package org.apache.phoenix.cache.aggcache;
import java.io.IOException;
import java.io.RandomAccessFile;
import java.nio.BufferOverflowException;
import java.util.AbstractMap;
import java.util.ArrayList;
import java.util.Collection;
import java.util.HashSet;
import java.util.Iterator;
import java.util.Map;
import java.util.Set;
import org.apache.hadoop.hbase.util.Bytes;
import com.google.common.base.Preconditions;
import com.google.common.collect.Lists;
import com.google.common.collect.Maps;
import com.google.common.hash.BloomFilter;
import com.google.common.hash.Funnels;
import org.apache.phoenix.hbase.index.util.ImmutableBytesPtr;
/**
* Class implements an active spilled partition serialized tuples are first written into an in-memory data structure
* that represents a single page. As the page fills up, it is written to the current spillFile or spill partition For
* fast tuple discovery, the class maintains a per page bloom-filter and never de-serializes elements. The element
* spilling employs an extentible hashing technique.
*/
public class SpillMap extends AbstractMap implements Iterable {
// Threshold is typically the page size
private final int thresholdBytes;
private final int pageInserts;
// Global directory depth
private int globalDepth;
private int curMapBufferIndex;
private SpillFile spillFile;
// Directory of hash buckets --> extendible hashing implementation
private FileMap[] directory;
private final SpillableGroupByCache.QueryCache cache;
public SpillMap(SpillFile file, int thresholdBytes, int estValueSize, SpillableGroupByCache.QueryCache cache)
throws IOException {
this.thresholdBytes = thresholdBytes - Bytes.SIZEOF_INT;
this.pageInserts = thresholdBytes / estValueSize;
this.spillFile = file;
this.cache = cache;
// Init the e-hashing directory structure
globalDepth = 1;
directory = new FileMap[(1 << globalDepth)];
for (int i = 0; i < directory.length; i++) {
// Create an empty bucket list
directory[i] = new FileMap(i, this.thresholdBytes, pageInserts, file);
directory[i].flushBuffer();
}
directory[0].pageIn();
curMapBufferIndex = 0;
}
// Get the directoy index for a specific key
private int getBucketIndex(ImmutableBytesPtr key) {
// Get key hash
int hashCode = key.hashCode();
// Mask all but globalDepth low n bits
return hashCode & ((1 << globalDepth) - 1);
}
// Function redistributes the elements in the current index
// to two new buckets, based on the bit at localDepth + 1 position.
// Optionally this function also doubles the directory to allow
// for bucket splits
private void redistribute(int index, ImmutableBytesPtr keyNew, byte[] valueNew) {
// Get the respective bucket
FileMap byteMap = directory[index];
// Get the actual bucket index, that the directory index points to
int mappedIdx = byteMap.pageIndex;
int localDepth = byteMap.localDepth;
ArrayList buckets = Lists.newArrayList();
// Get all directory entries that point to the same bucket.
// TODO: can be made faster!
for (int i = 0; i < directory.length; i++) {
if (directory[i].pageIndex == mappedIdx) {
buckets.add(i);
}
}
// Assuming no directory doubling for now
// compute the two new bucket Ids for splitting
// SpillFile adds new files dynamically in case the directory points to pageIDs
// that exceed the size limit of a single file.
// TODO verify if some sort of de-fragmentation might be helpful
int tmpIndex = index ^ ((1 << localDepth));
int b1Index = Math.min(index, tmpIndex);
int b2Index = Math.max(index, tmpIndex);
// Create two new split buckets
FileMap b1 = new FileMap(b1Index, thresholdBytes, pageInserts, spillFile);
FileMap b2 = new FileMap(b2Index, thresholdBytes, pageInserts, spillFile);
// redistribute old elements into b1 and b2
for (Entry element : byteMap.pageMap.entrySet()) {
ImmutableBytesPtr key = element.getKey();
byte[] value = element.getValue();
// Only add key during redistribution if its not in the cache
// Otherwise this is an good point to reduce the number of spilled elements
if (!cache.isKeyContained(key)) {
// Re-distribute element onto the new 2 split buckets
if ((key.hashCode() & ((1 << localDepth))) != 0) {
b2.addElement(null, key, value);
} else {
b1.addElement(null, key, value);
}
}
}
// Clear and GC the old now redistributed bucket
byteMap.pageMap.clear();
byteMap = null;
// Increase local bucket depths
b1.localDepth = localDepth + 1;
b2.localDepth = localDepth + 1;
boolean doubleDir = false;
if (globalDepth < (localDepth + 1)) {
// Double directory structure and re-adjust pointers
doubleDir = true;
b2Index = doubleDirectory(b2Index, keyNew);
}
if (!doubleDir) {
// This is a bit more tricky, we have to cover scenarios where
// globalDepth - localDepth > 1
// Here even after bucket splitting, multiple directory entries point to
// the new buckets
for (int i = 0; i < buckets.size(); i++) {
if ((buckets.get(i) & (1 << (localDepth))) != 0) {
directory[buckets.get(i)] = b2;
} else {
directory[buckets.get(i)] = b1;
}
}
} else {
// Update the directory indexes in case of directory doubling
directory[b1Index] = b1;
directory[b2Index] = b2;
}
}
// Doubles the directory and readjusts pointers.
private int doubleDirectory(int b2Index, ImmutableBytesPtr keyNew) {
// Double the directory in size, second half points to original first half
int newDirSize = 1 << (globalDepth + 1);
// Ensure that the new directory size does not exceed size limits
Preconditions.checkArgument(newDirSize < Integer.MAX_VALUE);
// Double it!
FileMap[] newDirectory = new FileMap[newDirSize];
for (int i = 0; i < directory.length; i++) {
newDirectory[i] = directory[i];
newDirectory[i + directory.length] = directory[i];
}
directory = newDirectory;
newDirectory = null;
// Adjust the index for new split bucket, according to the directory double
b2Index = (keyNew.hashCode() & ((1 << globalDepth) - 1)) | (1 << globalDepth);
// Increment global depth
globalDepth++;
return b2Index;
}
/**
* Get a key from the spillable data structures. page is determined via hash partitioning, and a bloomFilter check
* is used to determine if its worth paging in the data.
*/
@Override
public byte[] get(Object key) {
if (!(key instanceof ImmutableBytesPtr)) {
// TODO ... work on type safety
}
ImmutableBytesPtr ikey = (ImmutableBytesPtr)key;
byte[] value = null;
int bucketIndex = getBucketIndex(ikey);
FileMap byteMap = directory[bucketIndex];
// Decision based on bucket ID, not the directory ID due to the n:1 relationship
if (directory[curMapBufferIndex].pageIndex != byteMap.pageIndex) {
// map not paged in
FileMap curByteMap = directory[curMapBufferIndex];
// Use bloomFilter to check if key was spilled before
if (byteMap.containsKey(ikey.copyBytesIfNecessary())) {
// ensure consistency and flush current memory page to disk
// fflush current buffer
curByteMap.flushBuffer();
// page in new buffer
byteMap.pageIn();
// update index
curMapBufferIndex = bucketIndex;
}
}
// get KV from current map
value = byteMap.getPagedInElement(ikey);
return value;
}
// Similar as get(Object key) function, however
// always pages in page a key is spilled to, no bloom filter decision
private byte[] getAlways(ImmutableBytesPtr key) {
byte[] value = null;
int bucketIndex = getBucketIndex(key);
FileMap byteMap = directory[bucketIndex];
if (directory[curMapBufferIndex].pageIndex != byteMap.pageIndex) {
FileMap curByteMap = directory[curMapBufferIndex];
// ensure consistency and flush current memory page to disk
curByteMap.flushBuffer();
byteMap.pageIn();
curMapBufferIndex = bucketIndex;
}
// get KV from current queue
value = byteMap.getPagedInElement(key);
return value;
}
/**
* Spill a key First we discover if the key has been spilled before and load it into memory: #ref get() if it was
* loaded before just replace the old value in the memory page if it was not loaded before try to store it in the
* current page alternatively if not enough memory available, request new page.
*/
@Override
public byte[] put(ImmutableBytesPtr key, byte[] value) {
boolean redistributed = false;
// page in element and replace if present
byte[] spilledValue = getAlways(key);
FileMap byteMap = directory[curMapBufferIndex];
int index = curMapBufferIndex;
// TODO: We split buckets until the new element fits onto a
// one of the new buckets. Might consider the use of an overflow
// bucket, especially in case the directory runs out of page IDs.
while (!byteMap.canFit(spilledValue, value)) {
// Element does not fit... Split the bucket!
redistribute(index, key, value);
redistributed = true;
index = getBucketIndex(key);
byteMap = directory[index];
}
// Ensure that all pages that were paged in during redistribution are flushed back out
// to disk to keep memory footprint small.
if (redistributed) {
for (int i = 0; i < directory.length; i++) {
if (directory[i].pageIndex != byteMap.pageIndex) {
directory[i].flushBuffer();
}
}
// Ensure the page that receives the new key is in memory
spilledValue = getAlways(key);
}
byteMap.addElement(spilledValue, key, value);
return value;
}
/**
* Function returns the current spill file
*/
public SpillFile getSpillFile() {
return spillFile;
}
/**
* This inner class represents the currently mapped file region. It uses a Map to represent the current in memory
* page for easy get() and update() calls on an individual key The class keeps track of the current size of the in
* memory page and handles flushing and paging in respectively
*/
private static class FileMap {
private final SpillFile spillFile;
private final int pageIndex;
private final int thresholdBytes;
private long totalResultSize;
private boolean pagedIn;
private int localDepth;
// dirtyPage flag tracks if a paged in page was modified
// if not, no need to flush it back out to disk
private boolean dirtyPage;
// Use a map for in memory page representation
Map pageMap = Maps.newHashMap();
// Used to determine is an element was written to this page before or not
BloomFilter bFilter;
public FileMap(int id, int thresholdBytes, int pageInserts, SpillFile spillFile) {
this.spillFile = spillFile;
// size threshold of a page
this.thresholdBytes = thresholdBytes;
this.pageIndex = id;
pageMap.clear();
bFilter = BloomFilter.create(Funnels.byteArrayFunnel(), pageInserts);
pagedIn = true;
totalResultSize = 0;
localDepth = 1;
dirtyPage = true;
}
private boolean containsKey(byte[] key) {
return bFilter.mightContain(key);
}
private boolean canFit(byte[] curValue, byte[] newValue) {
if (thresholdBytes < newValue.length) {
// TODO resize page size if single element is too big,
// Can this ever happen?
throw new RuntimeException("page size too small to store a single KV element");
}
int resultSize = newValue.length + Bytes.SIZEOF_INT;
if (curValue != null) {
// Key existed before
// Ensure to compensate for potential larger byte[] for agg
resultSize = Math.max(0, resultSize - (curValue.length + Bytes.SIZEOF_INT));
}
if ((thresholdBytes - totalResultSize) <= (resultSize)) {
// KV does not fit
return false;
}
// KV fits
return true;
}
// Flush the current page to the memory mapped byte buffer
private void flushBuffer() {
if (pagedIn) {
// Only flush if page was changed
if (dirtyPage) {
Collection values = pageMap.values();
RandomAccessFile file = spillFile.getPage(pageIndex);
// number of elements
try {
file.writeInt(values.size());
int written = Bytes.SIZEOF_INT;
for (byte[] value : values) {
written += Bytes.SIZEOF_INT + value.length;
// safety check
if (written > SpillFile.DEFAULT_PAGE_SIZE) {
throw new BufferOverflowException();
}
// element length
file.writeInt(value.length);
// element
file.write(value, 0, value.length);
}
} catch (IOException ioe) {
// Error during key access on spilled resource
// TODO rework error handling
throw new RuntimeException(ioe);
}
}
// Reset page stats
pageMap.clear();
totalResultSize = 0;
}
pagedIn = false;
dirtyPage = false;
}
// load a page into a map for fast element access
private void pageIn() {
if (!pagedIn) {
RandomAccessFile file = spillFile.getPage(pageIndex);
try {
int numElements = file.readInt();
for (int i = 0; i < numElements; i++) {
int kvSize = file.readInt();
byte[] data = new byte[kvSize];
file.readFully(data);
pageMap.put(SpillManager.getKey(data), data);
totalResultSize += (data.length + Bytes.SIZEOF_INT);
}
} catch (IOException ioe) {
// Error during key access on spilled resource
// TODO rework error handling
throw new RuntimeException(ioe);
}
pagedIn = true;
dirtyPage = false;
}
}
/**
* Return a cache element currently page into memory Direct access via mapped page map
*
* @param key
* @return
*/
public byte[] getPagedInElement(ImmutableBytesPtr key) {
return pageMap.get(key);
}
/**
* Inserts / Replaces cache element in the currently loaded page. Direct access via mapped page map
*
* @param key
* @param value
*/
public void addElement(byte[] spilledValue, ImmutableBytesPtr key, byte[] value) {
// put Element into map
pageMap.put(key, value);
// Update bloom filter
bFilter.put(key.copyBytesIfNecessary());
// track current Map size to prevent Buffer overflows
if (spilledValue != null) {
// if previous key was present, just add the size difference
totalResultSize += Math.max(0, value.length - (spilledValue.length));
} else {
// Add new size information
totalResultSize += (value.length + Bytes.SIZEOF_INT);
}
dirtyPage = true;
}
/**
* Returns a value iterator over the pageMap
*/
public Iterator getPageMapEntries() {
pageIn();
return pageMap.values().iterator();
}
}
/**
* Iterate over all spilled elements, including the ones that are currently paged into memory
*/
@Override
public Iterator iterator() {
directory[curMapBufferIndex].flushBuffer();
return new Iterator() {
int pageIndex = 0;
Iterator entriesIter = directory[pageIndex].getPageMapEntries();
HashSet dups = new HashSet();
@Override
public boolean hasNext() {
if (!entriesIter.hasNext()) {
boolean found = false;
// Clear in memory map
while (!found) {
pageIndex++;
if (pageIndex >= directory.length) { return false; }
directory[pageIndex - 1].pageMap.clear();
// get keys from all spilled pages
if (!dups.contains(directory[pageIndex].pageIndex)) {
dups.add(directory[pageIndex].pageIndex);
entriesIter = directory[pageIndex].getPageMapEntries();
if (entriesIter.hasNext()) {
found = true;
}
}
}
}
dups.add(directory[pageIndex].pageIndex);
return true;
}
@Override
public byte[] next() {
// get elements from in memory map first
return entriesIter.next();
}
@Override
public void remove() {
throw new IllegalAccessError("Iterator does not support removal operation");
}
};
}
// TODO implement this method to make the SpillMap a true Map implementation
@Override
public Set> entrySet() {
throw new IllegalAccessError("entrySet is not supported for this type of cache");
}
}
© 2015 - 2025 Weber Informatics LLC | Privacy Policy