org.apache.phoenix.cache.aggcache.SpillableGroupByCache Maven / Gradle / Ivy
The newest version!
/*
* Licensed to the Apache Software Foundation (ASF) under one
* or more contributor license agreements. See the NOTICE file
* distributed with this work for additional information
* regarding copyright ownership. The ASF licenses this file
* to you under the Apache License, Version 2.0 (the
* "License"); you may not use this file except in compliance
* with the License. You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
package org.apache.phoenix.cache.aggcache;
import static org.apache.phoenix.query.QueryConstants.AGG_TIMESTAMP;
import static org.apache.phoenix.query.QueryConstants.SINGLE_COLUMN;
import static org.apache.phoenix.query.QueryConstants.SINGLE_COLUMN_FAMILY;
import static org.apache.phoenix.query.QueryServices.GROUPBY_MAX_CACHE_SIZE_ATTRIB;
import static org.apache.phoenix.query.QueryServices.GROUPBY_SPILL_FILES_ATTRIB;
import static org.apache.phoenix.query.QueryServicesOptions.DEFAULT_GROUPBY_MAX_CACHE_MAX;
import static org.apache.phoenix.query.QueryServicesOptions.DEFAULT_GROUPBY_SPILL_FILES;
import java.io.IOException;
import java.util.Iterator;
import java.util.LinkedHashMap;
import java.util.List;
import java.util.Map;
import java.util.Map.Entry;
import org.apache.hadoop.conf.Configuration;
import org.apache.hadoop.hbase.Cell;
import org.apache.hadoop.hbase.coprocessor.RegionCoprocessorEnvironment;
import org.apache.hadoop.hbase.io.ImmutableBytesWritable;
import org.apache.hadoop.hbase.regionserver.RegionScanner;
import org.apache.hadoop.hbase.util.Bytes;
import org.apache.phoenix.cache.GlobalCache;
import org.apache.phoenix.cache.TenantCache;
import org.apache.phoenix.cache.aggcache.SpillManager.CacheEntry;
import org.apache.phoenix.coprocessor.BaseRegionScanner;
import org.apache.phoenix.coprocessor.GroupByCache;
import org.apache.phoenix.coprocessor.GroupedAggregateRegionObserver;
import org.apache.phoenix.expression.aggregator.Aggregator;
import org.apache.phoenix.expression.aggregator.ServerAggregators;
import org.apache.phoenix.hbase.index.util.ImmutableBytesPtr;
import org.apache.phoenix.memory.InsufficientMemoryException;
import org.apache.phoenix.memory.MemoryManager.MemoryChunk;
import org.apache.phoenix.util.Closeables;
import org.apache.phoenix.util.KeyValueUtil;
import org.slf4j.Logger;
import org.slf4j.LoggerFactory;
/**
* The main entry point is in GroupedAggregateRegionObserver. It instantiates a SpillableGroupByCache and invokes a
* get() method on it. There is no: "if key not exists -> put into map" case, since the cache is a Loading cache and
* therefore handles the put under the covers. I tried to implement the final cache element accesses (RegionScanner
* below) streaming, i.e. there is just an iterator on it and removed the existing result materialization.
* SpillableGroupByCache implements a LRU cache using a LinkedHashMap with access order. There is a configurable an
* upper and lower size limit in bytes which are used as follows to compute the initial cache size in number of
* elements: Max(lowerBoundElements, Min(upperBoundElements, estimatedCacheSize)). Once the number of cached elements
* exceeds this number, the cache size is increased by a factor of 1.5. This happens until the additional memory to grow
* the cache cannot be requested. At this point the Cache starts spilling elements. As long as no eviction happens no
* spillable data structures are allocated, this only happens as soon as the first element is evicted from the cache. We
* cannot really make any assumptions on which keys arrive at the map, but assume the LRU would at least cover the cases
* where some keys have a slight skew and they should stay memory resident. Once a key gets evicted, the spillManager is
* instantiated. It basically takes care of spilling an element to disk and does all the SERDE work. It pre-allocates a
* configurable number of SpillFiles (spill partition) which are memory mapped temp files. The SpillManager keeps a list
* of these and hash distributes the keys within this list. Once an element gets spilled, it is serialized and will only
* get deserialized again, when it is requested from the client, i.e. loaded back into the LRU cache. The SpillManager
* holds a single SpillMap object in memory for every spill partition (SpillFile). The SpillMap is an in memory Map
* representation of a single page of spilled serialized key/value pairs. To achieve fast key lookup the key is hash
* partitioned into random pages of the current spill file. The code implements an extendible hashing approach which
* dynamically adjusts the hash function, in order to adapt to growing number of storage pages and avoiding long chains
* of overflow buckets. For an excellent discussion of the algorithm please refer to the following online resource:
* http://db.inf.uni-tuebingen.de/files/teaching/ws1011/db2/db2-hash-indexes.pdf . For this, each SpillFile keeps a
* directory of pointers to Integer.MAX_VALUE 4K pages in memory, which allows each directory to address more pages than
* a single memory mapped temp file could theoretically store. In case directory doubling, requests a page index that
* exceeds the limits of the initial temp file limits, the implementation dynamically allocates additional temp files to
* the SpillFile. The directory starts with a global depth of 1 and therefore a directory size of 2 buckets. Only during
* bucket split and directory doubling more than one page is temporarily kept in memory until all elements have been
* redistributed. The current implementation conducts bucket splits as long as an element does not fit onto a page. No
* overflow chain is created, which might be an alternative. For get requests, each directory entry maintains a
* bloomFilter to prevent page-in operations in case an element has never been spilled before. The deserialization is
* only triggered when a key a loaded back into the LRU cache. The aggregators are returned from the LRU cache and the
* next value is computed. In case the key is not found on any page, the Loader create new aggregators for it.
*/
public class SpillableGroupByCache implements GroupByCache {
private static final Logger LOGGER = LoggerFactory.getLogger(SpillableGroupByCache.class);
// Min size of 1st level main memory cache in bytes --> lower bound
private static final int SPGBY_CACHE_MIN_SIZE = 4096; // 4K
// TODO Generally better to use Collection API with generics instead of
// array types
private final LinkedHashMap cache;
private SpillManager spillManager = null;
private long totalNumElements;
private final ServerAggregators aggregators;
private final RegionCoprocessorEnvironment env;
private final MemoryChunk chunk;
/*
* inner class that makes cache queryable for other classes that should not get the full instance. Queryable view of
* the cache
*/
public class QueryCache {
public boolean isKeyContained(ImmutableBytesPtr key) {
return cache.containsKey(key);
}
}
/**
* Instantiates a Loading LRU Cache that stores key / aggregator[] tuples used for group by queries
*
* @param estSize
* @param estValueSize
* @param aggs
* @param ctxt
*/
public SpillableGroupByCache(final RegionCoprocessorEnvironment env, ImmutableBytesPtr tenantId,
ServerAggregators aggs, final int estSizeNum) {
totalNumElements = 0;
this.aggregators = aggs;
this.env = env;
final int estValueSize = aggregators.getEstimatedByteSize();
final TenantCache tenantCache = GlobalCache.getTenantCache(env, tenantId);
// Compute Map initial map
final Configuration conf = env.getConfiguration();
final long maxCacheSizeConf = conf.getLong(GROUPBY_MAX_CACHE_SIZE_ATTRIB, DEFAULT_GROUPBY_MAX_CACHE_MAX);
final int numSpillFilesConf = conf.getInt(GROUPBY_SPILL_FILES_ATTRIB, DEFAULT_GROUPBY_SPILL_FILES);
final int maxSizeNum = (int)(maxCacheSizeConf / estValueSize);
final int minSizeNum = (SPGBY_CACHE_MIN_SIZE / estValueSize);
// use upper and lower bounds for the cache size
final int maxCacheSize = Math.max(minSizeNum, Math.min(maxSizeNum, estSizeNum));
final long estSize = GroupedAggregateRegionObserver.sizeOfUnorderedGroupByMap(maxCacheSize, estValueSize);
try {
this.chunk = tenantCache.getMemoryManager().allocate(estSize);
} catch (InsufficientMemoryException ime) {
LOGGER.error("Requested Map size exceeds memory limit, " +
"please decrease max size via config paramter: "
+ GROUPBY_MAX_CACHE_SIZE_ATTRIB);
throw ime;
}
if (LOGGER.isDebugEnabled()) {
LOGGER.debug("Instantiating LRU groupby cache of element size: " + maxCacheSize);
}
// LRU cache implemented as LinkedHashMap with access order
cache = new LinkedHashMap(maxCacheSize, 0.75f, true) {
boolean spill = false;
int cacheSize = maxCacheSize;
@Override
protected boolean removeEldestEntry(Map.Entry eldest) {
if (!spill && size() > cacheSize) { // increase allocation
cacheSize *= 1.5f;
long estSize = GroupedAggregateRegionObserver.sizeOfUnorderedGroupByMap(cacheSize, estValueSize);
try {
chunk.resize(estSize);
} catch (InsufficientMemoryException im) {
// Cannot extend Map anymore, start spilling
spill = true;
}
}
if (spill) {
try {
if (spillManager == null) {
// Lazy instantiation of spillable data
// structures
//
// Only create spill data structs if LRU
// cache is too small
spillManager = new SpillManager(numSpillFilesConf, aggregators, env.getConfiguration(),
new QueryCache());
}
spillManager.spill(eldest.getKey(), eldest.getValue());
} catch (IOException ioe) {
// Ensure that we always close and delete the temp files
try {
throw new RuntimeException(ioe);
} finally {
Closeables.closeQuietly(SpillableGroupByCache.this);
}
}
return true;
}
return false;
}
};
}
/**
* Size function returns the current number of cached elements
*/
@Override
public long size() {
return totalNumElements;
}
/**
* Extract an element from the Cache If element is not present in in-memory cache / or in spill files cache
* implements an implicit put() of a new key/value tuple and loads it into the cache
*/
@Override
public Aggregator[] cache(ImmutableBytesPtr cacheKey) {
ImmutableBytesPtr key = new ImmutableBytesPtr(cacheKey);
Aggregator[] rowAggregators = cache.get(key);
if (rowAggregators == null) {
// If Aggregators not found for this distinct
// value, clone our original one (we need one
// per distinct value)
if (spillManager != null) {
// Spill manager present, check if key has been
// spilled before
try {
rowAggregators = spillManager.loadEntry(key);
} catch (IOException ioe) {
// Ensure that we always close and delete the temp files
try {
throw new RuntimeException(ioe);
} finally {
Closeables.closeQuietly(SpillableGroupByCache.this);
}
}
}
if (rowAggregators == null) {
// No, key never spilled before, create a new tuple
rowAggregators = aggregators.newAggregators(env.getConfiguration());
if (LOGGER.isDebugEnabled()) {
LOGGER.debug("Adding new aggregate bucket for row key "
+ Bytes.toStringBinary(key.get(), key.getOffset(), key.getLength()));
}
}
if (cache.put(key, rowAggregators) == null) {
totalNumElements++;
}
}
return rowAggregators;
}
/**
* Iterator over the cache and the spilled data structures by returning CacheEntries. CacheEntries are either
* extracted from the LRU cache or from the spillable data structures.The key/value tuples are returned in
* non-deterministic order.
*/
private final class EntryIterator implements Iterator> {
final Iterator> cacheIter;
final Iterator spilledCacheIter;
private EntryIterator() {
cacheIter = cache.entrySet().iterator();
if (spillManager != null) {
spilledCacheIter = spillManager.newDataIterator();
} else {
spilledCacheIter = null;
}
}
@Override
public boolean hasNext() {
return cacheIter.hasNext();
}
@Override
public Map.Entry next() {
if (spilledCacheIter != null && spilledCacheIter.hasNext()) {
try {
byte[] value = spilledCacheIter.next();
// Deserialize into a CacheEntry
Map.Entry spilledEntry = spillManager.toCacheEntry(value);
boolean notFound = false;
// check against map and return only if not present
while (cache.containsKey(spilledEntry.getKey())) {
// LRU Cache entries always take precedence,
// since they are more up to date
if (spilledCacheIter.hasNext()) {
value = spilledCacheIter.next();
spilledEntry = spillManager.toCacheEntry(value);
} else {
notFound = true;
break;
}
}
if (!notFound) {
// Return a spilled entry, this only happens if the
// entry was not
// found in the LRU cache
return spilledEntry;
}
} catch (IOException ioe) {
// TODO rework error handling
throw new RuntimeException(ioe);
}
}
// Spilled elements exhausted
// Finally return all elements from LRU cache
Map.Entry entry = cacheIter.next();
return new CacheEntry(entry.getKey(), entry.getValue());
}
/**
* Remove??? Denied!!!
*/
@Override
public void remove() {
throw new IllegalAccessError("Remove is not supported for this type of iterator");
}
}
/**
* Closes cache and releases spill resources
*
* @throws IOException
*/
@Override
public void close() throws IOException {
// Close spillable resources
Closeables.closeQuietly(spillManager);
Closeables.closeQuietly(chunk);
}
@Override
public RegionScanner getScanner(final RegionScanner s) {
final Iterator> cacheIter = new EntryIterator();
// scanner using the spillable implementation
return new BaseRegionScanner(s) {
@Override
public void close() throws IOException {
try {
s.close();
} finally {
// Always close gbCache and swallow possible Exceptions
Closeables.closeQuietly(SpillableGroupByCache.this);
}
}
@Override
public boolean next(List results) throws IOException {
if (!cacheIter.hasNext()) {
return false;
}
Map.Entry ce = cacheIter.next();
ImmutableBytesWritable key = ce.getKey();
Aggregator[] aggs = ce.getValue();
byte[] value = aggregators.toBytes(aggs);
if (LOGGER.isDebugEnabled()) {
LOGGER.debug("Adding new distinct group: "
+ Bytes.toStringBinary(key.get(), key.getOffset(), key.getLength()) +
" with aggregators " + aggs.toString() + " value = " +
Bytes.toStringBinary(value));
}
results.add(KeyValueUtil.newKeyValue(key.get(), key.getOffset(), key.getLength(), SINGLE_COLUMN_FAMILY,
SINGLE_COLUMN, AGG_TIMESTAMP, value, 0, value.length));
return cacheIter.hasNext();
}
};
}
} |