org.apache.hudi.common.util.collection.ExternalSpillableMap Maven / Gradle / Ivy
/*
* Licensed to the Apache Software Foundation (ASF) under one
* or more contributor license agreements. See the NOTICE file
* distributed with this work for additional information
* regarding copyright ownership. The ASF licenses this file
* to you under the Apache License, Version 2.0 (the
* "License"); you may not use this file except in compliance
* with the License. You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
package org.apache.hudi.common.util.collection;
import org.apache.hudi.common.util.SizeEstimator;
import org.apache.hudi.exception.HoodieIOException;
import org.slf4j.Logger;
import org.slf4j.LoggerFactory;
import javax.annotation.concurrent.NotThreadSafe;
import java.io.Closeable;
import java.io.IOException;
import java.io.Serializable;
import java.util.ArrayList;
import java.util.Collection;
import java.util.HashMap;
import java.util.HashSet;
import java.util.Iterator;
import java.util.List;
import java.util.Map;
import java.util.Set;
import java.util.stream.Stream;
/**
* An external map that spills content to disk when there is insufficient space for it to grow.
*
* This map holds 2 types of data structures :
*
* (1) Key-Value pairs in a in-memory map (2) Key-ValueMetadata pairs in an in-memory map which keeps a marker to the
* values spilled to disk
*
* NOTE : Values are only appended to disk. If a remove() is called, the entry is marked removed from the in-memory
* key-valueMetadata map but it's values will be lying around in the temp file on disk until the file is cleaned.
*
* The setting of the spill threshold faces the following trade-off: If the spill threshold is too high, the in-memory
* map may occupy more memory than is available, resulting in OOM. However, if the spill threshold is too low, we spill
* frequently and incur unnecessary disk writes.
*/
@NotThreadSafe
public class ExternalSpillableMap implements Map, Serializable, Closeable {
// Find the actual estimated payload size after inserting N records
private static final int NUMBER_OF_RECORDS_TO_ESTIMATE_PAYLOAD_SIZE = 100;
private static final Logger LOG = LoggerFactory.getLogger(ExternalSpillableMap.class);
// maximum space allowed in-memory for this map
private final long maxInMemorySizeInBytes;
// Map to store key-values in memory until it hits maxInMemorySizeInBytes
private final Map inMemoryMap;
// Map to store key-values on disk or db after it spilled over the memory
private transient volatile DiskMap diskBasedMap;
// TODO(na) : a dynamic sizing factor to ensure we have space for other objects in memory and
// incorrect payload estimation
private static final double SIZING_FACTOR_FOR_IN_MEMORY_MAP = 0.8;
// Size Estimator for key type
private final SizeEstimator keySizeEstimator;
// Size Estimator for key types
private final SizeEstimator valueSizeEstimator;
// Type of the disk map
private final DiskMapType diskMapType;
// Enables compression of values stored in disc
private final boolean isCompressionEnabled;
// current space occupied by this map in-memory
private long currentInMemoryMapSize;
// An estimate of the size of each payload written to this map
private volatile long estimatedPayloadSize = 0;
// Base File Path
private final String baseFilePath;
public ExternalSpillableMap(long maxInMemorySizeInBytes, String baseFilePath, SizeEstimator keySizeEstimator,
SizeEstimator valueSizeEstimator) throws IOException {
this(maxInMemorySizeInBytes, baseFilePath, keySizeEstimator, valueSizeEstimator, DiskMapType.BITCASK);
}
public ExternalSpillableMap(long maxInMemorySizeInBytes, String baseFilePath, SizeEstimator keySizeEstimator,
SizeEstimator valueSizeEstimator, DiskMapType diskMapType) throws IOException {
this(maxInMemorySizeInBytes, baseFilePath, keySizeEstimator, valueSizeEstimator, diskMapType, false);
}
public ExternalSpillableMap(long maxInMemorySizeInBytes, String baseFilePath, SizeEstimator keySizeEstimator,
SizeEstimator valueSizeEstimator, DiskMapType diskMapType, boolean isCompressionEnabled) throws IOException {
this.inMemoryMap = new HashMap<>();
this.baseFilePath = baseFilePath;
this.maxInMemorySizeInBytes = (long) Math.floor(maxInMemorySizeInBytes * SIZING_FACTOR_FOR_IN_MEMORY_MAP);
this.currentInMemoryMapSize = 0L;
this.keySizeEstimator = keySizeEstimator;
this.valueSizeEstimator = valueSizeEstimator;
this.diskMapType = diskMapType;
this.isCompressionEnabled = isCompressionEnabled;
}
private void initDiskBasedMap() {
if (null == diskBasedMap) {
synchronized (this) {
if (null == diskBasedMap) {
try {
switch (diskMapType) {
case ROCKS_DB:
diskBasedMap = new RocksDbDiskMap<>(baseFilePath);
break;
case BITCASK:
default:
diskBasedMap = new BitCaskDiskMap<>(baseFilePath, isCompressionEnabled);
}
} catch (IOException e) {
throw new HoodieIOException(e.getMessage(), e);
}
}
}
}
}
/**
* A custom iterator to wrap over iterating in-memory + disk spilled data.
*/
public Iterator iterator() {
return diskBasedMap == null ? inMemoryMap.values().iterator() : new IteratorWrapper<>(inMemoryMap.values().iterator(), diskBasedMap.iterator());
}
/**
* Number of entries in BitCaskDiskMap.
*/
public int getDiskBasedMapNumEntries() {
return diskBasedMap == null ? 0 : diskBasedMap.size();
}
/**
* Number of bytes spilled to disk.
*/
public long getSizeOfFileOnDiskInBytes() {
return diskBasedMap == null ? 0 : diskBasedMap.sizeOfFileOnDiskInBytes();
}
/**
* Number of entries in InMemoryMap.
*/
public int getInMemoryMapNumEntries() {
return inMemoryMap.size();
}
/**
* Approximate memory footprint of the in-memory map.
*/
public long getCurrentInMemoryMapSize() {
return currentInMemoryMapSize;
}
@Override
public int size() {
return inMemoryMap.size() + getDiskBasedMapNumEntries();
}
@Override
public boolean isEmpty() {
return inMemoryMap.isEmpty() && getDiskBasedMapNumEntries() == 0;
}
@Override
public boolean containsKey(Object key) {
return inMemoryMap.containsKey(key) || inDiskContainsKey(key);
}
@Override
public boolean containsValue(Object value) {
return inMemoryMap.containsValue(value) || (diskBasedMap != null && diskBasedMap.containsValue(value));
}
private boolean inMemoryContainsKey(Object key) {
return inMemoryMap.containsKey(key);
}
private boolean inDiskContainsKey(Object key) {
return diskBasedMap != null && diskBasedMap.containsKey(key);
}
@Override
public R get(Object key) {
if (inMemoryMap.containsKey(key)) {
return inMemoryMap.get(key);
} else if (inDiskContainsKey(key)) {
return diskBasedMap.get(key);
}
return null;
}
@Override
public R put(T key, R value) {
if (this.estimatedPayloadSize == 0) {
// At first, use the sizeEstimate of a record being inserted into the spillable map.
// Note, the converter may over-estimate the size of a record in the JVM
this.estimatedPayloadSize = keySizeEstimator.sizeEstimate(key) + valueSizeEstimator.sizeEstimate(value);
} else if (this.inMemoryMap.size() % NUMBER_OF_RECORDS_TO_ESTIMATE_PAYLOAD_SIZE == 0) {
this.estimatedPayloadSize = (long) (this.estimatedPayloadSize * 0.9 + (keySizeEstimator.sizeEstimate(key) + valueSizeEstimator.sizeEstimate(value)) * 0.1);
this.currentInMemoryMapSize = this.inMemoryMap.size() * this.estimatedPayloadSize;
}
if (this.inMemoryMap.containsKey(key)) {
this.inMemoryMap.put(key, value);
} else if (this.currentInMemoryMapSize < this.maxInMemorySizeInBytes) {
this.currentInMemoryMapSize += this.estimatedPayloadSize;
// Remove the old version of the record from disk first to avoid data duplication.
if (inDiskContainsKey(key)) {
diskBasedMap.remove(key);
}
this.inMemoryMap.put(key, value);
} else {
if (diskBasedMap == null) {
initDiskBasedMap();
}
diskBasedMap.put(key, value);
}
return value;
}
@Override
public R remove(Object key) {
// NOTE : getDiskBasedMap().remove does not delete the data from disk
if (inMemoryMap.containsKey(key)) {
currentInMemoryMapSize -= estimatedPayloadSize;
return inMemoryMap.remove(key);
} else if (inDiskContainsKey(key)) {
return diskBasedMap.remove(key);
}
return null;
}
@Override
public void putAll(Map extends T, ? extends R> m) {
for (Map.Entry extends T, ? extends R> entry : m.entrySet()) {
put(entry.getKey(), entry.getValue());
}
}
@Override
public void clear() {
inMemoryMap.clear();
if (diskBasedMap != null) {
diskBasedMap.clear();
}
currentInMemoryMapSize = 0L;
}
public void close() {
inMemoryMap.clear();
if (diskBasedMap != null) {
diskBasedMap.close();
}
currentInMemoryMapSize = 0L;
}
@Override
public Set keySet() {
if (diskBasedMap == null) {
return inMemoryMap.keySet();
}
Set keySet = new HashSet<>(inMemoryMap.size() + diskBasedMap.size());
keySet.addAll(inMemoryMap.keySet());
keySet.addAll(diskBasedMap.keySet());
return keySet;
}
@Override
public Collection values() {
if (diskBasedMap == null) {
return inMemoryMap.values();
}
List result = new ArrayList<>(inMemoryMap.size() + diskBasedMap.size());
result.addAll(inMemoryMap.values());
Iterator iterator = diskBasedMap.iterator();
while (iterator.hasNext()) {
result.add(iterator.next());
}
return result;
}
public Stream valueStream() {
if (diskBasedMap == null) {
return inMemoryMap.values().stream();
}
return Stream.concat(inMemoryMap.values().stream(), diskBasedMap.valueStream());
}
@Override
public Set> entrySet() {
if (diskBasedMap == null) {
return inMemoryMap.entrySet();
}
Set> inMemory = inMemoryMap.entrySet();
Set> onDisk = diskBasedMap.entrySet();
Set> entrySet = new HashSet<>(inMemory.size() + onDisk.size());
entrySet.addAll(inMemory);
entrySet.addAll(onDisk);
return entrySet;
}
/**
* The type of map to use for storing the Key, values on disk after it spills
* from memory in the {@link ExternalSpillableMap}.
*/
public enum DiskMapType {
BITCASK,
ROCKS_DB,
UNKNOWN
}
/**
* Iterator that wraps iterating over all the values for this map 1) inMemoryIterator - Iterates over all the data
* in-memory map 2) diskLazyFileIterator - Iterates over all the data spilled to disk.
*/
private class IteratorWrapper implements Iterator {
private final Iterator inMemoryIterator;
private final Iterator diskLazyFileIterator;
public IteratorWrapper(Iterator inMemoryIterator, Iterator diskLazyFileIterator) {
this.inMemoryIterator = inMemoryIterator;
this.diskLazyFileIterator = diskLazyFileIterator;
}
@Override
public boolean hasNext() {
if (inMemoryIterator.hasNext()) {
return true;
}
return diskLazyFileIterator.hasNext();
}
@Override
public R next() {
if (inMemoryIterator.hasNext()) {
return inMemoryIterator.next();
}
return diskLazyFileIterator.next();
}
}
}