All Downloads are FREE. Search and download functionalities are using the official Maven repository.

org.apache.hudi.common.util.collection.ExternalSpillableMap Maven / Gradle / Ivy

The newest version!
/*
 * Licensed to the Apache Software Foundation (ASF) under one
 * or more contributor license agreements.  See the NOTICE file
 * distributed with this work for additional information
 * regarding copyright ownership.  The ASF licenses this file
 * to you under the Apache License, Version 2.0 (the
 * "License"); you may not use this file except in compliance
 * with the License.  You may obtain a copy of the License at
 *
 *      http://www.apache.org/licenses/LICENSE-2.0
 *
 * Unless required by applicable law or agreed to in writing, software
 * distributed under the License is distributed on an "AS IS" BASIS,
 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 * See the License for the specific language governing permissions and
 * limitations under the License.
 */

package org.apache.hudi.common.util.collection;

import java.io.IOException;
import java.io.Serializable;
import java.util.ArrayList;
import java.util.Collection;
import java.util.HashMap;
import java.util.HashSet;
import java.util.Iterator;
import java.util.List;
import java.util.Map;
import java.util.Set;
import java.util.stream.Stream;
import org.apache.hudi.common.util.ObjectSizeCalculator;
import org.apache.hudi.common.util.SizeEstimator;
import org.apache.hudi.exception.HoodieIOException;
import org.apache.log4j.LogManager;
import org.apache.log4j.Logger;

/**
 * An external map that spills content to disk when there is insufficient space for it to grow.
 * 

* This map holds 2 types of data structures : *

* (1) Key-Value pairs in a in-memory map (2) Key-ValueMetadata pairs in an in-memory map which keeps a marker to the * values spilled to disk *

* NOTE : Values are only appended to disk. If a remove() is called, the entry is marked removed from the in-memory * key-valueMetadata map but it's values will be lying around in the temp file on disk until the file is cleaned. *

* The setting of the spill threshold faces the following trade-off: If the spill threshold is too high, the in-memory * map may occupy more memory than is available, resulting in OOM. However, if the spill threshold is too low, we spill * frequently and incur unnecessary disk writes. */ public class ExternalSpillableMap implements Map { // Find the actual estimated payload size after inserting N records private static final int NUMBER_OF_RECORDS_TO_ESTIMATE_PAYLOAD_SIZE = 100; private static final Logger log = LogManager.getLogger(ExternalSpillableMap.class); // maximum space allowed in-memory for this map private final long maxInMemorySizeInBytes; // Map to store key-values in memory until it hits maxInMemorySizeInBytes private final Map inMemoryMap; // Map to store key-valuemetadata important to find the values spilled to disk private transient volatile DiskBasedMap diskBasedMap; // TODO(na) : a dynamic sizing factor to ensure we have space for other objects in memory and // incorrect payload estimation private final Double sizingFactorForInMemoryMap = 0.8; // Size Estimator for key type private final SizeEstimator keySizeEstimator; // Size Estimator for key types private final SizeEstimator valueSizeEstimator; // current space occupied by this map in-memory private Long currentInMemoryMapSize; // An estimate of the size of each payload written to this map private volatile long estimatedPayloadSize = 0; // Flag to determine whether to stop re-estimating payload size private boolean shouldEstimatePayloadSize = true; // Base File Path private final String baseFilePath; public ExternalSpillableMap(Long maxInMemorySizeInBytes, String baseFilePath, SizeEstimator keySizeEstimator, SizeEstimator valueSizeEstimator) throws IOException { this.inMemoryMap = new HashMap<>(); this.baseFilePath = baseFilePath; this.diskBasedMap = new DiskBasedMap<>(baseFilePath); this.maxInMemorySizeInBytes = (long) Math.floor(maxInMemorySizeInBytes * sizingFactorForInMemoryMap); this.currentInMemoryMapSize = 0L; this.keySizeEstimator = keySizeEstimator; this.valueSizeEstimator = valueSizeEstimator; } private DiskBasedMap getDiskBasedMap() { if (null == diskBasedMap) { synchronized (this) { if (null == diskBasedMap) { try { diskBasedMap = new DiskBasedMap<>(baseFilePath); } catch (IOException e) { throw new HoodieIOException(e.getMessage(), e); } } } } return diskBasedMap; } /** * A custom iterator to wrap over iterating in-memory + disk spilled data */ public Iterator iterator() { return new IteratorWrapper<>(inMemoryMap.values().iterator(), getDiskBasedMap().iterator()); } /** * Number of entries in DiskBasedMap */ public int getDiskBasedMapNumEntries() { return getDiskBasedMap().size(); } /** * Number of bytes spilled to disk */ public long getSizeOfFileOnDiskInBytes() { return getDiskBasedMap().sizeOfFileOnDiskInBytes(); } /** * Number of entries in InMemoryMap */ public int getInMemoryMapNumEntries() { return inMemoryMap.size(); } /** * Approximate memory footprint of the in-memory map */ public long getCurrentInMemoryMapSize() { return currentInMemoryMapSize; } @Override public int size() { return inMemoryMap.size() + getDiskBasedMap().size(); } @Override public boolean isEmpty() { return inMemoryMap.isEmpty() && getDiskBasedMap().isEmpty(); } @Override public boolean containsKey(Object key) { return inMemoryMap.containsKey(key) || getDiskBasedMap().containsKey(key); } @Override public boolean containsValue(Object value) { return inMemoryMap.containsValue(value) || getDiskBasedMap().containsValue(value); } @Override public R get(Object key) { if (inMemoryMap.containsKey(key)) { return inMemoryMap.get(key); } else if (getDiskBasedMap().containsKey(key)) { return getDiskBasedMap().get(key); } return null; } @Override public R put(T key, R value) { if (this.currentInMemoryMapSize < maxInMemorySizeInBytes || inMemoryMap.containsKey(key)) { if (shouldEstimatePayloadSize && estimatedPayloadSize == 0) { // At first, use the sizeEstimate of a record being inserted into the spillable map. // Note, the converter may over estimate the size of a record in the JVM this.estimatedPayloadSize = keySizeEstimator.sizeEstimate(key) + valueSizeEstimator.sizeEstimate(value); log.info("Estimated Payload size => " + estimatedPayloadSize); } else if (shouldEstimatePayloadSize && inMemoryMap.size() % NUMBER_OF_RECORDS_TO_ESTIMATE_PAYLOAD_SIZE == 0) { // Re-estimate the size of a record by calculating the size of the entire map containing // N entries and then dividing by the number of entries present (N). This helps to get a // correct estimation of the size of each record in the JVM. long totalMapSize = ObjectSizeCalculator.getObjectSize(inMemoryMap); this.currentInMemoryMapSize = totalMapSize; this.estimatedPayloadSize = totalMapSize / inMemoryMap.size(); shouldEstimatePayloadSize = false; log.info("New Estimated Payload size => " + this.estimatedPayloadSize); } if (!inMemoryMap.containsKey(key)) { // TODO : Add support for adjusting payloadSize for updates to the same key currentInMemoryMapSize += this.estimatedPayloadSize; } inMemoryMap.put(key, value); } else { getDiskBasedMap().put(key, value); } return value; } @Override public R remove(Object key) { // NOTE : getDiskBasedMap().remove does not delete the data from disk if (inMemoryMap.containsKey(key)) { currentInMemoryMapSize -= estimatedPayloadSize; return inMemoryMap.remove(key); } else if (getDiskBasedMap().containsKey(key)) { return getDiskBasedMap().remove(key); } return null; } @Override public void putAll(Map m) { for (Map.Entry entry : m.entrySet()) { put(entry.getKey(), entry.getValue()); } } @Override public void clear() { inMemoryMap.clear(); getDiskBasedMap().clear(); currentInMemoryMapSize = 0L; } @Override public Set keySet() { Set keySet = new HashSet(); keySet.addAll(inMemoryMap.keySet()); keySet.addAll(getDiskBasedMap().keySet()); return keySet; } @Override public Collection values() { if (getDiskBasedMap().isEmpty()) { return inMemoryMap.values(); } List result = new ArrayList<>(inMemoryMap.values()); result.addAll(getDiskBasedMap().values()); return result; } public Stream valueStream() { return Stream.concat(inMemoryMap.values().stream(), getDiskBasedMap().valueStream()); } @Override public Set> entrySet() { Set> entrySet = new HashSet<>(); entrySet.addAll(inMemoryMap.entrySet()); entrySet.addAll(getDiskBasedMap().entrySet()); return entrySet; } /** * Iterator that wraps iterating over all the values for this map 1) inMemoryIterator - Iterates over all the data * in-memory map 2) diskLazyFileIterator - Iterates over all the data spilled to disk */ private class IteratorWrapper implements Iterator { private Iterator inMemoryIterator; private Iterator diskLazyFileIterator; public IteratorWrapper(Iterator inMemoryIterator, Iterator diskLazyFileIterator) { this.inMemoryIterator = inMemoryIterator; this.diskLazyFileIterator = diskLazyFileIterator; } @Override public boolean hasNext() { if (inMemoryIterator.hasNext()) { return true; } return diskLazyFileIterator.hasNext(); } @Override public R next() { if (inMemoryIterator.hasNext()) { return inMemoryIterator.next(); } return diskLazyFileIterator.next(); } } }





© 2015 - 2025 Weber Informatics LLC | Privacy Policy