All Downloads are FREE. Search and download functionalities are using the official Maven repository.

com.uber.hoodie.common.util.collection.ExternalSpillableMap Maven / Gradle / Ivy

There is a newer version: 0.4.7
Show newest version
/*
 *  Copyright (c) 2016 Uber Technologies, Inc. ([email protected])
 *
 *  Licensed under the Apache License, Version 2.0 (the "License");
 *  you may not use this file except in compliance with the License.
 *  You may obtain a copy of the License at
 *
 *           http://www.apache.org/licenses/LICENSE-2.0
 *
 *  Unless required by applicable law or agreed to in writing, software
 *  distributed under the License is distributed on an "AS IS" BASIS,
 *  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 *  See the License for the specific language governing permissions and
 *  limitations under the License.
 */

package com.uber.hoodie.common.util.collection;

import com.uber.hoodie.common.util.SpillableMapUtils;
import com.uber.hoodie.exception.HoodieIOException;
import com.uber.hoodie.exception.HoodieNotSupportedException;
import org.apache.avro.Schema;
import org.apache.log4j.LogManager;
import org.apache.log4j.Logger;

import java.io.IOException;
import java.util.Collection;
import java.util.HashMap;
import java.util.HashSet;
import java.util.Iterator;
import java.util.Map;
import java.util.Optional;
import java.util.Set;

/**
 * An external map that spills content to disk when there is insufficient space for it
 * to grow.
 *
 * This map holds 2 types of data structures :
 *
 *   (1) Key-Value pairs in a in-memory map
 *   (2) Key-ValueMetadata pairs in an in-memory map which keeps a marker to the values spilled to disk
 *
 * NOTE : Values are only appended to disk. If a remove() is called, the entry is marked removed from the in-memory
 * key-valueMetadata map but it's values will be lying around in the temp file on disk until the file is cleaned.
 *
 * The setting of the spill threshold faces the following trade-off: If the spill threshold is
 * too high, the in-memory map may occupy more memory than is available, resulting in OOM.
 * However, if the spill threshold is too low, we spill frequently and incur unnecessary disk
 * writes.
 * @param 
 * @param 
 */
public class ExternalSpillableMap implements Map {

  // maximum space allowed in-memory for this map
  final private long maxInMemorySizeInBytes;
  // current space occupied by this map in-memory
  private Long currentInMemoryMapSize;
  // Map to store key-values in memory until it hits maxInMemorySizeInBytes
  final private Map inMemoryMap;
  // Map to store key-valuemetadata important to find the values spilled to disk
  final private DiskBasedMap diskBasedMap;
  // Schema used to de-serialize and readFromDisk the records written to disk
  final private Schema schema;
  // An estimate of the size of each payload written to this map
  private volatile long estimatedPayloadSize = 0;
  // TODO(na) : a dynamic sizing factor to ensure we have space for other objects in memory and incorrect payload estimation
  final private Double sizingFactorForInMemoryMap = 0.8;

  private static Logger log = LogManager.getLogger(ExternalSpillableMap.class);


  public ExternalSpillableMap(Long maxInMemorySizeInBytes, Schema schema,
                              String payloadClazz, Optional baseFilePath) throws IOException {
    this.inMemoryMap = new HashMap<>();
    this.diskBasedMap = new DiskBasedMap<>(schema, payloadClazz, baseFilePath);
    this.maxInMemorySizeInBytes = (long) Math.floor(maxInMemorySizeInBytes*sizingFactorForInMemoryMap);
    this.schema = schema;
    this.currentInMemoryMapSize = 0L;
  }

  /**
   * A custom iterator to wrap over iterating in-memory + disk spilled data
   * @return
   */
  public Iterator iterator() {
    return new IteratorWrapper<>(inMemoryMap.values().iterator(), diskBasedMap.iterator());
  }

  /**
   * Number of entries in DiskBasedMap
   * @return
   */
  public int getDiskBasedMapNumEntries() {
    return diskBasedMap.size();
  }

  /**
   * Number of bytes spilled to disk
   * @return
   */
  public long getSizeOfFileOnDiskInBytes() {
    return diskBasedMap.sizeOfFileOnDiskInBytes();
  }

  /**
   * Number of entries in InMemoryMap
   * @return
   */
  public int getInMemoryMapNumEntries() {
    return inMemoryMap.size();
  }

  /**
   * Approximate memory footprint of the in-memory map
   * @return
   */
  public long getCurrentInMemoryMapSize() {
    return currentInMemoryMapSize;
  }

  @Override
  public int size() {
    return inMemoryMap.size() + diskBasedMap.size();
  }

  @Override
  public boolean isEmpty() {
    return inMemoryMap.isEmpty() && diskBasedMap.isEmpty();
  }

  @Override
  public boolean containsKey(Object key) {
    return inMemoryMap.containsKey(key) || diskBasedMap.containsKey(key);
  }

  @Override
  public boolean containsValue(Object value) {
    return inMemoryMap.containsValue(value) || diskBasedMap.containsValue(value);
  }

  @Override
  public R get(Object key) {
    if(inMemoryMap.containsKey(key)) {
      return inMemoryMap.get(key);
    } else if(diskBasedMap.containsKey(key)) {
      return diskBasedMap.get(key);
    }
    return null;
  }

  @Override
  public R put(T key, R value) {
    try {
      if (this.currentInMemoryMapSize < maxInMemorySizeInBytes || inMemoryMap.containsKey(key)) {
        // Naive approach for now
        if (estimatedPayloadSize == 0) {
          this.estimatedPayloadSize = SpillableMapUtils.computePayloadSize(value, schema);
          log.info("Estimated Payload size => " + estimatedPayloadSize);
        }
        if(!inMemoryMap.containsKey(key)) {
          currentInMemoryMapSize += this.estimatedPayloadSize;
        }
        inMemoryMap.put(key, value);
      } else {
        diskBasedMap.put(key, value);
      }
      return value;
    } catch(IOException io) {
      throw new HoodieIOException("Unable to estimate size of payload", io);
    }
  }

  @Override
  public R remove(Object key) {
    // NOTE : diskBasedMap.remove does not delete the data from disk
    if(inMemoryMap.containsKey(key)) {
      currentInMemoryMapSize -= estimatedPayloadSize;
      return inMemoryMap.remove(key);
    } else if(diskBasedMap.containsKey(key)) {
      return diskBasedMap.remove(key);
    }
    return null;
  }

  @Override
  public void putAll(Map m) {
    for(Map.Entry entry: m.entrySet()) {
      put(entry.getKey(), entry.getValue());
    }
  }

  @Override
  public void clear() {
    inMemoryMap.clear();
    diskBasedMap.clear();
    currentInMemoryMapSize = 0L;
  }

  @Override
  public Set keySet() {
    Set keySet = new HashSet();
    keySet.addAll(inMemoryMap.keySet());
    keySet.addAll(diskBasedMap.keySet());
    return keySet;
  }

  @Override
  public Collection values() {
    if(diskBasedMap.isEmpty()) {
      return inMemoryMap.values();
    }
    throw new HoodieNotSupportedException("Cannot return all values in memory");
  }

  @Override
  public Set> entrySet() {
    Set> entrySet = new HashSet<>();
    entrySet.addAll(inMemoryMap.entrySet());
    entrySet.addAll(diskBasedMap.entrySet());
    return entrySet;
  }

  /**
   * Iterator that wraps iterating over all the values for this map
   * 1) inMemoryIterator - Iterates over all the data in-memory map
   * 2) diskLazyFileIterator - Iterates over all the data spilled to disk
   * @param 
   */
  private class IteratorWrapper implements Iterator {

    private Iterator inMemoryIterator;
    private Iterator diskLazyFileIterator;

    public IteratorWrapper(Iterator inMemoryIterator, Iterator diskLazyFileIterator) {
      this.inMemoryIterator = inMemoryIterator;
      this.diskLazyFileIterator = diskLazyFileIterator;
    }
    @Override
    public boolean hasNext() {
      if(inMemoryIterator.hasNext()) {
        return true;
      }
      return diskLazyFileIterator.hasNext();
    }

    @Override
    public R next() {
      if(inMemoryIterator.hasNext()) {
        return inMemoryIterator.next();
      }
      return diskLazyFileIterator.next();
    }
  }
}




© 2015 - 2025 Weber Informatics LLC | Privacy Policy