org.apache.spark.unsafe.map.BytesToBytesMap Maven / Gradle / Ivy

Show more of this group Show more artifacts with this name
Show all versions of spark-unsafe_2.11 Show documentation
There is a newer version: 2.4.8
/*
 * Licensed to the Apache Software Foundation (ASF) under one or more
 * contributor license agreements.  See the NOTICE file distributed with
 * this work for additional information regarding copyright ownership.
 * The ASF licenses this file to You under the Apache License, Version 2.0
 * (the "License"); you may not use this file except in compliance with
 * the License.  You may obtain a copy of the License at
 *
 *    http://www.apache.org/licenses/LICENSE-2.0
 *
 * Unless required by applicable law or agreed to in writing, software
 * distributed under the License is distributed on an "AS IS" BASIS,
 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 * See the License for the specific language governing permissions and
 * limitations under the License.
 */

package org.apache.spark.unsafe.map;

import java.lang.Override;
import java.lang.UnsupportedOperationException;
import java.util.Iterator;
import java.util.LinkedList;
import java.util.List;

import com.google.common.annotations.VisibleForTesting;

import org.apache.spark.unsafe.*;
import org.apache.spark.unsafe.array.ByteArrayMethods;
import org.apache.spark.unsafe.array.LongArray;
import org.apache.spark.unsafe.bitset.BitSet;
import org.apache.spark.unsafe.hash.Murmur3_x86_32;
import org.apache.spark.unsafe.memory.*;

/**
 * An append-only hash map where keys and values are contiguous regions of bytes.
 * 
 * This is backed by a power-of-2-sized hash table, using quadratic probing with triangular numbers,
 * which is guaranteed to exhaust the space.
 * 

 * The map can support up to 2^29 keys. If the key cardinality is higher than this, you should
 * probably be using sorting instead of hashing for better cache locality.
 * 

 * This class is not thread safe.
 */
public final class BytesToBytesMap {

  private static final Murmur3_x86_32 HASHER = new Murmur3_x86_32(0);

  private static final HashMapGrowthStrategy growthStrategy = HashMapGrowthStrategy.DOUBLING;

  /**
   * Special record length that is placed after the last record in a data page.
   */
  private static final int END_OF_PAGE_MARKER = -1;

  private final TaskMemoryManager memoryManager;

  /**
   * A linked list for tracking all allocated data pages so that we can free all of our memory.
   */
  private final List dataPages = new LinkedList();

  /**
   * The data page that will be used to store keys and values for new hashtable entries. When this
   * page becomes full, a new page will be allocated and this pointer will change to point to that
   * new page.
   */
  private MemoryBlock currentDataPage = null;

  /**
   * Offset into `currentDataPage` that points to the location where new data can be inserted into
   * the page. This does not incorporate the page's base offset.
   */
  private long pageCursor = 0;

  /**
   * The size of the data pages that hold key and value data. Map entries cannot span multiple
   * pages, so this limits the maximum entry size.
   */
  private static final long PAGE_SIZE_BYTES = 1L << 26; // 64 megabytes

  /**
   * The maximum number of keys that BytesToBytesMap supports. The hash table has to be
   * power-of-2-sized and its backing Java array can contain at most (1 << 30) elements, since
   * that's the largest power-of-2 that's less than Integer.MAX_VALUE. We need two long array
   * entries per key, giving us a maximum capacity of (1 << 29).
   */
  @VisibleForTesting
  static final int MAX_CAPACITY = (1 << 29);

  // This choice of page table size and page size means that we can address up to 500 gigabytes
  // of memory.

  /**
   * A single array to store the key and value.
   *
   * Position {@code 2 * i} in the array is used to track a pointer to the key at index {@code i},
   * while position {@code 2 * i + 1} in the array holds key's full 32-bit hashcode.
   */
  private LongArray longArray;
  // TODO: we're wasting 32 bits of space here; we can probably store fewer bits of the hashcode
  // and exploit word-alignment to use fewer bits to hold the address.  This might let us store
  // only one long per map entry, increasing the chance that this array will fit in cache at the
  // expense of maybe performing more lookups if we have hash collisions.  Say that we stored only
  // 27 bits of the hashcode and 37 bits of the address.  37 bits is enough to address 1 terabyte
  // of RAM given word-alignment.  If we use 13 bits of this for our page table, that gives us a
  // maximum page size of 2^24 * 8 = ~134 megabytes per page. This change will require us to store
  // full base addresses in the page table for off-heap mode so that we can reconstruct the full
  // absolute memory addresses.

  /**
   * A {@link BitSet} used to track location of the map where the key is set.
   * Size of the bitset should be half of the size of the long array.
   */
  private BitSet bitset;

  private final double loadFactor;

  /**
   * Number of keys defined in the map.
   */
  private int size;

  /**
   * The map will be expanded once the number of keys exceeds this threshold.
   */
  private int growthThreshold;

  /**
   * Mask for truncating hashcodes so that they do not exceed the long array's size.
   * This is a strength reduction optimization; we're essentially performing a modulus operation,
   * but doing so with a bitmask because this is a power-of-2-sized hash map.
   */
  private int mask;

  /**
   * Return value of {@link BytesToBytesMap#lookup(Object, long, int)}.
   */
  private final Location loc;

  private final boolean enablePerfMetrics;

  private long timeSpentResizingNs = 0;

  private long numProbes = 0;

  private long numKeyLookups = 0;

  private long numHashCollisions = 0;

  public BytesToBytesMap(
      TaskMemoryManager memoryManager,
      int initialCapacity,
      double loadFactor,
      boolean enablePerfMetrics) {
    this.memoryManager = memoryManager;
    this.loadFactor = loadFactor;
    this.loc = new Location();
    this.enablePerfMetrics = enablePerfMetrics;
    if (initialCapacity <= 0) {
      throw new IllegalArgumentException("Initial capacity must be greater than 0");
    }
    if (initialCapacity > MAX_CAPACITY) {
      throw new IllegalArgumentException(
        "Initial capacity " + initialCapacity + " exceeds maximum capacity of " + MAX_CAPACITY);
    }
    allocate(initialCapacity);
  }

  public BytesToBytesMap(TaskMemoryManager memoryManager, int initialCapacity) {
    this(memoryManager, initialCapacity, 0.70, false);
  }

  public BytesToBytesMap(
      TaskMemoryManager memoryManager,
      int initialCapacity,
      boolean enablePerfMetrics) {
    this(memoryManager, initialCapacity, 0.70, enablePerfMetrics);
  }

  /**
   * Returns the number of keys defined in the map.
   */
  public int size() { return size; }

  private static final class BytesToBytesMapIterator implements Iterator {

    private final int numRecords;
    private final Iterator dataPagesIterator;
    private final Location loc;

    private int currentRecordNumber = 0;
    private Object pageBaseObject;
    private long offsetInPage;

    BytesToBytesMapIterator(int numRecords, Iterator dataPagesIterator, Location loc) {
      this.numRecords = numRecords;
      this.dataPagesIterator = dataPagesIterator;
      this.loc = loc;
      if (dataPagesIterator.hasNext()) {
        advanceToNextPage();
      }
    }

    private void advanceToNextPage() {
      final MemoryBlock currentPage = dataPagesIterator.next();
      pageBaseObject = currentPage.getBaseObject();
      offsetInPage = currentPage.getBaseOffset();
    }

    @Override
    public boolean hasNext() {
      return currentRecordNumber != numRecords;
    }

    @Override
    public Location next() {
      int keyLength = (int) PlatformDependent.UNSAFE.getLong(pageBaseObject, offsetInPage);
      if (keyLength == END_OF_PAGE_MARKER) {
        advanceToNextPage();
        keyLength = (int) PlatformDependent.UNSAFE.getLong(pageBaseObject, offsetInPage);
      }
      loc.with(pageBaseObject, offsetInPage);
      offsetInPage += 8 + 8 + keyLength + loc.getValueLength();
      currentRecordNumber++;
      return loc;
    }

    @Override
    public void remove() {
      throw new UnsupportedOperationException();
    }
  }

  /**
   * Returns an iterator for iterating over the entries of this map.
   *
   * For efficiency, all calls to `next()` will return the same {@link Location} object.
   *
   * If any other lookups or operations are performed on this map while iterating over it, including
   * `lookup()`, the behavior of the returned iterator is undefined.
   */
  public Iterator iterator() {
    return new BytesToBytesMapIterator(size, dataPages.iterator(), loc);
  }

  /**
   * Looks up a key, and return a {@link Location} handle that can be used to test existence
   * and read/write values.
   *
   * This function always return the same {@link Location} instance to avoid object allocation.
   */
  public Location lookup(
      Object keyBaseObject,
      long keyBaseOffset,
      int keyRowLengthBytes) {
    if (enablePerfMetrics) {
      numKeyLookups++;
    }
    final int hashcode = HASHER.hashUnsafeWords(keyBaseObject, keyBaseOffset, keyRowLengthBytes);
    int pos = hashcode & mask;
    int step = 1;
    while (true) {
      if (enablePerfMetrics) {
        numProbes++;
      }
      if (!bitset.isSet(pos)) {
        // This is a new key.
        return loc.with(pos, hashcode, false);
      } else {
        long stored = longArray.get(pos * 2 + 1);
        if ((int) (stored) == hashcode) {
          // Full hash code matches.  Let's compare the keys for equality.
          loc.with(pos, hashcode, true);
          if (loc.getKeyLength() == keyRowLengthBytes) {
            final MemoryLocation keyAddress = loc.getKeyAddress();
            final Object storedKeyBaseObject = keyAddress.getBaseObject();
            final long storedKeyBaseOffset = keyAddress.getBaseOffset();
            final boolean areEqual = ByteArrayMethods.wordAlignedArrayEquals(
              keyBaseObject,
              keyBaseOffset,
              storedKeyBaseObject,
              storedKeyBaseOffset,
              keyRowLengthBytes
            );
            if (areEqual) {
              return loc;
            } else {
              if (enablePerfMetrics) {
                numHashCollisions++;
              }
            }
          }
        }
      }
      pos = (pos + step) & mask;
      step++;
    }
  }

  /**
   * Handle returned by {@link BytesToBytesMap#lookup(Object, long, int)} function.
   */
  public final class Location {
    /** An index into the hash map's Long array */
    private int pos;
    /** True if this location points to a position where a key is defined, false otherwise */
    private boolean isDefined;
    /**
     * The hashcode of the most recent key passed to
     * {@link BytesToBytesMap#lookup(Object, long, int)}. Caching this hashcode here allows us to
     * avoid re-hashing the key when storing a value for that key.
     */
    private int keyHashcode;
    private final MemoryLocation keyMemoryLocation = new MemoryLocation();
    private final MemoryLocation valueMemoryLocation = new MemoryLocation();
    private int keyLength;
    private int valueLength;

    private void updateAddressesAndSizes(long fullKeyAddress) {
      updateAddressesAndSizes(
        memoryManager.getPage(fullKeyAddress), memoryManager.getOffsetInPage(fullKeyAddress));
    }

    private void updateAddressesAndSizes(Object page, long keyOffsetInPage) {
        long position = keyOffsetInPage;
        keyLength = (int) PlatformDependent.UNSAFE.getLong(page, position);
        position += 8; // word used to store the key size
        keyMemoryLocation.setObjAndOffset(page, position);
        position += keyLength;
        valueLength = (int) PlatformDependent.UNSAFE.getLong(page, position);
        position += 8; // word used to store the key size
        valueMemoryLocation.setObjAndOffset(page, position);
    }

    Location with(int pos, int keyHashcode, boolean isDefined) {
      this.pos = pos;
      this.isDefined = isDefined;
      this.keyHashcode = keyHashcode;
      if (isDefined) {
        final long fullKeyAddress = longArray.get(pos * 2);
        updateAddressesAndSizes(fullKeyAddress);
      }
      return this;
    }

    Location with(Object page, long keyOffsetInPage) {
      this.isDefined = true;
      updateAddressesAndSizes(page, keyOffsetInPage);
      return this;
    }

    /**
     * Returns true if the key is defined at this position, and false otherwise.
     */
    public boolean isDefined() {
      return isDefined;
    }

    /**
     * Returns the address of the key defined at this position.
     * This points to the first byte of the key data.
     * Unspecified behavior if the key is not defined.
     * For efficiency reasons, calls to this method always returns the same MemoryLocation object.
     */
    public MemoryLocation getKeyAddress() {
      assert (isDefined);
      return keyMemoryLocation;
    }

    /**
     * Returns the length of the key defined at this position.
     * Unspecified behavior if the key is not defined.
     */
    public int getKeyLength() {
      assert (isDefined);
      return keyLength;
    }

    /**
     * Returns the address of the value defined at this position.
     * This points to the first byte of the value data.
     * Unspecified behavior if the key is not defined.
     * For efficiency reasons, calls to this method always returns the same MemoryLocation object.
     */
    public MemoryLocation getValueAddress() {
      assert (isDefined);
      return valueMemoryLocation;
    }

    /**
     * Returns the length of the value defined at this position.
     * Unspecified behavior if the key is not defined.
     */
    public int getValueLength() {
      assert (isDefined);
      return valueLength;
    }

    /**
     * Store a new key and value. This method may only be called once for a given key; if you want
     * to update the value associated with a key, then you can directly manipulate the bytes stored
     * at the value address.
     * 

     * It is only valid to call this method immediately after calling `lookup()` using the same key.
     * 

     * The key and value must be word-aligned (that is, their sizes must multiples of 8).
     * 

     * After calling this method, calls to `get[Key|Value]Address()` and `get[Key|Value]Length`
     * will return information on the data stored by this `putNewKey` call.
     * 

     * As an example usage, here's the proper way to store a new key:
     * 

     * 
     *   Location loc = map.lookup(keyBaseObject, keyBaseOffset, keyLengthInBytes);
     *   if (!loc.isDefined()) {
     *     loc.putNewKey(keyBaseObject, keyBaseOffset, keyLengthInBytes, ...)
     *   }
     * 
     * 
     * Unspecified behavior if the key is not defined.
     */
    public void putNewKey(
        Object keyBaseObject,
        long keyBaseOffset,
        int keyLengthBytes,
        Object valueBaseObject,
        long valueBaseOffset,
        int valueLengthBytes) {
      assert (!isDefined) : "Can only set value once for a key";
      assert (keyLengthBytes % 8 == 0);
      assert (valueLengthBytes % 8 == 0);
      if (size == MAX_CAPACITY) {
        throw new IllegalStateException("BytesToBytesMap has reached maximum capacity");
      }
      // Here, we'll copy the data into our data pages. Because we only store a relative offset from
      // the key address instead of storing the absolute address of the value, the key and value
      // must be stored in the same memory page.
      // (8 byte key length) (key) (8 byte value length) (value)
      final long requiredSize = 8 + keyLengthBytes + 8 + valueLengthBytes;
      assert (requiredSize <= PAGE_SIZE_BYTES - 8); // Reserve 8 bytes for the end-of-page marker.
      size++;
      bitset.set(pos);

      // If there's not enough space in the current page, allocate a new page (8 bytes are reserved
      // for the end-of-page marker).
      if (currentDataPage == null || PAGE_SIZE_BYTES - 8 - pageCursor < requiredSize) {
        if (currentDataPage != null) {
          // There wasn't enough space in the current page, so write an end-of-page marker:
          final Object pageBaseObject = currentDataPage.getBaseObject();
          final long lengthOffsetInPage = currentDataPage.getBaseOffset() + pageCursor;
          PlatformDependent.UNSAFE.putLong(pageBaseObject, lengthOffsetInPage, END_OF_PAGE_MARKER);
        }
        MemoryBlock newPage = memoryManager.allocatePage(PAGE_SIZE_BYTES);
        dataPages.add(newPage);
        pageCursor = 0;
        currentDataPage = newPage;
      }

      // Compute all of our offsets up-front:
      final Object pageBaseObject = currentDataPage.getBaseObject();
      final long pageBaseOffset = currentDataPage.getBaseOffset();
      final long keySizeOffsetInPage = pageBaseOffset + pageCursor;
      pageCursor += 8; // word used to store the key size
      final long keyDataOffsetInPage = pageBaseOffset + pageCursor;
      pageCursor += keyLengthBytes;
      final long valueSizeOffsetInPage = pageBaseOffset + pageCursor;
      pageCursor += 8; // word used to store the value size
      final long valueDataOffsetInPage = pageBaseOffset + pageCursor;
      pageCursor += valueLengthBytes;

      // Copy the key
      PlatformDependent.UNSAFE.putLong(pageBaseObject, keySizeOffsetInPage, keyLengthBytes);
      PlatformDependent.copyMemory(
        keyBaseObject, keyBaseOffset, pageBaseObject, keyDataOffsetInPage, keyLengthBytes);
      // Copy the value
      PlatformDependent.UNSAFE.putLong(pageBaseObject, valueSizeOffsetInPage, valueLengthBytes);
      PlatformDependent.copyMemory(
        valueBaseObject, valueBaseOffset, pageBaseObject, valueDataOffsetInPage, valueLengthBytes);

      final long storedKeyAddress = memoryManager.encodePageNumberAndOffset(
        currentDataPage, keySizeOffsetInPage);
      longArray.set(pos * 2, storedKeyAddress);
      longArray.set(pos * 2 + 1, keyHashcode);
      updateAddressesAndSizes(storedKeyAddress);
      isDefined = true;
      if (size > growthThreshold && longArray.size() < MAX_CAPACITY) {
        growAndRehash();
      }
    }
  }

  /**
   * Allocate new data structures for this map. When calling this outside of the constructor,
   * make sure to keep references to the old data structures so that you can free them.
   *
   * @param capacity the new map capacity
   */
  private void allocate(int capacity) {
    assert (capacity >= 0);
    // The capacity needs to be divisible by 64 so that our bit set can be sized properly
    capacity = Math.max((int) Math.min(MAX_CAPACITY, nextPowerOf2(capacity)), 64);
    assert (capacity <= MAX_CAPACITY);
    longArray = new LongArray(memoryManager.allocate(capacity * 8L * 2));
    bitset = new BitSet(MemoryBlock.fromLongArray(new long[capacity / 64]));

    this.growthThreshold = (int) (capacity * loadFactor);
    this.mask = capacity - 1;
  }

  /**
   * Free all allocated memory associated with this map, including the storage for keys and values
   * as well as the hash map array itself.
   *
   * This method is idempotent.
   */
  public void free() {
    if (longArray != null) {
      memoryManager.free(longArray.memoryBlock());
      longArray = null;
    }
    if (bitset != null) {
      // The bitset's heap memory isn't managed by a memory manager, so no need to free it here.
      bitset = null;
    }
    Iterator dataPagesIterator = dataPages.iterator();
    while (dataPagesIterator.hasNext()) {
      memoryManager.freePage(dataPagesIterator.next());
      dataPagesIterator.remove();
    }
    assert(dataPages.isEmpty());
  }

  /** Returns the total amount of memory, in bytes, consumed by this map's managed structures. */
  public long getTotalMemoryConsumption() {
    return (
      dataPages.size() * PAGE_SIZE_BYTES +
      bitset.memoryBlock().size() +
      longArray.memoryBlock().size());
  }

  /**
   * Returns the total amount of time spent resizing this map (in nanoseconds).
   */
  public long getTimeSpentResizingNs() {
    if (!enablePerfMetrics) {
      throw new IllegalStateException();
    }
    return timeSpentResizingNs;
  }


  /**
   * Returns the average number of probes per key lookup.
   */
  public double getAverageProbesPerLookup() {
    if (!enablePerfMetrics) {
      throw new IllegalStateException();
    }
    return (1.0 * numProbes) / numKeyLookups;
  }

  public long getNumHashCollisions() {
    if (!enablePerfMetrics) {
      throw new IllegalStateException();
    }
    return numHashCollisions;
  }

  @VisibleForTesting
  int getNumDataPages() {
    return dataPages.size();
  }

  /**
   * Grows the size of the hash table and re-hash everything.
   */
  @VisibleForTesting
  void growAndRehash() {
    long resizeStartTime = -1;
    if (enablePerfMetrics) {
      resizeStartTime = System.nanoTime();
    }
    // Store references to the old data structures to be used when we re-hash
    final LongArray oldLongArray = longArray;
    final BitSet oldBitSet = bitset;
    final int oldCapacity = (int) oldBitSet.capacity();

    // Allocate the new data structures
    allocate(Math.min(growthStrategy.nextCapacity(oldCapacity), MAX_CAPACITY));

    // Re-mask (we don't recompute the hashcode because we stored all 32 bits of it)
    for (int pos = oldBitSet.nextSetBit(0); pos >= 0; pos = oldBitSet.nextSetBit(pos + 1)) {
      final long keyPointer = oldLongArray.get(pos * 2);
      final int hashcode = (int) oldLongArray.get(pos * 2 + 1);
      int newPos = hashcode & mask;
      int step = 1;
      boolean keepGoing = true;

      // No need to check for equality here when we insert so this has one less if branch than
      // the similar code path in addWithoutResize.
      while (keepGoing) {
        if (!bitset.isSet(newPos)) {
          bitset.set(newPos);
          longArray.set(newPos * 2, keyPointer);
          longArray.set(newPos * 2 + 1, hashcode);
          keepGoing = false;
        } else {
          newPos = (newPos + step) & mask;
          step++;
        }
      }
    }

    // Deallocate the old data structures.
    memoryManager.free(oldLongArray.memoryBlock());
    if (enablePerfMetrics) {
      timeSpentResizingNs += System.nanoTime() - resizeStartTime;
    }
  }

  /** Returns the next number greater or equal num that is power of 2. */
  private static long nextPowerOf2(long num) {
    final long highBit = Long.highestOneBit(num);
    return (highBit == num) ? num : highBit << 1;
  }
}