org.alicep.collect.CompactSet Maven / Gradle / Ivy

Go to download
/*
 * Copyright 2016 Chris Purcell. All rights reserved.
 *
 * Licensed under the Apache License, Version 2.0 (the "License");
 * you may not use this file except in compliance with the License.
 * You may obtain a copy of the License at
 *
 * http://www.apache.org/licenses/LICENSE-2.0
 *
 * Unless required by applicable law or agreed to in writing, software
 * distributed under the License is distributed on an "AS IS" BASIS,
 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 * See the License for the specific language governing permissions and
 * limitations under the License.
 */
package org.alicep.collect;

import static com.google.common.base.MoreObjects.firstNonNull;
import static com.google.common.base.Preconditions.checkArgument;
import static com.google.common.base.Preconditions.checkState;

import java.io.IOException;
import java.io.Serializable;
import java.io.StreamCorruptedException;
import java.util.AbstractSet;
import java.util.ArrayList;
import java.util.Arrays;
import java.util.Collection;
import java.util.Collections;
import java.util.ConcurrentModificationException;
import java.util.HashSet;
import java.util.Iterator;
import java.util.LinkedHashSet;
import java.util.NoSuchElementException;
import java.util.Set;
import java.util.Spliterator;
import java.util.Spliterators;

/**
 * Hash table and array implementation of the {@link Set} interface,
 * with predictable iteration order.  This implementation is similar to
 * {@link LinkedHashSet}, but uses a more compact memory representation
 * originally pioneered by PyPy, and subsequently
 * adopted in Python 3.6.
 *
 * 
This class provides all of the optional Set operations, and
 * permits null elements.  Like {@link HashSet}, it provides constant-time
 * performance for the basic operations (add, contains and
 * remove), assuming the hash function disperses elements
 * properly among the buckets.  Performance is typically within 5% of
 * HashSet, with only around a third of the memory overhead
 * (a quarter of LinkedHashSet, and close to a plain {@link ArrayList}).
 *
 * 
Unlike HashSet and LinkedHashSet, this class does not cache the
 * {@linkplain Object#hashCode() hash code value} of its elements, as this is typically
 * redundant: numeric types are a trivial transformation, while Strings already cache
 * their hash values.  This may however result in a significant negative performance
 * impact if element hashCode/equality checks are expensive.
 *
 * 
Note that this implementation is not synchronized.
 * If multiple threads access a compact set concurrently, and at least
 * one of the threads modifies the set, it must be synchronized
 * externally.  This is typically accomplished by synchronizing on some
 * object that naturally encapsulates the set.
 *
 * If no such object exists, the set should be "wrapped" using the
 * {@link Collections#synchronizedSet Collections.synchronizedSet}
 * method.  This is best done at creation time, to prevent accidental
 * unsynchronized access to the set: 
 *   Set<...> s = Collections.synchronizedSet(new CompactSet<>(...));
 *
 * The iterators returned by this class's iterator method are
 * fail-fast: if the set is modified at any time after the iterator
 * is created, in any way except through the iterator's own remove
 * method, the iterator will throw a {@link ConcurrentModificationException}.
 * Thus, in the face of concurrent modification, the iterator fails quickly
 * and cleanly, rather than risking arbitrary, non-deterministic behavior at
 * an undetermined time in the future.
 *
 * 
Note that the fail-fast behavior of an iterator cannot be guaranteed
 * as it is, generally speaking, impossible to make any hard guarantees in the
 * presence of unsynchronized concurrent modification.  Fail-fast iterators
 * throw ConcurrentModificationException on a best-effort basis.
 * Therefore, it would be wrong to write a program that depended on this
 * exception for its correctness:   the fail-fast behavior of iterators
 * should be used only to detect bugs.
 *
 * 
Implementation Details
 *
 * This section covers the current implementation; future releases may change
 * some or all of the details.
 *
 * 
This implementation stores all elements in an insertion-ordered
 * array.  Lookup is done via a compressed hashtable of indices, using
 * double hashing
 * to reduce collisions.  Small sets achieve a higher compression rate,
 * as lookup indexes require fewer bits; a newly-allocated instance
 * needs only 4 bits per bucket to index the default 10-element array.
 * As the set grows, the element array grows in the same manner as a
 * ArrayList, and the index hashtable is regenerated.
 *
 * 
Iteration performance is similar to ArrayList, though if a lot
 * of elements are deleted, the element array will not be shrunk, meaning
 * iteration performance does not recover once a set has been large.
 *
 * Memory overhead is a pointer and a half plus a handful of bits per
 * element in the set.  In contrast, HashSet allocates approximately
 * five pointers plus 16 bytes per element, while LinkedHashSet
 * allocates two more pointers on top of that; an ArrayList uses
 * around a pointer and a half.
 *
 * @param  the type of elements maintained by this set
 *
 * @see     Set
 * @see     HashSet
 * @see     LinkedHashSet
 */
public class CompactSet extends AbstractSet implements Serializable {

  private enum Reserved { NULL }
  private static final int NO_INDEX = -1;
  private static final int DEFAULT_CAPACITY = 10;

  private int size = 0;
  private int modCount = 0;
  private Object[] objects;
  private int head = 0;
  private long[] lookup;

  /**
   * Constructs an empty set with an initial capacity of ten.
   */
  public CompactSet() {
    this(DEFAULT_CAPACITY);
  }

  /**
   * Constructs an empty set with the specified initial capacity.
   *
   * @param  initialCapacity  the initial capacity of the set
   * @return an empty set
   * @throws IllegalArgumentException if the specified initial capacity
   *         is negative
   *
   * @param  the type of elements maintained by the set
   */
  public static  CompactSet withInitialCapacity(int initialCapacity) {
    return new CompactSet<>(initialCapacity);
  }

  /**
   * Constructs a set containing the elements of the specified
   * collection, in the order they are returned by the collection's
   * iterator. (If an element is duplicated, only the first instance
   * will be stored.)
   *
   * @param elements the collection whose elements are to be placed into this list
   * @throws NullPointerException if the specified collection is null
   */
  public CompactSet(Collection elements) {
    this(elements.size());
    addAll(elements);
  }

  private CompactSet(int initialCapacity) {
    checkArgument(initialCapacity >= 0, "initialCapacity must be non-negative");
    objects = new Object[Math.max(initialCapacity, DEFAULT_CAPACITY)];
    lookup = newLookupArray();
  }

  @Override
  public int size() {
    return size;
  }

  @Override
  public Iterator iterator() {
    return new IteratorImpl();
  }

  @Override
  public Spliterator spliterator() {
    return Spliterators.spliterator(this, Spliterator.DISTINCT | Spliterator.ORDERED);
  }

  @Override
  public boolean contains(Object o) {
    Object comparisonObject = (o == null) ? Reserved.NULL : o;
    long index = lookup(comparisonObject);
    return (index >= 0);
  }

  @Override
  public boolean add(E e) {
    Object insertionObject = firstNonNull(e, Reserved.NULL);

    // Ensure there is a free cell _before_ looking up index as rehashing invalidates the index.
    ensureFreeCell();
    long lookupIndex = lookup(insertionObject);
    if (lookupIndex >= 0) {
      return false;
    }
    int index = head++;
    objects[index] = insertionObject;
    addLookup((int) -(lookupIndex + 1), index);

    size++;
    modCount++;
    return true;
  }

  @Override
  public boolean remove(Object o) {
    long index = lookup((o == null) ? Reserved.NULL : o);
    if (index < 0) {
      return false;
    }

    deleteObjectAtIndex((int) index);
    return true;
  }

  /* Lookup methods */

  private static int log2ceil(int value) {
    return 32 - Integer.numberOfLeadingZeros(value - 1);
  }

  private int lookupEntryBits() {
    return log2ceil(objects.length);
  }

  private int lookupEntriesPerLong() {
    return Long.SIZE / lookupEntryBits();
  }

  private long[] newLookupArray() {
    // Aim for a power of two with 50% occupancy maximum
    int numCells = 1 << (log2ceil(objects.length) + 1);
    while (objects.length * 2 > numCells) {
      numCells = numCells * 2;
    }
    int cellsPerLong = lookupEntriesPerLong();
    long[] lookup = new long[1 + (numCells - 1) / cellsPerLong];
    Arrays.fill(lookup, -1);
    return lookup;
  }

  private void addLookup(int lookupIndex, int index) {
    assertState(index != NO_INDEX, "Invalid index");
    if (lookupEntryBits() < Long.SIZE) {
      addLookupNibble(lookupIndex, index);
    } else {
      lookup[lookupIndex] = index;
    }
  }

  private long lookupMask() {
    return (1 << lookupEntryBits()) - 1;
  }

  private void addLookupNibble(int lookupIndex, int index) {
    long word = lookup[lookupIndex / lookupEntriesPerLong()];
    int shift = lookupEntryBits() * (lookupIndex % lookupEntriesPerLong());
    word &= ~(lookupMask() << shift);
    word |= (index & lookupMask()) << shift;
    lookup[lookupIndex / lookupEntriesPerLong()] = word;
  }

  /**
   * If {@code obj} is in the {@code objects} array, returns its index; otherwise, returns
   * {@code (-(probe insertion point) - 1)}, where "probe insertion point" is
   * the index of first free cell in {@code lookup} along the probe sequence for {@code obj}.
   */
  private long lookup(Object obj) {
    int mask = numLookupCells() - 1;
    int tombstoneIndex = -1;
    int lookupIndex = obj.hashCode();
    int stride = Integer.reverse(lookupIndex) * 2 + 1;
    lookupIndex &= mask;
    stride &= mask;
    int index;
    while ((index = getLookupAt(lookupIndex)) != NO_INDEX) {
      Object other = objects[index];
      if (other == null) {
        if (tombstoneIndex == -1) {
          tombstoneIndex = lookupIndex;
        }
      } else if (other.equals(obj)) {
        return index;
      }
      lookupIndex += stride;
      lookupIndex &= mask;
    }
    if (tombstoneIndex != -1) {
      return -tombstoneIndex - 1;
    } else {
      return -lookupIndex - 1;
    }
  }

  private int numLookupCells() {
    return Integer.highestOneBit(lookup.length * lookupEntriesPerLong());
  }

  private int getLookupAt(int lookupIndex) {
    long word = lookup[lookupIndex / lookupEntriesPerLong()];
    int shift = lookupEntryBits() * (lookupIndex % lookupEntriesPerLong());
    int value = (int) ((word >> shift) & lookupMask());
    return (value == (NO_INDEX & lookupMask())) ? -1 : value;
  }

  private void clearLookupArray() {
    Arrays.fill(lookup, -1);
  }

  /* Other internal methods */

  private void ensureFreeCell() {
    if (objects.length == head) {
      if (size >= minGrowthThreshold()) {
        int newSize = objects.length + (objects.length >> 1);
        objects = Arrays.copyOf(objects, newSize);
        lookup = null;
      }
      compact();
    }
  }

  private void deleteObjectAtIndex(int index) {
    assertState(objects[index] != null, "Cannot delete empty cell");
    assertState(size != 0, "Size is 0 but a cell is not empty");
    objects[index] = null;
    size--;
    modCount++;
  }

  private void compact() {
    if (lookup == null) {
      lookup = newLookupArray();
    } else {
      clearLookupArray();
    }
    int target = 0;
    for (int source = 0; source < objects.length; source++) {
      Object e = objects[source];
      if (e == null) {
        continue;
      }
      if (source != target) {
        objects[target] = e;
      }
      long freeLookupCell = -(lookup(e) + 1);
      checkState(freeLookupCell >= 0);
      addLookup((int) freeLookupCell, target);
      target++;
    }
    for (; target < objects.length; target++) {
      objects[target] = null;
    }
    head = size;
  }

  private int minGrowthThreshold() {
    // Grow the objects array if less than a quarter of it is DELETED tombstones when it fills up.
    return objects.length * 3 / 4;
  }

  private static void assertState(boolean condition, String message, Object... args) {
    if (!condition) {
      throw new AssertionError(String.format(message, args));
    }
  }

  /* Serialization */

  private static final long serialVersionUID = 0;

  private void writeObject(java.io.ObjectOutputStream s) throws IOException {
    s.writeInt(size);
    for (int i = 0; i < head; ++i) {
      Object o = objects[i];
      if (o != null) {
        s.writeObject(o == Reserved.NULL ? null : o);
      }
    }
  }

  private void readObject(java.io.ObjectInputStream s) throws IOException, ClassNotFoundException {
    size = s.readInt();
    objects = new Object[Math.max(size, DEFAULT_CAPACITY)];
    lookup = newLookupArray();
    clearLookupArray();
    for (head = 0; head < size; head++) {
      Object e = firstNonNull(s.readObject(), Reserved.NULL);
      objects[head] = e;
      long x = lookup(e);
      long freeLookupCell = -(x + 1);
      if (freeLookupCell < 0) {
        throw new StreamCorruptedException("Duplicate data found in serialized set");
      }
      addLookup((int) freeLookupCell, head);
    }
  }

  /* Iteration */

  private class IteratorImpl implements Iterator {
    private int expectedModCount;
    private int index;
    private int nextIndex;

    IteratorImpl() {
      expectedModCount = modCount;
      index = -1;
      nextIndex = 0;
      while (nextIndex < head && objects[nextIndex] == null) {
        nextIndex++;
      }
    }

    @Override
    public boolean hasNext() {
      if (modCount != expectedModCount) {
        throw new ConcurrentModificationException();
      }
      return nextIndex < head;
    }

    @Override
    public E next() {
      if (!hasNext()) {
        throw new NoSuchElementException();
      }

      index = nextIndex;
      do {
        nextIndex++;
      } while (nextIndex < head && objects[nextIndex] == null);

      @SuppressWarnings("unchecked")
      E o = (E) objects[index];
      if (o == null) {
        throw new ConcurrentModificationException();
      }
      return (o == Reserved.NULL) ? null : o;
    }

    @Override
    public void remove() {
      checkState(index != -1);
      if (modCount != expectedModCount) {
        throw new ConcurrentModificationException();
      }
      deleteObjectAtIndex(index);
      index = -1;
      expectedModCount = modCount;
    }
  }
}