
org.alicep.collect.CompactSet Maven / Gradle / Ivy
/*
* Copyright 2016 Chris Purcell. All rights reserved.
*
* Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License.
* You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
package org.alicep.collect;
import static com.google.common.base.MoreObjects.firstNonNull;
import static com.google.common.base.Preconditions.checkArgument;
import static com.google.common.base.Preconditions.checkState;
import java.io.IOException;
import java.io.Serializable;
import java.io.StreamCorruptedException;
import java.util.AbstractSet;
import java.util.ArrayList;
import java.util.Arrays;
import java.util.Collection;
import java.util.Collections;
import java.util.ConcurrentModificationException;
import java.util.HashSet;
import java.util.Iterator;
import java.util.LinkedHashSet;
import java.util.NoSuchElementException;
import java.util.Set;
import java.util.Spliterator;
import java.util.Spliterators;
/**
* Hash table and array implementation of the {@link Set} interface,
* with predictable iteration order. This implementation is similar to
* {@link LinkedHashSet}, but uses a more compact memory representation
* originally pioneered by PyPy, and subsequently
* adopted in Python 3.6.
*
*
This class provides all of the optional Set operations, and
* permits null elements. Like {@link HashSet}, it provides constant-time
* performance for the basic operations (add, contains and
* remove), assuming the hash function disperses elements
* properly among the buckets. Performance is typically within 5% of
* HashSet, with only around a third of the memory overhead
* (a quarter of LinkedHashSet, and close to a plain {@link ArrayList}).
*
*
Unlike HashSet and LinkedHashSet, this class does not cache the
* {@linkplain Object#hashCode() hash code value} of its elements, as this is typically
* redundant: numeric types are a trivial transformation, while Strings already cache
* their hash values. This may however result in a significant negative performance
* impact if element hashCode/equality checks are expensive.
*
*
Note that this implementation is not synchronized.
* If multiple threads access a compact set concurrently, and at least
* one of the threads modifies the set, it must be synchronized
* externally. This is typically accomplished by synchronizing on some
* object that naturally encapsulates the set.
*
* If no such object exists, the set should be "wrapped" using the
* {@link Collections#synchronizedSet Collections.synchronizedSet}
* method. This is best done at creation time, to prevent accidental
* unsynchronized access to the set:
* Set<...> s = Collections.synchronizedSet(new CompactSet<>(...));
*
* The iterators returned by this class's iterator method are
* fail-fast: if the set is modified at any time after the iterator
* is created, in any way except through the iterator's own remove
* method, the iterator will throw a {@link ConcurrentModificationException}.
* Thus, in the face of concurrent modification, the iterator fails quickly
* and cleanly, rather than risking arbitrary, non-deterministic behavior at
* an undetermined time in the future.
*
*
Note that the fail-fast behavior of an iterator cannot be guaranteed
* as it is, generally speaking, impossible to make any hard guarantees in the
* presence of unsynchronized concurrent modification. Fail-fast iterators
* throw ConcurrentModificationException on a best-effort basis.
* Therefore, it would be wrong to write a program that depended on this
* exception for its correctness: the fail-fast behavior of iterators
* should be used only to detect bugs.
*
*
Implementation Details
*
* This section covers the current implementation; future releases may change
* some or all of the details.
*
*
This implementation stores all elements in an insertion-ordered
* array. Lookup is done via a compressed hashtable of indices, using
* double hashing
* to reduce collisions. Small sets achieve a higher compression rate,
* as lookup indexes require fewer bits; a newly-allocated instance
* needs only 4 bits per bucket to index the default 10-element array.
* As the set grows, the element array grows in the same manner as a
* ArrayList, and the index hashtable is regenerated.
*
*
Iteration performance is similar to ArrayList, though if a lot
* of elements are deleted, the element array will not be shrunk, meaning
* iteration performance does not recover once a set has been large.
*
*
Memory overhead is a pointer and a half plus a handful of bits per
* element in the set. In contrast, HashSet allocates approximately
* five pointers plus 16 bytes per element, while LinkedHashSet
* allocates two more pointers on top of that; an ArrayList uses
* around a pointer and a half.
*
* @param the type of elements maintained by this set
*
* @see Set
* @see HashSet
* @see LinkedHashSet
*/
public class CompactSet extends AbstractSet implements Serializable {
private enum Reserved { NULL }
private static final int NO_INDEX = -1;
private static final int DEFAULT_CAPACITY = 10;
private int size = 0;
private int modCount = 0;
private Object[] objects;
private int head = 0;
private long[] lookup;
/**
* Constructs an empty set with an initial capacity of ten.
*/
public CompactSet() {
this(DEFAULT_CAPACITY);
}
/**
* Constructs an empty set with the specified initial capacity.
*
* @param initialCapacity the initial capacity of the set
* @return an empty set
* @throws IllegalArgumentException if the specified initial capacity
* is negative
*
* @param the type of elements maintained by the set
*/
public static CompactSet withInitialCapacity(int initialCapacity) {
return new CompactSet<>(initialCapacity);
}
/**
* Constructs a set containing the elements of the specified
* collection, in the order they are returned by the collection's
* iterator. (If an element is duplicated, only the first instance
* will be stored.)
*
* @param elements the collection whose elements are to be placed into this list
* @throws NullPointerException if the specified collection is null
*/
public CompactSet(Collection extends E> elements) {
this(elements.size());
addAll(elements);
}
private CompactSet(int initialCapacity) {
checkArgument(initialCapacity >= 0, "initialCapacity must be non-negative");
objects = new Object[Math.max(initialCapacity, DEFAULT_CAPACITY)];
lookup = newLookupArray();
}
@Override
public int size() {
return size;
}
@Override
public Iterator iterator() {
return new IteratorImpl();
}
@Override
public Spliterator spliterator() {
return Spliterators.spliterator(this, Spliterator.DISTINCT | Spliterator.ORDERED);
}
@Override
public boolean contains(Object o) {
Object comparisonObject = (o == null) ? Reserved.NULL : o;
long index = lookup(comparisonObject);
return (index >= 0);
}
@Override
public boolean add(E e) {
Object insertionObject = firstNonNull(e, Reserved.NULL);
// Ensure there is a free cell _before_ looking up index as rehashing invalidates the index.
ensureFreeCell();
long lookupIndex = lookup(insertionObject);
if (lookupIndex >= 0) {
return false;
}
int index = head++;
objects[index] = insertionObject;
addLookup((int) -(lookupIndex + 1), index);
size++;
modCount++;
return true;
}
@Override
public boolean remove(Object o) {
long index = lookup((o == null) ? Reserved.NULL : o);
if (index < 0) {
return false;
}
deleteObjectAtIndex((int) index);
return true;
}
/* Lookup methods */
private static int log2ceil(int value) {
return 32 - Integer.numberOfLeadingZeros(value - 1);
}
private int lookupEntryBits() {
return log2ceil(objects.length);
}
private int lookupEntriesPerLong() {
return Long.SIZE / lookupEntryBits();
}
private long[] newLookupArray() {
// Aim for a power of two with 50% occupancy maximum
int numCells = 1 << (log2ceil(objects.length) + 1);
while (objects.length * 2 > numCells) {
numCells = numCells * 2;
}
int cellsPerLong = lookupEntriesPerLong();
long[] lookup = new long[1 + (numCells - 1) / cellsPerLong];
Arrays.fill(lookup, -1);
return lookup;
}
private void addLookup(int lookupIndex, int index) {
assertState(index != NO_INDEX, "Invalid index");
if (lookupEntryBits() < Long.SIZE) {
addLookupNibble(lookupIndex, index);
} else {
lookup[lookupIndex] = index;
}
}
private long lookupMask() {
return (1 << lookupEntryBits()) - 1;
}
private void addLookupNibble(int lookupIndex, int index) {
long word = lookup[lookupIndex / lookupEntriesPerLong()];
int shift = lookupEntryBits() * (lookupIndex % lookupEntriesPerLong());
word &= ~(lookupMask() << shift);
word |= (index & lookupMask()) << shift;
lookup[lookupIndex / lookupEntriesPerLong()] = word;
}
/**
* If {@code obj} is in the {@code objects} array, returns its index; otherwise, returns
* {@code (-(probe insertion point) - 1)}, where "probe insertion point" is
* the index of first free cell in {@code lookup} along the probe sequence for {@code obj}.
*/
private long lookup(Object obj) {
int mask = numLookupCells() - 1;
int tombstoneIndex = -1;
int lookupIndex = obj.hashCode();
int stride = Integer.reverse(lookupIndex) * 2 + 1;
lookupIndex &= mask;
stride &= mask;
int index;
while ((index = getLookupAt(lookupIndex)) != NO_INDEX) {
Object other = objects[index];
if (other == null) {
if (tombstoneIndex == -1) {
tombstoneIndex = lookupIndex;
}
} else if (other.equals(obj)) {
return index;
}
lookupIndex += stride;
lookupIndex &= mask;
}
if (tombstoneIndex != -1) {
return -tombstoneIndex - 1;
} else {
return -lookupIndex - 1;
}
}
private int numLookupCells() {
return Integer.highestOneBit(lookup.length * lookupEntriesPerLong());
}
private int getLookupAt(int lookupIndex) {
long word = lookup[lookupIndex / lookupEntriesPerLong()];
int shift = lookupEntryBits() * (lookupIndex % lookupEntriesPerLong());
int value = (int) ((word >> shift) & lookupMask());
return (value == (NO_INDEX & lookupMask())) ? -1 : value;
}
private void clearLookupArray() {
Arrays.fill(lookup, -1);
}
/* Other internal methods */
private void ensureFreeCell() {
if (objects.length == head) {
if (size >= minGrowthThreshold()) {
int newSize = objects.length + (objects.length >> 1);
objects = Arrays.copyOf(objects, newSize);
lookup = null;
}
compact();
}
}
private void deleteObjectAtIndex(int index) {
assertState(objects[index] != null, "Cannot delete empty cell");
assertState(size != 0, "Size is 0 but a cell is not empty");
objects[index] = null;
size--;
modCount++;
}
private void compact() {
if (lookup == null) {
lookup = newLookupArray();
} else {
clearLookupArray();
}
int target = 0;
for (int source = 0; source < objects.length; source++) {
Object e = objects[source];
if (e == null) {
continue;
}
if (source != target) {
objects[target] = e;
}
long freeLookupCell = -(lookup(e) + 1);
checkState(freeLookupCell >= 0);
addLookup((int) freeLookupCell, target);
target++;
}
for (; target < objects.length; target++) {
objects[target] = null;
}
head = size;
}
private int minGrowthThreshold() {
// Grow the objects array if less than a quarter of it is DELETED tombstones when it fills up.
return objects.length * 3 / 4;
}
private static void assertState(boolean condition, String message, Object... args) {
if (!condition) {
throw new AssertionError(String.format(message, args));
}
}
/* Serialization */
private static final long serialVersionUID = 0;
private void writeObject(java.io.ObjectOutputStream s) throws IOException {
s.writeInt(size);
for (int i = 0; i < head; ++i) {
Object o = objects[i];
if (o != null) {
s.writeObject(o == Reserved.NULL ? null : o);
}
}
}
private void readObject(java.io.ObjectInputStream s) throws IOException, ClassNotFoundException {
size = s.readInt();
objects = new Object[Math.max(size, DEFAULT_CAPACITY)];
lookup = newLookupArray();
clearLookupArray();
for (head = 0; head < size; head++) {
Object e = firstNonNull(s.readObject(), Reserved.NULL);
objects[head] = e;
long x = lookup(e);
long freeLookupCell = -(x + 1);
if (freeLookupCell < 0) {
throw new StreamCorruptedException("Duplicate data found in serialized set");
}
addLookup((int) freeLookupCell, head);
}
}
/* Iteration */
private class IteratorImpl implements Iterator {
private int expectedModCount;
private int index;
private int nextIndex;
IteratorImpl() {
expectedModCount = modCount;
index = -1;
nextIndex = 0;
while (nextIndex < head && objects[nextIndex] == null) {
nextIndex++;
}
}
@Override
public boolean hasNext() {
if (modCount != expectedModCount) {
throw new ConcurrentModificationException();
}
return nextIndex < head;
}
@Override
public E next() {
if (!hasNext()) {
throw new NoSuchElementException();
}
index = nextIndex;
do {
nextIndex++;
} while (nextIndex < head && objects[nextIndex] == null);
@SuppressWarnings("unchecked")
E o = (E) objects[index];
if (o == null) {
throw new ConcurrentModificationException();
}
return (o == Reserved.NULL) ? null : o;
}
@Override
public void remove() {
checkState(index != -1);
if (modCount != expectedModCount) {
throw new ConcurrentModificationException();
}
deleteObjectAtIndex(index);
index = -1;
expectedModCount = modCount;
}
}
}