org.apache.mahout.math.set.OpenHashSet Maven / Gradle / Ivy
Show all versions of mahout-collections Show documentation
/**
* Licensed to the Apache Software Foundation (ASF) under one
* or more contributor license agreements. See the NOTICE file
* distributed with this work for additional information
* regarding copyright ownership. The ASF licenses this file
* to you under the Apache License, Version 2.0 (the
* "License"); you may not use this file except in compliance
* with the License. You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing,
* software distributed under the License is distributed on an
* "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
* KIND, either express or implied. See the License for the
* specific language governing permissions and limitations
* under the License.
*/
package org.apache.mahout.math.set;
import java.util.ArrayList;
import java.util.Arrays;
import java.util.Collection;
import java.util.Iterator;
import java.util.List;
import java.util.Set;
import org.apache.mahout.math.function.ObjectProcedure;
import org.apache.mahout.math.map.PrimeFinder;
/**
* Open hashing alternative to java.util.HashSet.
**/
public class OpenHashSet extends AbstractSet implements Set {
protected static final byte FREE = 0;
protected static final byte FULL = 1;
protected static final byte REMOVED = 2;
protected static final char NO_KEY_VALUE = 0;
/** The hash table keys. */
private Object[] table;
/** The state of each hash table entry (FREE, FULL, REMOVED). */
private byte[] state;
/** The number of table entries in state==FREE. */
private int freeEntries;
/** Constructs an empty map with default capacity and default load factors. */
public OpenHashSet() {
this(defaultCapacity);
}
/**
* Constructs an empty map with the specified initial capacity and default load factors.
*
* @param initialCapacity the initial capacity of the map.
* @throws IllegalArgumentException if the initial capacity is less than zero.
*/
public OpenHashSet(int initialCapacity) {
this(initialCapacity, defaultMinLoadFactor, defaultMaxLoadFactor);
}
/**
* Constructs an empty map with the specified initial capacity and the specified minimum and maximum load factor.
*
* @param initialCapacity the initial capacity.
* @param minLoadFactor the minimum load factor.
* @param maxLoadFactor the maximum load factor.
* @throws IllegalArgumentException if initialCapacity < 0 || (minLoadFactor < 0.0 || minLoadFactor >= 1.0) ||
* (maxLoadFactor <= 0.0 || maxLoadFactor >= 1.0) || (minLoadFactor >=
* maxLoadFactor).
*/
public OpenHashSet(int initialCapacity, double minLoadFactor, double maxLoadFactor) {
setUp(initialCapacity, minLoadFactor, maxLoadFactor);
}
/** Removes all values associations from the receiver. Implicitly calls trimToSize(). */
@Override
public void clear() {
Arrays.fill(this.state, 0, state.length - 1, FREE);
distinct = 0;
freeEntries = table.length; // delta
trimToSize();
}
/**
* Returns a deep copy of the receiver.
*
* @return a deep copy of the receiver.
*/
@SuppressWarnings("unchecked")
@Override
public Object clone() {
OpenHashSet copy = (OpenHashSet) super.clone();
copy.table = copy.table.clone();
copy.state = copy.state.clone();
return copy;
}
/**
* Returns true if the receiver contains the specified key.
*
* @return true if the receiver contains the specified key.
*/
@Override
@SuppressWarnings("unchecked")
public boolean contains(Object key) {
return indexOfKey((T)key) >= 0;
}
/**
* Ensures that the receiver can hold at least the specified number of associations without needing to allocate new
* internal memory. If necessary, allocates new internal memory and increases the capacity of the receiver. This
* method never need be called; it is for performance tuning only. Calling this method before add()ing a
* large number of associations boosts performance, because the receiver will grow only once instead of potentially
* many times and hash collisions get less probable.
*
* @param minCapacity the desired minimum capacity.
*/
@Override
public void ensureCapacity(int minCapacity) {
if (table.length < minCapacity) {
int newCapacity = nextPrime(minCapacity);
rehash(newCapacity);
}
}
/**
* Applies a procedure to each key of the receiver, if any. Note: Iterates over the keys in no particular order.
* Subclasses can define a particular order, for example, "sorted by key". All methods which can be expressed
* in terms of this method (most methods can) must guarantee to use the same order defined by this
* method, even if it is no particular order. This is necessary so that, for example, methods keys and
* values will yield association pairs, not two uncorrelated lists.
*
* @param procedure the procedure to be applied. Stops iteration if the procedure returns false, otherwise
* continues.
* @return false if the procedure stopped before all keys where iterated over, true otherwise.
*/
@SuppressWarnings("unchecked")
public boolean forEachKey(ObjectProcedure procedure) {
for (int i = table.length; i-- > 0;) {
if (state[i] == FULL) {
if (!procedure.apply((T)table[i])) {
return false;
}
}
}
return true;
}
/**
* @param key the key to be added to the receiver.
* @return the index where the key would need to be inserted, if it is not already contained. Returns -index-1 if the
* key is already contained at slot index. Therefore, if the returned index < 0, then it is already contained
* at slot -index-1. If the returned index >= 0, then it is NOT already contained and should be inserted at
* slot index.
*/
protected int indexOfInsertion(T key) {
Object[] tab = table;
byte[] stat = state;
int length = tab.length;
int hash = key.hashCode() & 0x7FFFFFFF;
int i = hash % length;
int decrement = hash % (length - 2); // double hashing, see http://www.eece.unm.edu/faculty/heileman/hash/node4.html
//int decrement = (hash / length) % length;
if (decrement == 0) {
decrement = 1;
}
// stop if we find a removed or free slot, or if we find the key itself
// do NOT skip over removed slots (yes, open addressing is like that...)
while (stat[i] == FULL && tab[i] != key) {
i -= decrement;
//hashCollisions++;
if (i < 0) {
i += length;
}
}
if (stat[i] == REMOVED) {
// stop if we find a free slot, or if we find the key itself.
// do skip over removed slots (yes, open addressing is like that...)
// assertion: there is at least one FREE slot.
int j = i;
while (stat[i] != FREE && (stat[i] == REMOVED || tab[i] != key)) {
i -= decrement;
//hashCollisions++;
if (i < 0) {
i += length;
}
}
if (stat[i] == FREE) {
i = j;
}
}
if (stat[i] == FULL) {
// key already contained at slot i.
// return a negative number identifying the slot.
return -i - 1;
}
// not already contained, should be inserted at slot i.
// return a number >= 0 identifying the slot.
return i;
}
/**
* @param key the key to be searched in the receiver.
* @return the index where the key is contained in the receiver, returns -1 if the key was not found.
*/
protected int indexOfKey(T key) {
Object[] tab = table;
byte[] stat = state;
int length = tab.length;
int hash = key.hashCode() & 0x7FFFFFFF;
int i = hash % length;
int decrement = hash % (length - 2); // double hashing, see http://www.eece.unm.edu/faculty/heileman/hash/node4.html
//int decrement = (hash / length) % length;
if (decrement == 0) {
decrement = 1;
}
// stop if we find a free slot, or if we find the key itself.
// do skip over removed slots (yes, open addressing is like that...)
while (stat[i] != FREE && (stat[i] == REMOVED || (!key.equals(tab[i])))) {
i -= decrement;
//hashCollisions++;
if (i < 0) {
i += length;
}
}
if (stat[i] == FREE) {
return -1;
} // not found
return i; //found, return index where key is contained
}
/**
* Fills all keys contained in the receiver into the specified list. Fills the list, starting at index 0. After this
* call returns the specified list has a new size that equals this.size().
* This method can be used
* to iterate over the keys of the receiver.
*
* @param list the list to be filled, can have any size.
*/
@SuppressWarnings("unchecked")
public void keys(List list) {
list.clear();
Object [] tab = table;
byte[] stat = state;
for (int i = tab.length; i-- > 0;) {
if (stat[i] == FULL) {
list.add((T)tab[i]);
}
}
}
@SuppressWarnings("unchecked")
@Override
public boolean add(Object key) {
int i = indexOfInsertion((T)key);
if (i < 0) { //already contained
return false;
}
if (this.distinct > this.highWaterMark) {
int newCapacity = chooseGrowCapacity(this.distinct + 1, this.minLoadFactor, this.maxLoadFactor);
rehash(newCapacity);
return add(key);
}
this.table[i] = key;
if (this.state[i] == FREE) {
this.freeEntries--;
}
this.state[i] = FULL;
this.distinct++;
if (this.freeEntries < 1) { //delta
int newCapacity = chooseGrowCapacity(this.distinct + 1, this.minLoadFactor, this.maxLoadFactor);
rehash(newCapacity);
return add(key);
}
return true;
}
/**
* Rehashes the contents of the receiver into a new table with a smaller or larger capacity. This method is called
* automatically when the number of keys in the receiver exceeds the high water mark or falls below the low water
* mark.
*/
@SuppressWarnings("unchecked")
protected void rehash(int newCapacity) {
int oldCapacity = table.length;
//if (oldCapacity == newCapacity) return;
Object[] oldTable = table;
byte[] oldState = state;
Object[] newTable = new Object[newCapacity];
byte[] newState = new byte[newCapacity];
this.lowWaterMark = chooseLowWaterMark(newCapacity, this.minLoadFactor);
this.highWaterMark = chooseHighWaterMark(newCapacity, this.maxLoadFactor);
this.table = newTable;
this.state = newState;
this.freeEntries = newCapacity - this.distinct; // delta
for (int i = oldCapacity; i-- > 0;) {
if (oldState[i] == FULL) {
Object element = oldTable[i];
int index = indexOfInsertion((T)element);
newTable[index] = element;
newState[index] = FULL;
}
}
}
/**
* Removes the given key with its associated element from the receiver, if present.
*
* @param key the key to be removed from the receiver.
* @return true if the receiver contained the specified key, false otherwise.
*/
@SuppressWarnings("unchecked")
@Override
public boolean remove(Object key) {
int i = indexOfKey((T)key);
if (i < 0) {
return false;
} // key not contained
this.state[i] = REMOVED;
this.distinct--;
if (this.distinct < this.lowWaterMark) {
int newCapacity = chooseShrinkCapacity(this.distinct, this.minLoadFactor, this.maxLoadFactor);
rehash(newCapacity);
}
return true;
}
/**
* Initializes the receiver.
*
* @param initialCapacity the initial capacity of the receiver.
* @param minLoadFactor the minLoadFactor of the receiver.
* @param maxLoadFactor the maxLoadFactor of the receiver.
* @throws IllegalArgumentException if initialCapacity < 0 || (minLoadFactor < 0.0 || minLoadFactor >= 1.0) ||
* (maxLoadFactor <= 0.0 || maxLoadFactor >= 1.0) || (minLoadFactor >=
* maxLoadFactor).
*/
@Override
protected void setUp(int initialCapacity, double minLoadFactor, double maxLoadFactor) {
int capacity = initialCapacity;
super.setUp(capacity, minLoadFactor, maxLoadFactor);
capacity = nextPrime(capacity);
if (capacity == 0) {
capacity = 1;
} // open addressing needs at least one FREE slot at any time.
this.table = new Object[capacity];
this.state = new byte[capacity];
// memory will be exhausted long before this pathological case happens, anyway.
this.minLoadFactor = minLoadFactor;
if (capacity == PrimeFinder.largestPrime) {
this.maxLoadFactor = 1.0;
} else {
this.maxLoadFactor = maxLoadFactor;
}
this.distinct = 0;
this.freeEntries = capacity; // delta
// lowWaterMark will be established upon first expansion.
// establishing it now (upon instance construction) would immediately make the table shrink upon first put(...).
// After all the idea of an "initialCapacity" implies violating lowWaterMarks when an object is young.
// See ensureCapacity(...)
this.lowWaterMark = 0;
this.highWaterMark = chooseHighWaterMark(capacity, this.maxLoadFactor);
}
/**
* Trims the capacity of the receiver to be the receiver's current size. Releases any superfluous internal memory. An
* application can use this operation to minimize the storage of the receiver.
*/
@Override
public void trimToSize() {
// * 1.2 because open addressing's performance exponentially degrades beyond that point
// so that even rehashing the table can take very long
int newCapacity = nextPrime((int) (1 + 1.2 * size()));
if (table.length > newCapacity) {
rehash(newCapacity);
}
}
/**
* Access for unit tests.
* @param capacity
* @param minLoadFactor
* @param maxLoadFactor
*/
void getInternalFactors(int[] capacity,
double[] minLoadFactor,
double[] maxLoadFactor) {
capacity[0] = table.length;
minLoadFactor[0] = this.minLoadFactor;
maxLoadFactor[0] = this.maxLoadFactor;
}
@Override
public boolean isEmpty() {
return size() == 0;
}
/**
* OpenHashSet instances are only equal to other OpenHashSet instances, not to
* any other collection. Hypothetically, we should check for and permit
* equals on other Sets.
*/
@SuppressWarnings("unchecked")
public boolean equals(Object obj) {
if (obj == this) {
return true;
}
if (!(obj instanceof OpenHashSet)) {
return false;
}
final OpenHashSet other = (OpenHashSet) obj;
if (other.size() != size()) {
return false;
}
return
forEachKey(
new ObjectProcedure() {
@Override
public boolean apply(T key) {
return other.contains(key);
}
}
);
}
/**
* Implement the standard Java Collections iterator. Note that 'remove' is silently
* ineffectual here. This method is provided for convenience, only.
*/
@Override
public Iterator iterator() {
List keyList = new ArrayList();
keys(keyList);
return keyList.iterator();
}
@Override
public Object[] toArray() {
List keyList = new ArrayList();
keys(keyList);
return keyList.toArray();
}
@Override
public boolean addAll(Collection extends T> c) {
boolean anyAdded = false;
for(T o : c) {
boolean added = add(o);
anyAdded |= added;
}
return anyAdded;
}
@Override
public boolean containsAll(Collection> c) {
for (Object o : c) {
if (!contains(o)) {
return false;
}
}
return true;
}
@Override
public boolean removeAll(Collection> c) {
boolean anyRemoved = false;
for(Object o : c) {
boolean removed = remove(o);
anyRemoved |= removed;
}
return anyRemoved;
}
@Override
public boolean retainAll(Collection> c) {
final Collection> finalCollection = c;
final boolean[] modified = new boolean[1];
modified[0] = false;
forEachKey(new ObjectProcedure() {
@Override
public boolean apply(T element) {
if (!finalCollection.contains(element)) {
remove(element);
modified[0] = true;
}
return true;
}});
return modified[0];
}
@Override
public T2[] toArray(T2[] a) {
List keys = new ArrayList();
keys(keys);
return keys.toArray(a);
}
public List keys() {
List keys = new ArrayList();
keys(keys);
return keys;
}
}