org.apache.mahout.math.set.OpenByteHashSet Maven / Gradle / Ivy
Show all versions of mahout-collections Show documentation
/**
* Licensed to the Apache Software Foundation (ASF) under one
* or more contributor license agreements. See the NOTICE file
* distributed with this work for additional information
* regarding copyright ownership. The ASF licenses this file
* to you under the Apache License, Version 2.0 (the
* "License"); you may not use this file except in compliance
* with the License. You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing,
* software distributed under the License is distributed on an
* "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
* KIND, either express or implied. See the License for the
* specific language governing permissions and limitations
* under the License.
*/
package org.apache.mahout.math.set;
import java.util.Arrays;
import org.apache.mahout.math.function.ByteProcedure;
import org.apache.mahout.math.list.ByteArrayList;
import org.apache.mahout.math.map.HashFunctions;
import org.apache.mahout.math.map.PrimeFinder;
/**
* Open hash set of byte items;
**/
public class OpenByteHashSet extends AbstractByteSet {
protected static final byte FREE = 0;
protected static final byte FULL = 1;
protected static final byte REMOVED = 2;
protected static final byte NO_KEY_VALUE = 0;
/** The hash table keys. */
private byte[] table;
/** The state of each hash table entry (FREE, FULL, REMOVED). */
private byte[] state;
/** The number of table entries in state==FREE. */
private int freeEntries;
/** Constructs an empty map with default capacity and default load factors. */
public OpenByteHashSet() {
this(defaultCapacity);
}
/**
* Constructs an empty map with the specified initial capacity and default load factors.
*
* @param initialCapacity the initial capacity of the map.
* @throws IllegalArgumentException if the initial capacity is less than zero.
*/
public OpenByteHashSet(int initialCapacity) {
this(initialCapacity, defaultMinLoadFactor, defaultMaxLoadFactor);
}
/**
* Constructs an empty map with the specified initial capacity and the specified minimum and maximum load factor.
*
* @param initialCapacity the initial capacity.
* @param minLoadFactor the minimum load factor.
* @param maxLoadFactor the maximum load factor.
* @throws IllegalArgumentException if initialCapacity < 0 || (minLoadFactor < 0.0 || minLoadFactor >= 1.0) ||
* (maxLoadFactor <= 0.0 || maxLoadFactor >= 1.0) || (minLoadFactor >=
* maxLoadFactor).
*/
public OpenByteHashSet(int initialCapacity, double minLoadFactor, double maxLoadFactor) {
setUp(initialCapacity, minLoadFactor, maxLoadFactor);
}
/** Removes all values associations from the receiver. Implicitly calls trimToSize(). */
@Override
public void clear() {
Arrays.fill(this.state, 0, state.length - 1, FREE);
distinct = 0;
freeEntries = table.length; // delta
trimToSize();
}
/**
* Returns a deep copy of the receiver.
*
* @return a deep copy of the receiver.
*/
@Override
public Object clone() {
OpenByteHashSet copy = (OpenByteHashSet) super.clone();
copy.table = copy.table.clone();
copy.state = copy.state.clone();
return copy;
}
/**
* Returns true if the receiver contains the specified key.
*
* @return true if the receiver contains the specified key.
*/
@Override
public boolean contains(byte key) {
return indexOfKey(key) >= 0;
}
/**
* Ensures that the receiver can hold at least the specified number of associations without needing to allocate new
* internal memory. If necessary, allocates new internal memory and increases the capacity of the receiver. This
* method never need be called; it is for performance tuning only. Calling this method before add()ing a
* large number of associations boosts performance, because the receiver will grow only once instead of potentially
* many times and hash collisions get less probable.
*
* @param minCapacity the desired minimum capacity.
*/
@Override
public void ensureCapacity(int minCapacity) {
if (table.length < minCapacity) {
int newCapacity = nextPrime(minCapacity);
rehash(newCapacity);
}
}
/**
* Applies a procedure to each key of the receiver, if any. Note: Iterates over the keys in no particular order.
* Subclasses can define a particular order, for example, "sorted by key". All methods which can be expressed
* in terms of this method (most methods can) must guarantee to use the same order defined by this
* method, even if it is no particular order. This is necessary so that, for example, methods keys and
* values will yield association pairs, not two uncorrelated lists.
*
* @param procedure the procedure to be applied. Stops iteration if the procedure returns false, otherwise
* continues.
* @return false if the procedure stopped before all keys where iterated over, true otherwise.
*/
@Override
public boolean forEachKey(ByteProcedure procedure) {
for (int i = table.length; i-- > 0;) {
if (state[i] == FULL) {
if (!procedure.apply(table[i])) {
return false;
}
}
}
return true;
}
/**
* @param key the key to be added to the receiver.
* @return the index where the key would need to be inserted, if it is not already contained. Returns -index-1 if the
* key is already contained at slot index. Therefore, if the returned index < 0, then it is already contained
* at slot -index-1. If the returned index >= 0, then it is NOT already contained and should be inserted at
* slot index.
*/
protected int indexOfInsertion(byte key) {
final int length = table.length;
final int hash = HashFunctions.hash(key) & 0x7FFFFFFF;
int i = hash % length;
int decrement = hash % (length - 2); // double hashing, see http://www.eece.unm.edu/faculty/heileman/hash/node4.html
//int decrement = (hash / length) % length;
if (decrement == 0) {
decrement = 1;
}
// stop if we find a removed or free slot, or if we find the key itself
// do NOT skip over removed slots (yes, open addressing is like that...)
while (state[i] == FULL && table[i] != key) {
i -= decrement;
//hashCollisions++;
if (i < 0) {
i += length;
}
}
if (state[i] == REMOVED) {
// stop if we find a free slot, or if we find the key itself.
// do skip over removed slots (yes, open addressing is like that...)
// assertion: there is at least one FREE slot.
final int j = i;
while (state[i] != FREE && (state[i] == REMOVED || table[i] != key)) {
i -= decrement;
//hashCollisions++;
if (i < 0) {
i += length;
}
}
if (state[i] == FREE) {
i = j;
}
}
if (state[i] == FULL) {
// key already contained at slot i.
// return a negative number identifying the slot.
return -i - 1;
}
// not already contained, should be inserted at slot i.
// return a number >= 0 identifying the slot.
return i;
}
/**
* @param key the key to be searched in the receiver.
* @return the index where the key is contained in the receiver, returns -1 if the key was not found.
*/
protected int indexOfKey(byte key) {
final int length = table.length;
final int hash = HashFunctions.hash(key) & 0x7FFFFFFF;
int i = hash % length;
int decrement = hash % (length - 2); // double hashing, see http://www.eece.unm.edu/faculty/heileman/hash/node4.html
//int decrement = (hash / length) % length;
if (decrement == 0) {
decrement = 1;
}
// stop if we find a free slot, or if we find the key itself.
// do skip over removed slots (yes, open addressing is like that...)
while (state[i] != FREE && (state[i] == REMOVED || table[i] != key)) {
i -= decrement;
//hashCollisions++;
if (i < 0) {
i += length;
}
}
if (state[i] == FREE) {
return -1;
} // not found
return i; //found, return index where key is contained
}
/**
* Fills all keys contained in the receiver into the specified list. Fills the list, starting at index 0. After this
* call returns the specified list has a new size that equals this.size(). Iteration order is guaranteed to
* be identical to the order used by method {@link #forEachKey(ByteProcedure)}.
*
This method can be used
* to iterate over the keys of the receiver.
*
* @param list the list to be filled, can have any size.
*/
@Override
public void keys(ByteArrayList list) {
list.setSize(distinct);
byte [] elements = list.elements();
int j = 0;
for (int i = table.length; i-- > 0;) {
if (state[i] == FULL) {
elements[j++] = table[i];
}
}
}
/**
* Associates the given key with the given value. Replaces any old (key,someOtherValue) association, if
* existing.
*
* @param key the key the value shall be associated with.
* @return true if the receiver did not already contain such a key; false if the receiver did
* already contain such a key - the new value has now replaced the formerly associated value.
*/
@Override
public boolean add(byte key) {
int i = indexOfInsertion(key);
if (i < 0) { //already contained
//i = -i - 1;
return false;
}
if (this.distinct > this.highWaterMark) {
int newCapacity = chooseGrowCapacity(this.distinct + 1, this.minLoadFactor, this.maxLoadFactor);
rehash(newCapacity);
return add(key);
}
this.table[i] = key;
if (this.state[i] == FREE) {
this.freeEntries--;
}
this.state[i] = FULL;
this.distinct++;
if (this.freeEntries < 1) { //delta
int newCapacity = chooseGrowCapacity(this.distinct + 1, this.minLoadFactor, this.maxLoadFactor);
rehash(newCapacity);
}
return true;
}
/**
* Rehashes the contents of the receiver into a new table with a smaller or larger capacity. This method is called
* automatically when the number of keys in the receiver exceeds the high water mark or falls below the low water
* mark.
*/
protected void rehash(int newCapacity) {
int oldCapacity = table.length;
//if (oldCapacity == newCapacity) return;
byte[] oldTable = table;
byte[] oldState = state;
this.table = new byte[newCapacity];
this.state = new byte[newCapacity];
this.lowWaterMark = chooseLowWaterMark(newCapacity, this.minLoadFactor);
this.highWaterMark = chooseHighWaterMark(newCapacity, this.maxLoadFactor);
this.freeEntries = newCapacity - this.distinct; // delta
for (int i = oldCapacity; i-- > 0;) {
if (oldState[i] == FULL) {
byte element = oldTable[i];
int index = indexOfInsertion(element);
this.table[index] = element;
this.state[index] = FULL;
}
}
}
/**
* Removes the given key with its associated element from the receiver, if present.
*
* @param key the key to be removed from the receiver.
* @return true if the receiver contained the specified key, false otherwise.
*/
@Override
public boolean remove(byte key) {
int i = indexOfKey(key);
if (i < 0) {
return false;
} // key not contained
this.state[i] = REMOVED;
this.distinct--;
if (this.distinct < this.lowWaterMark) {
int newCapacity = chooseShrinkCapacity(this.distinct, this.minLoadFactor, this.maxLoadFactor);
rehash(newCapacity);
}
return true;
}
/**
* Initializes the receiver.
*
* @param initialCapacity the initial capacity of the receiver.
* @param minLoadFactor the minLoadFactor of the receiver.
* @param maxLoadFactor the maxLoadFactor of the receiver.
* @throws IllegalArgumentException if initialCapacity < 0 || (minLoadFactor < 0.0 || minLoadFactor >= 1.0) ||
* (maxLoadFactor <= 0.0 || maxLoadFactor >= 1.0) || (minLoadFactor >=
* maxLoadFactor).
*/
@Override
protected void setUp(int initialCapacity, double minLoadFactor, double maxLoadFactor) {
int capacity = initialCapacity;
super.setUp(capacity, minLoadFactor, maxLoadFactor);
capacity = nextPrime(capacity);
if (capacity == 0) {
capacity = 1;
} // open addressing needs at least one FREE slot at any time.
this.table = new byte[capacity];
this.state = new byte[capacity];
// memory will be exhausted long before this pathological case happens, anyway.
this.minLoadFactor = minLoadFactor;
if (capacity == PrimeFinder.largestPrime) {
this.maxLoadFactor = 1.0;
} else {
this.maxLoadFactor = maxLoadFactor;
}
this.distinct = 0;
this.freeEntries = capacity; // delta
// lowWaterMark will be established upon first expansion.
// establishing it now (upon instance construction) would immediately make the table shrink upon first put(...).
// After all the idea of an "initialCapacity" implies violating lowWaterMarks when an object is young.
// See ensureCapacity(...)
this.lowWaterMark = 0;
this.highWaterMark = chooseHighWaterMark(capacity, this.maxLoadFactor);
}
/**
* Trims the capacity of the receiver to be the receiver's current size. Releases any superfluous internal memory. An
* application can use this operation to minimize the storage of the receiver.
*/
@Override
public void trimToSize() {
// * 1.2 because open addressing's performance exponentially degrades beyond that point
// so that even rehashing the table can take very long
int newCapacity = nextPrime((int) (1 + 1.2 * size()));
if (table.length > newCapacity) {
rehash(newCapacity);
}
}
/**
* Access for unit tests.
* @param capacity
* @param minLoadFactor
* @param maxLoadFactor
*/
protected void getInternalFactors(int[] capacity,
double[] minLoadFactor,
double[] maxLoadFactor) {
capacity[0] = table.length;
minLoadFactor[0] = this.minLoadFactor;
maxLoadFactor[0] = this.maxLoadFactor;
}
}