org.elasticsearch.common.util.BytesRefHash Maven / Gradle / Ivy
/*
* Licensed to Elasticsearch under one or more contributor
* license agreements. See the NOTICE file distributed with
* this work for additional information regarding copyright
* ownership. Elasticsearch licenses this file to you under
* the Apache License, Version 2.0 (the "License"); you may
* not use this file except in compliance with the License.
* You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing,
* software distributed under the License is distributed on an
* "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
* KIND, either express or implied. See the License for the
* specific language governing permissions and limitations
* under the License.
*/
package org.elasticsearch.common.util;
import com.carrotsearch.hppc.BitMixer;
import org.apache.lucene.util.BytesRef;
import org.elasticsearch.common.lease.Releasable;
import org.elasticsearch.common.lease.Releasables;
/**
* Specialized hash table implementation similar to Lucene's BytesRefHash that maps
* BytesRef values to ids. Collisions are resolved with open addressing and linear
* probing, growth is smooth thanks to {@link BigArrays}, hashes are cached for faster
* re-hashing and capacity is always a multiple of 2 for faster identification of buckets.
* This class is not thread-safe.
*/
public final class BytesRefHash extends AbstractHash {
private LongArray startOffsets;
private ByteArray bytes;
private IntArray hashes; // we cache hashes for faster re-hashing
private final BytesRef spare;
// Constructor with configurable capacity and default maximum load factor.
public BytesRefHash(long capacity, BigArrays bigArrays) {
this(capacity, DEFAULT_MAX_LOAD_FACTOR, bigArrays);
}
//Constructor with configurable capacity and load factor.
public BytesRefHash(long capacity, float maxLoadFactor, BigArrays bigArrays) {
super(capacity, maxLoadFactor, bigArrays);
startOffsets = bigArrays.newLongArray(capacity + 1, false);
startOffsets.set(0, 0);
bytes = bigArrays.newByteArray(capacity * 3, false);
hashes = bigArrays.newIntArray(capacity, false);
spare = new BytesRef();
}
// BytesRef has a weak hashCode function so we try to improve it by rehashing using Murmur3
// Feel free to remove rehashing if BytesRef gets a better hash function
private static int rehash(int hash) {
return BitMixer.mix32(hash);
}
/**
* Return the key at 0 <= index <= capacity()
. The result is undefined if the slot is unused.
* Beware that the content of the {@link BytesRef} may become invalid as soon as {@link #close()} is called
*/
public BytesRef get(long id, BytesRef dest) {
final long startOffset = startOffsets.get(id);
final int length = (int) (startOffsets.get(id + 1) - startOffset);
bytes.get(startOffset, length, dest);
return dest;
}
/**
* Get the id associated with key
*/
public long find(BytesRef key, int code) {
final long slot = slot(rehash(code), mask);
for (long index = slot; ; index = nextSlot(index, mask)) {
final long id = id(index);
if (id == -1L || key.bytesEquals(get(id, spare))) {
return id;
}
}
}
/** Sugar for {@link #find(BytesRef, int) find(key, key.hashCode()} */
public long find(BytesRef key) {
return find(key, key.hashCode());
}
private long set(BytesRef key, int code, long id) {
assert rehash(key.hashCode()) == code;
assert size < maxSize;
final long slot = slot(code, mask);
for (long index = slot; ; index = nextSlot(index, mask)) {
final long curId = id(index);
if (curId == -1) { // means unset
id(index, id);
append(id, key, code);
++size;
return id;
} else if (key.bytesEquals(get(curId, spare))) {
return -1 - curId;
}
}
}
private void append(long id, BytesRef key, int code) {
assert size == id;
final long startOffset = startOffsets.get(size);
bytes = bigArrays.grow(bytes, startOffset + key.length);
bytes.set(startOffset, key.bytes, key.offset, key.length);
startOffsets = bigArrays.grow(startOffsets, size + 2);
startOffsets.set(size + 1, startOffset + key.length);
hashes = bigArrays.grow(hashes, id + 1);
hashes.set(id, code);
}
private boolean assertConsistent(long id, int code) {
get(id, spare);
return rehash(spare.hashCode()) == code;
}
private void reset(int code, long id) {
assert assertConsistent(id, code);
final long slot = slot(code, mask);
for (long index = slot; ; index = nextSlot(index, mask)) {
final long curId = id(index);
if (curId == -1) { // means unset
id(index, id);
break;
}
}
}
/**
* Try to add key
. Return its newly allocated id if it wasn't in the hash table yet, or -1-id
* if it was already present in the hash table.
*/
public long add(BytesRef key, int code) {
if (size >= maxSize) {
assert size == maxSize;
grow();
}
assert size < maxSize;
return set(key, rehash(code), size);
}
/** Sugar to {@link #add(BytesRef, int) add(key, key.hashCode()}. */
public long add(BytesRef key) {
return add(key, key.hashCode());
}
@Override
protected void removeAndAdd(long index) {
final long id = id(index, -1);
assert id >= 0;
final int code = hashes.get(id);
reset(code, id);
}
@Override
public void close() {
try (Releasable releasable = Releasables.wrap(bytes, hashes, startOffsets)) {
super.close();
}
}
}
© 2015 - 2025 Weber Informatics LLC | Privacy Policy