org.apache.lucene.util.BytesRefHash Maven / Gradle / Ivy
/*
* Licensed to the Apache Software Foundation (ASF) under one or more
* contributor license agreements. See the NOTICE file distributed with
* this work for additional information regarding copyright ownership.
* The ASF licenses this file to You under the Apache License, Version 2.0
* (the "License"); you may not use this file except in compliance with
* the License. You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
package org.apache.lucene.util;
import java.util.Arrays;
import java.util.Comparator;
import java.util.concurrent.atomic.AtomicLong;
import org.apache.lucene.util.ByteBlockPool.DirectAllocator;
import static org.apache.lucene.util.ByteBlockPool.BYTE_BLOCK_MASK;
import static org.apache.lucene.util.ByteBlockPool.BYTE_BLOCK_SHIFT;
import static org.apache.lucene.util.ByteBlockPool.BYTE_BLOCK_SIZE;
/**
* {@link BytesRefHash} is a special purpose hash-map like data-structure
* optimized for {@link BytesRef} instances. BytesRefHash maintains mappings of
* byte arrays to ids (Map<BytesRef,int>) storing the hashed bytes
* efficiently in continuous storage. The mapping to the id is
* encapsulated inside {@link BytesRefHash} and is guaranteed to be increased
* for each added {@link BytesRef}.
*
*
* Note: The maximum capacity {@link BytesRef} instance passed to
* {@link #add(BytesRef)} must not be longer than {@link ByteBlockPool#BYTE_BLOCK_SIZE}-2.
* The internal storage is limited to 2GB total byte storage.
*
*
* @lucene.internal
*/
public final class BytesRefHash {
public static final int DEFAULT_CAPACITY = 16;
// the following fields are needed by comparator,
// so package private to prevent access$-methods:
final ByteBlockPool pool;
int[] bytesStart;
private final BytesRef scratch1 = new BytesRef();
private int hashSize;
private int hashHalfSize;
private int hashMask;
private int count;
private int lastCount = -1;
private int[] ids;
private final BytesStartArray bytesStartArray;
private Counter bytesUsed;
/**
* Creates a new {@link BytesRefHash} with a {@link ByteBlockPool} using a
* {@link DirectAllocator}.
*/
public BytesRefHash() {
this(new ByteBlockPool(new DirectAllocator()));
}
/**
* Creates a new {@link BytesRefHash}
*/
public BytesRefHash(ByteBlockPool pool) {
this(pool, DEFAULT_CAPACITY, new DirectBytesStartArray(DEFAULT_CAPACITY));
}
/**
* Creates a new {@link BytesRefHash}
*/
public BytesRefHash(ByteBlockPool pool, int capacity, BytesStartArray bytesStartArray) {
hashSize = capacity;
hashHalfSize = hashSize >> 1;
hashMask = hashSize - 1;
this.pool = pool;
ids = new int[hashSize];
Arrays.fill(ids, -1);
this.bytesStartArray = bytesStartArray;
bytesStart = bytesStartArray.init();
bytesUsed = bytesStartArray.bytesUsed() == null? Counter.newCounter() : bytesStartArray.bytesUsed();
bytesUsed.addAndGet(hashSize * RamUsageEstimator.NUM_BYTES_INT);
}
/**
* Returns the number of {@link BytesRef} values in this {@link BytesRefHash}.
*
* @return the number of {@link BytesRef} values in this {@link BytesRefHash}.
*/
public int size() {
return count;
}
/**
* Populates and returns a {@link BytesRef} with the bytes for the given
* bytesID.
*
* Note: the given bytesID must be a positive integer less than the current
* size ({@link #size()})
*
* @param bytesID
* the id
* @param ref
* the {@link BytesRef} to populate
*
* @return the given BytesRef instance populated with the bytes for the given
* bytesID
*/
public BytesRef get(int bytesID, BytesRef ref) {
assert bytesStart != null : "bytesStart is null - not initialized";
assert bytesID < bytesStart.length: "bytesID exceeds byteStart len: " + bytesStart.length;
pool.setBytesRef(ref, bytesStart[bytesID]);
return ref;
}
/**
* Returns the ids array in arbitrary order. Valid ids start at offset of 0
* and end at a limit of {@link #size()} - 1
*
* Note: This is a destructive operation. {@link #clear()} must be called in
* order to reuse this {@link BytesRefHash} instance.
*
*/
int[] compact() {
assert bytesStart != null : "bytesStart is null - not initialized";
int upto = 0;
for (int i = 0; i < hashSize; i++) {
if (ids[i] != -1) {
if (upto < i) {
ids[upto] = ids[i];
ids[i] = -1;
}
upto++;
}
}
assert upto == count;
lastCount = count;
return ids;
}
/**
* Returns the values array sorted by the referenced byte values.
*
* Note: This is a destructive operation. {@link #clear()} must be called in
* order to reuse this {@link BytesRefHash} instance.
*
*
* @param comp
* the {@link Comparator} used for sorting
*/
public int[] sort(final Comparator comp) {
final int[] compact = compact();
new IntroSorter() {
@Override
protected void swap(int i, int j) {
final int o = compact[i];
compact[i] = compact[j];
compact[j] = o;
}
@Override
protected int compare(int i, int j) {
final int id1 = compact[i], id2 = compact[j];
assert bytesStart.length > id1 && bytesStart.length > id2;
pool.setBytesRef(scratch1, bytesStart[id1]);
pool.setBytesRef(scratch2, bytesStart[id2]);
return comp.compare(scratch1, scratch2);
}
@Override
protected void setPivot(int i) {
final int id = compact[i];
assert bytesStart.length > id;
pool.setBytesRef(pivot, bytesStart[id]);
}
@Override
protected int comparePivot(int j) {
final int id = compact[j];
assert bytesStart.length > id;
pool.setBytesRef(scratch2, bytesStart[id]);
return comp.compare(pivot, scratch2);
}
private final BytesRef pivot = new BytesRef(),
scratch1 = new BytesRef(), scratch2 = new BytesRef();
}.sort(0, count);
return compact;
}
private boolean equals(int id, BytesRef b) {
pool.setBytesRef(scratch1, bytesStart[id]);
return scratch1.bytesEquals(b);
}
private boolean shrink(int targetSize) {
// Cannot use ArrayUtil.shrink because we require power
// of 2:
int newSize = hashSize;
while (newSize >= 8 && newSize / 4 > targetSize) {
newSize /= 2;
}
if (newSize != hashSize) {
bytesUsed.addAndGet(RamUsageEstimator.NUM_BYTES_INT * -(hashSize - newSize));
hashSize = newSize;
ids = new int[hashSize];
Arrays.fill(ids, -1);
hashHalfSize = newSize / 2;
hashMask = newSize - 1;
return true;
} else {
return false;
}
}
/**
* Clears the {@link BytesRef} which maps to the given {@link BytesRef}
*/
public void clear(boolean resetPool) {
lastCount = count;
count = 0;
if (resetPool) {
pool.reset(false, false); // we don't need to 0-fill the buffers
}
bytesStart = bytesStartArray.clear();
if (lastCount != -1 && shrink(lastCount)) {
// shrink clears the hash entries
return;
}
Arrays.fill(ids, -1);
}
public void clear() {
clear(true);
}
/**
* Closes the BytesRefHash and releases all internally used memory
*/
public void close() {
clear(true);
ids = null;
bytesUsed.addAndGet(RamUsageEstimator.NUM_BYTES_INT * -hashSize);
}
/**
* Adds a new {@link BytesRef}
*
* @param bytes
* the bytes to hash
* @return the id the given bytes are hashed if there was no mapping for the
* given bytes, otherwise (-(id)-1)
. This guarantees
* that the return value will always be >= 0 if the given bytes
* haven't been hashed before.
*
* @throws MaxBytesLengthExceededException
* if the given bytes are {@code > 2 +}
* {@link ByteBlockPool#BYTE_BLOCK_SIZE}
*/
public int add(BytesRef bytes) {
assert bytesStart != null : "Bytesstart is null - not initialized";
final int length = bytes.length;
// final position
final int hashPos = findHash(bytes);
int e = ids[hashPos];
if (e == -1) {
// new entry
final int len2 = 2 + bytes.length;
if (len2 + pool.byteUpto > BYTE_BLOCK_SIZE) {
if (len2 > BYTE_BLOCK_SIZE) {
throw new MaxBytesLengthExceededException("bytes can be at most "
+ (BYTE_BLOCK_SIZE - 2) + " in length; got " + bytes.length);
}
pool.nextBuffer();
}
final byte[] buffer = pool.buffer;
final int bufferUpto = pool.byteUpto;
if (count >= bytesStart.length) {
bytesStart = bytesStartArray.grow();
assert count < bytesStart.length + 1 : "count: " + count + " len: "
+ bytesStart.length;
}
e = count++;
bytesStart[e] = bufferUpto + pool.byteOffset;
// We first encode the length, followed by the
// bytes. Length is encoded as vInt, but will consume
// 1 or 2 bytes at most (we reject too-long terms,
// above).
if (length < 128) {
// 1 byte to store length
buffer[bufferUpto] = (byte) length;
pool.byteUpto += length + 1;
assert length >= 0: "Length must be positive: " + length;
System.arraycopy(bytes.bytes, bytes.offset, buffer, bufferUpto + 1,
length);
} else {
// 2 byte to store length
buffer[bufferUpto] = (byte) (0x80 | (length & 0x7f));
buffer[bufferUpto + 1] = (byte) ((length >> 7) & 0xff);
pool.byteUpto += length + 2;
System.arraycopy(bytes.bytes, bytes.offset, buffer, bufferUpto + 2,
length);
}
assert ids[hashPos] == -1;
ids[hashPos] = e;
if (count == hashHalfSize) {
rehash(2 * hashSize, true);
}
return e;
}
return -(e + 1);
}
/**
* Returns the id of the given {@link BytesRef}.
*
* @param bytes
* the bytes to look for
*
* @return the id of the given bytes, or {@code -1} if there is no mapping for the
* given bytes.
*/
public int find(BytesRef bytes) {
return ids[findHash(bytes)];
}
private int findHash(BytesRef bytes) {
assert bytesStart != null : "bytesStart is null - not initialized";
int code = doHash(bytes.bytes, bytes.offset, bytes.length);
// final position
int hashPos = code & hashMask;
int e = ids[hashPos];
if (e != -1 && !equals(e, bytes)) {
// Conflict; use linear probe to find an open slot
// (see LUCENE-5604):
do {
code++;
hashPos = code & hashMask;
e = ids[hashPos];
} while (e != -1 && !equals(e, bytes));
}
return hashPos;
}
/** Adds a "arbitrary" int offset instead of a BytesRef
* term. This is used in the indexer to hold the hash for term
* vectors, because they do not redundantly store the byte[] term
* directly and instead reference the byte[] term
* already stored by the postings BytesRefHash. See
* add(int textStart) in TermsHashPerField. */
public int addByPoolOffset(int offset) {
assert bytesStart != null : "Bytesstart is null - not initialized";
// final position
int code = offset;
int hashPos = offset & hashMask;
int e = ids[hashPos];
if (e != -1 && bytesStart[e] != offset) {
// Conflict; use linear probe to find an open slot
// (see LUCENE-5604):
do {
code++;
hashPos = code & hashMask;
e = ids[hashPos];
} while (e != -1 && bytesStart[e] != offset);
}
if (e == -1) {
// new entry
if (count >= bytesStart.length) {
bytesStart = bytesStartArray.grow();
assert count < bytesStart.length + 1 : "count: " + count + " len: "
+ bytesStart.length;
}
e = count++;
bytesStart[e] = offset;
assert ids[hashPos] == -1;
ids[hashPos] = e;
if (count == hashHalfSize) {
rehash(2 * hashSize, false);
}
return e;
}
return -(e + 1);
}
/**
* Called when hash is too small ({@code > 50%} occupied) or too large ({@code < 20%}
* occupied).
*/
private void rehash(final int newSize, boolean hashOnData) {
final int newMask = newSize - 1;
bytesUsed.addAndGet(RamUsageEstimator.NUM_BYTES_INT * (newSize));
final int[] newHash = new int[newSize];
Arrays.fill(newHash, -1);
for (int i = 0; i < hashSize; i++) {
final int e0 = ids[i];
if (e0 != -1) {
int code;
if (hashOnData) {
final int off = bytesStart[e0];
final int start = off & BYTE_BLOCK_MASK;
final byte[] bytes = pool.buffers[off >> BYTE_BLOCK_SHIFT];
final int len;
int pos;
if ((bytes[start] & 0x80) == 0) {
// length is 1 byte
len = bytes[start];
pos = start + 1;
} else {
len = (bytes[start] & 0x7f) + ((bytes[start + 1] & 0xff) << 7);
pos = start + 2;
}
code = doHash(bytes, pos, len);
} else {
code = bytesStart[e0];
}
int hashPos = code & newMask;
assert hashPos >= 0;
if (newHash[hashPos] != -1) {
// Conflict; use linear probe to find an open slot
// (see LUCENE-5604):
do {
code++;
hashPos = code & newMask;
} while (newHash[hashPos] != -1);
}
newHash[hashPos] = e0;
}
}
hashMask = newMask;
bytesUsed.addAndGet(RamUsageEstimator.NUM_BYTES_INT * (-ids.length));
ids = newHash;
hashSize = newSize;
hashHalfSize = newSize / 2;
}
// TODO: maybe use long? But our keys are typically short...
private int doHash(byte[] bytes, int offset, int length) {
return StringHelper.murmurhash3_x86_32(bytes, offset, length, StringHelper.GOOD_FAST_HASH_SEED);
}
/**
* reinitializes the {@link BytesRefHash} after a previous {@link #clear()}
* call. If {@link #clear()} has not been called previously this method has no
* effect.
*/
public void reinit() {
if (bytesStart == null) {
bytesStart = bytesStartArray.init();
}
if (ids == null) {
ids = new int[hashSize];
bytesUsed.addAndGet(RamUsageEstimator.NUM_BYTES_INT * hashSize);
}
}
/**
* Returns the bytesStart offset into the internally used
* {@link ByteBlockPool} for the given bytesID
*
* @param bytesID
* the id to look up
* @return the bytesStart offset into the internally used
* {@link ByteBlockPool} for the given id
*/
public int byteStart(int bytesID) {
assert bytesStart != null : "bytesStart is null - not initialized";
assert bytesID >= 0 && bytesID < count : bytesID;
return bytesStart[bytesID];
}
/**
* Thrown if a {@link BytesRef} exceeds the {@link BytesRefHash} limit of
* {@link ByteBlockPool#BYTE_BLOCK_SIZE}-2.
*/
@SuppressWarnings("serial")
public static class MaxBytesLengthExceededException extends RuntimeException {
MaxBytesLengthExceededException(String message) {
super(message);
}
}
/** Manages allocation of the per-term addresses. */
public abstract static class BytesStartArray {
/**
* Initializes the BytesStartArray. This call will allocate memory
*
* @return the initialized bytes start array
*/
public abstract int[] init();
/**
* Grows the {@link BytesStartArray}
*
* @return the grown array
*/
public abstract int[] grow();
/**
* clears the {@link BytesStartArray} and returns the cleared instance.
*
* @return the cleared instance, this might be null
*/
public abstract int[] clear();
/**
* A {@link Counter} reference holding the number of bytes used by this
* {@link BytesStartArray}. The {@link BytesRefHash} uses this reference to
* track it memory usage
*
* @return a {@link AtomicLong} reference holding the number of bytes used
* by this {@link BytesStartArray}.
*/
public abstract Counter bytesUsed();
}
/** A simple {@link BytesStartArray} that tracks
* memory allocation using a private {@link Counter}
* instance. */
public static class DirectBytesStartArray extends BytesStartArray {
// TODO: can't we just merge this w/
// TrackingDirectBytesStartArray...? Just add a ctor
// that makes a private bytesUsed?
protected final int initSize;
private int[] bytesStart;
private final Counter bytesUsed;
public DirectBytesStartArray(int initSize, Counter counter) {
this.bytesUsed = counter;
this.initSize = initSize;
}
public DirectBytesStartArray(int initSize) {
this(initSize, Counter.newCounter());
}
@Override
public int[] clear() {
return bytesStart = null;
}
@Override
public int[] grow() {
assert bytesStart != null;
return bytesStart = ArrayUtil.grow(bytesStart, bytesStart.length + 1);
}
@Override
public int[] init() {
return bytesStart = new int[ArrayUtil.oversize(initSize,
RamUsageEstimator.NUM_BYTES_INT)];
}
@Override
public Counter bytesUsed() {
return bytesUsed;
}
}
}