
org.apache.lucene.util.SparseFixedBitSet Maven / Gradle / Ivy
Go to download
Show more of this group Show more artifacts with this name
Show all versions of org.apache.servicemix.bundles.lucene
Show all versions of org.apache.servicemix.bundles.lucene
This OSGi bundle wraps ${pkgArtifactId} ${pkgVersion} jar file.
/*
* Licensed to the Apache Software Foundation (ASF) under one or more
* contributor license agreements. See the NOTICE file distributed with
* this work for additional information regarding copyright ownership.
* The ASF licenses this file to You under the Apache License, Version 2.0
* (the "License"); you may not use this file except in compliance with
* the License. You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
package org.apache.lucene.util;
import java.io.IOException;
import java.util.Arrays;
import org.apache.lucene.search.DocIdSetIterator;
/**
* A bit set that only stores longs that have at least one bit which is set. The way it works is
* that the space of bits is divided into blocks of 4096 bits, which is 64 longs. Then for each
* block, we have:
*
*
* - a long[] which stores the non-zero longs for that block
*
- a long so that bit
i
being set means that the i-th
long of the
* block is non-null, and its offset in the array of longs is the number of one bits on the
* right of the i-th
bit.
*
*
* @lucene.internal
*/
public class SparseFixedBitSet extends BitSet {
private static final long BASE_RAM_BYTES_USED =
RamUsageEstimator.shallowSizeOfInstance(SparseFixedBitSet.class);
private static final long SINGLE_ELEMENT_ARRAY_BYTES_USED = RamUsageEstimator.sizeOf(new long[1]);
private static final int MASK_4096 = (1 << 12) - 1;
private static int blockCount(int length) {
int blockCount = length >>> 12;
if ((blockCount << 12) < length) {
++blockCount;
}
assert (blockCount << 12) >= length;
return blockCount;
}
final long[] indices;
final long[][] bits;
final int length;
int nonZeroLongCount;
long ramBytesUsed;
/**
* Create a {@link SparseFixedBitSet} that can contain bits between 0
included and
* length
excluded.
*/
public SparseFixedBitSet(int length) {
if (length < 1) {
throw new IllegalArgumentException("length needs to be >= 1");
}
this.length = length;
final int blockCount = blockCount(length);
indices = new long[blockCount];
bits = new long[blockCount][];
ramBytesUsed =
BASE_RAM_BYTES_USED
+ RamUsageEstimator.sizeOf(indices)
+ RamUsageEstimator.shallowSizeOf(bits);
}
@Override
public void clear() {
Arrays.fill(bits, null);
Arrays.fill(indices, 0L);
nonZeroLongCount = 0;
ramBytesUsed =
BASE_RAM_BYTES_USED
+ RamUsageEstimator.sizeOf(indices)
+ RamUsageEstimator.shallowSizeOf(bits);
}
@Override
public int length() {
return length;
}
private boolean consistent(int index) {
assert index >= 0 && index < length : "index=" + index + ",length=" + length;
return true;
}
@Override
public int cardinality() {
int cardinality = 0;
for (long[] bitArray : bits) {
if (bitArray != null) {
for (long bits : bitArray) {
cardinality += Long.bitCount(bits);
}
}
}
return cardinality;
}
@Override
public int approximateCardinality() {
// we are assuming that bits are uniformly set and use the linear counting
// algorithm to estimate the number of bits that are set based on the number
// of longs that are different from zero
final int totalLongs = (length + 63) >>> 6; // total number of longs in the space
assert totalLongs >= nonZeroLongCount;
final int zeroLongs = totalLongs - nonZeroLongCount; // number of longs that are zeros
// No need to guard against division by zero, it will return +Infinity and things will work as
// expected
final long estimate = Math.round(totalLongs * Math.log((double) totalLongs / zeroLongs));
return (int) Math.min(length, estimate);
}
@Override
public boolean get(int i) {
assert consistent(i);
final int i4096 = i >>> 12;
final long index = indices[i4096];
final int i64 = i >>> 6;
final long i64bit = 1L << i64;
// first check the index, if the i64-th bit is not set, then i is not set
// note: this relies on the fact that shifts are mod 64 in java
if ((index & i64bit) == 0) {
return false;
}
// if it is set, then we count the number of bits that are set on the right
// of i64, and that gives us the index of the long that stores the bits we
// are interested in
final long bits = this.bits[i4096][Long.bitCount(index & (i64bit - 1))];
return (bits & (1L << i)) != 0;
}
@Override
public boolean getAndSet(int i) {
assert consistent(i);
final int i4096 = i >>> 12;
final long index = indices[i4096];
final int i64 = i >>> 6;
final long i64bit = 1L << i64;
if ((index & i64bit) != 0) {
// in that case the sub 64-bits block we are interested in already exists,
// we just need to set a bit in an existing long: the number of ones on
// the right of i64 gives us the index of the long we need to update
final int location = Long.bitCount(index & (i64bit - 1));
final long bit = 1L << i; // shifts are mod 64 in java
boolean v = (bits[i4096][location] & bit) != 0;
bits[i4096][location] |= bit;
return v;
} else if (index == 0) {
// if the index is 0, it means that we just found a block of 4096 bits
// that has no bit that is set yet. So let's initialize a new block:
insertBlock(i4096, i64bit, i);
return false;
} else {
// in that case we found a block of 4096 bits that has some values, but
// the sub-block of 64 bits that we are interested in has no value yet,
// so we need to insert a new long
insertLong(i4096, i64bit, i, index);
return false;
}
}
private static int oversize(int s) {
int newSize = s + (s >>> 1);
if (newSize > 50) {
newSize = 64;
}
return newSize;
}
/** Set the bit at index i
. */
@Override
public void set(int i) {
assert consistent(i);
final int i4096 = i >>> 12;
final long index = indices[i4096];
final int i64 = i >>> 6;
final long i64bit = 1L << i64;
if ((index & i64bit) != 0) {
// in that case the sub 64-bits block we are interested in already exists,
// we just need to set a bit in an existing long: the number of ones on
// the right of i64 gives us the index of the long we need to update
bits[i4096][Long.bitCount(index & (i64bit - 1))] |= 1L << i; // shifts are mod 64 in java
} else if (index == 0) {
// if the index is 0, it means that we just found a block of 4096 bits
// that has no bit that is set yet. So let's initialize a new block:
insertBlock(i4096, i64bit, i);
} else {
// in that case we found a block of 4096 bits that has some values, but
// the sub-block of 64 bits that we are interested in has no value yet,
// so we need to insert a new long
insertLong(i4096, i64bit, i, index);
}
}
private void insertBlock(int i4096, long i64bit, int i) {
indices[i4096] = i64bit;
assert bits[i4096] == null;
bits[i4096] = new long[] {1L << i}; // shifts are mod 64 in java
++nonZeroLongCount;
ramBytesUsed += SINGLE_ELEMENT_ARRAY_BYTES_USED;
}
private void insertLong(int i4096, long i64bit, int i, long index) {
indices[i4096] |= i64bit;
// we count the number of bits that are set on the right of i64
// this gives us the index at which to perform the insertion
final int o = Long.bitCount(index & (i64bit - 1));
final long[] bitArray = bits[i4096];
if (bitArray[bitArray.length - 1] == 0) {
// since we only store non-zero longs, if the last value is 0, it means
// that we already have extra space, make use of it
System.arraycopy(bitArray, o, bitArray, o + 1, bitArray.length - o - 1);
bitArray[o] = 1L << i;
} else {
// we don't have extra space so we need to resize to insert the new long
final int newSize = oversize(bitArray.length + 1);
final long[] newBitArray = new long[newSize];
System.arraycopy(bitArray, 0, newBitArray, 0, o);
newBitArray[o] = 1L << i;
System.arraycopy(bitArray, o, newBitArray, o + 1, bitArray.length - o);
bits[i4096] = newBitArray;
// we may slightly overestimate size here, but keep it cheap
ramBytesUsed += (newBitArray.length - bitArray.length) << 3;
}
++nonZeroLongCount;
}
/** Clear the bit at index i
. */
@Override
public void clear(int i) {
assert consistent(i);
final int i4096 = i >>> 12;
final int i64 = i >>> 6;
and(i4096, i64, ~(1L << i));
}
private void and(int i4096, int i64, long mask) {
final long index = indices[i4096];
if ((index & (1L << i64)) != 0) {
// offset of the long bits we are interested in in the array
final int o = Long.bitCount(index & ((1L << i64) - 1));
long bits = this.bits[i4096][o] & mask;
if (bits == 0) {
removeLong(i4096, i64, index, o);
} else {
this.bits[i4096][o] = bits;
}
}
}
private void removeLong(int i4096, int i64, long index, int o) {
index &= ~(1L << i64);
indices[i4096] = index;
if (index == 0) {
// release memory, there is nothing in this block anymore
this.bits[i4096] = null;
} else {
final int length = Long.bitCount(index);
final long[] bitArray = bits[i4096];
System.arraycopy(bitArray, o + 1, bitArray, o, length - o);
bitArray[length] = 0L;
}
nonZeroLongCount -= 1;
}
@Override
public void clear(int from, int to) {
assert from >= 0;
assert to <= length;
if (from >= to) {
return;
}
final int firstBlock = from >>> 12;
final int lastBlock = (to - 1) >>> 12;
if (firstBlock == lastBlock) {
clearWithinBlock(firstBlock, from & MASK_4096, (to - 1) & MASK_4096);
} else {
clearWithinBlock(firstBlock, from & MASK_4096, MASK_4096);
for (int i = firstBlock + 1; i < lastBlock; ++i) {
nonZeroLongCount -= Long.bitCount(indices[i]);
indices[i] = 0;
bits[i] = null;
}
clearWithinBlock(lastBlock, 0, (to - 1) & MASK_4096);
}
}
// create a long that has bits set to one between from and to
private static long mask(int from, int to) {
return ((1L << (to - from) << 1) - 1) << from;
}
private void clearWithinBlock(int i4096, int from, int to) {
int firstLong = from >>> 6;
int lastLong = to >>> 6;
if (firstLong == lastLong) {
and(i4096, firstLong, ~mask(from, to));
} else {
assert firstLong < lastLong;
and(i4096, lastLong, ~mask(0, to));
for (int i = lastLong - 1; i >= firstLong + 1; --i) {
and(i4096, i, 0L);
}
and(i4096, firstLong, ~mask(from, 63));
}
}
/** Return the first document that occurs on or after the provided block index. */
private int firstDoc(int i4096, int i4096upper) {
assert i4096upper <= indices.length
: "i4096upper=" + i4096 + ", indices.length=" + indices.length;
long index = 0;
while (i4096 < i4096upper) {
index = indices[i4096];
if (index != 0) {
final int i64 = Long.numberOfTrailingZeros(index);
return (i4096 << 12) | (i64 << 6) | Long.numberOfTrailingZeros(bits[i4096][0]);
}
i4096 += 1;
}
return DocIdSetIterator.NO_MORE_DOCS;
}
@Override
public int nextSetBit(int i) {
// Override with a version that skips the bound check on the result since we know it will not
// go OOB:
return nextSetBitInRange(i, length);
}
@Override
public int nextSetBit(int start, int upperBound) {
int res = nextSetBitInRange(start, upperBound);
return res < upperBound ? res : DocIdSetIterator.NO_MORE_DOCS;
}
/**
* Returns the next set bit in the specified range, but treats `upperBound` as a best-effort hint
* rather than a hard requirement. Note that this may return a result that is >= upperBound in
* some cases, so callers must add their own check if `upperBound` is a hard requirement.
*/
private int nextSetBitInRange(int start, int upperBound) {
assert start < length;
assert upperBound > start && upperBound <= length
: "upperBound=" + upperBound + ", start=" + start + ", length=" + length;
final int i4096 = start >>> 12;
final long index = indices[i4096];
final long[] bitArray = this.bits[i4096];
int i64 = start >>> 6;
final long i64bit = 1L << i64;
int o = Long.bitCount(index & (i64bit - 1));
if ((index & i64bit) != 0) {
// There is at least one bit that is set in the current long, check if
// one of them is after i
final long bits = bitArray[o] >>> start; // shifts are mod 64
if (bits != 0) {
return start + Long.numberOfTrailingZeros(bits);
}
o += 1;
}
final long indexBits = index >>> i64 >>> 1;
if (indexBits == 0) {
// no more bits are set in the current block of 4096 bits, go to the next one
int i4096upper = upperBound == length ? indices.length : blockCount(upperBound);
return firstDoc(i4096 + 1, i4096upper);
}
// there are still set bits
i64 += 1 + Long.numberOfTrailingZeros(indexBits);
final long bits = bitArray[o];
return (i64 << 6) | Long.numberOfTrailingZeros(bits);
}
/** Return the last document that occurs on or before the provided block index. */
private int lastDoc(int i4096) {
long index;
while (i4096 >= 0) {
index = indices[i4096];
if (index != 0) {
final int i64 = 63 - Long.numberOfLeadingZeros(index);
final long bits = this.bits[i4096][Long.bitCount(index) - 1];
return (i4096 << 12) | (i64 << 6) | (63 - Long.numberOfLeadingZeros(bits));
}
i4096 -= 1;
}
return -1;
}
@Override
public int prevSetBit(int i) {
assert i >= 0;
final int i4096 = i >>> 12;
final long index = indices[i4096];
final long[] bitArray = this.bits[i4096];
int i64 = i >>> 6;
final long indexBits = index & ((1L << i64) - 1);
final int o = Long.bitCount(indexBits);
if ((index & (1L << i64)) != 0) {
// There is at least one bit that is set in the same long, check if there
// is one bit that is set that is lower than i
final long bits = bitArray[o] & ((1L << i << 1) - 1);
if (bits != 0) {
return (i64 << 6) | (63 - Long.numberOfLeadingZeros(bits));
}
}
if (indexBits == 0) {
// no more bits are set in this block, go find the last bit in the
// previous block
return lastDoc(i4096 - 1);
}
// go to the previous long
i64 = 63 - Long.numberOfLeadingZeros(indexBits);
final long bits = bitArray[o - 1];
return (i4096 << 12) | (i64 << 6) | (63 - Long.numberOfLeadingZeros(bits));
}
/** Return the long bits at the given i64
index. */
private long longBits(long index, long[] bits, int i64) {
if ((index & (1L << i64)) == 0) {
return 0L;
} else {
return bits[Long.bitCount(index & ((1L << i64) - 1))];
}
}
private void or(final int i4096, final long index, long[] bits, int nonZeroLongCount) {
assert Long.bitCount(index) == nonZeroLongCount;
final long currentIndex = indices[i4096];
if (currentIndex == 0) {
// fast path: if we currently have nothing in the block, just copy the data
// this especially happens all the time if you call OR on an empty set
indices[i4096] = index;
long[] newBits = ArrayUtil.copyOfSubArray(bits, 0, nonZeroLongCount);
this.bits[i4096] = newBits;
// we may slightly overestimate size here, but keep it cheap
this.ramBytesUsed += SINGLE_ELEMENT_ARRAY_BYTES_USED + ((long) newBits.length - 1 << 3);
this.nonZeroLongCount += nonZeroLongCount;
return;
}
final long[] currentBits = this.bits[i4096];
final long[] newBits;
final long newIndex = currentIndex | index;
final int requiredCapacity = Long.bitCount(newIndex);
if (currentBits.length >= requiredCapacity) {
newBits = currentBits;
} else {
newBits = new long[oversize(requiredCapacity)];
// we may slightly overestimate size here, but keep it cheap
this.ramBytesUsed += (long) (newBits.length - currentBits.length) << 3;
}
// we iterate backwards in order to not override data we might need on the next iteration if the
// array is reused
for (int i = Long.numberOfLeadingZeros(newIndex), newO = Long.bitCount(newIndex) - 1;
i < 64;
i += 1 + Long.numberOfLeadingZeros(newIndex << (i + 1)), newO -= 1) {
// bitIndex is the index of a bit which is set in newIndex and newO is the number of 1 bits on
// its right
final int bitIndex = 63 - i;
assert newO == Long.bitCount(newIndex & ((1L << bitIndex) - 1));
newBits[newO] =
longBits(currentIndex, currentBits, bitIndex) | longBits(index, bits, bitIndex);
}
indices[i4096] = newIndex;
this.bits[i4096] = newBits;
this.nonZeroLongCount += nonZeroLongCount - Long.bitCount(currentIndex & index);
}
private void or(SparseFixedBitSet other) {
for (int i = 0; i < other.indices.length; ++i) {
final long index = other.indices[i];
if (index != 0) {
or(i, index, other.bits[i], Long.bitCount(index));
}
}
}
/** {@link #or(DocIdSetIterator)} impl that works best when it
is dense */
private void orDense(DocIdSetIterator it) throws IOException {
checkUnpositioned(it);
// The goal here is to try to take advantage of the ordering of documents
// to build the data-structure more efficiently
// NOTE: this heavily relies on the fact that shifts are mod 64
final int firstDoc = it.nextDoc();
if (firstDoc == DocIdSetIterator.NO_MORE_DOCS) {
return;
}
int i4096 = firstDoc >>> 12;
int i64 = firstDoc >>> 6;
long index = 1L << i64;
long currentLong = 1L << firstDoc;
// we store at most 64 longs per block so preallocate in order never to have to resize
long[] longs = new long[64];
int numLongs = 0;
for (int doc = it.nextDoc(); doc != DocIdSetIterator.NO_MORE_DOCS; doc = it.nextDoc()) {
final int doc64 = doc >>> 6;
if (doc64 == i64) {
// still in the same long, just set the bit
currentLong |= 1L << doc;
} else {
longs[numLongs++] = currentLong;
final int doc4096 = doc >>> 12;
if (doc4096 == i4096) {
index |= 1L << doc64;
} else {
// we are on a new block, flush what we buffered
or(i4096, index, longs, numLongs);
// and reset state for the new block
i4096 = doc4096;
index = 1L << doc64;
numLongs = 0;
}
// we are on a new long, reset state
i64 = doc64;
currentLong = 1L << doc;
}
}
// flush
longs[numLongs++] = currentLong;
or(i4096, index, longs, numLongs);
}
@Override
public void or(DocIdSetIterator it) throws IOException {
{
// specialize union with another SparseFixedBitSet
final SparseFixedBitSet other = BitSetIterator.getSparseFixedBitSetOrNull(it);
if (other != null) {
checkUnpositioned(it);
or(other);
return;
}
}
// We do not specialize the union with a FixedBitSet since FixedBitSets are
// supposed to be used for dense data and sparse fixed bit sets for sparse
// data, so a sparse set would likely get upgraded by DocIdSetBuilder before
// being or'ed with a FixedBitSet
if (it.cost() < indices.length) {
// the default impl is good for sparse iterators
super.or(it);
} else {
orDense(it);
}
}
@Override
public long ramBytesUsed() {
return ramBytesUsed;
}
@Override
public String toString() {
return "SparseFixedBitSet(size=" + length + ",cardinality=~" + approximateCardinality();
}
}
© 2015 - 2025 Weber Informatics LLC | Privacy Policy