com.bigdata.bop.ap.SampleIndex Maven / Gradle / Ivy
/**
Copyright (C) SYSTAP, LLC DBA Blazegraph 2006-2016. All rights reserved.
Contact:
SYSTAP, LLC DBA Blazegraph
2501 Calvert ST NW #106
Washington, DC 20008
[email protected]
This program is free software; you can redistribute it and/or modify
it under the terms of the GNU General Public License as published by
the Free Software Foundation; version 2 of the License.
This program is distributed in the hope that it will be useful,
but WITHOUT ANY WARRANTY; without even the implied warranty of
MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
GNU General Public License for more details.
You should have received a copy of the GNU General Public License
along with this program; if not, write to the Free Software
Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA
*/
/*
* Created on Aug 16, 2010
*/
package com.bigdata.bop.ap;
import it.unimi.dsi.bits.BitVector;
import it.unimi.dsi.bits.LongArrayBitVector;
import it.unimi.dsi.fastutil.ints.IntOpenHashSet;
import java.io.Serializable;
import java.util.ArrayList;
import java.util.Arrays;
import java.util.Iterator;
import java.util.Map;
import java.util.Random;
import java.util.concurrent.Callable;
import com.bigdata.bop.AbstractAccessPathOp;
import com.bigdata.bop.BOp;
import com.bigdata.bop.BOpContextBase;
import com.bigdata.bop.IPredicate;
import com.bigdata.btree.AbstractBTree;
import com.bigdata.btree.ILeafCursor;
import com.bigdata.btree.ILinearList;
import com.bigdata.btree.IRangeQuery;
import com.bigdata.btree.ITuple;
import com.bigdata.btree.ITupleCursor;
import com.bigdata.btree.filter.Advancer;
import com.bigdata.btree.view.FusedView;
import com.bigdata.relation.IRelation;
import com.bigdata.relation.accesspath.AccessPath;
import com.bigdata.relation.accesspath.IAccessPath;
import com.bigdata.relation.rule.IAccessPathExpander;
import com.bigdata.striterator.IKeyOrder;
import com.bigdata.util.Bytes;
import cutthecrap.utils.striterators.IFilter;
/**
* Sampling operator for the {@link IAccessPath} implied by an
* {@link IPredicate}.
*
* @author Bryan Thompson
* @version $Id: AbstractSampleIndex.java 3672 2010-09-28 23:39:42Z thompsonbry
* $
* @param
* The generic type of the elements materialized from that index.
*
* @todo This is a basic operator which is designed to support adaptive query
* optimization. However, there are a lot of possible semantics for
* sampling, including: uniform distribution, randomly distribution, tuple
* at a time versus clustered (sampling with leaves), adaptive sampling
* until the sample reflects some statistical property of the underlying
* population, etc. Support for different kinds of sampling could be added
* using appropriate annotations.
*/
public class SampleIndex extends AbstractAccessPathOp {
/**
*
*/
private static final long serialVersionUID = 1L;
/**
* Typesafe enumeration of different kinds of index sampling strategies.
*
* @todo It is much more efficient to take clusters of samples when you can
* accept the bias. Taking a clustered sample really requires knowing
* where the leaf boundaries are in the index, e.g., using
* {@link ILeafCursor}. Taking all tuples from a few leaves in each
* sample might produce a faster estimation of the correlation when
* sampling join paths.
*/
public static enum SampleType {
/**
* Samples are taken at even space offsets. This produces a sample
* without any random effects. Re-sampling an index having the same data
* with the same key-range and the limit will always return the same
* results. This is useful to make unit test repeatable.
*/
EVEN,
/**
* Sample offsets are computed randomly.
*/
RANDOM,
/**
* The samples will be dense and may bave a front bias. This mode
* emphasizes the locality of the samples on the index pages and
* minimizes the IO associated with sampling.
*/
DENSE;
}
/**
* Known annotations.
*/
public interface Annotations extends BOp.Annotations {
/**
* The sample limit (default {@value #DEFAULT_LIMIT}).
*/
String LIMIT = (SampleIndex.class.getName() + ".limit").intern();
int DEFAULT_LIMIT = 100;
/**
* The random number generator seed -or- ZERO (0L) for a random seed
* (default {@value #DEFAULT_SEED}). A non-zero value may be used to
* create a repeatable sample.
*/
String SEED = (SampleIndex.class.getName() + ".seed").intern();
long DEFAULT_SEED = 0L;
/**
* The {@link IPredicate} describing the access path to be sampled
* (required).
*/
String PREDICATE = (SampleIndex.class.getName() + ".predicate").intern();
/**
* The type of sample to take (default {@value #DEFAULT_SAMPLE_TYPE)}.
*/
String SAMPLE_TYPE = (SampleIndex.class.getName() + ".sampleType").intern();
String DEFAULT_SAMPLE_TYPE = SampleType.RANDOM.name();
}
public SampleIndex(SampleIndex op) {
super(op);
}
public SampleIndex(BOp[] args, Map annotations) {
super(args, annotations);
}
/**
* @see Annotations#LIMIT
*/
public int limit() {
return getProperty(Annotations.LIMIT, Annotations.DEFAULT_LIMIT);
}
/**
* @see Annotations#SEED
*/
public long seed() {
return getProperty(Annotations.SEED, Annotations.DEFAULT_SEED);
}
/**
* @see Annotations#SAMPLE_TYPE
*/
public SampleType getSampleType() {
return SampleType.valueOf(getProperty(Annotations.SAMPLE_TYPE,
Annotations.DEFAULT_SAMPLE_TYPE));
}
@SuppressWarnings("unchecked")
public IPredicate getPredicate() {
return (IPredicate) getRequiredProperty(Annotations.PREDICATE);
}
/**
* Return a sample from the access path associated with the
* {@link Annotations#PREDICATE}.
*/
public E[] eval(final BOpContextBase context) {
try {
return new SampleTask(context).call();
} catch (Exception e) {
throw new RuntimeException(e);
}
}
/**
* Sample an {@link IAccessPath}.
*
* FIXME This needs to handle each of the following conditions:
*
* Timestamp {read-historical, read-committed, read-write tx, unisolated}
* Index view {standalone, partitioned,global view of partitioned}
*
* @todo The general approach uses the {@link ILinearList} interface to take
* evenly distributed or randomly distributed samples from the
* underlying index. This is done using an {@link IFilter} which is
* evaluated local to the index. This works whether or not the access
* path is using a partitioned view of the index.
*
* When sampling an index shard the {@link ILinearList} API is not
* defined for the {@link FusedView}. Since this sampling operator
* exists for the purposes of estimating the cardinality of an access
* path, we can dispense with the fused view and collect a number of
* samples from each component of that view which is proportional to
* the range count of the view divided by the range count of the
* component index. This may cause tuples which have since been
* deleted to become visible, but this should not cause problems when
* estimating the cardinality of a join path as long as we always
* report the actual tuples from the fused view in the case where the
* desired sample size is LTE the estimated range count of the access
* path.
*
* @todo Better performance could be realized by accepting all tuples in a
* leaf. This requires a sensitivity to the leaf boundaries which
* might be obtained with an {@link ITupleCursor} extension interface
* for local indices or with the {@link ILeafCursor} interface if that
* can be exposed from a sufficiently low level {@link ITupleCursor}
* implementation. However, when they are further constraints layered
* onto the access path by the {@link IPredicate} it may be that such
* clustered (leaf at once) sampling is not practical.
*
* @todo When sampling a global view of a partitioned index, we should focus
* the sample on a subset of the index partitions in order to
* "cluster" the effort. This can of course introduce bias. However,
* if there are a lot of index partitions then the sample will of
* necessity be very small in proportion to the data volume and the
* opportunity for bias will be correspondingly large.
*
* @todo If there is an {@link IAccessPathExpander} then
*/
private class SampleTask implements Callable {
private final BOpContextBase context;
SampleTask(final BOpContextBase context) {
this.context = context;
}
/** Return a sample from the access path. */
public E[] call() throws Exception {
return sample(limit(), getSampleType(), getPredicate()).getSample();
}
/**
* Return a sample from the access path.
*
* @param limit
* @return
*/
public AccessPathSample sample(final int limit,
final SampleType sampleType, IPredicate predicate) {
final IRelation relation = context.getRelation(predicate);
// @todo assumes raw AP.
final AccessPath accessPath = (AccessPath) context
.getAccessPath(relation, predicate);
final long rangeCount = accessPath.rangeCount(false/* exact */);
if (limit >= rangeCount) {
/*
* The sample will contain everything in the access path.
*/
return new AccessPathSample(limit, accessPath);
}
/*
* Add the CURSOR and PARALLEL flags to the predicate.
*
* @todo turn off REVERSE if specified.
*/
final int flags = predicate.getProperty(
IPredicate.Annotations.FLAGS,
IPredicate.Annotations.DEFAULT_FLAGS)
| IRangeQuery.CURSOR
| IRangeQuery.PARALLEL;
predicate = (IPredicate) predicate.setProperty(
IPredicate.Annotations.FLAGS, flags);
/*
* Add advancer to collect sample.
*/
final Advancer advancer;
switch (sampleType) {
case EVEN:
advancer = new EvenSampleAdvancer(// rangeCount,
limit, accessPath.getFromKey(), accessPath.getToKey());
break;
case RANDOM:
advancer = new RandomSampleAdvancer(// rangeCount,
seed(), limit, accessPath.getFromKey(), accessPath
.getToKey());
break;
case DENSE:
advancer = new DenseSampleAdvancer();
break;
default:
throw new UnsupportedOperationException("SampleType="
+ sampleType);
}
predicate = ((Predicate) predicate)
.addIndexLocalFilter(advancer);
return new AccessPathSample(limit, context.getAccessPath(
relation, predicate));
}
}
/**
* Dense samples in key order (simple index scan).
*
* @author Bryan Thompson
* @param
*/
private static class DenseSampleAdvancer extends Advancer {
private static final long serialVersionUID = 1L;
@Override
protected void advance(final ITuple tuple) {
// NOP
}
}
/**
* An advancer pattern which is designed to take evenly distributed samples
* from an index. The caller specifies the #of tuples to be sampled. This
* class estimates the range count of the access path and then computes the
* #of samples to be skipped after each tuple visited.
*
* Note: This can fail to gather the desired number of sample if additional
* filters are applied which further restrict the elements selected by the
* predicate. However, it will still faithfully represent the expected
* cardinality of the sampled access path (tuples tested).
*
* @author [email protected]
*
* @param
* The generic type of the elements visited by that access path.
*/
private static class EvenSampleAdvancer extends Advancer {
private static final long serialVersionUID = 1L;
/** The desired total limit on the sample. */
private final int limit;
private final byte[] /*fromKey,*/ toKey;
/*
* Transient data. This gets initialized when we visit the first tuple.
*/
/** The #of tuples to be skipped after every tuple visited. */
private transient long skipCount;
/** The #of tuples accepted so far. */
private transient int nread = 0;
/** The inclusive lower bound of the first tuple actually visited. */
private transient long fromIndex;
/** The exclusive upper bound of the last tuple which could be visited. */
private transient long toIndex;
/**
*
* @param limit
* The #of samples to visit.
*/
public EvenSampleAdvancer(final int limit, final byte[] fromKey,
final byte[] toKey) {
this.limit = limit;
this.toKey = toKey;
}
@Override
protected void advance(final ITuple tuple) {
final AbstractBTree ndx = (AbstractBTree) src.getIndex();
final long currentIndex = ndx.indexOf(tuple.getKey());
if (nread == 0) {
// inclusive lower bound.
fromIndex = currentIndex;
// exclusive upper bound.
toIndex = toKey == null ? ndx.getEntryCount() : ndx
.indexOf(toKey);
if (toIndex < 0) {
// convert insert position to index.
toIndex = -toIndex + 1;
}
final long rangeCount = (toIndex - fromIndex);
skipCount = Math.max(1L, rangeCount / limit);
// minus one since src.next() already consumed one tuple.
skipCount -= 1;
// System.err.println("limit=" + limit + ", rangeCount="
// + rangeCount + ", skipCount=" + skipCount);
}
nread++;
if (skipCount > 0) {
/*
* If the skip count is positive, then skip over N tuples.
*/
final long nextIndex = Math.min(ndx.getEntryCount() - 1,
currentIndex + skipCount);
src.seek(ndx.keyAt(nextIndex));
}
}
} // class EvenSampleAdvancer
/**
* An advancer pattern which is designed to take randomly distributed
* samples from an index. The caller specifies the #of tuples to be sampled.
* This class estimates the range count of the access path and then computes
* a set of random offsets into the access path from which it will collect
* the desired #of samples.
*
* Note: This can fail to gather the desired number of sample if additional
* filters are applied which further restrict the elements selected by the
* predicate. However, it will still faithfully represent the expected
* cardinality of the sampled access path (tuples tested).
*
* @author [email protected]
*
* @param
* The generic type of the elements visited by that access path.
*/
private static class RandomSampleAdvancer extends Advancer {
private static final long serialVersionUID = 1L;
/** The random number generator seed. */
private final long seed;
/** The desired total limit on the sample. */
private final int limit;
private final byte[] fromKey, toKey;
/*
* Transient data. This gets initialized when we visit the first tuple.
*/
/** The offset of each tuple to be sampled. */
private transient long[] offsets;
/** The #of tuples accepted so far. */
private transient int nread = 0;
/** The inclusive lower bound of the first tuple actually visited. */
private transient long fromIndex;
/** The exclusive upper bound of the last tuple which could be visited. */
private transient long toIndex;
/**
*
* @param limit
* The #of samples to visit.
*/
public RandomSampleAdvancer(final long seed, final int limit,
final byte[] fromKey, final byte[] toKey) {
this.seed = seed;
this.limit = limit;
this.fromKey = fromKey;
this.toKey = toKey;
}
@Override
protected boolean init() {
final AbstractBTree ndx = (AbstractBTree) src.getIndex();
// inclusive lower bound.
fromIndex = fromKey == null ? 0 : ndx.indexOf(fromKey);
if (fromIndex < 0) {
// convert insert position to index.
fromIndex = -fromIndex + 1;
}
// exclusive upper bound.
toIndex = toKey == null ? ndx.getEntryCount() : ndx.indexOf(toKey);
if (toIndex < 0) {
// convert insert position to index.
toIndex = -toIndex + 1;
}
// get offsets to be sampled.
offsets = new SmartOffsetSampler().getOffsets(seed, limit,
fromIndex, toIndex);
// Skip to the first tuple.
src.seek(ndx.keyAt(offsets[0]));
return true;
}
@Override
protected void advance(final ITuple tuple) {
final AbstractBTree ndx = (AbstractBTree) src.getIndex();
if (nread < offsets.length - 1) {
/*
* Skip to the next tuple.
*/
final long nextIndex = offsets[nread];
// System.err.println("limit=" + limit + ", rangeCount="
// + (toIndex - fromIndex) + ", fromIndex=" + fromIndex
// + ", toIndex=" + toIndex + ", currentIndex="
// + currentIndex + ", nextIndex=" + nextIndex);
src.seek(ndx.keyAt(nextIndex));
}
nread++;
}
} // class RandomSampleAdvancer
/**
* A sample from an access path.
*
* @param
* The generic type of the elements visited by that access
* path.
*
* @author [email protected]
*/
public static class AccessPathSample implements Serializable {
private static final long serialVersionUID = 1L;
private final IPredicate pred;
private final IKeyOrder keyOrder;
private final int limit;
private final E[] sample;
/**
* Constructor populates the sample using the caller's
* {@link IAccessPath#iterator()}. The caller is responsible for setting
* up the {@link IAccessPath} such that it provides an efficient sample
* of the access path with the appropriate constraints.
*
* @param limit
* @param accessPath
*/
private AccessPathSample(final int limit,
final IAccessPath accessPath) {
if (limit <= 0)
throw new IllegalArgumentException();
if (accessPath == null)
throw new IllegalArgumentException();
this.pred = accessPath.getPredicate();
this.keyOrder = accessPath.getKeyOrder();
this.limit = limit;
// drain the access path iterator.
final ArrayList tmp = new ArrayList(limit);
int nsamples = 0;
final Iterator src = accessPath.iterator(0L/* offset */, limit,
limit/* capacity */);
while (src.hasNext() && nsamples < limit) {
tmp.add(src.next());
nsamples++;
}
// convert to an array of the appropriate type.
sample = tmp.toArray((E[]) java.lang.reflect.Array.newInstance(
tmp.get(0).getClass(), tmp.size()));
}
public IPredicate getPredicate() {
return pred;
}
public boolean isEmpty() {
return sample != null;
}
public int sampleSize() {
return sample == null ? 0 : sample.length;
}
public int limit() {
return limit;
}
/**
* The sample.
*
* @return The sample -or- null
if the sample was
* empty.
*/
public E[] getSample() {
return sample;
}
} // AccessPathSample
/**
* Interface for obtaining an array of tuple offsets to be sampled.
*
* @author thompsonbry
*/
public interface IOffsetSampler {
/**
* Return an array of tuple indices which may be used to sample a key
* range of some index.
*
* Note: The caller must stop when it runs out of offsets, not when the
* limit is satisfied, as there will be fewer offsets returned when the
* half open range is smaller than the limit.
*
* @param seed
* The seed for the random number generator -or- ZERO (0L)
* for a random seed. A non-zero value may be used to create
* a repeatable sample.
* @param limit
* The maximum #of tuples to sample.
* @param fromIndex
* The inclusive lower bound.
* @param toIndex
* The exclusive upper bound0
*
* @return An array of at most limit offsets into the index. The
* offsets will lie in the half open range (fromIndex,toIndex].
* The elements of the array will be in ascending order. No
* offsets will be repeated.
*
* @throws IllegalArgumentException
* if limit is non-positive.
* @throws IllegalArgumentException
* if fromIndex is negative.
* @throws IllegalArgumentException
* if toIndex is negative.
* @throws IllegalArgumentException
* unless toIndex is GT fromIndex.
*/
long[] getOffsets(long seed, int limit, long fromIndex, long toIndex);
}
/**
* A smart implementation which uses whichever implementation is most
* efficient for the limit and key range to be sampled.
*
* @author thompsonbry
*/
public static class SmartOffsetSampler implements IOffsetSampler {
/**
* {@inheritDoc}
*/
public long[] getOffsets(final long seed, int limit,
final long fromIndex, final long toIndex) {
if (limit < 1)
throw new IllegalArgumentException();
if (fromIndex < 0)
throw new IllegalArgumentException();
if (toIndex < 0)
throw new IllegalArgumentException();
if (toIndex <= fromIndex)
throw new IllegalArgumentException();
final long rangeCount = (toIndex - fromIndex);
if (limit > rangeCount) {
/*
* Note: cast valid since limit is int32 and limit LT rangeCount
* so rangeCount may be cast to int32.
*/
limit = (int) rangeCount;
}
if (limit == rangeCount) {
// Visit everything.
return new EntireRangeOffsetSampler().getOffsets(seed, limit,
fromIndex, toIndex);
}
/*
* Random offsets visiting a subset of the key range using a
* selection without replacement pattern (the same tuple is never
* visited twice).
*
* FIXME When the limit approaches the range count and the range
* count is large (too large for a bit vector or acceptance set
* approach), then we are better off creating a hash set of offsets
* NOT to be visited and then just choosing (rangeCount-limit)
* offsets to reject. This will be less expensive than computing the
* acceptance set directly. However, to really benefit from the
* smaller memory profile, we would also need to wrap that with an
* iterator pattern so the smaller memory representation could be of
* use when the offset[] is applied (e.g., modify the IOffsetSampler
* interface to be an iterator with various ctor parameters rather
* than returning an array as we do today).
*/
// FIXME BitVectorOffsetSampler is broken.
if (false && rangeCount < Bytes.kilobyte32 * 8) {
// NB: 32k range count uses a 4k bit vector.
return new BitVectorOffsetSampler().getOffsets(seed, limit,
fromIndex, toIndex);
}
/*
* When limit is small (or significantly smaller than the
* rangeCount), then we are much better off creating a hash set of
* the offsets which have been accepted.
*
* Good unless [limit] is very large.
*/
return new AcceptanceSetOffsetSampler().getOffsets(seed, limit,
fromIndex, toIndex);
}
}
/**
* Returns all offsets in the half-open range, but may only be used when
* the limit GTE the range count.
*/
static public class EntireRangeOffsetSampler implements IOffsetSampler {
/**
* {@inheritDoc}
*
* @throws UnsupportedOperationException
* if limit!=rangeCount (after adjusting for limits
* greater than the rangeCount).
*/
public long[] getOffsets(final long seed, int limit,
final long fromIndex, final long toIndex) {
if (limit < 1)
throw new IllegalArgumentException();
if (fromIndex < 0)
throw new IllegalArgumentException();
if (toIndex < 0)
throw new IllegalArgumentException();
if (toIndex <= fromIndex)
throw new IllegalArgumentException();
final long rangeCount = (toIndex - fromIndex);
if (limit > rangeCount) {
/*
* Note: cast valid since limit is int32 and limit LT rangeCount
* so rangeCount may be cast to int32.
*/
limit = (int) rangeCount;
}
if (limit != rangeCount)
throw new UnsupportedOperationException();
// offsets of tuples to visit.
final long[] offsets = new long[limit];
for (int i = 0; i < limit; i++) {
offsets[i] = fromIndex + i;
}
return offsets;
}
}
/**
* Return a randomly selected ordered array of offsets in the given
* half-open range.
*
* This approach is based on a bit vector. If the bit is already marked,
* then the offset has been used and we scan until we find the next free
* offset. This requires [rangeCount] bits, so it works well when the
* rangeCount of the key range is small. For example, a range count of 32k
* requires a 4kb bit vector, which is quite manageable.
*
* FIXME There is something broken in this class, probably an assumption I
* have about how {@link LongArrayBitVector} works. If you enable it in the
* stress test, it will fail.
*/
static public class BitVectorOffsetSampler implements IOffsetSampler {
/**
* {@inheritDoc}
*
* Note: The utility of this class is limited to smaller range counts
* (32k is fine, 2x or 4k that is also Ok) so it will reject anything
* with a very large range count.
*
* @throws UnsupportedOperationException
* if the rangeCount is GT {@link Integer#MAX_VALUE}
*/
public long[] getOffsets(final long seed, int limit,
final long fromIndex, final long toIndex) {
if (limit < 1)
throw new IllegalArgumentException();
if (fromIndex < 0)
throw new IllegalArgumentException();
if (toIndex < 0)
throw new IllegalArgumentException();
if (toIndex <= fromIndex)
throw new IllegalArgumentException();
final long rangeCount2 = (toIndex - fromIndex);
if (rangeCount2 > Integer.MAX_VALUE) {
/*
* The utility of this class is limited to smaller range counts
* so it will reject anything with a very large range count.
*/
throw new UnsupportedOperationException();
}
// known to be an int32 value.
final int rangeCount = (int) rangeCount2;
if (limit > rangeCount) {
limit = rangeCount;
}
// offsets of tuples to visit.
final long [] offsets = new long [limit];
// create a cleared bit vector of the stated capacity.
final BitVector v = LongArrayBitVector.ofLength(//
rangeCount// capacity (in bits)
);
// Random number generator using caller's seed (if given).
final Random rnd = seed == 0L ? new Random() : new Random(seed);
// Choose random tuple indices for the remaining tuples.
for (int i = 0; i < limit; i++) {
/*
* Look for an unused bit starting at this index. If necessary,
* this will wrap around to zero.
*/
// k in (0:rangeCount-1).
int k = rnd.nextInt(rangeCount);
if (v.getBoolean((long) k)) {
// This bit is already taken.
final long nextZero = v.nextZero((long) k);
if (nextZero != -1L) {
k = (int) nextZero;
} else {
final long priorZero = v.previousZero((long) k);
if (priorZero != -1L) {
k = (int) priorZero;
} else {
// No empty bit found?
throw new AssertionError();
}
}
}
assert !v.getBoolean(k);
// Set the bit.
v.add(k, true);
assert v.getBoolean(k);
offsets[i] = fromIndex + k;
assert offsets[i] < toIndex;
}
// put them into sorted order for more efficient traversal.
Arrays.sort(offsets);
// System.err.println(Arrays.toString(offsets));
return offsets;
}
}
/**
* An implementation based on an acceptance set of offsets which have been
* accepted. This implementation is a good choice when the limit moderate
* (~100k) and the rangeCount is significantly greater than the limit. The
* memory demand is the O(limit).
*
* @author thompsonbry
*/
static public class AcceptanceSetOffsetSampler implements IOffsetSampler {
/**
* {@inheritDoc}
*
* Note: The utility of this class is limited to moderate range counts
* (~100k) so it will reject anything with a very large range count.
*
* @throws UnsupportedOperationException
* if the rangeCount is GT {@link Integer#MAX_VALUE}
*/
public long[] getOffsets(final long seed, int limit,
final long fromIndex, final long toIndex) {
if (limit < 1)
throw new IllegalArgumentException();
if (fromIndex < 0)
throw new IllegalArgumentException();
if (toIndex < 0)
throw new IllegalArgumentException();
if (toIndex <= fromIndex)
throw new IllegalArgumentException();
final long rangeCount2 = (toIndex - fromIndex);
if (rangeCount2 > Integer.MAX_VALUE)
throw new UnsupportedOperationException();
final int rangeCount = (int) rangeCount2;
if (limit > rangeCount) {
limit = rangeCount;
}
// offsets of tuples to visit.
final long [] offsets = new long[limit];
// hash set of accepted offsets.
final IntOpenHashSet v = new IntOpenHashSet(
rangeCount// capacity
);
// Random number generator using caller's seed (if given).
final Random rnd = seed == 0L ? new Random() : new Random(seed);
// Choose random tuple indices for the remaining tuples.
for (int i = 0; i < limit; i++) {
/*
* Look for an unused bit starting at this index. If necessary,
* this will wrap around to zero.
*/
// k in (0:rangeCount-1).
int k = rnd.nextInt(rangeCount);
int round = 0;
while (v.contains(k)) {
k++;
if (k == rangeCount) {
// wrap around.
if (++round > 1) {
// no empty bit found?
throw new AssertionError();
}
// reset starting index.
k = 0;
}
}
assert !v.contains(k);
// Set the bit.
v.add(k);
offsets[i] = fromIndex + k;
assert offsets[i] < toIndex;
}
// put them into sorted order for more efficient traversal.
Arrays.sort(offsets);
// System.err.println(Arrays.toString(offsets));
return offsets;
}
}
}