com.yahoo.documentapi.VisitorIterator Maven / Gradle / Ivy

Go to download
Show more of this group Show more artifacts with this name
Show all versions of documentapi Show documentation
There is a newer version: 8.441.21
// Copyright Vespa.ai. Licensed under the terms of the Apache 2.0 license. See LICENSE in the project root.
package com.yahoo.documentapi;

import com.yahoo.document.BucketId;
import com.yahoo.document.BucketIdFactory;
import com.yahoo.document.select.BucketSelector;
import com.yahoo.document.select.parser.ParseException;
import java.util.logging.Level;

import java.util.Map;
import java.util.Set;
import java.util.TreeMap;
import java.util.logging.Logger;

/**
 * Enables transparent iteration of super/sub-buckets
 *
 * Thread safety: safe for threads to hold their own iterators (no shared state),
 * as long as they also hold the ProgressToken object associated with it. No two
 * VisitorIterator instances may share the same progress token instance at the
 * same time.
 * Concurrent access to a single VisitorIterator instance is not safe and must
 * be handled atomically by the caller.
 *
 * @author vekterli
 */
public class VisitorIterator {

    private final ProgressToken progressToken;
    private final BucketSource bucketSource;
    private int distributionBitCount;

    private static final Logger log = Logger.getLogger(VisitorIterator.class.getName());

    public static class BucketProgress {
        private BucketId superbucket;
        private BucketId progress;

        public BucketProgress(BucketId superbucket, BucketId progress) {
            this.superbucket = superbucket;
            this.progress = progress;
        }

        public BucketId getProgress() {
            return progress;
        }

        public BucketId getSuperbucket() {
            return superbucket;
        }
    }

    /**
     * Provides an abstract interface to VisitorIterator for
     * how pending buckets are acquired, decoupling this from the iteration
     * itself.
     *
     * Important: it is the responsibility of the {@link BucketSource} implementation
     * to ensure that progress information is honored for (partially) finished buckets.
     * From the point of view of the iterator itself, it should not have to deal with
     * filtering away already finished buckets, as this is a detail best left to
     * bucket sources.
     */
    protected static interface BucketSource {
        public boolean hasNext();
        public boolean shouldYield();
        public boolean visitsAllBuckets();
        public BucketProgress getNext();
        public long getTotalBucketCount();
        public int getDistributionBitCount();
        public void setDistributionBitCount(int distributionBitCount,
                                            ProgressToken progress);
        public void update(BucketId superbucket, BucketId progress,
                           ProgressToken token);
    }

    /**
     * Provides a bucket source that encompasses the entire range available
     * through a given value of distribution bits
     */
    protected static class DistributionRangeBucketSource implements BucketSource {
        private boolean flushActive = false;
        private int distributionBitCount;
        private long totalBucketsSplit;
        private long totalBucketsMerged;
        private final int slices;
        private final int sliceId;
        // Wouldn't need this if this were a non-static class, but do it for
        // the sake of keeping things identical in Java and C++
        private ProgressToken progressToken;

        public DistributionRangeBucketSource(int distributionBitCount,
                                             ProgressToken progress,
                                             int slices, int sliceId) {
            if (slices < 1) {
                throw new IllegalArgumentException("slices must be positive, but was " + slices);
            }
            if (sliceId < 0 || sliceId >= slices) {
                throw new IllegalArgumentException("sliceId must be in [0, " + slices + "), but was " + sliceId);
            }

            this.slices = slices;
            this.sliceId = sliceId;
            this.totalBucketsSplit = 0;
            this.totalBucketsMerged = 0;
            this.progressToken = progress;

            // New progress token (could also be empty, in which this is a
            // no-op anyway)
            if (progressToken.getTotalBucketCount() == 0) {
                assert(progressToken.isEmpty()) : "inconsistent progress state";
                progressToken.setTotalBucketCount(1L << distributionBitCount);
                progressToken.setDistributionBitCount(distributionBitCount);
                progressToken.setBucketCursor(0);
                progressToken.setFinishedBucketCount(0);
                this.distributionBitCount = distributionBitCount;
            }
            else {
                this.distributionBitCount = progressToken.getDistributionBitCount();
                // Quick consistency check to ensure the user isn't trying to eg.
                // pass a progress token for an explicit document selection
                if (progressToken.getTotalBucketCount() != (1L << progressToken.getDistributionBitCount())) {
                    throw new IllegalArgumentException("Total bucket count in existing progress is not "
                            + "consistent with that of the current document selection");
                }
            }

            if (!progress.isFinished()) {
                if (log.isLoggable(Level.FINE)) {
                    log.log(Level.FINE, "Importing unfinished progress token with " +
                            "bits: " + progressToken.getDistributionBitCount() +
                            ", active: " + progressToken.getActiveBucketCount() +
                            ", pending: " + progressToken.getPendingBucketCount() +
                            ", cursor: " + progressToken.getBucketCursor() +
                            ", finished: " + progressToken.getFinishedBucketCount() +
                            ", total: " + progressToken.getTotalBucketCount());
                }
                if (!progress.isEmpty()) {
                    // Lower all active to pending
                    if (progressToken.getActiveBucketCount() > 0) {
                        if (log.isLoggable(Level.FINE)) {
                            log.log(Level.FINE, "Progress token had active buckets upon range " +
                                    "construction. Setting these as pending");
                        }
                        progressToken.setAllBucketsToState(ProgressToken.BucketState.BUCKET_PENDING);
                    }
                    // Fixup for any buckets that were active when progress was written
                    // but are now pending and with wrong dist bits (used-bits). Buckets
                    // split here may very well be split/merged again if we set a new dist
                    // bit count, but that is the desired process
                    correctInconsistentPending(progressToken.getDistributionBitCount());
                    // Fixup for bucket cursor in case of bucket space downscaling
                    correctTruncatedBucketCursor();

                    if (log.isLoggable(Level.FINE)) {
                        log.log(Level.FINE, "Partial bucket space progress; continuing "+
                                "from position " + progressToken.getBucketCursor());
                    }
                }
                progressToken.setFinishedBucketCount(progressToken.getBucketCursor() -
                        progressToken.getPendingBucketCount());
            } else {
                assert(progressToken.getBucketCursor() == progressToken.getTotalBucketCount());
            }
            // Should be all fixed up and good to go
            progressToken.setInconsistentState(false);
            skipToSlice();
        }

        protected boolean isLosslessResetPossible() {
            // #pending must be equal to cursor, i.e. all buckets ever fetched
            // must be located in the set of pending
            if (progressToken.getPendingBucketCount() != progressToken.getBucketCursor()) {
                return false;
            }
            // Check if all pending buckets have a progress of 0
            for (Map.Entry entry
                        : progressToken.getBuckets().entrySet()) {
                if (entry.getValue().getState() != ProgressToken.BucketState.BUCKET_PENDING) {
                    return false;
                }
                if (entry.getValue().getProgress().getId() != 0) {
                    return false;
                }
            }
            return true;
        }

        /**
         * Ensure that a given ProgressToken instance only has
         * buckets pending that have a used-bits count of that of the
         * targetDistCits. This is done by splitting or merging
         * all inconsistent buckets until the desired state is reached.
         *
         * Time complexity is approx O(4bn) where b is the maximum
         * delta of bits to change anywhere in the set of pending and n
         * is the number of pending. This includes the time spent making shallow
         * map copies.
         *
         * @param targetDistBits The desired distribution bit count of the buckets
         */
        private void correctInconsistentPending(int targetDistBits) {
            boolean maybeInconsistent = true;
            long bucketsSplit = 0, bucketsMerged = 0;
            long pendingBefore = progressToken.getPendingBucketCount();
            ProgressToken p = progressToken;

            // Optimization: before doing any splitting/merging at all, we check
            // to see if we can't simply just reset the entire internal state
            // with the new distribution bit count. This ensures that if we go
            // from eg. 1 bit to 20 bits, we won't have to perform a grueling
            // half a million splits to cover the same bucket space as that 1
            // single-bit bucket once did
            if (isLosslessResetPossible()) {
                if (log.isLoggable(Level.FINE)) {
                    log.log(Level.FINE, "At start of bucket space and all " +
                            "buckets have no progress; doing a lossless reset " +
                            "instead of splitting/merging");
                }
                assert(p.getActiveBucketCount() == 0);
                p.clearAllBuckets();
                p.setBucketCursor(0);
                return;
            }

            while (maybeInconsistent) {
                BucketId lastMergedBucket = null;
                maybeInconsistent = false;
                // Make a shallow working copy of the bucket map. BucketKeyWrapper
                // keys are considered immutable, and should thus not be at risk
                // for being changed during the inner loop
                // Do separate passes for splitting and merging just to make
                // absolutely sure that the two ops won't step on each others'
                // toes. This isn't wildly efficient, but the data sets in question
                // are presumed to be low in size and this is presumed to be a very
                // infrequent operation
                TreeMap buckets
                        = new TreeMap(p.getBuckets());
                for (Map.Entry entry
                        : buckets.entrySet()) {
                    assert(entry.getValue().getState() == ProgressToken.BucketState.BUCKET_PENDING);
                    BucketId pending = new BucketId(ProgressToken.keyToBucketId(entry.getKey().key()));
                    if (pending.getUsedBits() < targetDistBits) {
                        if (pending.getUsedBits() + 1 < targetDistBits) {
                            maybeInconsistent = true; // Do another pass
                        }
                        p.splitPendingBucket(pending);
                        ++bucketsSplit;
                    }
                }

                // Make new map copy with potentially split buckets
                buckets = new TreeMap(p.getBuckets());
                for (Map.Entry entry
                        : buckets.entrySet()) {
                    assert(entry.getValue().getState() == ProgressToken.BucketState.BUCKET_PENDING);
                    BucketId pending = new BucketId(ProgressToken.keyToBucketId(entry.getKey().key()));
                    if (pending.getUsedBits() > targetDistBits) {
                        // If this is the right sibling of an already merged left sibling,
                        // it's already been merged away, so we should skip it
                        if (lastMergedBucket != null) {
                            BucketId rightCheck = new BucketId(lastMergedBucket.getUsedBits(),
                                    lastMergedBucket.getId() | (1L << (lastMergedBucket.getUsedBits() - 1)));
                            if (pending.equals(rightCheck)) {
                                if (log.isLoggable(Level.FINEST)) {
                                    log.log(Level.FINEST, "Skipped " + pending +
                                            ", as it was right sibling of " + lastMergedBucket);
                                }
                                continue;
                            }
                        }
                        if (pending.getUsedBits() - 1 > targetDistBits) {
                            maybeInconsistent = true; // Do another pass
                        }
                        p.mergePendingBucket(pending);
                        ++bucketsMerged;

                        lastMergedBucket = pending;
                    }
                }
            }
            if ((bucketsSplit > 0 || bucketsMerged > 0) && log.isLoggable(Level.FINE)) {
                log.log(Level.FINE, "Existing progress' pending buckets had inconsistent " +
                        "distribution bits; performed " + bucketsSplit + " split ops and " +
                        bucketsMerged + " merge ops. Pending: " + pendingBefore + " -> " +
                        p.getPendingBucketCount());
            }
            totalBucketsSplit += bucketsSplit;
            totalBucketsMerged += bucketsMerged;
        }

        private void correctTruncatedBucketCursor() {
            // We've truncated the bucket cursor, but in doing so we might
            // have moved back beyond where there are pending buckets. Consider
            // having a cursor value of 3 at 31 bits and then moving to 11 bits.
            // With 1 pending we'll normally reach a cursor of 0, even though it
            // should be 1
            for (ProgressToken.BucketKeyWrapper bucketKey
                    : progressToken.getBuckets().keySet()) {
                BucketId bid = bucketKey.toBucketId();
                long idx = bucketKey.key() >>> (64 - bid.getUsedBits());
                if (bid.getUsedBits() == distributionBitCount
                        && idx >= progressToken.getBucketCursor()) {
                    progressToken.setBucketCursor(idx + 1);
                }
            }
            if (log.isLoggable(Level.FINEST)) {
                log.log(Level.FINEST, "New range bucket cursor is " +
                        progressToken.getBucketCursor());
            }
        }

        public boolean hasNext() {
            // There is a next bucket iff. there is a bucket no earlier than the cursor which
            // is contained in the bucket space, and is also 0 modulo our sliceId; or if we're
            // not yet properly initialised, with a real distribution bit count, we ignore this.
            long nextBucket = progressToken.getBucketCursor();
            if (distributionBitCount != 1) {
                nextBucket += Math.floorMod(sliceId - nextBucket, slices);
            }
            return nextBucket < (1L << distributionBitCount);
        }

        public boolean shouldYield() {
            // If we need to flush all active buckets, stall the iteration until
            // this has been done
            return flushActive;
        }

        public boolean visitsAllBuckets() {
            return true;
        }

        public long getTotalBucketCount() {
            return 1L << distributionBitCount;
        }

        public BucketProgress getNext() {
            assert(hasNext()) : "getNext() called with hasNext() == false";

            // Create the progress to return for creating visitors, and advance bucket cursor.
            BucketProgress progress = new BucketProgress(progressToken.getCurrentBucketId(), new BucketId());
            progressToken.setBucketCursor(progressToken.getBucketCursor() + 1);

            // Skip ahead to our next next slice, to ensure we also exhaust the bucket space when
            // hasNext() turns false, but there are still super buckets left after the current.
            skipToSlice();

            return progress;
        }

        // Advances the wrapped progress token's bucket cursor to our next slice, marking any skipped
        // buckets as complete, but only if we've been initialised with a proper distribution bit count.
        private void skipToSlice() {
            if (distributionBitCount == 1)
                return;

            while (progressToken.getBucketCursor() < getTotalBucketCount() && (progressToken.getBucketCursor() % slices) != sliceId) {
                progressToken.skipCurrentBucket();
            }
        }

        public int getDistributionBitCount() {
            return distributionBitCount;
        }

        public void setDistributionBitCount(int distributionBitCount,
                                            ProgressToken progress)
        {
            this.distributionBitCount = distributionBitCount;

            // There might be a case where we're waiting for active buckets
            // already when a new distribution bit change comes in. If so,
            // don't do anything at all yet with the set of pending
            if (progressToken.getActiveBucketCount() > 0) {
                flushActive = true;
                if (log.isLoggable(Level.FINE)) {
                    log.log(Level.FINE, "Holding off new/pending buckets and consistency " +
                            "correction until all " + progress.getActiveBucketCount() +
                            " active buckets have been updated");
                }
                progressToken.setInconsistentState(true);
            } else {
                // Only perform the actual distribution bit bucket ops if we've
                // got no pending buckets
                int delta = distributionBitCount - progressToken.getDistributionBitCount();

                // Must do this before setting the bucket cursor to allow
                // reset-checking to be performed
                correctInconsistentPending(distributionBitCount);
                if (delta > 0) {
                    if (log.isLoggable(Level.FINE)) {
                        log.log(Level.FINE, "Increasing distribution bits for full bucket " +
                                "space range source from " + progressToken.getDistributionBitCount() + " to " +
                                distributionBitCount);
                    }
                    progressToken.setFinishedBucketCount(progressToken.getFinishedBucketCount() << delta);
                    // By n-doubling the position, the bucket key ordering ensures
                    // we go from eg. 3:0x02 to 4:0x02 to 5:02 etc.
                    progressToken.setBucketCursor(progressToken.getBucketCursor() << delta);
                } else if (delta < 0) {
                    if (log.isLoggable(Level.FINE)) {
                        log.log(Level.FINE, "Decreasing distribution bits for full bucket " +
                                "space range source from " + progressToken.getDistributionBitCount() +
                                " to " + distributionBitCount + " bits");
                    }
                    // Scale down bucket space and cursor
                    progressToken.setBucketCursor(progressToken.getBucketCursor() >>> -delta);
                    progressToken.setFinishedBucketCount(progressToken.getFinishedBucketCount() >>> -delta);
                }

                progressToken.setTotalBucketCount(1L << distributionBitCount);
                progressToken.setDistributionBitCount(distributionBitCount);

                correctTruncatedBucketCursor();
                progressToken.setInconsistentState(false);
                skipToSlice();
            }
        }

        public void update(BucketId superbucket, BucketId progress,
                           ProgressToken token) {
            progressToken.updateProgress(superbucket, progress);

            if (superbucket.getUsedBits() != distributionBitCount) {
                if (!progress.equals(ProgressToken.FINISHED_BUCKET)) {
                    // We should now always flush active buckets before doing a
                    // consistency fix. This simplifies things greatly
                    assert(flushActive);
                    if (log.isLoggable(Level.FINE)) {
                        log.log(Level.FINE, "Received non-finished bucket " +
                                superbucket + " with wrong distribution bit count (" +
                                superbucket.getUsedBits() + "). Waiting to correct " +
                                "until all active are done");
                    }
                } else {
                    if (log.isLoggable(Level.FINE)) {
                        log.log(Level.FINE, "Received finished bucket " +
                                superbucket + " with wrong distribution bit count (" +
                                superbucket.getUsedBits() + "). Waiting to correct " +
                                "until all active are done");
                    }
                }
            }

            if (progressToken.getActiveBucketCount() == 0) {
                if (flushActive) {
                    if (log.isLoggable(Level.FINE)) {
                        log.log(Level.FINE, "All active buckets flushed, " +
                                "correcting progress token and continuing normal operation");
                    }
                    // Trigger the actual bucket state change this time
                    setDistributionBitCount(distributionBitCount, progressToken);
                    assert(progressToken.getDistributionBitCount() == distributionBitCount);
                }
                flushActive = false;
                // Update #finished since we might have had inconsistent active
                // buckets that have prevent us from getting a correct value. At
                // this point, however, all pending buckets should presumably be
                // at the same, correct dist bit count, so we can safely compute
                // a new count
                // TODO: ensure this is consistent
                if (progressToken.getPendingBucketCount() <= progressToken.getBucketCursor()) {
                    progressToken.setFinishedBucketCount(progressToken.getBucketCursor() -
                            progressToken.getPendingBucketCount());
                }
            }
        }
    }

    /**
     * Provides an explicit set of bucket IDs to iterate over. Will immediately
     * set these as pending in the {@link ProgressToken}, as it is presumed this set is
     * rather small. Changing the distribution bit count for this source is
     * effectively a no-op, as explicit bucket IDs should not be implicitly
     * changed.
     */
    protected static class ExplicitBucketSource implements BucketSource {
        private int distributionBitCount;
        private long totalBucketCount = 0;

        public ExplicitBucketSource(Set superbuckets,
                                    int distributionBitCount,
                                    ProgressToken progress) {
            this.distributionBitCount = progress.getDistributionBitCount();
            this.totalBucketCount = superbuckets.size();

            // New progress token?
            if (progress.getTotalBucketCount() == 0) {
                progress.setTotalBucketCount(this.totalBucketCount);
                progress.setDistributionBitCount(distributionBitCount);
                this.distributionBitCount = distributionBitCount;
            }
            else {
                // Quick consistency check to ensure the user isn't trying to eg.
                // pass a progress token for another document selection
                if (progress.getTotalBucketCount() != totalBucketCount
                        || (progress.getFinishedBucketCount() + progress.getPendingBucketCount()
                            + progress.getActiveBucketCount() != totalBucketCount)) {
                    throw new IllegalArgumentException("Total bucket count in existing progress is not " +
                            "consistent with that of the current document selection");
                }
                if (progress.getBucketCursor() != 0) {
                    // Trying to use a range source progress file
                    throw new IllegalArgumentException("Cannot use given progress file with the "+
                            "current document selection");
                }
                this.distributionBitCount = progress.getDistributionBitCount();
            }

            if (progress.isFinished() || !progress.isEmpty()) return;

            for (BucketId id : superbuckets) {
                // Add all superbuckets with zero sub-bucket progress and pending
                progress.addBucket(id, new BucketId(), ProgressToken.BucketState.BUCKET_PENDING);
            }
        }

        public boolean hasNext() {
            return false;
        }

        public boolean shouldYield() {
            return false;
        }

        public boolean visitsAllBuckets() {
            return false;
        }

        public long getTotalBucketCount() {
            return totalBucketCount;
        }

        // All explicit buckets should have been placed in the progress
        // token during construction, so this method should never be called
        public BucketProgress getNext() {
            throw new IllegalStateException("getNext() called on ExplicitBucketSource");
        }

        public int getDistributionBitCount() {
            return distributionBitCount;
        }

        public void setDistributionBitCount(int distributionBitCount,
                                            ProgressToken progress)
        {
            // Setting distribution bits for explicit bucket source is essentially
            // a no-op, since its buckets already are fixed at 32 used bits.
            progress.setDistributionBitCount(distributionBitCount);
            this.distributionBitCount = distributionBitCount;
            if (log.isLoggable(Level.FINE)) {
                log.log(Level.FINE, "Set distribution bit count to "
                        + distributionBitCount + " for explicit bucket source (no-op)");
            }
        }

        public void update(BucketId superbucket, BucketId progress,
                           ProgressToken token) {
            // Simply delegate to ProgressToken, as it maintains all progress state
            token.updateProgress(superbucket, progress);
        }
    }

    /**
     * @param bucketSource An instance of {@link BucketSource}, providing the working set for
     * the iterator
     * @param progressToken A {@link ProgressToken} instance, allowing the progress of
     * finished or partially finished buckets to be tracked
     *
     * @see BucketSource
     * @see ProgressToken
     */
    private VisitorIterator(ProgressToken progressToken,
                            BucketSource bucketSource) {
        assert(progressToken.getDistributionBitCount() == bucketSource.getDistributionBitCount())
                : "inconsistent distribution bit counts";
        this.distributionBitCount = progressToken.getDistributionBitCount();
        this.progressToken = progressToken;
        this.bucketSource = bucketSource;
    }


    /**
     * @return The pair [superbucket, progress] that specifies the next iterable
     * bucket. When a superbucket is initially returned, the pair is equal to
     * that of [superbucket, 0], as there has been no progress into its sub-buckets
     * yet (if they exist).
     *
     * Precondition: hasNext() == true
     */
    public BucketProgress getNext() {
        assert(progressToken.getDistributionBitCount() == bucketSource.getDistributionBitCount())
                : "inconsistent distribution bit counts for progress and source";
        assert(hasNext());
        // We prioritize returning buckets in the pending map over those
        // that may be in the bucket source, since we want to avoid growing
        // the map too much
        if (progressToken.hasPending()) {
            // Find first pending bucket in token
            TreeMap buckets = progressToken.getBuckets();
            ProgressToken.BucketEntry pending = null;
            BucketId superbucket = null;
            for (Map.Entry entry : buckets.entrySet()) {
                if (entry.getValue().getState() == ProgressToken.BucketState.BUCKET_PENDING) {
                    pending = entry.getValue();
                    superbucket = new BucketId(ProgressToken.keyToBucketId(entry.getKey().key()));
                    break;
                }
            }
            assert(pending != null) : "getNext() called with inconsistent state";

            // Set bucket to active, since it's not awaiting an update
            pending.setState(ProgressToken.BucketState.BUCKET_ACTIVE);

            progressToken.setActiveBucketCount(progressToken.getActiveBucketCount() + 1);
            progressToken.setPendingBucketCount(progressToken.getPendingBucketCount() - 1);

            return new BucketProgress(superbucket, pending.getProgress());
        } else {
            BucketProgress ret = bucketSource.getNext();
            progressToken.addBucket(ret.getSuperbucket(), ret.getProgress(),
                    ProgressToken.BucketState.BUCKET_ACTIVE);
            return ret;
        }
    }

    /**
     * Check whether or not it is valid to call {@link #getNext()} with the current
     * iterator state.
     *
     * There exists a case wherein hasNext may return false before {@link #update} is
     * called, but true afterwards. This happens when the set of pending buckets is
     * empty, the bucket source is empty but the set of active buckets is
     * not. A future progress update on any of the buckets in the active set may
     * or may not make that bucket available to the pending set again.
     * This must be handled explicitly by the caller by checking {@link #isDone()}
     * and ensuring that {@link #update} is called before retrying hasNext.
     *
     * This method will also return false if the number of distribution bits have
     * changed and there are active buckets needing to be flushed before the
     * iterator will allow new buckets to be handed out.
     *
     * @return Whether or not it is valid to call {@link #getNext()} with the current
     * iterator state.
     */
    public boolean hasNext() {
        return (progressToken.hasPending() || bucketSource.hasNext()) && !bucketSource.shouldYield();
    }

    /**
     * Check if the iterator is actually done
     *
     * @see #hasNext()
     *
     * @return true iff the bucket source is empty and
     * there are no pending or active buckets in the progress token.
     */
    public boolean isDone() {
        return !(hasNext() || progressToken.hasActive());
    }

    /**
     * Tell the iterator that we've finished processing up to and
     * including progress. progress may be a sub-bucket or
     * the invalid 0-bucket (in case the caller fails to process the bucket and
     * must return it to the set of pending) or the special case BucketId(Integer.MAX_VALUE),
     * the latter indicating to the iterator that traversal is complete for
     * superbucket's tree. The null bucket should only be used if no
     * non-null updates have yet been given for the superbucket.
     *
     * It is a requirement that each superbucket returned by {@link #getNext()} must
     * eventually result in 1-n update operations, where the last update operation
     * has the special progress==super case.
     *
     * If the document selection used to create the iterator is unknown and there
     * were active buckets at the time of a distribution bit state change, such
     * a bucket passed to update() will be in an inconsistent state
     * with regards to the number of bits it uses. For unfinished buckets, this
     * is handled by splitting or merging it until it's consistent, depending on
     * whether or not it had a lower or higher distribution bit count than that of
     * the current system state. For finished buckets of a lower dist bit count,
     * the amount of finished buckets in the ProgressToken is adjusted upwards
     * to compensate for the fact that a bucket using fewer distribution bits
     * actually covers more of the bucket space than the ones that are currently
     * in use. For finished buckets of a higher dist bit count, the number of
     * finished buckets is not increased at that point in time, since
     * such a bucket doesn't actually cover an entire bucket with the current state.
     *
     * All this is done automatically and transparently to the caller once all
     * active buckets have been updated.
     *
     * @param superbucket A valid bucket ID that has been retrieved earlier through
     * {@link #getNext()}
     * @param progress A bucket logically contained within super. Subsequent
     * updates for the same superbucket must have progress be in an increasing
     * order, where order is defined as the in-order traversal of the bucket split
     * tree. May also be the null bucket if the superbucket has not seen any "proper"
     * progress updates yet or the special case Integer.MAX_VALUE. Note that inconsistent
     * splitting might actually see progress as containing super
     * rather than vice versa, so this is explicitly allowed to pass by the code.
     */
    public void update(BucketId superbucket, BucketId progress) {
        // Delegate to bucket source, as it knows how to deal with buckets
        // that are in an inconsistent state wrt distribution bit count
        bucketSource.update(superbucket, progress, progressToken);
    }

   /**
    * @return The total number of iterable buckets that remain to be processed
    *
    * Note: currently includes all non-finished (i.e. active and pending
    * buckets) as well
    */
    public long getRemainingBucketCount() {
        return progressToken.getTotalBucketCount() - progressToken.getFinishedBucketCount();
    }

    /**
     * @return Internal bucket source instance. Do NOT modify!
     */
    protected BucketSource getBucketSource() {
        return bucketSource;
    }

    public ProgressToken getProgressToken() {
        return progressToken;
    }

    public int getDistributionBitCount() {
        return distributionBitCount;
    }

    /**
     * Set the distribution bit count for the iterator and the buckets it
     * currently maintains and will return in the future.
     *
     * For document selections that result in a explicit set of buckets, this
     * is essentially a no-op, so in such a case, disregard the rest of this text.
     *
     * Changing the number of distribution bits for an unknown document
     * selection will effectively scale the bucket space that will be visited;
     * each bit increase or decrease doubling or halving its size, respectively.
     * When increasing, any pending buckets will be split to ensure the total
     * bucket space covered remains the same. Correspondingly, when decreasing,
     * any pending buckets will be merged appropriately.
     *
     * If there are buckets active at the time of the change, the actual
     * bucket splitting/merging operations are kept on hold until all active
     * buckets have been updated, at which point they will be automatically
     * performed. The iterator will force such an update by not giving out
     * any new or pending buckets until that happens.
     *
     * Note: when decreasing the number of distribution bits,
     * there is a chance of losing superbucket progress in a bucket that
     * is merged with another bucket, leading to potential duplicate
     * results.
     *
     * @param distBits New system state distribution bit count
     */
    public void setDistributionBitCount(int distBits) {
        if (distributionBitCount != distBits) {
            bucketSource.setDistributionBitCount(distBits, progressToken);
            distributionBitCount = distBits;
            if (log.isLoggable(Level.FINE)) {
                log.log(Level.FINE, "Set visitor iterator distribution bit count to "
                        + distBits);
            }
        }
    }

    public boolean visitsAllBuckets() {
        return bucketSource.visitsAllBuckets();
    }

    public static VisitorIterator createFromDocumentSelection(
            String documentSelection,
            BucketIdFactory idFactory,
            int distributionBitCount,
            ProgressToken progress) throws ParseException {
        return createFromDocumentSelection(documentSelection, idFactory, distributionBitCount, progress, 1, 0);
    }
    /**
     * Create a new VisitorIterator instance based on the given document
     * selection string.
     *
     * @param documentSelection Document selection string used to create the
     * VisitorIterator instance. Depending on the characteristics of the
     * selection, the iterator may iterate over only a small subset of the buckets or
     * every bucket in the system. Both cases will be handled efficiently.
     * @param idFactory {@link BucketId} factory specifying the number of distribution bits
     * to use et al.
     * @param progress A unique {@link ProgressToken} instance which is used for maintaining the state
     * of the iterator. Can not be shared with other iterator instances at the same time.
     * If progress contains work done in an earlier iteration run, the iterator will pick
     * up from where it left off
     * @return A new VisitorIterator instance
     * @throws ParseException if documentSelection fails to properly parse
     */
    public static VisitorIterator createFromDocumentSelection(
            String documentSelection,
            BucketIdFactory idFactory,
            int distributionBitCount,
            ProgressToken progress,
            int slices,
            int sliceId) throws ParseException {
        BucketSelector bucketSel = new BucketSelector(idFactory);
        Set rawBuckets = bucketSel.getBucketList(documentSelection);
        BucketSource src;

        // Depending on whether the expression yielded an unknown number of
        // buckets, we create either an explicit bucket source or a distribution
        // bit-based range source
        if (rawBuckets == null) {
            // Range source
            src = new DistributionRangeBucketSource(distributionBitCount, progress, slices, sliceId);
        } else {
            // Explicit source
            src = new ExplicitBucketSource(rawBuckets, distributionBitCount, progress);
        }

        return new VisitorIterator(progress, src);
    }

    /**
     * Create a new VisitorIterator instance based on the given
     * set of buckets. This is supported for internal use only, and is required
     * by Synchronization. Use {@link #createFromDocumentSelection} instead for
     * all normal purposes.
     *
     * @param bucketsToVisit The set of buckets that will be visited
     * @param distributionBitCount Number of distribution bits to use
     * @param progress A unique ProgressToken instance which is used for maintaining the state
     * of the iterator. Can not be shared with other iterator instances at the same time.
     * If progress contains work done in an earlier iteration run, the iterator will pick
     * up from where it left off
     * @return A new VisitorIterator instance
     */
    public static VisitorIterator createFromExplicitBucketSet(
            Set bucketsToVisit,
            int distributionBitCount,
            ProgressToken progress) {
        // For obvious reasons, always create an explicit source here
        BucketSource src = new ExplicitBucketSource(bucketsToVisit,
                distributionBitCount, progress);
        return new VisitorIterator(progress, src);
    }
}