oracle.kv.impl.api.bulk.BulkMultiGet Maven / Gradle / Ivy

Go to download
/*-
 * Copyright (C) 2011, 2018 Oracle and/or its affiliates. All rights reserved.
 *
 * This file was distributed by Oracle as part of a version of Oracle NoSQL
 * Database made available at:
 *
 * http://www.oracle.com/technetwork/database/database-technologies/nosqldb/downloads/index.html
 *
 * Please see the LICENSE file included in the top-level directory of the
 * appropriate version of Oracle NoSQL Database for a copy of the license and
 * additional information.
 */

package oracle.kv.impl.api.bulk;

import java.util.ArrayList;
import java.util.Arrays;
import java.util.Collections;
import java.util.Comparator;
import java.util.HashMap;
import java.util.HashSet;
import java.util.Iterator;
import java.util.List;
import java.util.Map;
import java.util.Map.Entry;
import java.util.Set;
import java.util.TreeSet;
import java.util.concurrent.ArrayBlockingQueue;
import java.util.concurrent.Callable;
import java.util.concurrent.ExecutionException;
import java.util.concurrent.ExecutorService;
import java.util.concurrent.Executors;
import java.util.concurrent.Future;
import java.util.concurrent.RejectedExecutionException;
import java.util.concurrent.ThreadFactory;
import java.util.concurrent.TimeUnit;
import java.util.concurrent.atomic.AtomicInteger;
import java.util.logging.Level;

import oracle.kv.Consistency;
import oracle.kv.Depth;
import oracle.kv.Direction;
import oracle.kv.Key;
import oracle.kv.KeyRange;
import oracle.kv.KeyValueVersion;
import oracle.kv.ParallelScanIterator;
import oracle.kv.StoreIteratorConfig;
import oracle.kv.impl.api.KVStoreImpl;
import oracle.kv.impl.api.KeySerializer;
import oracle.kv.impl.api.Request;
import oracle.kv.impl.api.StoreIteratorParams;
import oracle.kv.impl.api.ops.InternalOperation;
import oracle.kv.impl.api.ops.MultiGetBatchIterate;
import oracle.kv.impl.api.ops.MultiGetBatchKeysIterate;
import oracle.kv.impl.api.ops.Result;
import oracle.kv.impl.api.ops.ResultKey;
import oracle.kv.impl.api.ops.ResultKeyValueVersion;
import oracle.kv.impl.api.parallelscan.BaseParallelScanIteratorImpl;
import oracle.kv.impl.api.parallelscan.DetailedMetricsImpl;
import oracle.kv.impl.async.IterationHandleNotifier;
import oracle.kv.impl.topo.PartitionId;
import oracle.kv.impl.topo.RepGroupId;
import oracle.kv.impl.topo.Topology;
import oracle.kv.impl.topo.TopologyUtil;
import oracle.kv.impl.util.KVThreadFactory;
import oracle.kv.stats.DetailedMetrics;

import com.sleepycat.je.utilint.PropUtil;

/**
 * Implementation of a bulk get storeIterator and storeKeysIterator.
 *
 * BulkMultiGet.BulkGetIterator<K, V>: Implements the bulk get
 * operation. It provides the common underpinnings for batching both rows and
 * KV pairs.
 */
public class BulkMultiGet {

    /**
     * Creates a bulk get iterator returning KeyValueVersion.
     */
    public static ParallelScanIterator
        createBulkMultiGetIterator(final KVStoreImpl storeImpl,
                                   final List> parentKeyIterators,
                                   final int batchSize,
                                   final KeyRange subRange,
                                   final Depth depth,
                                   final Consistency consistency,
                                   final long timeout,
                                   final TimeUnit timeoutUnit,
                                   final StoreIteratorConfig config) {

        /* Prohibit iteration of internal keyspace (//). */
        final KeyRange useRange =
            storeImpl.getKeySerializer().restrictRange(null, subRange);

        final StoreIteratorParams params =
            new StoreIteratorParams(Direction.UNORDERED,
                                    batchSize,
                                    null,
                                    useRange,
                                    depth,
                                    consistency,
                                    timeout,
                                    timeoutUnit);

        return new BulkGetIterator(
            storeImpl, parentKeyIterators, params, config,
            null /* iterationHandle */) {

            @Override
            public void validateKey(Key key) {
                return;
            }

            @Override
            public Key getKey(Key key) {
                return key;
            }

            @Override
            protected InternalOperation
                generateBulkGetOp(List parentKeys, byte[] resumeKey) {

                return new MultiGetBatchIterate(parentKeys,
                                                resumeKey,
                                                params.getSubRange(),
                                                params.getDepth(),
                                                params.getBatchSize());
            }

            @Override
            protected void convertResult(Result result,
                                         List elementList) {
                final List results =
                    result.getKeyValueVersionList();
                if (results.size() == 0) {
                    return;
                }
                for (ResultKeyValueVersion entry: results) {
                    final byte[] keyBytes = entry.getKeyBytes();
                    final Key key = keySerializer.fromByteArray(keyBytes);
                    final KeyValueVersion value =
                        KVStoreImpl.createKeyValueVersion(
                            key,
                            entry.getValue(),
                            entry.getVersion(),
                            entry.getExpirationTime());
                    elementList.add(value);
                }
            }
        };
    }

    /**
     * Creates a bulk get iterator returning Keys.
     */
    public static ParallelScanIterator createBulkMultiGetKeysIterator
        (final KVStoreImpl storeImpl,
         final List> parentKeyiterators,
         final int batchSize,
         final KeyRange subRange,
         final Depth depth,
         final Consistency consistency,
         final long timeout,
         final TimeUnit timeoutUnit,
         final StoreIteratorConfig config) {

        /* Prohibit iteration of internal keyspace (//). */
        final KeyRange useRange =
            storeImpl.getKeySerializer().restrictRange(null, subRange);

        final StoreIteratorParams params =
            new StoreIteratorParams(Direction.UNORDERED,
                                    batchSize,
                                    null,
                                    useRange,
                                    depth,
                                    consistency,
                                    timeout,
                                    timeoutUnit);

        return new BulkGetIterator(storeImpl, parentKeyiterators,
                                             params, config,
                                             null /* iterationHandle */) {
            @Override
            public void validateKey(Key key) {
                return;
            }

            @Override
            public Key getKey(Key key) {
                return key;
            }

            @Override
            protected InternalOperation
                generateBulkGetOp(List parentKeys, byte[] resumeKey) {

                return new MultiGetBatchKeysIterate(parentKeys,
                                                    resumeKey,
                                                    params.getSubRange(),
                                                    params.getDepth(),
                                                    params.getBatchSize());
            }

            @Override
            protected void convertResult(Result result, List elementList) {
                final List byteKeyResults = result.getKeyList();

                int cnt = byteKeyResults.size();
                if (cnt == 0) {
                    assert (!result.hasMoreElements());
                    return;
                }
                for (int i = 0; i < cnt; i += 1) {
                    final byte[] entry = byteKeyResults.get(i).getKeyBytes();
                    elementList.add(keySerializer.fromByteArray(entry));
                }
            }
        };
    }

    /**
     * This class implements the bulk get operation. It provides the common
     * underpinnings for batching both rows and KV pairs.
     *
     * In general, the bulk get operation works as follows:
     *
     * 1) Multiple reader tasks that read keys supplied by user supplied
     * iterator, and the keys are grouped by partition and sorted and wrapped
     * into batches.
     *
     * 2) Multiple bulk get tasks that take key batch and perform get batch
     * operation.
     *
     * 3) The get batch operation retrieved records associated with the keys,
     * and also may return its child or dependents depending on configuration
     * of Depth for KV API or child/ancestor table for Table API.
     *
     * The general flow of a key entry is as follows:
     *
     * 1) The key entries is supplied by user supplied iterator. Multiple reader
     * threads read the input keys in parallel.
     *
     * 2) Each reader accumulates the key entries, along with earlier entries,
     * in a sorted tree associated with each partition.
     *
     * 3) When the threshold associated with the partition is exceeded, the
     * leading elements in the sorted tree are assembled into a batch and placed
     * into a queue associated with the partition.
     *
     * 4) The ShardGetStream associated with the queue takes the batch and
     * retrieves the rows associated with the keys in the batch from store.
     *
     * 5) Multiple ShardGetStreams perform the bulk get operation in parallel,
     * the number of streams is configurable.
     *
     * 2 key parameters used in this operation:
     *
     * 1) Threshold for each partition batch = batchSize
     * 2) Number of shard get tasks =
     *      StoreIteratorConfig.maxCurrentRequests > 0 ?
     *          StoreIteratorConfig.maxCurrentRequests :
     *          MIN(#available RNs, nProcessors).
     *
     * @param  must be a PrimaryKey or a Key
     * @param  must be a Row or a KeyValueVersion
     *
     */
    public static abstract class BulkGetIterator
            extends BaseParallelScanIteratorImpl {

        /**
         * The Key comparator used to group keys associated with a partition,
         * so that they can be sent as a contiguous batch.
         */
        private final static Comparator KEY_BYTES_COMPARATOR =
            new Key.BytesComparator();

        /**
         * Canonical PartitionBatch object to signify EOF in the partition
         * batch queue.
         */
        private final PartitionBatch partitionBatchEOF =
            new PartitionBatch(null, null);

        /**
         * The topology associated with the store.
         */
        private final Topology topology;

        /**
         * A map indexed by partition id which yields the keys that are being
         * aggregated for that partition.
         */
        private final PartitionValues pMap[];

        /**
         * Used to manage key reader threads.
         */
        private ExecutorService readerExecutor;
        private HashMap, KeysReader> readerTasks;
        private final AtomicInteger remainingReaders = new AtomicInteger();
        private final Set getStreams =
            new HashSet();

        private final Consistency consistency;
        private final TimeUnit timeoutUnit;
        private final long timeout;

        private final Map shardMetrics;

        /**
         * Used to hold the aggregate statistics associated with this operation
         */
        private final AggregateStatistics statistics;

        protected final KeySerializer keySerializer;

        public BulkGetIterator(KVStoreImpl store,
                               List> parentKeyIterators,
                               StoreIteratorParams params,
                               StoreIteratorConfig config,
                               IterationHandleNotifier iterHandleNotifier) {
            super(store, store.getLogger(), getRequestTimeoutMs(store, params),
                  params.getDirection(), 32 /* maxResultsBatches */,
                  true, /* prefetch */
                  iterHandleNotifier);

            topology = storeImpl.getTopology();
            keySerializer = storeImpl.getKeySerializer();
            shardMetrics = new HashMap();
            statistics = new AggregateStatistics();

            /* Initialize parameters */
            consistency = params.getConsistency();
            timeout = params.getTimeout();
            timeoutUnit = params.getTimeoutUnit();

            /* Create partition values array */
            final int partitionThreshold = params.getBatchSize();
            final int nParts = topology.getPartitionMap().size();
            @SuppressWarnings("unchecked")
            PartitionValues[] partitionValues =
                new BulkGetIterator.PartitionValues[nParts+1];
            pMap = partitionValues;
            for (int i = 0 ; i <= nParts; i++) {
                pMap[i] = new PartitionValues(i, partitionThreshold);
            }

            /*
             * Create shard get tasks, the number of tasks is set to the value
             * of StoreIteratorConfig.getMaxConcurrentRequests() if it is set
             * and greater than 0, otherwise set to MIN(#available RNs,
             * nProcessors).  The shard get tasks need to be created before the
             * reader tasks start running so that the readers can insert stream
             * EOF markers.
             */
            final int maxConcurrentRequests =
                (config != null) ? config.getMaxConcurrentRequests() : 0;
            final int RNThreads = getNumOfShardTasks();
            final int maxShardTasks =
                    (maxConcurrentRequests > 0) ?
                                    Math.min(maxConcurrentRequests, RNThreads) :
                                    RNThreads;
            createShardExecutor(maxShardTasks);

            /* Start reader threads */
            startReaderExecutor(parentKeyIterators);

            /*
             * Start shard get tasks now that all reader threads are started so
             * that it is more likely that keys will be available for the get
             * operations without blocking.
             */
            for (final ShardGetStream task : getStreams) {
                task.submit();
            }

            /* Start a thread to monitor readers. */
            ExecutorService executor = Executors.newSingleThreadExecutor(
                new KVThreadFactory("BulkGetReadersMonitor", logger));
            executor.submit(new Runnable() {
                    @Override
                    public void run() {
                        try {
                            logReaderProgress(readerExecutor);
                            for (Future f: readerTasks.keySet()) {
                                int nKeys = f.get();
                                statistics.aggregate(nKeys);
                            }
                        } catch (InterruptedException ie) {
                            logger.info(Thread.currentThread() +
                                " caught " + ie);
                        } catch (ExecutionException ee) {
                            logger.info(Thread.currentThread() +
                                " caught " + ee);
                        }
                    }
                });
            executor.shutdown();
        }

        /**
         * Abstract method to check the validation of key supplied by iterator.
         */
        protected abstract void validateKey(K key);

        /**
         * Abstract method to abstract how the key is obtained.
         */
        protected abstract Key getKey(K key);

        /**
         * Abstract method to create bulk get operation.
         */
        protected abstract InternalOperation generateBulkGetOp
            (List parentKeys, byte[] resumeKey);

        @Override
        protected boolean close(Throwable reason) {
            if (!super.close(reason)) {
                return false;
            }

            /* Cancel the reader tasks */
            for (Future f : readerTasks.keySet()) {
                f.cancel(true);
            }

            List unfinishedBusiness = readerExecutor.shutdownNow();
            if (!unfinishedBusiness.isEmpty()) {
                final int nRemainingTasks = unfinishedBusiness.size();
                logger.log(Level.FINE,
                           "Bulk get reader executor didn't shutdown cleanly. "+
                           "{0} tasks remaining.", nRemainingTasks);
            }

            unfinishedBusiness = getTaskExecutor().shutdownNow();
            if (!unfinishedBusiness.isEmpty()) {
                final int nRemainingTasks = unfinishedBusiness.size();
                logger.log(Level.FINE,
                           "Bulk get shard executor didn't shutdown cleanly. "+
                           "{0} tasks remaining.", nRemainingTasks);
            }

            getStatInfo();
            logger.log(Level.INFO, statistics.toString());
            return true;
        }

        @Override
        public List getPartitionMetrics() {
            return Collections.emptyList();
        }

        @Override
        public List getShardMetrics() {
            synchronized (shardMetrics) {
                final ArrayList ret =
                    new ArrayList(shardMetrics.size());
                ret.addAll(shardMetrics.values());
                return ret;
            }
        }

        private static int getRequestTimeoutMs(KVStoreImpl storeImpl,
                                               StoreIteratorParams params) {
            final long timeOut = params.getTimeout();
            if (timeOut == 0) {
                return storeImpl.getDefaultRequestTimeoutMs();
            }
            final int requestTimeoutMs =
                PropUtil.durationToMillis(timeOut, params.getTimeoutUnit());
            if (requestTimeoutMs > storeImpl.getReadTimeoutMs()) {
                final String format = "Request timeout parameter: %,d ms " +
                    "exceeds socket read timeout: %,d ms";
                throw new IllegalArgumentException(
                    String.format(format, requestTimeoutMs,
                                  storeImpl.getReadTimeoutMs()));
            }
            return requestTimeoutMs;
        }

        /**
         * Create the tasks that will read batches of partition keys from
         * their respective shard. The number of tasks per shard is defined by
         * the configuration parameter: maxConcurrentRequest. There is at least
         * one task per shard to ensure that all shards are kept busy during
         * the load.
         *
         * The partitions are divided amongst each shard to ensure that no two
         * tasks ever access the same partition.
         */
        private void createShardExecutor(int maxShardTasks) {
            final String fmt = "createShardExecutor #ShardTasks:%d, " +
                "#Shards:%d, base parallelism per shard:%d, " +
                "residual parallelism per shard:%d";

            final int nShards = topology.getRepGroupMap().size();
            int basePerShardParallelism;
            int residualPerShardParallelism;
            if (maxShardTasks > nShards) {
                basePerShardParallelism = maxShardTasks / nShards;
                residualPerShardParallelism = maxShardTasks % nShards;
            } else {
                basePerShardParallelism = 1;
                residualPerShardParallelism = 0;
            }

            logger.info(String.format(fmt, maxShardTasks, nShards,
                                      basePerShardParallelism,
                                      residualPerShardParallelism));

            final Map> map =
                TopologyUtil.getRGIdPartMap(topology);

            setTaskExecutor(maxShardTasks);
            for (RepGroupId rgId : topology.getRepGroupIds()) {
                final List list = map.get(rgId);
                final int nParts = list.size();
                int perShardParallelism = basePerShardParallelism;
                if (residualPerShardParallelism > 0) {
                    residualPerShardParallelism--;
                    perShardParallelism++;
                }

                /* Divide up the partitions amongst the tasks. */
                final int basePerTaskPartitions = nParts / perShardParallelism;
                int residualPerTaskPartitions = nParts % perShardParallelism;

                doneWithShard:
                for (int i = 0; i < nParts;  ) {
                    int perTaskPartitions = basePerTaskPartitions;
                    if (residualPerTaskPartitions > 0) {
                        perTaskPartitions++;
                        residualPerTaskPartitions--;
                    }

                    if (perTaskPartitions == 0) {
                        /* More parallelism than partitions in shard. */
                        break doneWithShard;
                    }

                    final List taskPartitions =
                        list.subList(i, i + perTaskPartitions);

                    logger.info("Partitions:" +
                        Arrays.toString(taskPartitions.toArray()) +
                        " assigned to RG task");

                    final ShardGetStream task =
                        new ShardGetStream(rgId, taskPartitions.size());
                    for (PartitionId pid : taskPartitions) {
                        PartitionValues pv = pMap[pid.getPartitionId()];
                        pv.setShardTask(task);
                    }
                    streams.add(task);
                    getStreams.add(task);

                    i += perTaskPartitions;
                }
            }
        }

        /**
         * Returns the consistency used for this operation.
         */
        private Consistency getConsistency() {
            return (consistency != null) ?
                    consistency : storeImpl.getDefaultConsistency();
        }

        /**
         * Returns the total number of threads can be used for bulk get.
         */
        private int getNumOfShardTasks() {
            final int useNumRepNodes;
            if (getConsistency() == Consistency.ABSOLUTE) {
                useNumRepNodes = topology.getRepGroupMap().size();
            } else {
                final int[] readZoneIds =
                    storeImpl.getDispatcher().getReadZoneIds();
                useNumRepNodes =
                    TopologyUtil.getNumRepNodesForRead(topology, readZoneIds);
            }
            if (useNumRepNodes == 0) {
                throw new IllegalStateException("Store not yet initialized");
            }
            /* The 2x will keep all RNs busy, with a request in transit to/from
             * the RN and a request being processed.
             */
            return useNumRepNodes * 2;
        }

        /**
         * Start key reader tasks
         */
        private void startReaderExecutor(List> parentKeyIterators) {
            final ThreadFactory threadFactory =
                new KVThreadFactory("BulkGetReaders", logger);
            final int nReaders = parentKeyIterators.size();
            readerExecutor =
                Executors.newFixedThreadPool(nReaders, threadFactory);
            readerTasks = new HashMap, KeysReader>(nReaders);
            remainingReaders.set(nReaders);

            for (int i = 0; i < nReaders; i++) {
                final Iterator keyIterator = parentKeyIterators.get(i);
                final KeysReader kr = new KeysReader(keyIterator);
                Future future = null;
                try {
                    future = readerExecutor.submit(kr);
                } catch (RejectedExecutionException ree) {
                    close(ree);
                }
                readerTasks.put(future, kr);
            }
            readerExecutor.shutdown();
        }

        /**
         * Flush all residual values that were queued at their partitions to
         * their respective shards.
         */
        private void flushPartitions()
            throws InterruptedException {

            for (PartitionValues pv : pMap) {
               pv.flush(true);
            }
            logger.info("Flushed all partitions");
        }

        /**
         * Log progress at one minute intervals until all the readers have
         * reached EOF
         */
        private void logReaderProgress(final ExecutorService executor)
            throws InterruptedException {

            final long startMs = System.currentTimeMillis();
            long prevTotalRead = 0;

            while (!executor.awaitTermination(1, TimeUnit.MINUTES)) {
                final String fmt = "Reading continues. %,d values read. " +
                    "Throughput:%,d values/sec";
                final long totalRead = totalRead();
                final long throughput = (totalRead * 1000) /
                    (System.currentTimeMillis() - startMs);
                /*
                 * Log at warning level if there was no read progress and the
                 * operation appears to have stalled.
                 */
                logger.log((totalRead > prevTotalRead) ?
                    Level.INFO : Level.WARNING,
                    String.format(fmt, totalRead, throughput));
                prevTotalRead = totalRead;
            }
        }

        /**
         * Total entries read from all readers
         */
        private long totalRead() {
            long totalRead = 0;
            for (KeysReader kr: readerTasks.values()) {
                totalRead += kr.getReadCount();
            }
            return totalRead;
        }

        /**
         * Total aggregate statistics
         */
        private void getStatInfo() {
            for (ShardGetStream sgs : getStreams) {
                statistics.batchCount += sgs.getBatchCount();
                statistics.batchQueueUnderflow += sgs.getBatchQueueUnderflow();
                statistics.batchQueueOverflow += sgs.getBatchQueueOverflow();
                if (statistics.maxBatchRequestRepeated <
                        sgs.getMaxBatchRequestRepeated()) {
                    statistics.maxBatchRequestRepeated =
                        sgs.getMaxBatchRequestRepeated();
                }
            }
        }

        /**
         * The function can be ignored because bulk get doesn't support sorting
         */
        @Override
        protected int compare(V one, V two) {
            return 0;
        }

        /**
         * Dedicated task used to read a specific Key stream
         */
        private class KeysReader implements Callable {

            /**
             * The KeyStream being read
             */
            private final Iterator keyIterator;

            /**
             * The number of keys read by this stream reader.
             */
            private volatile int readCount = 0;

            KeysReader(Iterator keyIterator) {
                super();
                this.keyIterator = keyIterator;
            }

            @Override
            public Integer call()
                throws Exception {

                final KeySerializer serializer = storeImpl.getKeySerializer();
                logger.info("Started keys reader");
                try {
                    K k;
                    while (keyIterator.hasNext()) {
                        k = keyIterator.next();
                        if (k == null) {
                            throw new IllegalArgumentException("The parent key" +
                                " should not be null");
                        }
                        /* Call validateKey() to check the key */
                        validateKey(k);
                        readCount++;

                        final Key key = getKey(k);
                        final byte[] bytes = serializer.toByteArray(key);
                        final PartitionId pid = topology.getPartitionId(bytes);
                        pMap[pid.getPartitionId()].put(bytes);
                    }

                    /*
                     * Flush partitions and add EOF markers only after all keys
                     * have been generated.  Otherwise, the iterator might
                     * think it was done if it encountered EOFs and some
                     * streams hadn't been added yet.
                     */
                    if (remainingReaders.decrementAndGet() <= 0) {
                        flushPartitions();
                        for (ShardGetStream sgs: getStreams) {
                            sgs.setEOFPartitionBatch();
                        }
                    }
                } catch (RuntimeException e) {
                    close(e);
                } catch (Error e) {
                    close(e);
                } finally {
                    logger.info("Finished keys reader");
                }
                return readCount;
            }

            public int getReadCount() {
                return readCount;
            }
        }

        /**
         * Used to hold a sorted list of key byte array. The sorted list
         * ensures locality of reference during reading on the server.
         */
        private class PartitionBatch {
            final PartitionId pid;
            final List entries;

            PartitionBatch(PartitionId pid, List entries) {
                super();
                this.pid = pid;
                this.entries = entries;
            }
        }

        /**
         * Reading records of a single partition.
         */
        public class ShardGetStream extends Stream {
            /**
             * The shard associated with this task
             */
            private final RepGroupId rgId;

            /**
             *  The queue of batches to be processed by this task.
             */
            private final ArrayBlockingQueue queuedBatchs;

            /**
             * The number of batches processed by this task.
             */
            private long batchCount = 0 ;

            /**
             * The number of times this task was blocked because it did not have
             * a partition batch to execute. Large numbers of queue underflows
             * indicate that the user input streams are not providing data fast
             * enough and increasing stream parallelism could help.
             */
            private long batchQueueUnderflow = 0 ;

            /**
             * The number of times a batch could not be inserted because there
             * was no space in the queue. Large numbers of queue overflows
             * indicate that performance could benefit from increased shard
             * parallelism.
             */
            private long batchQueueOverflows = 0 ;

            /**
             * The maximum times a batch of parent keys was processed repeatedly
             * if the batch size is smaller than the total count of result set.
             */
            private int maxBatchRequestRepeated = 0;
            private int batchRequestRepeated = 0;

            private PartitionBatch currentBatch = null;
            private int resumeParentKeyIndex = -1;
            private byte[] resumeKey = null;

            ShardGetStream(RepGroupId rgId, int numTaskPartitions) {
                this.rgId = rgId;
                queuedBatchs = new ArrayBlockingQueue(
                    numTaskPartitions * 2);
            }

            void add(PartitionBatch partBatch)
                throws InterruptedException {

                if (!queuedBatchs.offer(partBatch)) {
                    batchQueueOverflows++;
                    queuedBatchs.put(partBatch);
                }
            }

            @Override
            protected void updateDetailedMetrics(long timeInMs,
                                                 long recordCount) {

                final String shardName = rgId.toString();
                DetailedMetricsImpl dmi;

                /* Shard Metrics. */
                synchronized (shardMetrics) {
                    dmi = shardMetrics.get(rgId);
                    if (dmi == null) {
                        dmi = new DetailedMetricsImpl
                            (shardName, timeInMs, recordCount);
                        shardMetrics.put(rgId, dmi);
                        return;
                    }
                }
                dmi.inc(timeInMs, recordCount);
            }

            @Override
            protected void setResumeKey(Result result) {
                if (result.hasMoreElements()) {
                    if (resumeParentKeyIndex == -1) {
                        resumeParentKeyIndex = result.getResumeParentKeyIndex();
                    } else {
                        resumeParentKeyIndex +=result.getResumeParentKeyIndex();
                    }
                    resumeKey = result.getPrimaryResumeKey();
                } else {
                    resetResumeKey();
                }
            }

            @Override
            protected Request makeReadRequest() {
                final List keys;
                if (resumeParentKeyIndex == -1) {
                    if (currentBatch == null) {
                        currentBatch = getPartitionBatch();
                        if (currentBatch == null) {
                            return null;
                        }
                    }
                    keys = currentBatch.entries;

                    batchCount++;
                    logMaxBatchRequestRepeated();
                } else {
                    final List batchKeys = currentBatch.entries;
                    keys = batchKeys.subList(resumeParentKeyIndex,
                                             batchKeys.size());
                    batchRequestRepeated++;
                }

                final InternalOperation op = generateBulkGetOp(keys, resumeKey);
                return storeImpl.makeReadRequest(op, currentBatch.pid,
                                                 consistency, timeout,
                                                 timeoutUnit, null);
            }

            @Override
            protected boolean hasMoreElements(Result result) {
                if (result.hasMoreElements()) {
                    return true;
                }
                currentBatch = getPartitionBatch();
                resetResumeKey();
                return (currentBatch != null);
            }

            @Override
            public String toString() {
                return "ShardGetStream[" + rgId + "]";
            }

            private PartitionBatch getPartitionBatch() {
                try {
                    PartitionBatch pbatch = queuedBatchs.poll();
                    if (pbatch == null) {
                        batchQueueUnderflow++;

                        /*
                         * TODO: This method will block if the KeysReader
                         * threads have not be able to read enough keys from
                         * the iterators.  That blocking could end up blocking
                         * an async thread, which isn't good.  We should
                         * probably fix this.
                         */
                        pbatch = queuedBatchs.take();
                    }
                    if (pbatch == partitionBatchEOF) {
                        return null;
                    }
                    return pbatch;
                } catch (InterruptedException ie) {
                    logger.info(Thread.currentThread() + " caught " + ie);
                    Thread.currentThread().interrupt();
                }
                return null;
            }

            private void resetResumeKey() {
                resumeParentKeyIndex = -1;
                resumeKey = null;
            }

            long getBatchCount() {
                return batchCount;
            }

            long getBatchQueueUnderflow() {
                return batchQueueUnderflow;
            }

            long getBatchQueueOverflow() {
                return batchQueueOverflows;
            }

            int getMaxBatchRequestRepeated() {
                logMaxBatchRequestRepeated();
                return maxBatchRequestRepeated;
            }

            void setEOFPartitionBatch()
                throws InterruptedException {

                add(partitionBatchEOF);
            }

            private void logMaxBatchRequestRepeated() {
                if (batchRequestRepeated == 0) {
                    return;
                }
                if (batchRequestRepeated > maxBatchRequestRepeated) {
                    maxBatchRequestRepeated = batchRequestRepeated;
                }
                batchRequestRepeated = 0;
            }
        }

        /**
         * The values associated with a specific partition.
         */
        private class PartitionValues {

            /**
             * The partition associated with the values.
             */
            private final int partitionId;

            /**
             * The task designated to write this partition's values to its
             * shard.
             */
            private ShardGetStream getStream;

            /**
             * The number of entries that were actually inserted into the
             * partition.
             */
            private long getCount = 0;

            /**
             * The number of duplicated entries that were already existed in
             * the partition.
             */
            private long dupCount = 0;

            /**
             * Holds the sorted values that are waiting to be written to the
             * shard. Tried ConcurrentSkipListMap to eliminate use of
             * synchronized methods but it resulted in lower perf on
             * Nashua machines.
             */
            private final Set keys =
                new TreeSet(KEY_BYTES_COMPARATOR);

            private final int threshold;

            PartitionValues(int pid, int partitionThreshold) {
                this.partitionId = pid;
                this.threshold = partitionThreshold;
            }

            void setShardTask(ShardGetStream stream) {
                getStream = stream;
            }

            synchronized void put(byte[] key)
                throws InterruptedException {

                if (keys.contains(key)) {
                    dupCount++;
                    return;
                }
                keys.add(key);
                flush(false);
            }

            /**
             * Flushes keys set to batch queue of shard task if needed.
             *
             * A flush is typically done if the number of the keys exceeds the
             * threshold number.
             *
             * @param force if true the partition is flushed even if the
             * threshold has not been reached
             *
             */
            void flush(boolean force)
                throws InterruptedException {

                final int maxRequestSize = 1024 * 1024;

                final String fmt =
                    "Queued Partition %d flushed. Batch size %,d; Total:%,d;" +
                        " Number of keys:%,d; request size:%,d" ;
                int numKeys = keys.size();
                while ((force && numKeys > 0) || (numKeys >= threshold)) {
                    int getBatchCount = 0;
                    int requestSize = 0;
                    final List le = new ArrayList();
                    synchronized (this) {
                        for (Iterator iter = keys.iterator();
                            iter.hasNext();) {

                            final byte[] kvBytes = iter.next();
                            iter.remove();
                            getBatchCount++;
                            numKeys--;
                            requestSize += kvBytes.length;
                            le.add(kvBytes);
                            if (requestSize > maxRequestSize) {
                                break;
                            }
                        }
                        getCount += getBatchCount;
                    }
                    /* Can block, do it outside sync block */
                    final PartitionBatch batch =
                        new PartitionBatch(new PartitionId(partitionId),le);
                    getStream.add(batch);
                    logger.fine(String.format(fmt, partitionId,
                                              getBatchCount, getCount,
                                              keys.size(), requestSize));
                }
            }
        }

        /**
         * Represents the aggregate statistics across all keyStreams.
         */
        private class AggregateStatistics {

            private long batchCount;
            private long batchQueueUnderflow;
            private long batchQueueOverflow;
            private int maxBatchRequestRepeated;

            /**
             * The total number of entries read from all the streams supplied
             * to the operation, it may be less than total number keys
             * supplied by iterator if contains duplicated keys
             */
            private long readCount ;

            public void aggregate(int entriesRead) {
                readCount += entriesRead;
            }

            /**
             * The total number of V actually retrieved from the store
             * as a result of the operation.
             */
            public long totalGetCount() {
                long total = 0;
                for (Entry entry:
                     shardMetrics.entrySet()) {
                    total += entry.getValue().getScanRecordCount();
                }
                return total;
            }

            private long getTotalDupCount() {
                long total = 0;
                for (PartitionValues pv: pMap)  {
                    total += pv.dupCount;
                }
                return total;
            }

            @Override
            public String toString() {
                final String fmt =
                    "%,d key streams; %,d shard streams; " +
                    "%,d keys read; %,d duplicated; " +
                    "%,d get; %,d batches; " +
                    "%,d batch queue underflows; " +
                    "%,d batch queue overflows; "  +
                    "%,d av batch size; " +
                    "%,d max batch request repeated;";
                final long getCount = totalGetCount();
                return String.format(fmt, readerTasks.size(), getStreams.size(),
                                     readCount, getTotalDupCount(),
                                     getCount, batchCount,
                                     batchQueueUnderflow,
                                     batchQueueOverflow,
                                     ((batchCount > 0) ?
                                         (getCount / batchCount) : 0),
                                      maxBatchRequestRepeated);
            }
        }
    }
}