oracle.kv.impl.api.bulk.BulkPut Maven / Gradle / Ivy

Go to download
Show more of this group Show more artifacts with this name
Show all versions of oracle-nosql-server Show documentation
NoSQL Database Server - supplies build and runtime support for the server (store) side of the Oracle NoSQL Database.
The newest version!
/*-
 * Copyright (C) 2011, 2018 Oracle and/or its affiliates. All rights reserved.
 *
 * This file was distributed by Oracle as part of a version of Oracle NoSQL
 * Database made available at:
 *
 * http://www.oracle.com/technetwork/database/database-technologies/nosqldb/downloads/index.html
 *
 * Please see the LICENSE file included in the top-level directory of the
 * appropriate version of Oracle NoSQL Database for a copy of the license and
 * additional information.
 */

package oracle.kv.impl.api.bulk;

import static oracle.kv.impl.util.SerialVersion.STD_UTF8_VERSION;
import static oracle.kv.impl.util.SerialVersion.TTL_SERIAL_VERSION;
import static oracle.kv.impl.util.SerializationUtil.readNonNullByteArray;
import static oracle.kv.impl.util.SerializationUtil.writeNonNullByteArray;

import java.io.DataInput;
import java.io.DataOutput;
import java.io.IOException;
import java.util.ArrayList;
import java.util.Arrays;
import java.util.Comparator;
import java.util.HashMap;
import java.util.HashSet;
import java.util.Iterator;
import java.util.List;
import java.util.Map;
import java.util.Map.Entry;
import java.util.Set;
import java.util.SortedMap;
import java.util.TreeMap;
import java.util.concurrent.ArrayBlockingQueue;
import java.util.concurrent.Callable;
import java.util.concurrent.ExecutionException;
import java.util.concurrent.ExecutorService;
import java.util.concurrent.Executors;
import java.util.concurrent.Future;
import java.util.concurrent.RejectedExecutionException;
import java.util.concurrent.ThreadFactory;
import java.util.concurrent.TimeUnit;
import java.util.concurrent.atomic.AtomicLong;
import java.util.concurrent.atomic.AtomicReference;
import java.util.logging.Level;
import java.util.logging.Logger;

import oracle.kv.BulkWriteOptions;
import oracle.kv.EntryStream;
import oracle.kv.FaultException;
import oracle.kv.Key;
import oracle.kv.Value;
import oracle.kv.impl.api.KVStoreImpl;
import oracle.kv.impl.api.KeySerializer;
import oracle.kv.impl.api.ops.InternalOperation;
import oracle.kv.impl.topo.PartitionId;
import oracle.kv.impl.topo.RepGroupId;
import oracle.kv.impl.topo.Topology;
import oracle.kv.impl.topo.TopologyUtil;
import oracle.kv.impl.util.FastExternalizable;
import oracle.kv.impl.util.KVThreadFactory;
import oracle.kv.impl.util.SerialVersion;
import oracle.kv.impl.util.SerializationUtil;
import oracle.kv.table.TimeToLive;

import com.sleepycat.je.utilint.JVMSystemUtils;

/**
 * This class represents a single bulk put operation. It provides the common
 * underpinnings for batching both rows and KV pairs.
 *
 * The overall flow of entries is designed to:
 *
 * 1) Utilize the store as completely as possible by ensuring that each shard
 * is kept uniformly busy at all times.
 *
 * 2) Make best use of thread level parallelism.
 *
 * 3) Assemble batches so that key value pairs in a batch are clustered by
 * partition and are sorted within each partition to optimize JE insert
 * performance.
 *
 * The general flow of an entry is as follows:
 *
 * 1) The entry is supplied by the user supplied stream. Multiple streams may
 * be read in parallel by the BuldStreamReader task associated with the stream,
 * depending on the configurable level of stream parallelization.
 *
 * 2) Each StreamReader accumulates the new entry, along with earlier
 * entries, in a sorted tree associated with each partition.
 *
 * 3) When the storage threshold associated with the partition is exceeded, the
 * the leading elements in the sorted tree are assembled into a batch and
 * placed into a queue associated with the partition.
 *
 * 4) The ShardPutTask associated with the queue the takes the batch and writes
 * it to the shard.
 *
 * @param  must be a Row or a KV pair
 */
public abstract class BulkPut {

    /**
     * Handle to the store
     */
    private final KVStoreImpl store;

    /**
     * The topology associated with the store.
     */
    private final Topology topology;

    /**
     * The key serializer associated with the store.
     */
    private final KeySerializer serializer;

    /**
     * The streams supplying the entries to be loaded.
     */
    private final EntryStream streams[];

    /**
     * The list of stream reader that read the entries supplied by stream.
     */
    private final List> readers;

    /**
     * A map indexed by partition id which yields the values that are being
     * aggregated for that partition.
     */
    private final PartitionValues pMap[];

    /**
     * The Key comparator used to group KV pairs associated with a
     * partition, so that they can be sent as a contiguous batch. This is the
     * same comparator used to insert KV pairs into the JE btree.
     */
    public final static Comparator KEY_BYTES_COMPARATOR =
        new Key.BytesComparator();

    /**
     * The options in effect for this operation.
     */
    private final BulkWriteOptions options;

    /**
     * This represents the threshold bytes that will be aggregated for each
     * partition, before the bytes are flushed out to the shard holding the
     * partition.
     */
    private final long partitionThresholdBytes;

    /**
     * The min number of bytes required per partition for batching to be
     * effective.
     */
    private static final int partitionHeapMinBytes = 100 * 1024;

    /**
     * Used to hold the aggregate statistics associated with this operation
     */
    final AggregateStatistics statistics = new AggregateStatistics();

    /**
     * The logger to be used on the client.
     */
    private final Logger logger;

    /**
     * The exception to terminate the bulk put operation.
     */
    private final AtomicReference terminateException =
        new AtomicReference();

    /**
     * Used to manage the put executor tasks.
     */
    private ExecutorService shardExecutor = null;

    /**
     * Used to manage the stream reader tasks.
     */
    private ExecutorService streamExecutor = null;

    public BulkPut(KVStoreImpl store,
                   BulkWriteOptions options,
                   List> streams,
                   Logger logger) {

        this.logger = logger;

        this.store = store;
        topology = store.getTopology();
        serializer = store.getKeySerializer();

        this.options = options;
        @SuppressWarnings("unchecked")
        final EntryStream[] array =
            streams.toArray(new EntryStream[streams.size()]);
        this.streams = array;

        readers = new ArrayList>();

        final int nPartitions = topology.getPartitionMap().size();

        partitionThresholdBytes = computeThresholdBytes(nPartitions);

        @SuppressWarnings("unchecked")
        PartitionValues[] partitionValues =
            new BulkPut.PartitionValues[nPartitions + 1];
        pMap = partitionValues;
        for (int i = 0 ; i <= nPartitions ; i++) {
            pMap[i] = new PartitionValues(i);
        }
    }


    /**
     * Computes the threshold bytes that may be used to hold the KV pairs
     * associated with a partition. The larger the threshold, the larger the
     * potential batch size, and the more efficient the btree insert, since the
     * keys are likely to be more clustered and will impact fewer nodes.
     *
     * @param nPartitions the number of partitions associated with the store
     *
     * @return the threshold bytes associated with each partition
     */
    private long computeThresholdBytes(final int nPartitions) {
        final long maxHeapBytes = JVMSystemUtils.getRuntimeMaxMemory();

        if (maxHeapBytes == Long.MAX_VALUE) {
            final String msg =
                "Could not determine a max heap size. This is unusual. " +
                "Please specify the -Xmx argument to the jvm invocation " +
                "as a workaround";
            throw new IllegalArgumentException(msg);
        }
        final int bulkHeapPercent = options.getBulkHeapPercent();
        final long maxLoadHeapBytes =
            (maxHeapBytes * bulkHeapPercent) / 100;

        /*
         * Factor of two to allow for serialization of kv pairs from
         * map to batch request -- conservative but safe.
         */
        final long thresholdBytes = (maxLoadHeapBytes / 2) / nPartitions;

        if (thresholdBytes < partitionHeapMinBytes) {
            final long minHeapBytes =
                (((nPartitions * partitionHeapMinBytes) * 2) * 100) /
                bulkHeapPercent;
            logger.warning("Insufficient heap:" + maxHeapBytes + ". For best " +
                           "performance increase -Xmx on jvm invocation " +
                           "to a min of "  + (minHeapBytes / (1024 * 1024)) +
                           "mb");
        }

        final String fmt = "Buffer bytes per partition:%,d Max heap " +
        		"memory:%,d Bulk heap %% %,d";
        logger.info(String.format(fmt, thresholdBytes, maxHeapBytes,
                                  bulkHeapPercent));

        return thresholdBytes;
    }


    /**
     * Abstract method used to create a Row or KV reader
     *
     * @param streamId the internal unique stream id to be associated with the
     * user supplied stream
     * @param stream the user supplied stream
     *
     * @return the stream reader
     */
    public abstract StreamReader
        createReader(int streamId, EntryStream stream) ;

    /**
     * Abstract method used to convert Key/Value pair to T instance, it is used
     * to construct the input entry instance for EntryStream.keyExists(T entry).
     */
    protected abstract T convertToEntry(Key key, Value value);

    /**
     * Implements the bulk put operation.
     */
    public void execute()
        throws InterruptedException {

        shardExecutor = startShardExecutor();

        final ThreadFactory threadFactory =
            new KVThreadFactory("BulkStreamReader", logger);

        streamExecutor =
            Executors.newFixedThreadPool(options.getStreamParallelism(),
                                         threadFactory);
        int streamId = 0;

        ArrayList>
            futures = new ArrayList>(streams.length);
        try {
            for (EntryStream s : streams) {
                final StreamReader streamReader = createReader(++streamId, s);
                if (!futures.add(streamExecutor.submit(streamReader))) {
                    throw new IllegalStateException
                        ("failed to add new future for stream:" + s.name());
                }
                readers.add(streamReader);
            }
        } catch (RejectedExecutionException ree) {
            terminateWithException(ree);
        }

        streamExecutor.shutdown();

        logProgress(streamExecutor);

        finishStreams(futures, statistics);

        flushPartitions();

        shutdownShardExecutor(shardExecutor);

        logger.log(Level.INFO, statistics.toString());

        if (terminateException.get() != null) {
            throw new FaultException(terminateException.get(), false);
        }
    }

    /**
     * Log progress at one minute intervals until all the streams have
     * reached EOF
     */
    private void logProgress(final ExecutorService readerExecutor)
        throws InterruptedException {

        final long startMs = System.currentTimeMillis();
        long prevTotalRead = 0;

        while (!readerExecutor.awaitTermination(1, TimeUnit.MINUTES)) {

            final String fmt = "Loading continues. %,d values read. " +
                "Throughput:%,d values/sec";
            final long totalRead = totalRead();
            final long throughput = (totalRead * 1000) /
                (System.currentTimeMillis() - startMs);
            logger.log((totalRead > prevTotalRead) ?
                Level.INFO : Level.WARNING,
                String.format(fmt, totalRead, throughput));
            prevTotalRead = totalRead;
        }
    }

    /**
     * Shutdown executor used to flush batches to shards. This method is only
     * invoked after all user supplied streams have reached EOF and all
     * partition batches have been flushed.
     */
    private void shutdownShardExecutor(final ExecutorService putExecutor)
        throws InterruptedException {

        final Set rgTasks = new HashSet();

        for (PartitionValues pv : pMap) {
            if (pv.partitionId == 0) {
                /* Ignore dummy partition id zero */
                continue;
            }

           rgTasks.add(pv.shardPutTask);
           pv.shardPutTask.add(partitionBatchEOF);
        }

        putExecutor.shutdown();

        while (!putExecutor.awaitTermination(1, TimeUnit.MINUTES)) {
            final String fmt = "Flushing puts";
            logger.info(fmt);
        }

        /* Collect statistics. */
        for (ShardPutTask rgp : rgTasks) {
            statistics.batchCount += rgp.batchCount;
            statistics.batchQueueUnderflow += rgp.batchQueueUnderflow;
            statistics.batchQueueOverflow += rgp.batchQueueOverflow;
            statistics.existingKeys += rgp.existingKeyCount;
            statistics.putCount += rgp.putCount;
        }
    }

    /**
     * Create the tasks that will write batches of partition entries to their
     * respective shard. The number of tasks per shard is defined by the
     * configuration parameter: perShardParallelism. There is at least one task
     * per shard to ensure that all shards are kept busy during the load.
     *
     * The partitions are divided amongst each shard to ensure that no two
     * tasks ever update the same partition and thus never conflict on locks.
     */
    private ExecutorService startShardExecutor() {

        final int perShardParallelism = options.getPerShardParallelism();

        final int numShardTasks =
            topology.getRepGroupMap().size() * perShardParallelism;

        final ExecutorService putExecutor =
            Executors.newFixedThreadPool(numShardTasks,
                                         new KVThreadFactory("RGWriter",
                                                             logger));

        final Map> map =
            TopologyUtil.getRGIdPartMap(topology);

        for (RepGroupId rgId : topology.getRepGroupIds()) {
            final List list = map.get(rgId);

            /* Divide up the partitions amongst the tasks. */
            int basePartitionsPerTask = list.size() / perShardParallelism;
            int residualPartitions = list.size() % perShardParallelism;

            doneWithShard:
            for (int i = 0; i < list.size();) {
                int partitionsThisTask = basePartitionsPerTask;
                if (residualPartitions > 0) {
                    /* Distribute residual partitions across lead tasks */
                    residualPartitions--;
                    partitionsThisTask++;
                }

                if (partitionsThisTask == 0) {
                    /* More parallelism than partitions in shard. */
                    break doneWithShard;
                }

                final List taskPartitions =
                    list.subList(i, i + partitionsThisTask);

                logger.info("Partitions:" +
                            Arrays.toString(taskPartitions.toArray()) +
                            " assigned to RG task");
                final ShardPutTask putTask =
                    new ShardPutTask(rgId, taskPartitions.size());

                for (PartitionId pid : taskPartitions) {
                    PartitionValues pv = pMap[pid.getPartitionId()];
                    pv.setShardPutTask(putTask);
                }

                putExecutor.submit(putTask);
                /* Move forward in the partition list */
                i += partitionsThisTask;
            }
        }

        return putExecutor;
    }

    /*
     * Terminates the whole bulk put operation and record the exception.
     */
    private void terminateWithException(Exception exception) {

        if (!terminateException.compareAndSet(null, exception)) {
            /*
             * Multiple exceptions. Ignore subsequent ones since we are already
             * shutting down.
             */
            return ;
        }

        List unfinishedBusiness = streamExecutor.shutdownNow();
        if (!unfinishedBusiness.isEmpty()) {
            final int nRemainingTasks = unfinishedBusiness.size();
            logger.log(Level.FINE,
                       "Bulk put reader stream executor didn't shutdown " +
                       "cleanly. {0} tasks remaining.", nRemainingTasks);
        }

        unfinishedBusiness = shardExecutor.shutdownNow();
        if (!unfinishedBusiness.isEmpty()) {
            final int nRemainingTasks = unfinishedBusiness.size();
            logger.log(Level.FINE,
                       "Bulk put shard executor didn't shutdown cleanly. "+
                       "{0} tasks remaining.", nRemainingTasks);
        }
    }

    /**
     * Returns true if the current bulk put operation is terminated.
     */
    private boolean isTerminated() {
        return terminateException.get() != null;
    }

    /**
     * Canonical PartitionBatch object to signify EOF in the partition batch
     * queue.
     */
    private static PartitionBatch partitionBatchEOF = new PartitionBatch();

    /**
     * Used to hold a sorted list of KV pairs. The sorted list ensures locality
     * of reference during insertion on the server.
     */
    private static class PartitionBatch {
        final PartitionId pid;
        final List kvPairs;

        /* The collection of table ids for the batch entries */
        /*
         * CRC: revisit to see if we actually need these once the dust has
         * settled around SR 24670
         */
        final Set tableIds;

        /* The map of stream ids and corresponding entry count */
        final Map perStreamCount;

        PartitionBatch() {
            this(null, null, null, null);
        }

        PartitionBatch(PartitionId pid,
                       List kvPairs,
                       Set tableIds,
                       Map perStreamCount) {
            super();
            this.pid = pid;
            this.kvPairs = kvPairs;
            this.tableIds = tableIds;
            this.perStreamCount = perStreamCount;
        }

        public Integer[] getStreamIds() {
            final Set streamIds = perStreamCount.keySet();
            return streamIds.toArray(new Integer[streamIds.size()]);
        }

        public int getStreamEntryCount(int streamId) {
            return perStreamCount.get(streamId);
        }

        public long[] getTableIds() {
            if (tableIds == null) {
                return null;
            }
            long[] tids = new long[tableIds.size()];
            int i = 0;
            for (Long id : tableIds) {
                tids[i++] = id.longValue();
            }
            return tids;
        }
    }

    /**
     * The task used to write a PartitionBatch to its shard.
     */
    public class ShardPutTask implements Runnable {

        /**
         * The shard associated with this task
         */
        final RepGroupId rgId;

        /**
         * The total number of puts completed by this task
         */
        public long putCount;

        /**
         *  The queue of batches to be processed by this task.
         */
        private final ArrayBlockingQueue queuedKVPairs;

        /**
         * The number of batches processed by this task.
         */
        private long batchCount = 0 ;

        /**
         * The number of times this task was blocked because it did not have
         * a partition batch to write. Large numbers of queue underflows
         * indicate that the user input streams are not providing data fast
         * enough and increasing stream parallelism could help.
         */
        private long batchQueueUnderflow = 0 ;

        /**
         * The number of times a batch could not be inserted because there
         * was no space in the queue. Large numbers of queue overflows
         * indicate that performance could benefit from increased shard
         * parallelism.
         */
        private long batchQueueOverflow = 0 ;

        /**
         * The number of keys that were already present in the store.
         */
        private long existingKeyCount;

        public ShardPutTask(RepGroupId rgId,
                            int numTaskPartitions) {
            this.rgId = rgId;

            queuedKVPairs =
                new ArrayBlockingQueue(numTaskPartitions * 2) ;
        }

        void add(PartitionBatch partBatch)
            throws InterruptedException {

            if (!queuedKVPairs.offer(partBatch)) {
                batchQueueOverflow++;
                while (!isTerminated() &&
                       !queuedKVPairs.offer(partBatch, 10, TimeUnit.SECONDS)) {
                }
            }
        }

        @Override
        public void run() {
            final String sfmt = "Starting RG thread. Shard:%s";
            Integer[] streamIds = null;
            logger.info(String.format(sfmt, rgId));
            try {
                while (true) {
                    PartitionBatch pbatch = queuedKVPairs.poll();
                    if (pbatch == null) {
                        batchQueueUnderflow++;
                        pbatch = queuedKVPairs.take();
                    }

                    if (pbatch == partitionBatchEOF) {
                        return;
                    }

                    streamIds = pbatch.getStreamIds();
                    batchCount++;

                    try {
                        final List existing =
                            store.putBatch(pbatch.pid,
                                           pbatch.kvPairs,
                                           pbatch.getTableIds(),
                                           options.getOverwrite(),
                                           options.getDurability(),
                                           options.getTimeout(),
                                           options.getTimeoutUnit());
                        putCount += pbatch.kvPairs.size();

                        for (int pos : existing) {
                            final KVPair kvPair = pbatch.kvPairs.get(pos);
                            final T entry = convertKVPairToEntry(kvPair);
                            streams[kvPair.getStreamId() - 1].keyExists(entry);
                            existingKeyCount++;
                            logger.info("Existing key at sub-batch pos:" + pos);
                        }
                    } catch (RuntimeException re) {
                        logger.info(Thread.currentThread() + " caught " + re);
                        if (re.getCause() != null) {
                            Throwable e = re.getCause();
                            if (e instanceof InterruptedException) {
                                throw (InterruptedException) e;
                            }
                        }
                        handleRuntimeException(pbatch, re);
                    }

                    tallyEntryCount(streamIds, pbatch);
                }
            } catch (InterruptedException ie) {
                logger.info(Thread.currentThread() + " caught " + ie);
                terminateWithException(new RuntimeException(ie));
            } finally {
                final String fmt = "Exiting RG thread. Shard:%s";
                logger.info(String.format(fmt, rgId));
            }
        }
    }

    /**
     * Tally the entry count for the related entry streams in this batch, if
     * the last entry of stream is put to the store, invoke the
     * EntryStream.completed() method.
     */
    private void tallyEntryCount(final Integer[] streamIds,
                                 final PartitionBatch pbatch) {

        for (Integer streamId : streamIds) {
            final int count = pbatch.getStreamEntryCount(streamId);
            final StreamReader reader = readers.get(streamId - 1);
            synchronized (reader) {
                reader.tallyOpCount(count);
                if (reader.isDone()) {
                    reader.getEntryStream().completed();
                }
            }
        }
    }

    /**
     * Invoke EntryStream.catchException() method for all the related entry
     * streams, terminates the whole bulk put operation if
     * EntryStream.catchException() thrown exception.
     */
    private void handleRuntimeException(final PartitionBatch pbatch,
                                        RuntimeException re) {
        for (KVPair kv : pbatch.kvPairs) {
            final int streamId = kv.getStreamId();
            final T entry = convertKVPairToEntry(kv);
            try {
                streams[streamId - 1].catchException(re, entry);
            } catch (Exception ex) {
                terminateWithException(ex);
                break;
            }
        }
    }

    /**
     * Convert a KVPair object to an entry object.
     */
    private T convertKVPairToEntry(final KVPair kv) {
        final Key key = serializer.fromByteArray(kv.getKey());
        final Value value = Value.fromByteArray(kv.getValue());
        return convertToEntry(key, value);
    }

    /**
     * Flush all residual values that were queued at their partitions to their
     * respective shards.
     */
    private void flushPartitions()
        throws InterruptedException {

        for (PartitionValues pv : pMap) {
            pv.flush(true);
        }

        logger.info("Flushed all partitions");
    }

    /**
     * Wait for all futures queued to read streams to finish and accumulate
     * read counts.
     */
    private void finishStreams(ArrayList> futures,
                               AggregateStatistics putResult)
        throws InterruptedException  {

        for (Future f : futures) {
            if (isTerminated()) {
                f.cancel(true);
                continue;
            }
            try {
                long readCount = f.get();
                putResult.aggregate(readCount);
            } catch (ExecutionException e) {
                final Throwable t = e.getCause();

                if (t instanceof RuntimeException) {
                    throw (RuntimeException) t;
                }

                throw new IllegalStateException(t);
            }
        }
    }

    /**
     * Represents the aggregate statistics across all streams.
     */
    private static class AggregateStatistics {

        private long batchCount;
        private long batchQueueUnderflow ;
        private long batchQueueOverflow;

        /**
         * The total number of entries read from all the streams
         * supplied to the operation.
         */
        private long readCount ;

        /**
         * The total number of entries actually inserted into the store
         * as a result of the operation. This number is typically
         * equal to the number returned by {@link #entriesRead} but may be less
         * if entries supplied by the stream have primary keys that are already
         * present in the store. Or if reading from a stream was abandoned due
         * to an exception.
         */
        private long putCount ;

        /**
         * The number of entries that were rejected because there was
         * already an entry, with the same primary key in the store.
         */
        private long existingKeys ;

        public void aggregate(long entriesRead) {
            readCount += entriesRead;
        }

        @Override
        public String toString() {
            final String fmt =
                "%,d rows read, %,d inserted, %,d pre-existing. " +
                "%,d batches; %,d batch queue underflows; " +
                "%,d batch queue overflows;"  +
                "%,d av batch size;";

            return String.format(fmt, readCount, putCount, existingKeys,
                                 batchCount,
                                 batchQueueUnderflow,
                                 batchQueueOverflow,
                                 ((batchCount > 0) ?
                                     (putCount / batchCount) : 0));
        }
    }


    /**
     * Total entries read from all streams
     */
    private long totalRead() {
        long totalRead = 0;
        for (StreamReader reader: readers) {
            totalRead += reader.getReadCount();
        }
        return totalRead;
    }

    /**
     * Dedicated task used to read a specific row or KV stream
     *
     * @param  the entry type: Row or a KV pair
     */
    public abstract class StreamReader implements Callable {

        /**
         * The internal stream id
         */
        private final int streamId;

        /**
         * The stream being read
         */
        private final EntryStream entryStream;

        /**
         * The number of records read by this stream reader.
         */
        private volatile long readCount = 0;

        /**
         * The number of records with the key that already existed in the
         * partition buffer, that is, they were duplicates in or across streams
         * and were detected as such even before being sent to the store.
         */
        private volatile long dupCount = 0;

        /**
         * The flag to indicate if there are no more elements to read.
         */
        private volatile boolean noMoreElement = false;

        /**
         * The number of records put to store.
         */
        private final AtomicLong putCount;

        public StreamReader(int streamId, EntryStream entryStream) {
            super();
            this.streamId = streamId;
            this.entryStream = entryStream;
            putCount = new AtomicLong();
        }

        @Override
        public Long call() throws Exception {
           final String sfmt = "Started stream reader for %s(%d)";
           logger.info(String.format(sfmt, entryStream.name(), streamId));

           try {

               for (E e = entryStream.getNext();
                    (e != null) ;
                    e = entryStream.getNext()) {

                   readCount++;

                   final Key pk = getKey(e);
                   final Value value = getValue(e);
                   final long tableId = getTableId(e);
                   final byte[] keyBytes = serializer.toByteArray(pk);
                   final PartitionId pid = topology.getPartitionId(keyBytes);
                   final TimeToLive ttl = getTTL(e);
                   pMap[pid.getPartitionId()].put(keyBytes,
                                                  value.toByteArray(),
                                                  streamId,
                                                  tableId,
                                                  ttl);
               }
               noMoreElement = true;
               /*
                * Invoke the EntryStream.completed() if no element read
                * from the stream
                */
               if (readCount == 0) {
                   entryStream.completed();
               }
           } catch (RuntimeException re) {
               terminateWithException(re);
           } catch (InterruptedException ie) {
               terminateWithException(new RuntimeException(ie));
           } finally {
               final String fmt = "Finished stream reader for %s(%d)";
               logger.info(String.format(fmt, entryStream.name(), streamId));
           }
           return readCount;
        }

        void keyExists(E entry) {
            dupCount++;
            entryStream.keyExists(entry);
        }

        EntryStream getEntryStream() {
            return entryStream;
        }

        long getReadCount() {
            return readCount;
        }

        void tallyOpCount(int count) {
            putCount.addAndGet(count);
        }

        /**
         * Return true if all elements have read and match all writes after
         * accounting for transiently detected duplicates.
         */
        boolean isDone() {
            return noMoreElement && (putCount.get() == (readCount - dupCount));
        }

        /**
         * Abstract methods used to abstract how the keys and values are
         * obtained.
         */
        protected abstract Key getKey(E entry);

        protected abstract Value getValue(E entry);

        /**
         * Returns the table id of the entry, BulkPut for table should
         * override this method.
         */
        protected long getTableId(@SuppressWarnings("unused") E entry) {
            return 0;
        }

        /**
         * Returns the ttl of the entry, BulkPut for table should override
         * this method.
         */
        protected TimeToLive getTTL(@SuppressWarnings("unused") E entry) {
            return null;
        }
    }

    /**
     * The values associated with a specific partition.
     */
    private class PartitionValues {

        /**
         * The partition associated with the values.
         */
        private final int partitionId;

        /**
         * The task designated to write this partition's values to its shard.
         */
        private ShardPutTask shardPutTask;

        /**
         * The number of entries that were actually inserted into the partition.
         */
        private long putCount = 0;

        /**
         * Holds the sorted values that are waiting to be written to the shard.
         * Tried ConcurrentSkipListMap to eliminate use of synchronized methods
         * but it resulted in lower perf on Nashua machines.
         */
        private final SortedMap kvPairs =
            new TreeMap(KEY_BYTES_COMPARATOR);

        /**
         * The total number of key/value bytes stored in kvPairs. It's
         * compared against the partition threshold to help determine when to
         * flush kv pairs to the partition.
         */
        private long treeBytes = 0;

        public void setShardPutTask(ShardPutTask rgPutThread) {
            this.shardPutTask = rgPutThread;
        }

        PartitionValues(int pid) {
            super();
            this.partitionId = pid;
        }

        @SuppressWarnings("unchecked")
        synchronized void put(byte[] key, byte[] value,
                              int streamId, long tableId,
                              TimeToLive ttl)
            throws InterruptedException {

            final WrappedValue wv = new WrappedValue(value, streamId,
                                                     tableId, ttl);
            final Object old = kvPairs.put(key, wv);
            if (old != null) {
                List list;
                if (old instanceof List) {
                    list = (List) old;
                    list.add(wv);
                } else {
                    list = new ArrayList();
                    list.add((WrappedValue)old);
                    list.add(wv);
                }
                kvPairs.put(key, list);
            }
            treeBytes += (key.length + wv.getBytesSize());
            flush(false);
        }

        /**
         * Flushes values in the kvPairs tree to the shard if needed.
         *
         * A flush is typically done if the size of the storage occupied by
         * kvPairs exceeds the threshold number of bytes associated with the
         * partition.
         *
         * @param force if true the partition is flushed even if the threshold
         * has not been reached
         *
         */
        private void flush(boolean force)
            throws InterruptedException {

            final int maxRequestSize = options.getMaxRequestSize();

            final String fmt =
                "Queued Partition %d flushed. Batch size %,d; Total:%,d;" +
                    " Tree bytes:%,d; request size:%,d";

            while ((force && kvPairs.size() > 0) ||
                   (treeBytes > partitionThresholdBytes)) {
                int putBatchCount = 0;
                int requestSize = 0;
                final List le = new ArrayList();
                final Map streamIdCountMap =
                    new HashMap();
                final Set tableIds = new HashSet();
                synchronized (this) {

                    for (Iterator> iter =
                         kvPairs.entrySet().iterator();
                         iter.hasNext();) {

                        final Entry e = iter.next();
                        iter.remove();

                        final byte[] key = e.getKey();
                        final Object obj = e.getValue();
                        int size = 0;
                        if (obj instanceof List) {
                            @SuppressWarnings("unchecked")
                            List wvs = (List)obj;
                            for (WrappedValue wv : wvs) {
                                size += addEntry(key, wv, le, tableIds,
                                                 streamIdCountMap);
                            }
                            putBatchCount += wvs.size();
                        } else {
                            assert (obj instanceof WrappedValue);
                            size = addEntry(key, (WrappedValue)obj, le,
                                            tableIds, streamIdCountMap);
                            putBatchCount++;
                        }
                        treeBytes -= size;
                        requestSize += size;
                        if (requestSize > maxRequestSize) {
                            break;
                        }
                    }

                    putCount += putBatchCount;
                }

                /* Can block, do it outside sync block */
                shardPutTask.add(
                    new PartitionBatch(new PartitionId(partitionId), le,
                                       (tableIds.isEmpty() ? null : tableIds),
                                       streamIdCountMap));

                logger.fine(String.format(fmt, partitionId,
                                          putBatchCount, putCount,
                                          treeBytes, requestSize));
            }
        }

        /**
         * Adds a entry to KVPair list, return the size of the entry.
         */
        private int addEntry(byte[] key, WrappedValue wv,
                             List kvpairs, Set tableIds,
                             Map streamCountMap) {

            final byte[] value = wv.getValue();
            final int streamId = wv.getStreamId();
            final long tableId = wv.getTableId();
            kvpairs.add(new KVPair(key,
                                   value,
                                   wv.getTTLVal(),
                                   wv.getTTLUnitOrdinal(),
                                   streamId));
            if (tableId != 0) {
                tableIds.add(wv.getTableId());
            }
            tallyEntryCount(streamCountMap, wv.getStreamId());
            return key.length + wv.getBytesSize();
        }

        /**
         * Tally the entry count for each stream in the current batch.
         */
        private void tallyEntryCount(final Map streamCountMap,
                                     final int streamId) {
            if (streamCountMap.containsKey(streamId)) {
                final int count = streamCountMap.get(streamId) + 1;
                streamCountMap.put(streamId, count);
            } else {
                streamCountMap.put(streamId, 1);
            }
        }
    }

    /**
     * A class represents a value information that includes value bytes, and
     * its corresponding streamId and tableId.
     */
    private static class WrappedValue {

        private final byte[] value;
        private final int streamId;
        private final long tableId;
        private final int ttlVal;
        private final byte ttlUnitOrdinal;

        WrappedValue(byte[] value, int streamId, long tableId, TimeToLive ttl) {
            this.value = value;
            this.streamId = streamId;
            this.tableId = tableId;
            if (ttl != null) {
                ttlVal = (int)ttl.getValue();
                ttlUnitOrdinal = (byte)ttl.getUnit().ordinal();
            } else {
                ttlVal = 0;
                ttlUnitOrdinal = 0;
            }
        }

        int getStreamId() {
            return streamId;
        }

        long getTableId() {
            return tableId;
        }

        byte[] getValue() {
            return value;
        }

        int getTTLVal() {
            return ttlVal;
        }

        byte getTTLUnitOrdinal() {
            return ttlUnitOrdinal;
        }
        /*
         * Returns the bytes size of a WrappedValue object, currently it is
         * calculated as the total size of its 5 members: value:length,
         * streamId:4, tableId:8, ttlVal:4 and ttlUnitOrdinal:1 .
         *
         * TODO: overhead to add?
         */
        int getBytesSize() {
            return value.length + 4 + 8 + 4 + 1;
        }
    }

    /**
     * A simple "struct" used to hold a key/value pair
     *
     * @see #writeFastExternal FastExternalizable format
     */
    public static class KVPair implements FastExternalizable {
        final byte[] key;
        final byte[] value;
        final int ttlVal;
        final TimeUnit ttlUnit;
        final int streamId;

        public KVPair(byte[] key, byte[] value, int ttlVal,
                      byte ttlUnitOrdinal, int streamId) {
            this.key = key;
            this.value = value;
            this.ttlVal = ttlVal;
            ttlUnit = TimeToLive.convertTimeToLiveUnit(ttlVal, ttlUnitOrdinal);
            this.streamId = streamId;
        }

        /** Creates an instance from the input stream. */
        public KVPair(DataInput in, short serialVersion)
            throws IOException {

            if (serialVersion >= STD_UTF8_VERSION) {
                key = readNonNullByteArray(in);
                value = readNonNullByteArray(in);
            } else {
                final int keySize = in.readInt();
                key = new byte[keySize];
                in.readFully(key);
                final int valueSize = in.readInt();
                value = new byte[valueSize];
                in.readFully(value);
            }
            if (serialVersion >= TTL_SERIAL_VERSION) {
                ttlVal = TimeToLive.readTTLValue(in);
                ttlUnit = TimeToLive.readTTLUnit(in, ttlVal);
            } else {
                ttlVal = 0;
                ttlUnit = TimeUnit.DAYS;
            }
            streamId = -1;
        }

        /**
         * Writes the fields of this object to the output stream.  Format for
         * {@code serialVersion} {@link SerialVersion#STD_UTF8_VERSION} and
         * greater:
         * 
         *  ({@link SerializationUtil#writeNonNullByteArray non-null byte
         *      array}) {@link #getKey key}
         * 
 ({@link SerializationUtil#writeNonNullByteArray non-null byte
         *      array}) {@link #getValue value}
         * 
 ({@link InternalOperation#writeTimeToLive(DataOutput, short,
         *      int, TimeUnit, String) TimeToLive}) time to live
         * 
         */
        @Override
        public void writeFastExternal(DataOutput out, short serialVersion)
            throws IOException {

            if (serialVersion >= STD_UTF8_VERSION) {
                writeNonNullByteArray(out, key);
                writeNonNullByteArray(out, value);
            } else {
                out.writeInt(key.length);
                out.write(key);
                out.writeInt(value.length);
                out.write(value);
            }
            if (serialVersion >= TTL_SERIAL_VERSION) {
                InternalOperation.writeTimeToLive(out, serialVersion, ttlVal,
                                                  ttlUnit, "bulk put");
            }
        }

        public byte[] getKey() {
            return key;
        }

        public byte[] getValue() {
            return value;
        }

        public int getStreamId() {
            return streamId;
        }

        public int getTTLVal() {
            return ttlVal;
        }

        public TimeUnit getTTLUnit() {
            return ttlUnit;
        }
    }
}