oracle.kv.impl.rep.migration.MigrationTarget Maven / Gradle / Ivy

Go to download
Show more of this group Show more artifacts with this name
Show all versions of oracle-nosql-server Show documentation
NoSQL Database Server - supplies build and runtime support for the server (store) side of the Oracle NoSQL Database.
There is a newer version: 18.3.10
/*-
 * Copyright (C) 2011, 2018 Oracle and/or its affiliates. All rights reserved.
 *
 * This file was distributed by Oracle as part of a version of Oracle NoSQL
 * Database made available at:
 *
 * http://www.oracle.com/technetwork/database/database-technologies/nosqldb/downloads/index.html
 *
 * Please see the LICENSE file included in the top-level directory of the
 * appropriate version of Oracle NoSQL Database for a copy of the license and
 * additional information.
 */

package oracle.kv.impl.rep.migration;

import java.io.DataInputStream;
import java.io.DataOutput;
import java.io.IOException;
import java.net.InetSocketAddress;
import java.nio.channels.Channels;
import java.util.HashMap;
import java.util.LinkedList;
import java.util.Map;
import java.util.Queue;
import java.util.concurrent.Callable;
import java.util.logging.Level;
import java.util.logging.Logger;

import oracle.kv.impl.admin.param.RepNodeParams;
import oracle.kv.impl.fault.RNUnavailableException;
import oracle.kv.impl.rep.RepNode;
import oracle.kv.impl.rep.RepNodeService.Params;
import oracle.kv.impl.rep.admin.RepNodeAdmin.MigrationState;
import oracle.kv.impl.rep.admin.RepNodeAdmin.PartitionMigrationState;
import oracle.kv.impl.rep.migration.PartitionMigrations.TargetRecord;
import oracle.kv.impl.rep.migration.TransferProtocol.OP;
import oracle.kv.impl.rep.migration.TransferProtocol.TransferRequest;
import oracle.kv.impl.rep.migration.generation.PartitionGenNum;
import oracle.kv.impl.rep.migration.generation.PartitionMDException;
import oracle.kv.impl.topo.PartitionId;
import oracle.kv.impl.topo.RepGroupId;
import oracle.kv.impl.topo.Topology;
import oracle.kv.impl.util.KVThreadFactory;
import oracle.kv.impl.util.TxnUtil;
import oracle.kv.impl.util.server.LoggerUtils;
import oracle.kv.util.PingCollector;
import oracle.kv.util.PingCollector.RNNameHAPort;

import com.sleepycat.je.Database;
import com.sleepycat.je.DatabaseEntry;
import com.sleepycat.je.DatabaseException;
import com.sleepycat.je.DatabaseNotFoundException;
import com.sleepycat.je.DiskLimitException;
import com.sleepycat.je.Durability;
import com.sleepycat.je.Put;
import com.sleepycat.je.Transaction;
import com.sleepycat.je.TransactionConfig;
import com.sleepycat.je.WriteOptions;
import com.sleepycat.je.rep.InsufficientReplicasException;
import com.sleepycat.je.rep.NoConsistencyRequiredPolicy;
import com.sleepycat.je.rep.ReplicaWriteException;
import com.sleepycat.je.rep.ReplicatedEnvironment;
import com.sleepycat.je.rep.UnknownMasterException;
import com.sleepycat.je.rep.impl.RepImpl;
import com.sleepycat.je.rep.net.DataChannel;
import com.sleepycat.je.rep.net.DataChannelFactory.ConnectOptions;
import com.sleepycat.je.rep.utilint.HostPortPair;
import com.sleepycat.je.rep.utilint.RepUtils;
import com.sleepycat.je.rep.utilint.ServiceDispatcher;
import com.sleepycat.je.rep.utilint.ServiceDispatcher.Response;
import com.sleepycat.je.rep.utilint.ServiceDispatcher.ServiceConnectFailedException;
import com.sleepycat.je.utilint.VLSN;

/**
 * Partition migration target. This class is the destination side
 * of a source target pair.
 *
 * The migration process is initiated by the target which will attempt to
 * contact the source and send a migration request. This initial request
 * is the only time the target sends data to the source. All other
 * communication is one-way, source to target.
 *
 * Once a connection is established (migration stream) the source will send
 * records (keys and values) read from the partition's DB and client
 * operations targeted for the partition. Additional messages are needed to
 * handle client transactions.
 *
 * The source sends Ops until all records are read read and send to the
 * target. At that time the source will send an End of Data (EoD) message.
 * Once the target received the EoD Op it initiates the Transfer of Ownership
 * protocol (ToO).
 *
 * If the target encounters an error any time during the above steps
 * it may wait and retry. See the MigrationTarget.call() method.
 *
 * During a migration the target consists of two threads. One (Reader) will
 * read operation messages from the source migration stream. Most messages
 * result in Op objects being placed on a queue. The second "consumer" thread
 * (MigrationTarget) removes Ops from the queue and executes them. This
 * continues until an EoD message is encountered.
 *
 * The basic message sequence for a client (non-copy) operation is:
 *
 * {@literal
 *      1. Op (Put, Delete) --> 2. Prepare --> 3. Resolution (Commit/Abort)
 * }
 *
 * On the source, the client operation's transaction is closed (Commit or
 * Abort) between sending the Prepare (2) and sending the Resolution (3).
 *
 * Since sending EoD is based reading the source DB records it can occur at any
 * time. The target needs to handle the possible cases where EoD interrupts
 * the client messages. They are:
 *
 * Case 1 - If EoD is before 1 the client operation will be rejected on the
 *          source node, to be retried, and the Op is never sent. The retry
 *          should be redirected to the target node. Since the Op is not sent,
 *          the target is not aware of this case.
 *
 * Case 2 - If EoD is between 1 and 2 the operation will be rejected on the
 *          source node as in case 1. The target will see the Op but no other
 *          messages for it. Since the operation was not committed on the
 *          source, it is OK to just drop it on the target.
 *
 * Case 3 - If EoD is between 2 and 3 the operation may commit, abort, or fail.
 *          The target will see the Op and the Prepare messages but no
 *          Resolution. Because of this the target does not know them client
 *          operation's outcome.
 *               a) If the Op is committed, it could have aborted (or failed)
 *                  and the data would be incorrectly written.
 *               b) If the Op is dropped, it could have committed on the source,
 *                  in which case the new partition will be missing that record.
 *          The only thing that can be done is to abandon the migration and
 *          start over again.
 *
 * Case 4 - EoD sent after 3 is the usual steady state while the DB copy is
 *          in progress.
 */
class MigrationTarget implements Callable {

    private final Logger logger;

    private static final int SECOND_MS = 1000;

    /* Number of times to retry after an error. */
    private static final int MAX_ERRORS = 10;

    /* Retry wait period (ms) for when the source or target is busy */
    private final long waitAfterBusy;

    /* Retry wait period (ms) for when there is an error */
    private final long waitAfterError;

    /* Configuration for speedy writes */
    private static final TransactionConfig weakConfig =
        new TransactionConfig().
               setConsistencyPolicy(NoConsistencyRequiredPolicy.NO_CONSISTENCY).
               setDurability(new Durability(Durability.SyncPolicy.NO_SYNC,
                                            Durability.SyncPolicy.NO_SYNC,
                                            Durability.ReplicaAckPolicy.NONE));

    /* Write options for TTL to avoid creating a new instance on each op */
    private final WriteOptions writeOptions;

    /* The partition this target is going to get */
    private final PartitionId partitionId;

    /* The current rep group the partition resides on */
    private final RepGroupId sourceRGId;

    /* The ID of the TargetRecord associated with this target */
    private final long recordId;

    private final RepNode repNode;

    private final MigrationManager manager;

    private final ReplicatedEnvironment repEnv;

    private final ReaderFactory readerFactory;

    private DataChannel channel = null;

    /* The new partition db */
    private Database partitionDb = null;

    /*
     * The following three flags define the state of the migration:
     *                 running          done      canceled
     * PENDING          false           false       false
     * RUNNING          true            false       false
     * SUCCEEDED          -             true         -
     * ERROR              -             false       true
     *
     * The legal transitions are:
     *
     *                                    setDone()
     *          setRunning()            -------------> SUCCEEDED
     * PENDING --------------> RUNNING /
     *     ^                      |    \-------------> ERROR
     *     |______________________|      setCanceled() via error() or
     *          setStopped()              cancel()
     */

    /*
     * True when the target is executing.
     */
    private volatile boolean running = false;

    /*
     * True when the migration is complete and the partition has been
     * made durable. Once set the migration can not be canceled.
     */
    private volatile boolean done = false;

    /* Guard to keep from being canceled while finishing ToO (see setDone()) */
    private volatile boolean inDone = false;

    /*
     * True when the migration is canceled. Could be set from an admin
     * command or because of an unrecoverable error.
     */
    private volatile boolean canceled = false;

    /* Exception that caused the migration to terminate */
    private Exception errorCause = null;

    /*
     * Time (in milliseconds) to wait before retrying the target after an error
     * or busy response.
     */
    private long retryWait = -1;

    /* True if the EOD marker was received */
    private volatile boolean eodReceived;

    /*
     * For logging. When available, this will be set to the master RN of the
     * source group. Otherwise it will be the source group name.
     */
    private String sourceName;

    /* statistics */
    private final long requestTime;
    private long startTime = 0;
    private long endTime = 0;
    private long operations = 0;
    private long copyOps = 0;
    private long copyBytes = 0;
    private long copyBatches = 0;
    private int attempts = 0;
    private int busyResponses = 0;
    private int errors = 0;

    MigrationTarget(TargetRecord record,
                    RepNode repNode,
                    MigrationManager manager,
                    ReplicatedEnvironment repEnv,
                    Params params) {

        partitionId = record.getPartitionId();
        sourceRGId = record.getSourceRGId();
        /* Until a connection is made, the source name is just the group name */
        sourceName = sourceRGId.getGroupName();
        recordId = record.getId();
        this.repNode = repNode;
        this.manager = manager;
        this.repEnv = repEnv;
        logger = LoggerUtils.getLogger(this.getClass(), params);
        final RepNodeParams repNodeParams = params.getRepNodeParams();
        waitAfterBusy =  repNodeParams.getWaitAfterBusy();
        waitAfterError = repNodeParams.getWaitAfterError();
        readerFactory = new ReaderFactory();
        requestTime = System.currentTimeMillis();
        writeOptions = new WriteOptions().setUpdateTTL(true);
    }

    /**
     * Gets the source group ID.
     *
     * @return the source group ID
     */
    RepGroupId getSource() {
        return sourceRGId;
    }

  /**
   * Gets the ID of the TargetRecord associated with this target.
   *
   * @return the target record ID
   */
    long getRecordId() {
        return recordId;
    }

    /**
     * Gets statistics on this migration target.
     *
     * @return a statistics object
     */
    PartitionMigrationStatus getStatus() {
        return getStatus(getState().getPartitionMigrationState());
    }

    private PartitionMigrationStatus getStatus(PartitionMigrationState state) {
        return new PartitionMigrationStatus(state,
                                            partitionId.getPartitionId(),
                                            repNode.getRepNodeId().getGroupId(),
                                            sourceRGId.getGroupId(),
                                            operations,
                                            requestTime,
                                            startTime,
                                            endTime,
                                            attempts,
                                            busyResponses,
                                            errors);
    }

    /**
     * Gets the state of the migration. The admin will poll for status as part
     * of the ToO protocol. This returns SUCCESS iff the the EOD messages
     * is received and the partition has been made durable.
     *
     * @return migration state
     */
    MigrationState getState() {
        PartitionMigrationState state =
            done ? PartitionMigrationState.SUCCEEDED :
                canceled ? PartitionMigrationState.ERROR :
                    running ? PartitionMigrationState.RUNNING :
                         PartitionMigrationState.PENDING;
        return new MigrationState(state, errorCause);
    }

    /**
     * Gets the partition ID of this target.
     *
     * @return the partition ID
     */
    PartitionId getPartitionId() {
        return partitionId;
    }

    /**
     * Attempts to cancel the migration. If wait is true, this method will wait
     * on the target thread to exit. Returns true if the migration can be
     * canceled, otherwise false is returned. The migration can be canceled any
     * time before the final commit of the partition and switch to topology x.
     *
     * Note that this does not remove the record from the db, as this cancel
     * could be due to shutdown, and not a failure or admin cancel.
     *
     * @param wait wait flag
     * @return true if migration was canceled
     */
    synchronized boolean cancel(boolean wait) {
        if (done || inDone) {
            return false;
        }
        setCanceled(wait, new Exception("Migration canceled"));
        return true;
    }

    /**
     * Cancels the migration by setting the canceled flag and cleans up the
     * target.
     *
     * @param wait wait flag
     */
    private synchronized void setCanceled(boolean wait, Exception cause) {
        assert !done;

        canceled = true;
        errorCause = cause;
        cleanup(wait);
    }

    /**
     * Cancels the migration due to an unrecoverable condition.
     *
     * @param msg message to log
     * @param cause exception to log
     */
    private void error(String msg, Exception cause) {
        assert !Thread.holdsLock(this);

        logger.log(Level.WARNING, msg, cause);

        setCanceled(false, new Exception(msg, cause));
        try {
            /* On an unrecoverable error, remove the record from the db */
            manager.removeRecord(partitionId, recordId, false);
        } catch (DatabaseException de) {
            logger.log(Level.INFO,
                       "Exception attempting to remove migration record for " +
                       partitionId,
                       de);
        }
    }

    /**
     * Cleans up this target. If wait is true, this method will wait on
     * the target thread to exit.
     *
     * @param wait wait flag
     */
    private synchronized void cleanup(boolean wait) {
        setStopped();

        /*
         * If requested to wait and the DB is open, wait until it is closed.
         * This is done to avoid DB errors caused but the target still running
         * after a cancel. (A DB error will cause the env to be invalidated,
         * something to avoid.)
         *
         * By setting stopped (above) Reader.remove() will return null,
         * causing the main thread to exit and call cleanup(false) which will
         * then close the DB at an OK time.
         */
        if (wait && (partitionDb != null)) {
            try {
                logger.log(Level.INFO, "Waiting for {0} to exit", this);
                wait(2 * SECOND_MS);
            } catch (InterruptedException ie) {
                logger.log(Level.INFO, "Unexpected interrupt", ie);
            }
        }

        /* Close the channel if open. */
        if (channel != null) {
            try {
                channel.close();
            } catch (Exception ex) {
                logger.log(Level.WARNING, "Exception closing channel", ex);
            }
            channel = null;
        }

        /*
         * If we are not done, remove the partition migration DB if one exists.
         * (Note that a check for done is unnecessary as setDone() closes
         * the DB which clears db)
         */
        if (partitionDb != null) {
            assert !done;

            final String dbName = partitionId.getPartitionName();
            logger.log(Level.INFO, "Removing migrated DB {0}", dbName);

            try {
                closeDB();
            } catch (Exception ex) {
                logger.log(Level.WARNING, "Exception closing DB", ex);
            }

            try {
                repEnv.removeDatabase(null, dbName);
            } catch (DatabaseNotFoundException dnfe) {
                /* Shouldn't happen, but if it does, not really bad */
            } catch (Exception ex) {
                logger.log(Level.WARNING, "Exception removing DB", ex);
            }

            /*
             * We need to clean any secondary databases that may have been
             * present and populated by records from this aborted migration.
             * Note that if cleaning is required, future partition migrations
             * (including the restart of this one) will be held off until the
             * cleaning is complete (see the TableManager#isBusyCleaning()
             * calls in MigrationManager and MigrationService).
             */
            repNode.getTableManager().notifyRemoval(partitionId);
        }
    }

    /**
     * Closes the partition migration DB if one exists.
     */
    private synchronized void closeDB() {
        if (partitionDb != null) {
            partitionDb.close();
            partitionDb = null;
        }

        /* A cleanup may be waiting for the DB to be closed */
        notifyAll();
    }

    /**
     * Sets the running flag. The running flag can be set to true only if
     * not done or canceled, otherwise it is set to false. Returns the value
     * of the running flag.
     *
     * @return running
     */
    private synchronized boolean setRunning() {
        assert !running;
        running = !done && !canceled;
        return running;
    }

    /**
     * Sets the running flag to false.
     */
    private synchronized void setStopped() {
        running = false;
    }

    /**
     * Sets state to done (SUCCEEDED) and persists the partition DB.
     */
    private void setDone(Reader.EoD eod) {
        endTime = System.currentTimeMillis();

        if (logger.isLoggable(Level.INFO)) {
            printMigrationStats();
        }

        try {
            /*
             * The normal monitor can't be held when writing the db (in
             * persistTargetDurable()) so we guard from being canceled by
             * setting inDone to true. We can't just set done to true because
             * that will cause getState() to return SUCCESS before things
             * are made durable and everyone is informed.
             */
            synchronized (this) {
                /* Doh! Too bad, we were almost there! */
                if (canceled) {
                    return;
                }
                inDone = true;
            }
            closeDB();

            /*
             * ToO #5 & #6 - Make the partition durable and persist that the
             * transfer from the source is complete.
             *
             * Persisting the transfer complete will allow the target node
             * to accept forwarded client ops - ToO #7
             *
             * ToO #9 - Setting done to true will cause SUCCEEDED be returned
             * from getMigrationState()
             */
            done = persistTargetDurable(eod);

        } finally {
            inDone = false;
        }
    }

    /*
     * Print Partition Migration stats in actual data movement
     */
    private void printMigrationStats() {

        final String avgBatchOps =
                String.format("%.1f", (float)copyOps / (float)copyBatches);
        final float avgBatchBytes = (float)copyBytes / (float)copyBatches;
        final String avgBatchSize = (avgBatchBytes < 1000) ?
                           String.format("%.1f B", avgBatchBytes) :
                           String.format("%.1f kB", avgBatchBytes / 1000);
        final long seconds = (endTime - startTime) / 1000;
        logger.log(Level.INFO,
                   "Migration of {0} complete, {1} operations, " +
                   "Avg copy batch {2} ops {3}, transfer time: {4} seconds",
                   new Object[]{partitionId, operations,
                                avgBatchOps, avgBatchSize, seconds});
    }

    /**
     * Main target loop. This may be called repeatedly during the life of the
     * partition migration, potentially in different threads. In the case of
     * a retry-able error or busy response from the source, this call will
     * return this object. In all other cases null is returned. If this
     * object is returned, getRetryWait() will return the time to wait before
     * attempting to retry.
     *
     * @return this object or null
     */
    @Override
    public MigrationTarget call() {
        while (setRunning()) {
            runMigration();

            final long waitTime = getRetryWait();

            /* done or canceled */
            if (waitTime < 0) {
                break;
            }

            if (waitTime > 0) {
                return this;
            }

            /*
             * waitTime == 0, indicating an error occurred after the EOD.
             * In this case immediately re-try because the source will
             * have closed the partition DB (at ToD #4) and is waiting for
             * resolution (ToO #12) which will never come. By retrying the
             * migration the source will detect the restart and re-instate
             * the partition. (Note that the source will respond with BUSY
             * while the partition is being restored)
             */
        }
        return null;
    }

    private void runMigration() {
        attempts++;
        startTime = System.currentTimeMillis();
        endTime = 0;
        operations = 0;
        copyOps = 0;
        copyBytes = 0;
        copyBatches = 0;
        eodReceived = false;

        /*
         * Try clause for catching exceptions. The loop will open a channel
         * to the source, create a db if needed, and read operations from
         * the channel to populate the db.
         */
        try {
            final DataInputStream stream = openChannel();

            TransferRequest.write(channel,
                                  partitionId.getPartitionId(),
                                  repNode.getRepNodeId());

            final Response response = TransferRequest.readResponse(stream);
            switch (response) {

                /* OK */
                case OK:
                    logger.log(Level.INFO, "Starting {0}", this);

                    createDb();

                    /*
                     * Read loop. This returns when done, or throws an
                     * IOException.
                     */
                    consumeOps(readerFactory.newReader(stream));
                    break;

                /*
                 * If BUSY, the source is suitable for migration but is
                 * currently unavailable. Force the usual retry logic
                 * but without a retry limit since we assume that the
                 * source will eventually be able to service the
                 * request.
                 */
                case BUSY:
                    // TODO - make use of this info
                    TransferRequest.readNumStreams(stream);
                    setBusyRetryWait("Source busy: " +
                                     TransferRequest.readReason(stream), true);
                    break;

                /*
                 * If UNKNOWN_SERVICE the node may be down/coming up or the
                 * partition is missing (from a previously failed migration).
                 * Treat as an error and retry, but not forever.
                 */
                case UNKNOWN_SERVICE:
                    setErrorRetryWait(new Exception("Unknown service: " +
                                           TransferRequest.readReason(stream)));
                    break;

                /* Fatal */
                case FORMAT_ERROR:
                    error("Fatal response " + response + " from source: " +
                          TransferRequest.readReason(stream), null);
                    break;
                case AUTHENTICATE:
                        /* should not occur in this context */
                        error("Authenticate response encountered outside of " +
                              "hello sequence", null);
                        break;
                case PROCEED:
                	/* should not occur in this context */
                	error("Proceed response encountered outside of hello " +
                              "sequence", null);
                	break;
                case INVALID:
                    error("Fatal response " + response + " from source: " +
                          TransferRequest.readReason(stream), null);
                    break;
            }
        } catch (IOException | DatabaseException e) {
            setErrorRetryWait(e);
        } catch (ServiceConnectFailedException scfe) {
            error("Failed to connect to migration service at " + sourceName +
                  " for " + partitionId, scfe);
        } catch (Exception ex) {
            error("Unexpected exception, exiting", ex);
        }  finally {
            /* End time will not be set on error */
            if (endTime == 0) {
                endTime = System.currentTimeMillis();
            }

            /*
             * Before the try block exits, one of the following will be called:
             *    setDone() on success,
             *    setRetryWait() on a retry-able error, or busy response, or
             *    error() on an non-retry-able error.
             */
            try {
                cleanup(false);
            } catch (InsufficientReplicasException | DiskLimitException ire) {
                /*
                 * These exceptions can happen since cleanup() calls
                 * TableManager#notifyRemoval() which does
                 * SecondaryInfoMap#persist(). When that fails, we will not be
                 * able to clean the secondary populated by the aborted
                 * migration. But this is OK since the JE will delete obsolete
                 * secondary when it is scanned. So simply move on.
                 */
            }
        }
    }

    /**
     * Sets the retry wait due to a busy condition. If isBusyResponse is true
     * busyResponses is incremented.
     */
    private void setBusyRetryWait(String reason, boolean isBusyResponse) {
        assert !done;

        if (canceled) {
            return;
        }

        if (isBusyResponse) {
            busyResponses++;
        }

        retryWait = waitAfterBusy;

        logger.log(Level.FINE,
                   "Migration of {0} from {1} did not start: {2}, " +
                   "retry in {3} ms",
                   new Object[]{partitionId, sourceName, reason, retryWait});
    }

    /**
     * Sets the retry wait due to a retryable error condition. If the number
     * of errors > MAX_ERRORS the migration will be canceled.
     */
    private void setErrorRetryWait(Exception ex) {
        assert !done;

        if (canceled) {
            return;
        }

        errors++;
        if (errors >= MAX_ERRORS) {
            error("Migration of " + partitionId + " failed. Giving up after " +
                  attempts + " attempt(s)", ex);
            return;
        }

        /*
         * If an error and the EOD was received then return 0 to force
         * an immediate restart
         */
        retryWait = eodReceived ? 0 : waitAfterError;

        logger.log(Level.FINE,
                   "Migration of {0} from {1} failed, reason: {2}, " +
                   "retry in {3} ms",
                   new Object[]{partitionId, sourceName,
                                ex.getLocalizedMessage(), retryWait});
    }

    /**
     * Gets the retry wait time in milliseconds. If the target is to be retried
     * the return value is {@literal >=} 0, otherwise the value is -1.
     *
     * @return the retry wait time or -1
     */
    long getRetryWait() {
        assert !running;

        if (canceled || done) {
            return -1L;
        }
        assert retryWait >= 0;
        return retryWait;
    }

    /**
     * Establishes a channel with the partition source and creates an
     * input stream.
     *
     * @return an input stream
     * @throws IOException if fail to open the channel
     * @throws com.sleepycat.je.rep.utilint.ServiceDispatcher.ServiceConnectFailedException
     * if fail to connect service
     */
    private synchronized DataInputStream openChannel()
        throws IOException, ServiceConnectFailedException {

        final Topology topo = repNode.getTopology();

        if (topo == null) {
            throw new IOException("Target node not yet initialized");
        }
        final PingCollector collector = new PingCollector(topo);
        final RNNameHAPort rnNameAndPort =
                collector.getMasterNamePort(sourceRGId);

        if (rnNameAndPort == null) {
            throw new IOException("Unable to get mastership status for " +
                                   sourceRGId.getGroupName());
        }
        final String haHostPort = rnNameAndPort.getHAHostPort();
        sourceName = rnNameAndPort.getFullName();

        /* getHAHostPort() returns null for a R1 node */
        if (haHostPort == null) {
            throw new IllegalStateException("Source node " + sourceName +
                                            " is running an incompatible " +
                                            "software version");
        }
        final InetSocketAddress sourceAddress =
                                        HostPortPair.getSocket(haHostPort);

        logger.log(Level.FINE,
                   "Opening channel to {0} to make migration request",
                   sourceAddress);

        final RepNodeParams repNodeParams = repNode.getRepNodeParams();
        final RepImpl repImpl = repNode.getEnvImpl(0L);
        if (repImpl == null) {
            throw new IllegalStateException("Attempt to migrate a partition " +
                                            "on a node that is not available");
        }

        final ConnectOptions connectOpts = new ConnectOptions().
            setTcpNoDelay(true).
            setReceiveBufferSize(0).
            setReadTimeout(repNodeParams.getReadWriteTimeout()).
            setOpenTimeout(repNodeParams.getConnectTImeout());

        channel = RepUtils.openBlockingChannel(
            sourceAddress, repImpl.getChannelFactory(), connectOpts);

        ServiceDispatcher.doServiceHandshake(channel,
                                             MigrationService.SERVICE_NAME);

        return new DataInputStream(Channels.newInputStream(channel));
    }

    /**
     * Opens or creates the partition DB.
     */
    private synchronized void createDb() {

        if (partitionDb != null) {
            return;
        }

        /* Retry until success. */
        final TransactionConfig txnConfig =
                    new TransactionConfig().setConsistencyPolicy(
                                NoConsistencyRequiredPolicy.NO_CONSISTENCY);

        /* Create DB */
        while (partitionDb == null) {

            Transaction txn = null;
            try {
                txn = repEnv.beginTransaction(null, txnConfig);
                partitionDb =
                        repEnv.openDatabase(txn,
                                            partitionId.getPartitionName(),
                                            repNode.getPartitionDbConfig());
                txn.commit();
                txn = null;
                
            } catch (ReplicaWriteException rwe) {
                /* Could be transitioning from master to replica. */
                final String msg = "Attempted to start partition migration " +
                                   "target for " + partitionId + " but node " +
                                   "has become a replica";
                logger.log(Level.WARNING, msg, rwe);
                throw new IllegalStateException(msg, rwe);
            } catch (UnknownMasterException ume) {
                /* Could be transitioning from master to replica. */
                final String msg = "Attempted to start partition migration " +
                                   "target for " + partitionId + " but node " +
                                   "has lost master status";
                logger.log(Level.WARNING, msg, ume);
                throw new IllegalStateException(msg, ume);
            } finally {
                TxnUtil.abort(txn);
            }
        }
    }

    private void consumeOps(Reader reader) throws Exception {
        try {
            int count = 0;
            while (!done) {
                final Op op = reader.remove();

                if (op == null) {
                    throw new IOException("Reader returned null after " +
                                          count + " operations");
                }
                /*
                 * We retry the operation in the case of RUE because this can
                 * be thrown by the table manager when a secondary DB is being
                 * created and not everything is in a consistent state.
                 */
                int retries = 100;
                while (true) {
                    try {
                        op.execute();
                        break;
                    } catch (RNUnavailableException rue) {
                        if (retries-- <=0) {
                            throw rue;
                        }
                        Thread.sleep(10);
                    }
                }
                count++;
            }
        } finally {
            /*
             * Cleanup. Unresolved-unprepared txns may be leftover in normal
             * operation. Prepared and batch txns may remain after an error.
             */
            reader.abortAllTxns();
        }
    }

    /**
     * Persists the target record in the db. Note the monitor can't be
     * held during this operation as the db's triggers call back into the
     * manager.
     *
     * @return true if the operation was successful
     */
    private boolean persistTargetDurable(Reader.EoD eod) {
        assert !Thread.holdsLock(this);
        logger.log(Level.FINE,
                   "Persist target transfer durable for {0}", partitionId);

        final TransactionConfig txnConfig = new TransactionConfig();
        txnConfig.setConsistencyPolicy(
                                 NoConsistencyRequiredPolicy.NO_CONSISTENCY);
        txnConfig.setDurability(
                   new Durability(Durability.SyncPolicy.SYNC,
                                  Durability.SyncPolicy.SYNC,
                                  Durability.ReplicaAckPolicy.SIMPLE_MAJORITY));

        final Boolean success =
            manager.tryDBOperation(db -> {

                Transaction txn = null;
                try {
                    txn = db.getEnvironment().beginTransaction(null, txnConfig);

                    final PartitionMigrations pm =
                        PartitionMigrations.fetch(db, txn);

                    final TargetRecord record = pm.getTarget(partitionId);

                    if (record == null) {
                        throw new IllegalStateException(
                            "Unable to find migration record for " +
                            partitionId);
                    }
                    record.setStatus(
                        getStatus(PartitionMigrationState.SUCCEEDED));

                    pm.persist(db, txn, true);

                    /* open a partition gen in the same txn */
                    manager.getPartGenTable()
                           .openGeneration(partitionId,
                                           eod.prevPGN.incrGenNum(),
                                           record.getSourceRGId(),
                                           eod.prevGenVLSN,
                                           txn);
                    txn.commit();
                    txn = null;
                    return true;
                } catch (PartitionMDException pmde) {
                    logger.info("Fail to open generation for partition " +
                                pmde.getPartitionId() +
                                " in db " + pmde.getDbName() +
                                ", generation: " + pmde.getGeneration());
                    return false;
                } finally {
                    TxnUtil.abort(txn);
                }
            }, true);

        if ((success == null) || !success) {
            return false;
        }

        /* With the migration record persisted, we can remove this target. */
        manager.removeTarget(partitionId);

        /*
         * This will update the local topology here (master) and update the
         * partition DBs. The replicas are updated through the DB triggers
         * from persisting the migration record.
         *
         * This is critical to complete, because if the local topo is not
         * updated no one will (source or target) will think they own the
         * partition. So fail the node if there is a problem and hopefully
         * the new master be correct.
         */
        manager.criticalUpdate();

        manager.setLastMigrationDuration(endTime - startTime);
        return true;
    }

    @Override
    public String toString() {
        return "MigrationTarget[" + partitionId + ", " + sourceName + ", " +
               attempts + ", " + running + ", " + done + ", " + canceled + "]";
    }

    /**
     * Encapsulates an operation.
     */
    private static abstract class Op {

        /**
         * Called to execute the operation by the consumer thread.
         */
        abstract void execute();
    }

    /**
     * Reader thread. This thread will read operations from the stream and
     * insert them onto the opQueue.
     */
    private class Reader implements Runnable {

        /*
         * Map of local transactions. No synchronization is needed since it is
         * onlt accessed by the read thread.
         */
        private final Map txnMap = new HashMap<>();

        /* Max. number of operations in a single copy batch transaction */
        private final int MAX_BATCH_COPY_OPS = 100;

        /*
         * The transaction used to batch copy operations. This should only be
         * used for copy ops, and the transaction should not overlap other
         * transactions. This means the transaction should be committed before
         * some other transactional operation (put, delete) is started. This is
         * to maintain proper ordering between the on disk copy and client
         * operations.
         */
        private Transaction batchTxn = null;

        /* The number of copy operations made in the current batch txn. */
        private int batchCount = 0;

        /* The operation queue. This thread inserts ops, the target thread
         * removes them. Accesses to the queue must be synchronized.
         */
        private final Queue opQueue = new LinkedList<>();

        /*
         * The normal capacity of the op queue. This is overridden when the
         * consumer needs to wait for a transaction resolution.
         */
        private static final int DEFAULT_CAPACITY = 100;

        /* The capacity limit of the op queue. */
        private int capacity = DEFAULT_CAPACITY;

        private final DataInputStream stream;

        /* For general use to avoid constructing DatabaseEntrys in the OPS */
        private final DatabaseEntry keyEntry = new DatabaseEntry();
        private final DatabaseEntry valueEntry = new DatabaseEntry();

        Reader(DataInputStream stream) {
            this.stream = stream;
        }

        @Override
        public void run() {

            try {
                processStream();
            } catch (Exception ex) {
                /* If canceled, don't bother reporting. */
                if (!canceled) {
                    logger.log(Level.INFO,
                               "Exception processing migration stream for " +
                               partitionId, ex);
                }

                /*
                 * Clearing running will cause remove() and PrepareOp.execute()
                 * to exit so that the target thread can handle the issue.
                 */
                setStopped();
            }
        }

        /**
         * Processes operations from the migration stream. This method does not
         * return normally until an End Of Data operation is received on the
         * stream.
         *
         * If an exception is thrown, the migration should be aborted because
         * the state of the source and the data is unknown.
         *
         * @throws Exception if there were any problems encountered processing
         * the migration stream
         */
        private void processStream() throws Exception {

            while (running) {
                final OP op = OP.get(stream.readByte());

                if (op == null) {
                    throw new IOException("Bad op, or unexpected EOF");
                }
                operations++;

                switch (op) {
                    case COPY : {
                        copyOps++;
                        insert(new CopyOp(readDbEntry(),
                                          readDbEntry(),
                                          readExpirationTime()));
                        break;
                    }
                    case PUT : {
                        insert(new PutOp(readTxnId(),
                                         readDbEntry(),
                                         readDbEntry(),
                                         readExpirationTime()));
                        break;
                    }
                    case DELETE : {
                        insert(new DeleteOp(readTxnId(), readDbEntry()));
                        break;
                    }
                    case PREPARE : {
                        insert(new PrepareOp(readTxnId()));
                        break;
                    }
                    case COMMIT : {
                        resolve(readTxnId(), true);
                        break;
                    }
                    case ABORT: {
                        resolve(readTxnId(), false);
                        break;
                    }
                    case EOD : {
                        logger.log(Level.INFO,
                                   "Received EOD for {0}", partitionId);

                        eodReceived = true;

                        /*
                         * It is possible that a txn was started (via PUT or
                         * DELETE) but not resolved (a COMMIT or ABORT was never
                         * received). At this point the local txns in the map
                         * are those which have not been resolved.
                         *
                         * If the transaction has been prepared then we don't
                         * know if the operation has completed on the source.
                         * In this case the only option is to cancel
                         * the migration and start over.
                         *
                         * Unresolved txns which have not been prepared are
                         * safe to abort (later) at this target; these txns are
                         * also aborted on the source and will be re-tried by
                         * the client.
                         */
                        for (LocalTxn txn : txnMap.values()) {
                            assert !txn.resolved;
                            if (txn.prepared) {

                                /*
                                 * Log instead of throwing an exception
                                 * because even though this (rare) condition
                                 * results in an abort of the migration, it is
                                 * not unexpected.
                                 */
                                logger.log(Level.INFO,
                                           "Encountered prepared but " +
                                           "unresolved txn, stopping " +
                                           "migration for {0}", partitionId);
                                setStopped();
                                return;
                            }
                        }
                        final EoD eod = new EoD();
                        readEoDFromStream(stream, eod);
                        logger.log(Level.FINE,
                                   () -> "Received EOD for " + partitionId +
                                         " from shard " + sourceRGId +
                                         " at vlsn " + eod.prevGenVLSN  +
                                         " with generation # " + eod.prevPGN);

                        insert(eod);
                        return;
                    }
                }
            }
        }

        /**
         * Reads a transaction ID from the migration stream.
         */
        private long readTxnId() throws IOException {
            return stream.readLong();
        }

        /**
         * Reads expiration time from the migration stream.
         */
        private long readExpirationTime() throws IOException {
            return stream.readLong();
        }

        /**
         * Reads a DB entry (as a byte array) from the migration stream.
         */
        private byte[] readDbEntry() throws IOException {
            final int size = stream.readInt();
            final byte[] bytes = new byte[size];
            stream.readFully(bytes);
            return bytes;
        }

        /**
         * Inserts an operation onto the queue. This method will block if the
         * queue is at capacity.
         */
        private void insert(Op op) {

            synchronized (opQueue) {
                if (!running) {
                    return;
                }
                opQueue.add(op);
                opQueue.notifyAll();

                while ((opQueue.size() > capacity) && running) {

                    try {
                        opQueue.wait(SECOND_MS);
                    } catch (InterruptedException ie) {
                        logger.log(Level.WARNING, "Unexpected interrupt", ie);
                    }
                }
            }
        }

        /**
         * Removes an operation from the queue. This method will block if the
         * queue is empty.
         */
        private Op remove() {

            synchronized (opQueue) {
                while (running) {
                    final Op op = opQueue.poll();
                    if (op != null) {
                        opQueue.notifyAll();
                        return op;
                    }

                    try {
                        opQueue.wait(SECOND_MS);
                    } catch (InterruptedException ie) {
                        logger.log(Level.WARNING, "Unexpected interrupt", ie);
                    }
                }
                return null;
            }
        }

        /**
         * Resolves a prepare operation.
         */
        private void resolve(long txnId, boolean commit) {

            final LocalTxn txn = txnMap.remove(txnId);
            assert txn != null;

            /*
             * If the consumer had reached the prepare operation, it is waiting
             * on the transaction. After marking it resolved wake it up.
             */
            synchronized (txn) {
                txn.resolve(commit);
                txn.notifyAll();
            }

            /*
             * Resolve dosen't place an op on the queue, so we need to wake
             * the consumer thread just in case it is waiting there.
             */
            synchronized (opQueue) {
                opQueue.notifyAll();
            }
        }

        /*
         * Gets the batch transaction and increments the batch count. If the
         * count is MAX_BATCH_COPY_OPS the current txn is committed and a new
         * transaction is started.
         */
        private Transaction getBatchTxn() {
            if (batchCount >= MAX_BATCH_COPY_OPS) {
                commitBatchTxn();
            }
            if (batchTxn == null) {
                batchTxn = repEnv.beginTransaction(null, weakConfig);
            }
            batchCount++;
            return batchTxn;
        }

        /*
         * Commits the batch transaction if there is one open and resets the
         * batch txn count.
         */
        private void commitBatchTxn() {
            if (batchTxn != null) {
                batchTxn.commit();
                batchTxn = null;
                batchCount = 0;
                copyBatches++;
            }
        }

        /*
         * Aborts all open transactions.
         */
        private void abortAllTxns() {
            /*
             * Aborts the batch transaction if it is open. This should only be
             * necessary if there is an error.
             */
            if (batchTxn != null) {
                assert !done;
                TxnUtil.abort(batchTxn);
                batchTxn = null;
                batchCount = 0;
            }

            /*
             * Aborts unresolved local txns. Note that if any were
             * prepared, it would be detected and handled in processStream().
             */
            for (LocalTxn txn : txnMap.values()) {
                txn.abort();
            }
        }

        @Override
        public String toString() {
            return "Reader[" + operations + ", " + opQueue.size() + "]";
        }

        /**
         * Copy operation (record read from the DB).
         */
        private class CopyOp extends Op {
            final byte[] key;
            final byte[] value;
            final long expirationTime;

            CopyOp(byte[] key, byte[] value, long expirationTime) {
                this.key = key;
                this.value = value;
                this.expirationTime = expirationTime;
            }

            @Override
            void execute() {
                /* Skip writing records to a dropped table */
                if (MigrationManager.isForDroppedTable(repNode, key)) {
                    return;
                }
                keyEntry.setData(key);
                valueEntry.setData(value);
                partitionDb.put(getBatchTxn(), keyEntry, valueEntry,
                                Put.OVERWRITE, getWriteOptions(expirationTime));
                copyBytes += value.length;
            }

            @Override
            public String toString() {
                return "CopyOp[" + key.length + ", " + value.length + "]";
            }
        }

        /**
         * An operation associated with source-side transactions. When the first
         * object created with the specified txn a new local transaction is
         * started. Subsequent creations with the same txn will be associated
         * with same local transaction.
         */
        private abstract class TxnOp extends Op {
            final LocalTxn txn;

            TxnOp(long txnId) {
                LocalTxn t = txnMap.get(txnId);

                if (t == null) {
                    t = new LocalTxn(txnId);
                    txnMap.put(txnId, t);
                }
                this.txn = t;
            }

            /**
             * Gets the local transaction for this operation. The first time
             * this is called a new local transaction will be created and
             * started.
             */
            protected Transaction getTransaction() {
                return txn.getTransaction();
            }
        }

        /**
         * Put operation (client write).
         */
        private class PutOp extends TxnOp {
            final byte[] key;
            final byte[] value;
            final long expirationTime;

            PutOp(long txnId, byte[] key, byte[] value,
                  long expirationTime) {
                super(txnId);
                this.key = key;
                this.value = value;
                this.expirationTime = expirationTime;
            }

            @Override
            void execute() {
                /* Skip writing records to a dropped table */
                if (MigrationManager.isForDroppedTable(repNode, key)) {
                    return;
                }
                keyEntry.setData(key);
                valueEntry.setData(value);
                partitionDb.put(getTransaction(), keyEntry, valueEntry,
                                Put.OVERWRITE, getWriteOptions(expirationTime));
            }

            @Override
            public String toString() {
                return "PutOp[" + txn.txnId + ", " + key.length + ", " +
                       value.length + "]";
            }
        }

        /**
         * Delete operation (client delete).
         */
        private class DeleteOp extends TxnOp {
            final byte[] key;

            DeleteOp(long txnId, byte[] key) {
                super(txnId);
                this.key = key;
            }

            @Override
            void execute() {
                keyEntry.setData(key);
                partitionDb.delete(getTransaction(), keyEntry);
            }

            @Override
            public String toString() {
                return "DeleteOp[" + txn.txnId + ", " + key.length + "]";
            }
        }

        /**
         * Prepare op. This operation indicates that a source-based transaction
         * is about to be committed. When the consumer reaches a PrepareOp it
         * must wait for it to be resolved before consuming any other
         * operations.
         *
         * While the consumer is waiting for the resolution the operation queue
         * becomes unbounded so that the read thread can continue and read the
         * resolution.
         */
        private class PrepareOp extends TxnOp {

            PrepareOp(long txnId) {
                super(txnId);
                txn.prepared = true;
            }

            @Override
            void execute() {

                /*
                 * Check if no transaction was started for this ID. This could
                 * happen due to key filtering at the source. In this case just
                 * exit.
                 */
                if (txn.transaction == null) {
                    logger.log(Level.FINE,
                               "Prepare with no txn for {0}, {1}",
                               new Object[]{txn, partitionId});
                    return;
                }

                /* Commit copy operations to maintain order. */
                commitBatchTxn();

                synchronized (txn) {
                    if (!txn.resolved) {
                        synchronized (opQueue) {

                            /*
                             * Make the queue unbounded so we can find the
                             * resolution message.
                             */
                            capacity = Integer.MAX_VALUE;
                            opQueue.notifyAll();
                        }
                    }
                    while (!txn.resolved && running) {
                        logger.log(Level.FINE,
                                   "Waiting for resolution of {0}, {1} {2} ops",
                                   new Object[]{txn, partitionId, operations});
                        try {
                            txn.wait(SECOND_MS);
                        } catch (InterruptedException ie) {
                            logger.log(Level.WARNING,
                                       "Unexpected interrupt", ie);
                        }
                    }
                }
                capacity = DEFAULT_CAPACITY;
                txn.finish();
            }

            @Override
            public String toString() {
                return "PrepareOp[" + txn + "]";
            }
        }

        /**
         * End of data marker.
         */
        private class EoD extends Op {

            /* last vlsn of previous generation */
            VLSN prevGenVLSN;
            /* generation number of previous generation */
            PartitionGenNum prevPGN;

            EoD(VLSN prevGenVLSN, PartitionGenNum prevPGN) {
                super();

                this.prevGenVLSN = prevGenVLSN;
                this.prevPGN = prevPGN;
            }

            EoD() {
                this(null, null);
            }

            @SuppressWarnings("hiding")
            void init(VLSN prevGenVLSN, PartitionGenNum prevPGN) {
                this.prevGenVLSN = prevGenVLSN;
                this.prevPGN = prevPGN;
            }

            @Override
            void execute() {
                /* Commit all remaining copy operations */
                commitBatchTxn();
                setDone(this);
            }

            @Override
            public String toString() {
                return "EoD[from shard: " + sourceRGId +
                       ", generation # " + prevPGN +
                       ", last vlsn  " + prevGenVLSN +
                       ", last generation # " + prevPGN;
            }
        }

        /**
         * Encapsulates a local transaction which is associated with a
         * transaction id.
         */
        private class LocalTxn {
            private final long txnId;
            private Transaction transaction = null;
            private boolean prepared = false;
            private boolean resolved = false;
            private boolean committed = false;

            LocalTxn(long txnId) {
                this.txnId = txnId;
            }

            /**
             * Gets the local transaction for this id. The first time this
             * is called a new local transaction will be created and started.
             */
            Transaction getTransaction() {
                /* Commit copy operations to prevent overlap. */
                commitBatchTxn();
                if (transaction == null) {
                    transaction = repEnv.beginTransaction(null, weakConfig);
                }
                return transaction;
            }

            /**
             * Marks the transaction as resolved.
             */
            void resolve(boolean commit) {
                assert prepared;
                assert !resolved;
                resolved = true;
                committed = commit;
            }

            /**
             * Completes the transaction.
             */
            void finish() {
                assert resolved;
                assert transaction != null;
                if (committed) {
                    transaction.commit();
                } else {
                    TxnUtil.abort(transaction);
                }
            }

            /**
             * Aborts the transaction if started.
             */
            void abort() {
                TxnUtil.abort(transaction);
            }

            @Override
            public String toString() {
                return "LocalTxn[" + txnId + ", " + transaction +
                       ", prepared=" + prepared + ", resolved=" + resolved +
                       ", committed=" + committed + "]";
            }
        }
    }

    private class ReaderFactory extends KVThreadFactory {

        ReaderFactory() {
            super(" migration stream reader for ", logger);
        }

        private Reader newReader(DataInputStream stream) {
            final Reader reader = new Reader(stream);
            newThread(reader).start();
            return reader;
        }
    }

    /**
     * Returns a JE WriteOptions object initialized with a TTL
     * if expirationTime is non-zero, otherwise null. If non-null
     * the instance returned is a singleton in this class.
     */
    private WriteOptions getWriteOptions(long expirationTime) {
        if (expirationTime == 0) {
            return null;
        }
        return writeOptions.setExpirationTime(expirationTime, null);
    }


    /**
     * Writes EoD to stream, must be consistent with {@link
     * #readEoDFromStream}.
     *
     * @param stream data output stream
     * @param vlsn   vlsn
     * @param gen    generation number
     *
     * @throws IOException if fail to write to stream
     */
    static void writeEoDToStream(DataOutput stream,
                                 long vlsn,
                                 int gen) throws IOException {

        /*
         * Today we do not support partition generation for a mixed
         * version store, e.g., source and target must have the same
         * TransferProtocol version in order for migration to start. In
         * consequence, it is guaranteed that both sides can understand
         * EoD format since they share the same protocol version. See
         * TransferProtocol.read() and TransferProtocol.version for
         * details.
         */
        stream.writeLong(vlsn);
        stream.writeInt(gen);
    }

    /**
     * Reads EoD from stream. Must be consistent with {@link
     * #writeEoDToStream}.
     *
     * Cannot create eod instance in static context outside Reader, therefore
     * the caller need provide a placeholder as parameter. After return, the
     * placeholder should be populated.

     * @param stream  data input stream
     * @param eod     an empty EoD instance to populate
     *
     * @throws IOException if fail to read from stream
     */
    private static void readEoDFromStream(DataInputStream stream,
                                          Reader.EoD eod) throws IOException {

        /*
         * Today we do not support partition generation for a mixed
         * version store, e.g., source and target must have the same
         * TransferProtocol version in order for migration to start. In
         * consequence, it is guaranteed that both sides can understand
         * EoD format since they share the same protocol version. See
         * TransferProtocol.read() and TransferProtocol.version for
         * details.
         */
        final VLSN prevGenVLSN = new VLSN(stream.readLong());
        final PartitionGenNum prevPGN =
            new PartitionGenNum(stream.readInt());
        eod.init(prevGenVLSN, prevPGN);
    }
}