org.apache.cassandra.net.OutboundConnection Maven / Gradle / Ivy

Go to download
Show more of this group Show more artifacts with this name
Show all versions of cassandra-all Show documentation
The Apache Cassandra Project develops a highly scalable second-generation distributed database, bringing together Dynamo's fully distributed design and Bigtable's ColumnFamily-based data model.
There is a newer version: 5.0.2
Show newest version
/*
 * Licensed to the Apache Software Foundation (ASF) under one
 * or more contributor license agreements.  See the NOTICE file
 * distributed with this work for additional information
 * regarding copyright ownership.  The ASF licenses this file
 * to you under the Apache License, Version 2.0 (the
 * "License"); you may not use this file except in compliance
 * with the License.  You may obtain a copy of the License at
 *
 *     http://www.apache.org/licenses/LICENSE-2.0
 *
 * Unless required by applicable law or agreed to in writing, software
 * distributed under the License is distributed on an "AS IS" BASIS,
 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 * See the License for the specific language governing permissions and
 * limitations under the License.
 */

package org.apache.cassandra.net;

import java.io.IOException;
import java.net.ConnectException;
import java.net.InetSocketAddress;
import java.nio.channels.ClosedChannelException;
import java.util.Objects;
import java.util.concurrent.CountDownLatch;
import java.util.concurrent.ExecutorService;
import java.util.concurrent.TimeUnit;
import java.util.concurrent.atomic.AtomicInteger;
import java.util.concurrent.atomic.AtomicLongFieldUpdater;
import java.util.concurrent.atomic.AtomicReference;
import java.util.concurrent.atomic.AtomicReferenceFieldUpdater;

import javax.annotation.Nullable;

import com.google.common.annotations.VisibleForTesting;
import com.google.common.util.concurrent.Uninterruptibles;
import org.slf4j.Logger;
import org.slf4j.LoggerFactory;

import io.netty.channel.Channel;
import io.netty.channel.ChannelFuture;
import io.netty.channel.ChannelHandlerContext;
import io.netty.channel.ChannelInboundHandlerAdapter;
import io.netty.channel.EventLoop;
import io.netty.channel.unix.Errors;
import io.netty.util.concurrent.Future;
import io.netty.util.concurrent.Promise;
import io.netty.util.concurrent.PromiseNotifier;
import io.netty.util.concurrent.SucceededFuture;
import org.apache.cassandra.config.DatabaseDescriptor;
import org.apache.cassandra.io.util.DataOutputBufferFixed;
import org.apache.cassandra.net.OutboundConnectionInitiator.Result.MessagingSuccess;
import org.apache.cassandra.tracing.Tracing;
import org.apache.cassandra.utils.FBUtilities;
import org.apache.cassandra.utils.JVMStabilityInspector;
import org.apache.cassandra.utils.NoSpamLogger;

import static java.lang.Math.max;
import static java.lang.Math.min;
import static java.util.concurrent.TimeUnit.MILLISECONDS;
import static org.apache.cassandra.net.MessagingService.current_version;
import static org.apache.cassandra.net.OutboundConnectionInitiator.*;
import static org.apache.cassandra.net.OutboundConnections.LARGE_MESSAGE_THRESHOLD;
import static org.apache.cassandra.net.ResourceLimits.*;
import static org.apache.cassandra.net.ResourceLimits.Outcome.*;
import static org.apache.cassandra.net.SocketFactory.*;
import static org.apache.cassandra.utils.FBUtilities.prettyPrintMemory;
import static org.apache.cassandra.utils.MonotonicClock.approxTime;
import static org.apache.cassandra.utils.Throwables.isCausedBy;

/**
 * Represents a connection type to a peer, and handles the state transistions on the connection and the netty {@link Channel}.
 * The underlying socket is not opened until explicitly requested (by sending a message).
 *
 * TODO: complete this description
 *
 * Aside from a few administrative methods, the main entry point to sending a message is {@link #enqueue(Message)}.
 * Any thread may send a message (enqueueing it to {@link #queue}), but only one thread may consume messages from this
 * queue.  There is a single delivery thread - either the event loop, or a companion thread - that has logical ownership
 * of the queue, but other threads may temporarily take ownership in order to perform book keeping, pruning, etc.,
 * to ensure system stability.
 *
 * {@link Delivery#run()} is the main entry point for consuming messages from the queue, and executes either on the event
 * loop or on a non-dedicated companion thread.  This processing is activated via {@link Delivery#execute()}.
 *
 * Almost all internal state maintenance on this class occurs on the eventLoop, a single threaded executor which is
 * assigned in the constructor.  Further details are outlined below in the class.  Some behaviours require coordination
 * between the eventLoop and the companion thread (if any).  Some minimal set of behaviours are permitted to occur on
 * producers to ensure the connection remains healthy and does not overcommit resources.
 *
 * All methods are safe to invoke from any thread unless otherwise stated.
 */
@SuppressWarnings({ "WeakerAccess", "FieldMayBeFinal", "NonAtomicOperationOnVolatileField", "SameParameterValue" })
public class OutboundConnection
{
    static final Logger logger = LoggerFactory.getLogger(OutboundConnection.class);
    private static final NoSpamLogger noSpamLogger = NoSpamLogger.getLogger(logger, 30L, TimeUnit.SECONDS);

    private static final AtomicLongFieldUpdater submittedUpdater = AtomicLongFieldUpdater.newUpdater(OutboundConnection.class, "submittedCount");
    private static final AtomicLongFieldUpdater pendingCountAndBytesUpdater = AtomicLongFieldUpdater.newUpdater(OutboundConnection.class, "pendingCountAndBytes");
    private static final AtomicLongFieldUpdater overloadedCountUpdater = AtomicLongFieldUpdater.newUpdater(OutboundConnection.class, "overloadedCount");
    private static final AtomicLongFieldUpdater overloadedBytesUpdater = AtomicLongFieldUpdater.newUpdater(OutboundConnection.class, "overloadedBytes");
    private static final AtomicReferenceFieldUpdater closingUpdater = AtomicReferenceFieldUpdater.newUpdater(OutboundConnection.class, Future.class, "closing");
    private static final AtomicReferenceFieldUpdater scheduledCloseUpdater = AtomicReferenceFieldUpdater.newUpdater(OutboundConnection.class, Future.class, "scheduledClose");

    private final EventLoop eventLoop;
    private final Delivery delivery;

    private final OutboundMessageCallbacks callbacks;
    private final OutboundDebugCallbacks debug;
    @VisibleForTesting
    final OutboundMessageQueue queue;
    /** the number of bytes we permit to queue to the network without acquiring any shared resource permits */
    private final long pendingCapacityInBytes;
    /** the number of messages and bytes queued for flush to the network,
     * including those that are being flushed but have not been completed,
     * packed into a long (top 20 bits for count, bottom 42 for bytes)*/
    private volatile long pendingCountAndBytes = 0;
    /** global shared limits that we use only if our local limits are exhausted;
     *  we allocate from here whenever queueSize > queueCapacity */
    private final EndpointAndGlobal reserveCapacityInBytes;

    /** Used in logging statements to lazily build a human-readable number of pending bytes. */
    private final Object readablePendingBytes =
        new Object() { @Override public String toString() { return prettyPrintMemory(pendingBytes()); } };

    /** Used in logging statements to lazily build a human-readable number of reserve endpoint bytes in use. */
    private final Object readableReserveEndpointUsing =
        new Object() { @Override public String toString() { return prettyPrintMemory(reserveCapacityInBytes.endpoint.using()); } };

    /** Used in logging statements to lazily build a human-readable number of reserve global bytes in use. */
    private final Object readableReserveGlobalUsing =
        new Object() { @Override public String toString() { return prettyPrintMemory(reserveCapacityInBytes.global.using()); } };

    private volatile long submittedCount = 0;   // updated with cas
    private volatile long overloadedCount = 0;  // updated with cas
    private volatile long overloadedBytes = 0;  // updated with cas
    private long expiredCount = 0;              // updated with queue lock held
    private long expiredBytes = 0;              // updated with queue lock held
    private long errorCount = 0;                // updated only by delivery thread
    private long errorBytes = 0;                // updated by delivery thread only
    private long sentCount;                     // updated by delivery thread only
    private long sentBytes;                     // updated by delivery thread only
    private long successfulConnections;         // updated by event loop only
    private long connectionAttempts;            // updated by event loop only

    private static final int pendingByteBits = 42;
    private static boolean isMaxPendingCount(long pendingCountAndBytes)
    {
        return (pendingCountAndBytes & (-1L << pendingByteBits)) == (-1L << pendingByteBits);
    }

    private static int pendingCount(long pendingCountAndBytes)
    {
        return (int) (pendingCountAndBytes >>> pendingByteBits);
    }

    private static long pendingBytes(long pendingCountAndBytes)
    {
        return pendingCountAndBytes & (-1L >>> (64 - pendingByteBits));
    }

    private static long pendingCountAndBytes(long pendingCount, long pendingBytes)
    {
        return (pendingCount << pendingByteBits) | pendingBytes;
    }

    private final ConnectionType type;

    /**
     * Contains the base settings for this connection, _including_ any defaults filled in.
     *
     */
    private OutboundConnectionSettings template;

    private static class State
    {
        static final State CLOSED  = new State(Kind.CLOSED);

        enum Kind { ESTABLISHED, CONNECTING, DORMANT, CLOSED }

        final Kind kind;

        State(Kind kind)
        {
            this.kind = kind;
        }

        boolean isEstablished()  { return kind == Kind.ESTABLISHED; }
        boolean isConnecting()   { return kind == Kind.CONNECTING; }
        boolean isDisconnected() { return kind == Kind.CONNECTING || kind == Kind.DORMANT; }
        boolean isClosed()       { return kind == Kind.CLOSED; }

        Established  established()  { return (Established)  this; }
        Connecting   connecting()   { return (Connecting)   this; }
        Disconnected disconnected() { return (Disconnected) this; }
    }

    /**
     * We have successfully negotiated a channel, and believe it to still be valid.
     *
     * Before using this, we should check isConnected() to check the Channel hasn't
     * become invalid.
     */
    private static class Established extends State
    {
        final int messagingVersion;
        final Channel channel;
        final FrameEncoder.PayloadAllocator payloadAllocator;
        final OutboundConnectionSettings settings;

        Established(int messagingVersion, Channel channel, FrameEncoder.PayloadAllocator payloadAllocator, OutboundConnectionSettings settings)
        {
            super(Kind.ESTABLISHED);
            this.messagingVersion = messagingVersion;
            this.channel = channel;
            this.payloadAllocator = payloadAllocator;
            this.settings = settings;
        }

        boolean isConnected() { return channel.isOpen(); }
    }

    private static class Disconnected extends State
    {
        /** Periodic message expiry scheduled while we are disconnected; this will be cancelled and cleared each time we connect */
        final Future maintenance;
        Disconnected(Kind kind, Future maintenance)
        {
            super(kind);
            this.maintenance = maintenance;
        }

        public static Disconnected dormant(Future maintenance)
        {
            return new Disconnected(Kind.DORMANT, maintenance);
        }
    }

    private static class Connecting extends Disconnected
    {
        /**
         * Currently (or scheduled to) (re)connect; this may be cancelled (if closing) or waited on (for delivery)
         *
         *  - The work managed by this future is partially performed asynchronously, not necessarily on the eventLoop.
         *  - It is only completed on the eventLoop
         *  - It may not be executing, but might be scheduled to be submitted if {@link #scheduled} is not null
         */
        final Future> attempt;

        /**
         * If we are retrying to connect with some delay, this represents the scheduled inititation of another attempt
         */
        @Nullable
        final Future scheduled;

        /**
         * true iff we are retrying to connect after some failure (immediately or following a delay)
         */
        final boolean isFailingToConnect;

        Connecting(Disconnected previous, Future> attempt)
        {
            this(previous, attempt, null);
        }

        Connecting(Disconnected previous, Future> attempt, Future scheduled)
        {
            super(Kind.CONNECTING, previous.maintenance);
            this.attempt = attempt;
            this.scheduled = scheduled;
            this.isFailingToConnect = scheduled != null || (previous.isConnecting() && previous.connecting().isFailingToConnect);
        }

        /**
         * Cancel the connection attempt
         *
         * No cleanup is needed here, as {@link #attempt} is only completed on the eventLoop,
         * so we have either already invoked the callbacks and are no longer in {@link #state},
         * or the {@link OutboundConnectionInitiator} will handle our successful cancellation
         * when it comes to complete, by closing the channel (if we could not cancel it before then)
         */
        void cancel()
        {
            if (scheduled != null)
                scheduled.cancel(true);

            // we guarantee that attempt is only ever completed by the eventLoop
            boolean cancelled = attempt.cancel(true);
            assert cancelled;
        }
    }

    private volatile State state;

    /** The connection is being permanently closed */
    private volatile Future closing;
    /** The connection is being permanently closed in the near future */
    private volatile Future scheduledClose;

    OutboundConnection(ConnectionType type, OutboundConnectionSettings settings, EndpointAndGlobal reserveCapacityInBytes)
    {
        this.template = settings.withDefaults(ConnectionCategory.MESSAGING);
        this.type = type;
        this.eventLoop = template.socketFactory.defaultGroup().next();
        this.pendingCapacityInBytes = template.applicationSendQueueCapacityInBytes;
        this.reserveCapacityInBytes = reserveCapacityInBytes;
        this.callbacks = template.callbacks;
        this.debug = template.debug;
        this.queue = new OutboundMessageQueue(approxTime, this::onExpired);
        this.delivery = type == ConnectionType.LARGE_MESSAGES
                        ? new LargeMessageDelivery(template.socketFactory.synchronousWorkExecutor)
                        : new EventLoopDelivery();
        setDisconnected();
    }

    /**
     * This is the main entry point for enqueuing a message to be sent to the remote peer.
     */
    public void enqueue(Message message) throws ClosedChannelException
    {
        if (isClosing())
            throw new ClosedChannelException();

        final int canonicalSize = canonicalSize(message);
        if (canonicalSize > DatabaseDescriptor.getInternodeMaxMessageSizeInBytes())
            throw new Message.OversizedMessageException(canonicalSize);

        submittedUpdater.incrementAndGet(this);
        switch (acquireCapacity(canonicalSize))
        {
            case INSUFFICIENT_ENDPOINT:
                // if we're overloaded to one endpoint, we may be accumulating expirable messages, so
                // attempt an expiry to see if this makes room for our newer message.
                // this is an optimisation only; messages will be expired on ~100ms cycle, and by Delivery when it runs
                if (queue.maybePruneExpired() && SUCCESS == acquireCapacity(canonicalSize))
                    break;
            case INSUFFICIENT_GLOBAL:
                onOverloaded(message);
                return;
        }

        queue.add(message);
        delivery.execute();

        // we might race with the channel closing; if this happens, to ensure this message eventually arrives
        // we need to remove ourselves from the queue and throw a ClosedChannelException, so that another channel
        // can be opened in our place to try and send on.
        if (isClosing() && queue.remove(message))
        {
            releaseCapacity(1, canonicalSize);
            throw new ClosedChannelException();
        }
    }

    /**
     * Try to acquire the necessary resource permits for a number of pending bytes for this connection.
     *
     * Since the owner limit is shared amongst multiple connections, our semantics cannot be super trivial.
     * Were they per-connection, we could simply perform an atomic increment of the queue size, then
     * allocate any excess we need in the reserve, and on release free everything we see from both.
     * Since we are coordinating two independent atomic variables we have to track every byte we allocate in reserve
     * and ensure it is matched by a corresponding released byte. We also need to be sure we do not permit another
     * releasing thread to release reserve bytes we have not yet - and may never - actually reserve.
     *
     * As such, we have to first check if we would need reserve bytes, then allocate them *before* we increment our
     * queue size.  We only increment the queue size if the reserve bytes are definitely not needed, or we could first
     * obtain them.  If in the process of obtaining any reserve bytes the queue size changes, we have some bytes that are
     * reserved for us, but may be a different number to that we need.  So we must continue to track these.
     *
     * In the happy path, this is still efficient as we simply CAS
     */
    private Outcome acquireCapacity(long bytes)
    {
        return acquireCapacity(1, bytes);
    }

    private Outcome acquireCapacity(long count, long bytes)
    {
        long increment = pendingCountAndBytes(count, bytes);
        long unusedClaimedReserve = 0;
        Outcome outcome = null;
        loop: while (true)
        {
            long current = pendingCountAndBytes;
            if (isMaxPendingCount(current))
            {
                outcome = INSUFFICIENT_ENDPOINT;
                break;
            }

            long next = current + increment;
            if (pendingBytes(next) <= pendingCapacityInBytes)
            {
                if (pendingCountAndBytesUpdater.compareAndSet(this, current, next))
                {
                    outcome = SUCCESS;
                    break;
                }
                continue;
            }

            State state = this.state;
            if (state.isConnecting() && state.connecting().isFailingToConnect)
            {
                outcome = INSUFFICIENT_ENDPOINT;
                break;
            }

            long requiredReserve = min(bytes, pendingBytes(next) - pendingCapacityInBytes);
            if (unusedClaimedReserve < requiredReserve)
            {
                long extraGlobalReserve = requiredReserve - unusedClaimedReserve;
                switch (outcome = reserveCapacityInBytes.tryAllocate(extraGlobalReserve))
                {
                    case INSUFFICIENT_ENDPOINT:
                    case INSUFFICIENT_GLOBAL:
                        break loop;
                    case SUCCESS:
                        unusedClaimedReserve += extraGlobalReserve;
                }
            }

            if (pendingCountAndBytesUpdater.compareAndSet(this, current, next))
            {
                unusedClaimedReserve -= requiredReserve;
                break;
            }
        }

        if (unusedClaimedReserve > 0)
            reserveCapacityInBytes.release(unusedClaimedReserve);

        return outcome;
    }

    /**
     * Mark a number of pending bytes as flushed to the network, releasing their capacity for new outbound messages.
     */
    private void releaseCapacity(long count, long bytes)
    {
        long decrement = pendingCountAndBytes(count, bytes);
        long prev = pendingCountAndBytesUpdater.getAndAdd(this, -decrement);
        if (pendingBytes(prev) > pendingCapacityInBytes)
        {
            long excess = min(pendingBytes(prev) - pendingCapacityInBytes, bytes);
            reserveCapacityInBytes.release(excess);
        }
    }

    private void onOverloaded(Message message)
    {
        overloadedCountUpdater.incrementAndGet(this);
        
        int canonicalSize = canonicalSize(message);
        overloadedBytesUpdater.addAndGet(this, canonicalSize);
        
        noSpamLogger.warn("{} overloaded; dropping {} message (queue: {} local, {} endpoint, {} global)",
                          this, FBUtilities.prettyPrintMemory(canonicalSize),
                          readablePendingBytes, readableReserveEndpointUsing, readableReserveGlobalUsing);
        
        callbacks.onOverloaded(message, template.to);
    }

    /**
     * Take any necessary cleanup action after a message has been selected to be discarded from the queue.
     *
     * Only to be invoked while holding OutboundMessageQueue.WithLock
     */
    private boolean onExpired(Message message)
    {
        noSpamLogger.warn("{} dropping message of type {} whose timeout expired before reaching the network", id(), message.verb());
        releaseCapacity(1, canonicalSize(message));
        expiredCount += 1;
        expiredBytes += canonicalSize(message);
        callbacks.onExpired(message, template.to);
        return true;
    }

    /**
     * Take any necessary cleanup action after a message has been selected to be discarded from the queue.
     *
     * Only to be invoked by the delivery thread
     */
    private void onFailedSerialize(Message message, int messagingVersion, int bytesWrittenToNetwork, Throwable t)
    {
        logger.warn("{} dropping message of type {} due to error", id(), message.verb(), t);
        JVMStabilityInspector.inspectThrowable(t);
        releaseCapacity(1, canonicalSize(message));
        errorCount += 1;
        errorBytes += message.serializedSize(messagingVersion);
        callbacks.onFailedSerialize(message, template.to, messagingVersion, bytesWrittenToNetwork, t);
    }

    /**
     * Take any necessary cleanup action after a message has been selected to be discarded from the queue on close.
     * Note that this is only for messages that were queued prior to closing without graceful flush, OR
     * for those that are unceremoniously dropped when we decide close has been trying to complete for too long.
     */
    private void onClosed(Message message)
    {
        releaseCapacity(1, canonicalSize(message));
        callbacks.onDiscardOnClose(message, template.to);
    }

    /**
     * Delivery bundles the following:
     *
     *  - the work that is necessary to actually deliver messages safely, and handle any exceptional states
     *  - the ability to schedule delivery for some time in the future
     *  - the ability to schedule some non-delivery work to happen some time in the future, that is guaranteed
     *    NOT to coincide with delivery for its duration, including any data that is being flushed (e.g. for closing channels)
     *      - this feature is *not* efficient, and should only be used for infrequent operations
     */
    private abstract class Delivery extends AtomicInteger implements Runnable
    {
        final ExecutorService executor;

        // the AtomicInteger we extend always contains some combination of these bit flags, representing our current run state

        /** Not running, and will not be scheduled again until transitioned to a new state */
        private static final int STOPPED               = 0;
        /** Currently executing (may only be scheduled to execute, or may be about to terminate);
         *  will stop at end of this run, without rescheduling */
        private static final int EXECUTING             = 1;
        /** Another execution has been requested; a new execution will begin some time after this state is taken */
        private static final int EXECUTE_AGAIN         = 2;
        /** We are currently executing and will submit another execution before we terminate */
        private static final int EXECUTING_AGAIN       = EXECUTING | EXECUTE_AGAIN;
        /** Will begin a new execution some time after this state is taken, but only once some condition is met.
         *  This state will initially be taken in tandem with EXECUTING, but if delivery completes without clearing
         *  the state, the condition will be held on its own until {@link #executeAgain} is invoked */
        private static final int WAITING_TO_EXECUTE    = 4;

        /**
         * Force all task execution to stop, once any currently in progress work is completed
         */
        private volatile boolean terminated;

        /**
         * Is there asynchronous delivery work in progress.
         *
         * This temporarily prevents any {@link #stopAndRun} work from being performed.
         * Once both inProgress and stopAndRun are set we perform no more delivery work until one is unset,
         * to ensure we eventually run stopAndRun.
         *
         * This should be updated and read only on the Delivery thread.
         */
        private boolean inProgress = false;

        /**
         * Request a task's execution while there is no delivery work in progress.
         *
         * This is to permit cleanly tearing down a connection without interrupting any messages that might be in flight.
         * If stopAndRun is set, we should not enter doRun() until a corresponding setInProgress(false) occurs.
         */
        final AtomicReference stopAndRun = new AtomicReference<>();

        Delivery(ExecutorService executor)
        {
            this.executor = executor;
        }

        /**
         * Ensure that any messages or stopAndRun that were queued prior to this invocation will be seen by at least
         * one future invocation of the delivery task, unless delivery has already been terminated.
         */
        public void execute()
        {
            if (get() < EXECUTE_AGAIN && STOPPED == getAndUpdate(i -> i == STOPPED ? EXECUTING: i | EXECUTE_AGAIN))
                executor.execute(this);
        }

        private boolean isExecuting(int state)
        {
            return 0 != (state & EXECUTING);
        }

        /**
         * This method is typically invoked after WAITING_TO_EXECUTE is set.
         *
         * However WAITING_TO_EXECUTE does not need to be set; all this method needs to ensure is that
         * delivery unconditionally performs one new execution promptly.
         */
        void executeAgain()
        {
            // if we are already executing, set EXECUTING_AGAIN and leave scheduling to the currently running one.
            // otherwise, set ourselves unconditionally to EXECUTING and schedule ourselves immediately
            if (!isExecuting(getAndUpdate(i -> !isExecuting(i) ? EXECUTING : EXECUTING_AGAIN)))
                executor.execute(this);
        }

        /**
         * Invoke this when we cannot make further progress now, but we guarantee that we will execute later when we can.
         * This simply communicates to {@link #run} that we should not schedule ourselves again, just unset the EXECUTING bit.
         */
        void promiseToExecuteLater()
        {
            set(EXECUTING | WAITING_TO_EXECUTE);
        }

        /**
         * Called when exiting {@link #run} to schedule another run if necessary.
         *
         * If we are currently executing, we only reschedule if the present state is EXECUTING_AGAIN.
         * If this is the case, we clear the EXECUTE_AGAIN bit (setting ourselves to EXECUTING), and reschedule.
         * Otherwise, we clear the EXECUTING bit and terminate, which will set us to either STOPPED or WAITING_TO_EXECUTE
         * (or possibly WAITING_TO_EXECUTE | EXECUTE_AGAIN, which is logically the same as WAITING_TO_EXECUTE)
         */
        private void maybeExecuteAgain()
        {
            if (EXECUTING_AGAIN == getAndUpdate(i -> i == EXECUTING_AGAIN ? EXECUTING : (i & ~EXECUTING)))
                executor.execute(this);
        }

        /**
         * No more tasks or delivery will be executed, once any in progress complete.
         */
        public void terminate()
        {
            terminated = true;
        }

        /**
         * Only to be invoked by the Delivery task.
         *
         * If true, indicates that we have begun asynchronous delivery work, so that
         * we cannot safely stopAndRun until it completes.
         *
         * Once it completes, we ensure any stopAndRun task has a chance to execute
         * by ensuring delivery is scheduled.
         *
         * If stopAndRun is also set, we should not enter doRun() until a corresponding
         * setInProgress(false) occurs.
         */
        void setInProgress(boolean inProgress)
        {
            boolean wasInProgress = this.inProgress;
            this.inProgress = inProgress;
            if (!inProgress && wasInProgress)
                executeAgain();
        }

        /**
         * Perform some delivery work.
         *
         * Must never be invoked directly, only via {@link #execute()}
         */
        public void run()
        {
            /* do/while handling setup for {@link #doRun()}, and repeat invocations thereof */
            while (true)
            {
                if (terminated)
                    return;

                if (null != stopAndRun.get())
                {
                    // if we have an external request to perform, attempt it - if no async delivery is in progress

                    if (inProgress)
                    {
                        // if we are in progress, we cannot do anything;
                        // so, exit and rely on setInProgress(false) executing us
                        // (which must happen later, since it must happen on this thread)
                        promiseToExecuteLater();
                        break;
                    }

                    stopAndRun.getAndSet(null).run();
                }

                State state = OutboundConnection.this.state;
                if (!state.isEstablished() || !state.established().isConnected())
                {
                    // if we have messages yet to deliver, or a task to run, we need to reconnect and try again
                    // we try to reconnect before running another stopAndRun so that we do not infinite loop in close
                    if (hasPending() || null != stopAndRun.get())
                    {
                        promiseToExecuteLater();
                        requestConnect().addListener(f -> executeAgain());
                    }
                    break;
                }

                if (!doRun(state.established()))
                    break;
            }

            maybeExecuteAgain();
        }

        /**
         * @return true if we should run again immediately;
         *         always false for eventLoop executor, as want to service other channels
         */
        abstract boolean doRun(Established established);

        /**
         * Schedule a task to run later on the delivery thread while delivery is not in progress,
         * i.e. there are no bytes in flight to the network buffer.
         *
         * Does not guarantee to run promptly if there is no current connection to the remote host.
         * May wait until a new connection is established, or a connection timeout elapses, before executing.
         *
         * Update the shared atomic property containing work we want to interrupt message processing to perform,
         * the invoke schedule() to be certain it gets run.
         */
        void stopAndRun(Runnable run)
        {
            stopAndRun.accumulateAndGet(run, OutboundConnection::andThen);
            execute();
        }

        /**
         * Schedule a task to run on the eventLoop, guaranteeing that delivery will not occur while the task is performed.
         */
        abstract void stopAndRunOnEventLoop(Runnable run);

    }

    /**
     * Delivery that runs entirely on the eventLoop
     *
     * Since this has single threaded access to most of its environment, it can be simple and efficient, however
     * it must also have bounded run time, and limit its resource consumption to ensure other channels serviced by the
     * eventLoop can also make progress.
     *
     * This operates on modest buffers, no larger than the {@link OutboundConnections#LARGE_MESSAGE_THRESHOLD} and
     * filling at most one at a time before writing (potentially asynchronously) to the socket.
     *
     * We track the number of bytes we have in flight, ensuring no more than a user-defined maximum at any one time.
     */
    class EventLoopDelivery extends Delivery
    {
        private int flushingBytes;
        private boolean isWritable = true;

        EventLoopDelivery()
        {
            super(eventLoop);
        }

        /**
         * {@link Delivery#doRun}
         *
         * Since we are on the eventLoop, in order to ensure other channels are serviced
         * we never return true to request another run immediately.
         *
         * If there is more work to be done, we submit ourselves for execution once the eventLoop has time.
         */
        @SuppressWarnings("resource")
        boolean doRun(Established established)
        {
            if (!isWritable)
                return false;

            // pendingBytes is updated before queue.size() (which triggers notEmpty, and begins delivery),
            // so it is safe to use it here to exit delivery
            // this number is inaccurate for old versions, but we don't mind terribly - we'll send at least one message,
            // and get round to it eventually (though we could add a fudge factor for some room for older versions)
            int maxSendBytes = (int) min(pendingBytes() - flushingBytes, LARGE_MESSAGE_THRESHOLD);
            if (maxSendBytes == 0)
                return false;

            OutboundConnectionSettings settings = established.settings;
            int messagingVersion = established.messagingVersion;

            FrameEncoder.Payload sending = null;
            int canonicalSize = 0; // number of bytes we must use for our resource accounting
            int sendingBytes = 0;
            int sendingCount = 0;
            try (OutboundMessageQueue.WithLock withLock = queue.lockOrCallback(approxTime.now(), this::execute))
            {
                if (withLock == null)
                    return false; // we failed to acquire the queue lock, so return; we will be scheduled again when the lock is available

                sending = established.payloadAllocator.allocate(true, maxSendBytes);
                DataOutputBufferFixed out = new DataOutputBufferFixed(sending.buffer);

                Message next;
                while ( null != (next = withLock.peek()) )
                {
                    try
                    {
                        int messageSize = next.serializedSize(messagingVersion);

                        // actual message size for this version is larger than permitted maximum
                        if (messageSize > DatabaseDescriptor.getInternodeMaxMessageSizeInBytes())
                            throw new Message.OversizedMessageException(messageSize);

                        if (messageSize > sending.remaining())
                        {
                            // if we don't have enough room to serialize the next message, we have either
                            //  1) run out of room after writing some messages successfully; this might mean that we are
                            //     overflowing our highWaterMark, or that we have just filled our buffer
                            //  2) we have a message that is too large for this connection; this can happen if a message's
                            //     size was calculated for the wrong messaging version when enqueued.
                            //     In this case we want to write it anyway, so simply allocate a large enough buffer.

                            if (sendingBytes > 0)
                                break;

                            sending.release();
                            sending = null; // set to null to prevent double-release if we fail to allocate our new buffer
                            sending = established.payloadAllocator.allocate(true, messageSize);
                            //noinspection IOResourceOpenedButNotSafelyClosed
                            out = new DataOutputBufferFixed(sending.buffer);
                        }

                        Tracing.instance.traceOutgoingMessage(next, messageSize, settings.connectTo);
                        Message.serializer.serialize(next, out, messagingVersion);

                        if (sending.length() != sendingBytes + messageSize)
                            throw new InvalidSerializedSizeException(next.verb(), messageSize, sending.length() - sendingBytes);

                        canonicalSize += canonicalSize(next);
                        sendingCount += 1;
                        sendingBytes += messageSize;
                    }
                    catch (Throwable t)
                    {
                        onFailedSerialize(next, messagingVersion, 0, t);

                        assert sending != null;
                        // reset the buffer to ignore the message we failed to serialize
                        sending.trim(sendingBytes);
                    }
                    withLock.removeHead(next);
                }
                if (0 == sendingBytes)
                    return false;

                sending.finish();
                debug.onSendSmallFrame(sendingCount, sendingBytes);
                ChannelFuture flushResult = AsyncChannelPromise.writeAndFlush(established.channel, sending);
                sending = null;

                if (flushResult.isSuccess())
                {
                    sentCount += sendingCount;
                    sentBytes += sendingBytes;
                    debug.onSentSmallFrame(sendingCount, sendingBytes);
                }
                else
                {
                    flushingBytes += canonicalSize;
                    setInProgress(true);

                    boolean hasOverflowed = flushingBytes >= settings.flushHighWaterMark;
                    if (hasOverflowed)
                    {
                        isWritable = false;
                        promiseToExecuteLater();
                    }

                    int releaseBytesFinal = canonicalSize;
                    int sendingBytesFinal = sendingBytes;
                    int sendingCountFinal = sendingCount;
                    flushResult.addListener(future -> {

                        releaseCapacity(sendingCountFinal, releaseBytesFinal);
                        flushingBytes -= releaseBytesFinal;
                        if (flushingBytes == 0)
                            setInProgress(false);

                        if (!isWritable && flushingBytes <= settings.flushLowWaterMark)
                        {
                            isWritable = true;
                            executeAgain();
                        }

                        if (future.isSuccess())
                        {
                            sentCount += sendingCountFinal;
                            sentBytes += sendingBytesFinal;
                            debug.onSentSmallFrame(sendingCountFinal, sendingBytesFinal);
                        }
                        else
                        {
                            errorCount += sendingCountFinal;
                            errorBytes += sendingBytesFinal;
                            invalidateChannel(established, future.cause());
                            debug.onFailedSmallFrame(sendingCountFinal, sendingBytesFinal);
                        }
                    });
                    canonicalSize = 0;
                }
            }
            catch (Throwable t)
            {
                errorCount += sendingCount;
                errorBytes += sendingBytes;
                invalidateChannel(established, t);
            }
            finally
            {
                if (canonicalSize > 0)
                    releaseCapacity(sendingCount, canonicalSize);

                if (sending != null)
                    sending.release();

                if (pendingBytes() > flushingBytes && isWritable)
                    execute();
            }

            return false;
        }

        void stopAndRunOnEventLoop(Runnable run)
        {
            stopAndRun(run);
        }
    }

    /**
     * Delivery that coordinates between the eventLoop and another (non-dedicated) thread
     *
     * This is to service messages that are too large to fully serialize on the eventLoop, as they could block
     * prompt service of other requests.  Since our serializers assume blocking IO, the easiest approach is to
     * ensure a companion thread performs blocking IO that, under the hood, is serviced by async IO on the eventLoop.
     *
     * Most of the work here is handed off to {@link AsyncChannelOutputPlus}, with our main job being coordinating
     * when and what we should run.
     *
     * To avoid allocating a huge number of threads across a cluster, we utilise the shared methods of {@link Delivery}
     * to ensure that only one run() is actually scheduled to run at a time - this permits us to use any {@link ExecutorService}
     * as a backing, with the number of threads defined only by the maximum concurrency needed to deliver all large messages.
     * We use a shared caching {@link java.util.concurrent.ThreadPoolExecutor}, and rename the Threads that service
     * our connection on entry and exit.
     */
    class LargeMessageDelivery extends Delivery
    {
        static final int DEFAULT_BUFFER_SIZE = 32 * 1024;

        LargeMessageDelivery(ExecutorService executor)
        {
            super(executor);
        }

        /**
         * A simple wrapper of {@link Delivery#run} to set the current Thread name for the duration of its execution.
         */
        public void run()
        {
            String threadName, priorThreadName = null;
            try
            {
                priorThreadName = Thread.currentThread().getName();
                threadName = "Messaging-OUT-" + template.from() + "->" + template.to + '-' + type;
                Thread.currentThread().setName(threadName);

                super.run();
            }
            finally
            {
                if (priorThreadName != null)
                    Thread.currentThread().setName(priorThreadName);
            }
        }

        @SuppressWarnings({ "resource", "RedundantSuppression" }) // make eclipse warnings go away
        boolean doRun(Established established)
        {
            Message send = queue.tryPoll(approxTime.now(), this::execute);
            if (send == null)
                return false;

            AsyncMessageOutputPlus out = null;
            try
            {
                int messageSize = send.serializedSize(established.messagingVersion);
                out = new AsyncMessageOutputPlus(established.channel, DEFAULT_BUFFER_SIZE, messageSize, established.payloadAllocator);
                // actual message size for this version is larger than permitted maximum
                if (messageSize > DatabaseDescriptor.getInternodeMaxMessageSizeInBytes())
                    throw new Message.OversizedMessageException(messageSize);

                Tracing.instance.traceOutgoingMessage(send, messageSize, established.settings.connectTo);
                Message.serializer.serialize(send, out, established.messagingVersion);

                if (out.position() != messageSize)
                    throw new InvalidSerializedSizeException(send.verb(), messageSize, out.position());

                out.close();
                sentCount += 1;
                sentBytes += messageSize;
                releaseCapacity(1, canonicalSize(send));
                return hasPending();
            }
            catch (Throwable t)
            {
                boolean tryAgain = true;

                if (out != null)
                {
                    out.discard();
                    if (out.flushed() > 0 ||
                        isCausedBy(t, cause ->    isConnectionReset(cause)
                                               || cause instanceof Errors.NativeIoException
                                               || cause instanceof AsyncChannelOutputPlus.FlushException))
                    {
                        // close the channel, and wait for eventLoop to execute
                        disconnectNow(established).awaitUninterruptibly();
                        tryAgain = false;
                        try
                        {
                            // after closing, wait until we are signalled about the in flight writes;
                            // this ensures flushedToNetwork() is correct below
                            out.waitUntilFlushed(0, 0);
                        }
                        catch (Throwable ignore)
                        {
                            // irrelevant
                        }
                    }
                }

                onFailedSerialize(send, established.messagingVersion, out == null ? 0 : (int) out.flushedToNetwork(), t);
                return tryAgain;
            }
        }

        void stopAndRunOnEventLoop(Runnable run)
        {
            stopAndRun(() -> {
                try
                {
                    runOnEventLoop(run).await();
                }
                catch (InterruptedException e)
                {
                    throw new RuntimeException(e);
                }
            });
        }
    }

    /*
     * Size used for capacity enforcement purposes. Using current messaging version no matter what the peer's version is.
     */
    private int canonicalSize(Message message)
    {
        return message.serializedSize(current_version);
    }

    private void invalidateChannel(Established established, Throwable cause)
    {
        JVMStabilityInspector.inspectThrowable(cause);

        if (state != established)
            return; // do nothing; channel already invalidated

        if (isCausedByConnectionReset(cause))
            logger.info("{} channel closed by provider", id(), cause);
        else
            logger.error("{} channel in potentially inconsistent state after error; closing", id(), cause);

        disconnectNow(established);
    }

    /**
     *  Attempt to open a new channel to the remote endpoint.
     *
     *  Most of the actual work is performed by OutboundConnectionInitiator, this method just manages
     *  our book keeping on either success or failure.
     *
     *  This method is only to be invoked by the eventLoop, and the inner class' methods should only be evaluated by the eventtLoop
     */
    Future initiate()
    {
        class Initiate
        {
            /**
             * If we fail to connect, we want to try and connect again before any messages timeout.
             * However, we update this each time to ensure we do not retry unreasonably often, and settle on a periodicity
             * that might lead to timeouts in some aggressive systems.
             */
            long retryRateMillis = DatabaseDescriptor.getMinRpcTimeout(MILLISECONDS) / 2;

            // our connection settings, possibly updated on retry
            int messagingVersion = template.endpointToVersion().get(template.to);
            OutboundConnectionSettings settings;

            /**
             * If we failed for any reason, try again
             */
            void onFailure(Throwable cause)
            {
                if (cause instanceof ConnectException)
                    noSpamLogger.info("{} failed to connect", id(), cause);
                else
                    noSpamLogger.error("{} failed to connect", id(), cause);

                JVMStabilityInspector.inspectThrowable(cause);

                if (hasPending())
                {
                    Promise> result = new AsyncPromise<>(eventLoop);
                    state = new Connecting(state.disconnected(), result, eventLoop.schedule(() -> attempt(result), max(100, retryRateMillis), MILLISECONDS));
                    retryRateMillis = min(1000, retryRateMillis * 2);
                }
                else
                {
                    // this Initiate will be discarded
                    state = Disconnected.dormant(state.disconnected().maintenance);
                }
            }

            void onCompletedHandshake(Result result)
            {
                switch (result.outcome)
                {
                    case SUCCESS:
                        // it is expected that close, if successful, has already cancelled us; so we do not need to worry about leaking connections
                        assert !state.isClosed();

                        MessagingSuccess success = result.success();
                        debug.onConnect(success.messagingVersion, settings);
                        state.disconnected().maintenance.cancel(false);

                        FrameEncoder.PayloadAllocator payloadAllocator = success.allocator;
                        Channel channel = success.channel;
                        Established established = new Established(messagingVersion, channel, payloadAllocator, settings);
                        state = established;
                        channel.pipeline().addLast("handleExceptionalStates", new ChannelInboundHandlerAdapter() {
                            @Override
                            public void channelInactive(ChannelHandlerContext ctx)
                            {
                                disconnectNow(established);
                                ctx.fireChannelInactive();
                            }

                            @Override
                            public void exceptionCaught(ChannelHandlerContext ctx, Throwable cause)
                            {
                                try
                                {
                                    invalidateChannel(established, cause);
                                }
                                catch (Throwable t)
                                {
                                    logger.error("Unexpected exception in {}.exceptionCaught", this.getClass().getSimpleName(), t);
                                }
                            }
                        });
                        ++successfulConnections;

                        logger.info("{} successfully connected, version = {}, framing = {}, encryption = {}",
                                    id(true),
                                    success.messagingVersion,
                                    settings.framing,
                                    encryptionConnectionSummary(channel));
                        break;

                    case RETRY:
                        if (logger.isTraceEnabled())
                            logger.trace("{} incorrect legacy peer version predicted; reconnecting", id());

                        // the messaging version we connected with was incorrect; try again with the one supplied by the remote host
                        messagingVersion = result.retry().withMessagingVersion;
                        settings.endpointToVersion.set(settings.to, messagingVersion);

                        initiate();
                        break;

                    case INCOMPATIBLE:
                        // we cannot communicate with this peer given its messaging version; mark this as any other failure, and continue trying
                        Throwable t = new IOException(String.format("Incompatible peer: %s, messaging version: %s",
                                                                    settings.to, result.incompatible().maxMessagingVersion));
                        t.fillInStackTrace();
                        onFailure(t);
                        break;

                    default:
                        throw new AssertionError();
                }
            }

            /**
             * Initiate all the actions required to establish a working, valid connection. This includes
             * opening the socket, negotiating the internode messaging handshake, and setting up the working
             * Netty {@link Channel}. However, this method will not block for all those actions: it will only
             * kick off the connection attempt, setting the @{link #connecting} future to track its completion.
             *
             * Note: this should only be invoked on the event loop.
             */
            private void attempt(Promise> result)
            {
                ++connectionAttempts;

                /*
                 * Re-evaluate messagingVersion before re-attempting the connection in case
                 * endpointToVersion were updated. This happens if the outbound connection
                 * is made before the endpointToVersion table is initially constructed or out
                 * of date (e.g. if outbound connections are established for gossip
                 * as a result of an inbound connection) and can result in the wrong outbound
                 * port being selected if configured with enable_legacy_ssl_storage_port=true.
                 */
                int knownMessagingVersion = messagingVersion();
                if (knownMessagingVersion != messagingVersion)
                {
                    logger.trace("Endpoint version changed from {} to {} since connection initialized, updating.",
                                 messagingVersion, knownMessagingVersion);
                    messagingVersion = knownMessagingVersion;
                }

                settings = template;
                if (messagingVersion > settings.acceptVersions.max)
                    messagingVersion = settings.acceptVersions.max;

                // ensure we connect to the correct SSL port
                settings = settings.withLegacyPortIfNecessary(messagingVersion);

                initiateMessaging(eventLoop, type, settings, messagingVersion, result)
                .addListener(future -> {
                    if (future.isCancelled())
                        return;
                    if (future.isSuccess()) //noinspection unchecked
                        onCompletedHandshake((Result) future.getNow());
                    else
                        onFailure(future.cause());
                });
            }

            Future> initiate()
            {
                Promise> result = new AsyncPromise<>(eventLoop);
                state = new Connecting(state.disconnected(), result);
                attempt(result);
                return result;
            }
        }

        return new Initiate().initiate();
    }

    /**
     * Returns a future that completes when we are _maybe_ reconnected.
     *
     * The connection attempt is guaranteed to have completed (successfully or not) by the time any listeners are invoked,
     * so if a reconnection attempt is needed, it is already scheduled.
     */
    private Future requestConnect()
    {
        // we may race with updates to this variable, but this is fine, since we only guarantee that we see a value
        // that did at some point represent an active connection attempt - if it is stale, it will have been completed
        // and the caller can retry (or utilise the successfully established connection)
        {
            State state = this.state;
            if (state.isConnecting())
                return state.connecting().attempt;
        }

        Promise