com.oracle.coherence.common.internal.net.socketbus.BufferedSocketBus Maven / Gradle / Ivy

Show more of this group Show more artifacts with this name
Show all versions of coherence Show documentation
Oracle Coherence Community Edition
There is a newer version: 24.03
/*
 * Copyright (c) 2000, 2022, Oracle and/or its affiliates.
 *
 * Licensed under the Universal Permissive License v 1.0 as shown at
 * http://oss.oracle.com/licenses/upl.
 */
package com.oracle.coherence.common.internal.net.socketbus;

import com.oracle.coherence.common.internal.util.HeapDump;
import com.oracle.coherence.common.io.BufferSequence;
import com.oracle.coherence.common.io.BufferManager;
import com.oracle.coherence.common.io.Buffers;
import com.oracle.coherence.common.net.exabus.EndPoint;
import com.oracle.coherence.common.net.exabus.util.UrlEndPoint;
import com.oracle.coherence.common.net.exabus.util.SimpleEvent;
import com.oracle.coherence.common.net.exabus.Event;
import com.oracle.coherence.common.util.Duration;
import com.oracle.coherence.common.util.MemorySize;
import com.oracle.coherence.common.util.SafeClock;

import java.io.DataInput;
import java.io.IOException;
import java.nio.ByteBuffer;

import java.util.Arrays;

import java.util.concurrent.atomic.AtomicBoolean;
import java.util.concurrent.atomic.AtomicInteger;
import java.util.concurrent.atomic.AtomicLong;
import java.net.SocketException;

import java.util.logging.Level;

import java.util.zip.CRC32;


/**
 * BufferedSocketBus adds write buffering to the AbstractSocketBus.
 *
 * @author mf  2010.12.1
 */
public abstract class BufferedSocketBus
        extends AbstractSocketBus
    {
    // ----- constructors ---------------------------------------------------

    /**
     * Construct a BufferedSocketMessageBus.
     *
     * @param driver      the socket driver
     * @param pointLocal  the local endpoint
     *
     * @throws IOException if an I/O error occurs
     */
    public BufferedSocketBus(SocketBusDriver driver, UrlEndPoint pointLocal)
            throws IOException
        {
        super(driver, pointLocal);
        }

    /**
     * {@inheritDoc}
     */
    @Override
    protected void onOpen()
        {
        final long cMillisFlush = f_driver.getDependencies().getMaximumReceiptDelayMillis();
        if (cMillisFlush > 0)
            {
            scheduleTask(new Runnable()
                {
                @Override
                public void run()
                    {
                    // reschedule self for next periodic flush
                    scheduleTask(this, cMillisFlush);

                    for (Connection conn : getRegisteredConnections())
                        {
                        if (((BufferedConnection) conn).isReceiptFlushRequired())
                            {
                            conn.optimisticFlush();
                            }
                        }
                    }
                }, cMillisFlush);
            }

        super.onOpen();
        }


    // ----- BufferedConnection interface ----------------------------------

    /**
     * BufferedConnection implements a reliable stream connection with
     * I/O offloading.
     */
    public abstract class BufferedConnection
        extends Connection
        {
        /**
         * Create a BufferedConnection for the specified peer.
         *
         * @param peer  the peer
         */
        public BufferedConnection(UrlEndPoint peer)
            {
            super(peer);
            }

        /**
         * Return the threshold at which a send operation should perform
         * an auto-flush of the unflushed write batch.
         *
         * @return  the threshold in bytes
         */
        protected long getAutoFlushThreshold()
            {
            long cb = m_cbAutoFlushThreshold;

            if (cb <= 0)
                {
                try
                    {
                    // goal is to avoid engaging the selection service writer
                    // or more specifically avoid flip-flopping between using
                    // it and not using it.  If we allow our buffered data
                    // to exceed the underlying buffer size then a write is
                    // likely to engage it, so we start writing at the point
                    // that it would only be partially full.  Obviously we
                    // don't want to do micro writes either, so we still want
                    // to be using a decent portion of the buffer before
                    // writing
                    // TODO: consider continuing to auto-resize the threshold based on how
                    // often an flush fails to fully flush the buffer
                    cb = f_driver.getDependencies().getAutoFlushThreshold();
                    if (cb <= 0)
                        {
                        cb = Math.min(getPacketSize() * 9, // try for >90% packet utilization
                                      getSendBufferSize() / 4); // ensure we don't fill tx buffer
                        }
                    m_cbAutoFlushThreshold = cb;
                    }
                catch (SocketException e) {}

                if (cb <= 0)
                    {
                    // not yet connected
                    cb = 64 * 1024;
                    }
                }

            return cb;
            }

        /**
         * Return the threshold at which forced acks of pending receipts is
         * requested from the peer.
         *
         * @return  the threshold in bytes
         */
        protected long getForceAckThreshold()
            {
            long cb = m_cbForceAckThreshold;

            if (cb <= 0)
                {
                cb = f_driver.getDependencies().getReceiptRequestThreshold();

                try
                    {
                    if (cb <= 0)
                        {
                        cb = getSendBufferSize() * 3;
                        }
                    m_cbForceAckThreshold = cb;
                    }
                catch (SocketException e) {}

                if (cb <= 0)
                    {
                    // not yet connected
                    cb = 3 * 64 * 1024;
                    }
                }

            return cb;
            }

        /**
         * Return the threshold at which to declare backlog
         *
         * @return  the threshold in bytes
         */
        protected long getBacklogExcessiveThreshold()
            {
            long cb = m_cbBacklogExcessiveThreshold;

            if (cb <= 0)
                {
                try
                    {
                    m_cbBacklogExcessiveThreshold = cb = getSendBufferSize();
                    }
                catch (SocketException e) {}

                if (cb <= 0)
                    {
                    // not yet connected
                    cb = 1024 * 1024;
                    }
                }

            return cb;
            }

        @Override
        public BufferedConnection ensureValid()
            {
            super.ensureValid();
            return this;
            }

        /**
         * Return the number of concurrently executing writers.
         *
         * @return the number of concurrently executing writers.
         */
        protected int getConcurrentWriters()
            {
            // estimate how contended this connection is by including all threads which contributed to this batch
            // as well as any threads waiting to use the connection

            return m_cWritersWaiting.get() + m_cWritersBatch;
            }

        /**
         * Extracted post "send" logic.
         *
         * The caller must be synchronized on this connection. The parameter ordering allows the caller to inline
         * the computation of each parameter.
         *
         * @param fFlushInProg   true if a flush was in progress before the send
         * @param fFlushPending  true if a flush was required before the send
         * @param cbPending      the number of bytes pending on the connection
         */
        protected void evaluateAutoFlush(boolean fFlushInProg, boolean fFlushPending, long cbPending)
            {
            if (!fFlushInProg)
                {
                if (cbPending > getAutoFlushThreshold())
                    {
                    if (!flush(/*fAuto*/ true))
                        {
                        if (!fFlushPending)
                            {
                            addFlushable(this);
                            }
                        // else; a flush was already pending no need to add again
                        }
                    else if (fFlushPending)
                        {
                        // flush indicated there is nothing left; and we were previously in the flush-set; do cleanup
                        removeFlushable(this);
                        }
                    }
                else if (!fFlushPending)
                    {
                    addFlushable(this);
                    }
                // else; a flush was already pending no need to add again
                }
            // else; another thread is actively waiting on flush, no need to do anything post-send flush processing
            }

        /**
         * Add a thread to the set of concurrent writers.
         */
        protected void addWriter()
            {
            // we loosely track threads using a bitset
            long lBitId = 1L << Thread.currentThread().getId(); // % 64 is not necessary as JLS says << will do it for me
            if ((m_lWritersBatchBitSet & lBitId) == 0L)
                {
                m_lWritersBatchBitSet |= lBitId;
                ++m_cWritersBatch;
                }
            }

        /**
         * Send any scheduled BufferSequences.
         */
        public void flush()
            {
            if (flush(/*fAuto*/ false))
                {
                removeFlushable(this);
                }
            }

        /**
         * Send any scheduled BufferSequences.
         *
         * @param fAuto  true iff it is an auto-flush
         *
         * @return true if the connection has flushed all pending data
         */
        public boolean flush(boolean fAuto)
            {
            SocketBusDriver.Dependencies deps = getSocketDriver().getDependencies();

            WriteBatch batch   = m_batchWriteUnflushed;
            int        cRecReq = m_cReceiptsUnflushed;
            int        cRecRet = m_cReceiptsReturn.getAndSet(0);

            // determine if we need to ask for forced acks
            long cUnackedBytes = f_cBytesUnacked.get();
            if (cRecReq > 0 && cUnackedBytes > getForceAckThreshold())
                {
                cRecReq = -cRecReq; // negative receipts indicates forced acks
                f_cBytesUnacked.set(0); // reset counter to avoid requesting multiple forced acks
                }

            // insert receipt message if necessary
            if (cRecReq != 0 || cRecRet != 0 || m_fIdle)
                {
                m_fIdle = false;
                if (batch == null)
                    {
                    m_batchWriteUnflushed = batch = new WriteBatch();
                    }

                int        cbReceipt        = getReceiptSize();
                ByteBuffer bufferMsgReceipt = m_bufferRecycleOutboundReceipts;
                if (bufferMsgReceipt == null)
                    {
                    bufferMsgReceipt = m_bufferRecycleOutboundReceipts = deps.getBufferManager().acquire(cbReceipt * 1024);
                    int        cbCap = bufferMsgReceipt.capacity();
                    bufferMsgReceipt.limit(cbCap - (cbCap % cbReceipt));
                    }

                if (bufferMsgReceipt.remaining() > cbReceipt)
                    {
                    ByteBuffer buffReceipt = bufferMsgReceipt.slice();
                    writeMsgReceipt(buffReceipt, cRecReq, cRecRet);
                    bufferMsgReceipt.position(bufferMsgReceipt.position() + cbReceipt);
                    batch.append(buffReceipt, /*fRecycle*/ false, /*body*/ null, /*receipt*/ cRecReq == 0 ? null : RECEIPT_ACKREQ_MARKER);
                    }
                else // bufferMsgReceipt.remaining() == MSG_RECEIPT_SIZE
                    {
                    // write last chunk and recycle
                    bufferMsgReceipt.mark();
                    writeMsgReceipt(bufferMsgReceipt, cRecReq, cRecRet);
                    batch.append(bufferMsgReceipt, /*fRecycle*/ true, /*body*/ null, /*receipt*/ cRecReq == 0 ? null : RECEIPT_ACKREQ_MARKER_RECYCLE);
                    m_bufferRecycleOutboundReceipts = null;
                    }
                m_cReceiptsUnflushed = 0;
                }

            if (batch == null)
                {
                return true; // nothing to flush
                }
            else if (f_cbQueued.get() == 0 && getConcurrentWriters() <= deps.getDirectWriteThreadThreshold())
                {
                // SS thread isn't writing, so this is the current head.  Even if the SS thread never sees this
                // batch we want to overwrite any historic batch with the current head to avoid building up
                // garbage which otherwise wouldn't be GCable until the SS thread iterated through many empty batches
                m_batchWriteSendHead = batch;
                try
                    {
                    synchronized (batch) // sync'd because ack can come in while we're still in batch.write
                        {
                        if (batch.write())
                            {
                            m_cWritersBatch       = 0;
                            m_lWritersBatchBitSet = 0;
                            return true;
                            }
                        // else; we didn't write everything, fall through and evaluate if we need to enqueue the batch
                        }
                    }
                catch (IOException e)
                    {
                    // stream is now in an unknown state; queue the remainder and reconnect
                    m_batchWriteUnflushed = null;
                    enqueueWriteBatch(batch);
                    onException(e);
                    return true; // nothing more can be done
                    }
                }
            else if (m_state == ConnectionState.DEFUNCT)
                {
                scheduleDisconnect(null); // ensure receipts are emitted ASAP
                }

            if (fAuto && batch.getLength() < getBacklogExcessiveThreshold() * 2)
                {
                return false; // data remains to be flushed
                }
            else
                {
                // either caller explictly flushed or we're buffering a significant
                // amount, enqueue to SelectionService
                m_batchWriteUnflushed = null;
                m_cWritersBatch       = 0;
                m_lWritersBatchBitSet = 0;

                enqueueWriteBatch(batch);
                return true; // euqueue'd the batch
                }
            }


        @Override
        protected boolean heartbeat()
            {
            long    cbHeartbeat = m_cbWrite;
            boolean fResult     = false;
            if (cbHeartbeat == m_cbHeartbeatLast && // we've sent nothing since the last check
                f_cbQueued.get() == 0 &&            // we have no outbound traffic queued up
                m_state == ConnectionState.ACTIVE)  // COH-25350 - messages can be sent on the connection
                {
                // prevent the network infrastructure from closing the idle socket

                // setting m_fIdle to true will cause the next flush to minimally send an empty receipt
                // flush will also reset idle
                m_fIdle = true;
                optimisticFlush(); // attempt flush in case auto-flush is disabled
                fResult = true;
                }
            // else; the connection has seen writes since the last heartbeat check, no action required

            m_cbHeartbeatLast = cbHeartbeat;
            return fResult;
            }


        /**
         * Offload the specified WriteBatch to the SelectionService for processing.
         *
         * @param batch  the WriteBatch to enqueue
         */
        public void enqueueWriteBatch(WriteBatch batch)
            {
            long cbBatch = batch.getLength();

            long cbQueuedNow = f_cbQueued.addAndGet(cbBatch);

            if (cbQueuedNow == cbBatch)
                {
                // for this to occur the just enqueued batch is at the head of the queue
                // and was not even partially written by the SelectionService. In such
                // a case the SelectionService may not be aware of it and we must
                // re-register with selection service for WRITE interest
                try
                    {
                    wakeup();
                    }
                catch (IOException e)
                    {
                    onException(e);
                    }
                }

            final long cbExcessive = getBacklogExcessiveThreshold();
            if (cbQueuedNow > cbExcessive * 2 &&
                m_fBacklogScheduled.compareAndSet(false, true))
                {
                // handle the case where the SelectionService thread doesn't wake up for any writes
                // we need to prevent an endless backlog from being formed, but also want to avoid
                // issuing this invocable in most cases, so we use an extra large limit.
                // emit BACKLOG_EXCESSIVE from SelectionService
                invoke(new Runnable()
                    {
                    public void run()
                        {
                        m_fBacklogScheduled.set(false);

                        if (isValid() && !m_fBacklog && f_cbQueued.get() > cbExcessive)
                            {
                            m_fBacklog = true;
                            emitEvent(new SimpleEvent(
                                    Event.Type.BACKLOG_EXCESSIVE,
                                    getPeer()));
                            }
                        }
                    });
                }
            }

        /**
         * Write a Receipt message to a buffer.
         *
         * @param  buff     buffer to write the receipt message
         * @param  cRecReq  number of receipts associated with the unflushed WriteBatch
         * @param  cRecRet  number of receipts to return to the peer
         *
         * @return ByteBuffer to which the receipt message was written
         */
        protected ByteBuffer writeMsgReceipt(ByteBuffer buff, int cRecReq, int cRecRet)
            {
            int nProt = getProtocolVersion();
            int nPos0 = buff.position();

            if (nProt > 4)
                {
                buff.putLong(-9)  // negative msg size indicates control message of that size
                    .position(nPos0 + 16); // skip writes for header/body crc
                }
            else
                {
                buff.putInt(-9);
                }

            buff.put(MSG_RECEIPT)
                .putInt(cRecReq)
                .putInt(cRecRet);

            buff.limit(buff.position()).position(nPos0);
            if (nProt > 4)
                {
                // backfill the message header with the header and body CRCs now that they can be computed
                populateCtrlMsgHeaderCrc(buff);
                }

            return buff;
            }

        /**
         * Process a receipt from the supplied stream
         *
         * @param in  the receipt
         *
         * @throws IOException if an IO error occurs
         */
        protected void processReceipt(DataInput in)
                throws IOException
            {
            int cReceiptsRequested = in.readInt();
            if (cReceiptsRequested < 0 || getSocketDriver().getDependencies().getMaximumReceiptDelayMillis() == 0)
                {
                // negative or config indicates force receipt send immediately
                m_cReceiptsReturn.addAndGet(cReceiptsRequested < 0 ? -cReceiptsRequested : cReceiptsRequested);

                if (isReceiptFlushRequired())
                    {
                    // there are no pending flushes for this connection
                    optimisticFlush();
                    }
                }
            else
                {
                m_cReceiptsReturn.addAndGet(cReceiptsRequested);
                }

            int cReturned = in.readInt();
            if (cReturned > 0)
                {
                EndPoint   epPeer      = getPeer();
                WriteBatch batchResend = m_batchWriteResendHead;
                WriteBatch batchNext   = batchResend.next();
                for (;;)
                    {
                    int cEmit;
                    if (batchNext == null && m_batchWriteUnflushed == batchResend)
                        {
                        // the app may be concurrently writing to this (the last) batch, thus we must sync
                        synchronized (batchResend)
                            {
                            cReturned -= cEmit = batchResend.ack(cReturned, f_aoReceiptTmp);
                            }
                        }
                    else
                        {
                        // there is a subsequent batch which means the application is no longer writing
                        // to this batch.  Also reading the next ref acts as a memory barrier ensuring
                        // that we have a clean view of the contents of this batch since it was written
                        // after this batch was done being written to by the app thread
                        cReturned -= cEmit = batchResend.ack(cReturned, f_aoReceiptTmp);
                        }

                    // emit receipts outside of synchronization
                    for (int i = 0; i < cEmit; ++i)
                        {
                        Object oReceipt = f_aoReceiptTmp[i];
                        f_aoReceiptTmp[i] = null;
                        if (oReceipt != RECEIPT_NO_EMIT)
                            {
                            addEvent(new SimpleEvent(Event.Type.RECEIPT, epPeer, oReceipt));
                            ++m_cReceiptsEmitted;
                            }
                        }

                    if (cReturned == 0)
                        {
                        // wait for more receipts
                        break;
                        }
                    else if (cEmit < f_aoReceiptTmp.length)
                        {
                        // this batch must be fully ack'd; move onto next batch

                        // try to null out the resend.next ref so that we don't
                        // end up with tenured garbage referencing live data as
                        // this can force short lived objects into being tenured
                        // as well and cause long GCs.  To be here clearly this batch
                        // is fully ack'd, so we'd think we could just null out it's
                        // next ref, but it is possible that if we've recently done
                        // a migration this batch is still pending a send, i.e. we're
                        // resending it not realizing that it was already ack'd.  In
                        // such a case we can't null out next but in all other cases
                        // we can.  To identify this we check if the head of the send
                        // queue is fully ack'd if so then we're in that odd state

                        WriteBatch batchSend = m_batchWriteSendHead; // yes send not resend
                        if (batchSend.m_ofAck != batchSend.m_ofAdd)
                            {
                            // since this resend batch is fully ack'd and the send head is not
                            // full ack'd that means the resend pointer is in front of the send
                            // pointer (as is normal), and thus this resend batch is now garbage
                            // and we can null out its next pointer
                            batchResend.m_next = null;
                            }

                        batchResend = batchNext;
                        batchNext   = batchResend.next();
                        }
                    // else; more to process in current batch
                    }

                m_batchWriteResendHead = batchResend;

                // Peer has acked. Reset counter; this also serves as a volatile write to make all of the above
                // visible to checkHealth
                f_cBytesUnacked.set(0);
                }
            }

        /**
         * Process a receipt from the supplied stream
         *
         * @param in  the receipt
         *
         * @throws IOException if an IO error occurs
         */
        protected void processSync(DataInput in)
                throws IOException
            {
            long cMsgOut      = m_cMsgOutDelivered;
            long cMsgIn       = m_cMsgIn; // SYNCs aren't ackable and thus aren't reflected in either side's count
            long cMsgAcked    = in.readLong();
            long cMsgReceived = in.readLong();
            byte nCmdSync     = getProtocolVersion() < 3 ? SYNC_CMD_NONE : in.readByte();

            if ((nCmdSync & SYNC_CMD_DUMP) != 0)
                {
                String sDump = HeapDump.dumpHeapForBug("Bug-27585336-tmb-migration");
                getLogger().log(makeRecord(Level.WARNING, "{0} migration with {1} appears to not be progressing on {2}; {3} collected for analysis",
                        getLocalEndPoint(), getPeer(), BufferedConnection.this, sDump));
                }

            if (cMsgAcked > cMsgIn ||     // peer got more acks then we sent to it
                cMsgOut   > cMsgReceived) //   we got more acks then it sent to us
                {
                scheduleDisconnect(new IOException("out of sync during migration in " +
                                        cMsgAcked + "/" + cMsgIn + ", out " + cMsgOut + "/" + cMsgReceived));
                }
            else
                {
                long cSkip      = m_cMsgInSkip = cMsgIn - cMsgAcked;
                long cRedeliver = cMsgReceived - cMsgOut;
                getLogger().log(makeRecord(Level.FINE,
                        "{0} synchronizing migrated connection with {1} will result in {2} skips and {3} re-deliveries: {4}",
                        getLocalEndPoint(), getPeer(), cSkip, cRedeliver, BufferedConnection.this));
                }
            }

        /**
         * Return true iff there are pending receipts that needs to be flushed but no application data to flush
         *
         * @return true iff connection has pending receipts but no unflushed application data
         */
        protected boolean isReceiptFlushRequired()
            {
            return (m_cReceiptsReturn.get() > 0 || m_fIdle) && !isFlushRequired();
            }

        /**
         * Return true there is application data pending a flush.
         *
         * @return true iff there is application data pending a flush
         */
        protected boolean isFlushRequired()
            {
            WriteBatch batch = m_batchWriteUnflushed;
            return batch != null && batch.m_ofSend < batch.m_ofAdd;
            }

        /**
         * {@inheritDoc}
         */
        public int onReadySafe(int nOps)
            throws IOException
            {
            return m_nInterestOpsLast = processReads((nOps & OP_READ) != 0) | processWrites((nOps & OP_WRITE) != 0);
            }

        @Override
        protected void checkHealth(long ldtNow)
            {
            if (m_state == null || m_state.ordinal() > ConnectionState.ACTIVE.ordinal())
                {
                return;
                }

            // establish write health, i.e. we've done a write or have nothing (flushed) to write

            // m_cbWrite is a dirty read as writes may be done off the SS thread, but even
            // if they are f_cbQueued will be updated after each m_cbWrite update, thus worst case we're
            // guaranteed to see the updated m_cbWrite on our next pass.
            long    cbWrite       = m_cbWrite;
            long    cbWriteLast   = m_cbWriteLastCheck;
            boolean fWriteHealthy = cbWrite > cbWriteLast || f_cbQueued.get() == 0;

            // establish read health

            // we're unhealthy if we have an inbound receipt pending and we aren't actively making read progress.
            // note we consider reads of anything to be a good sign simply because our ack could still be coming.

            long    ldtAckTimeout    = m_ldtAckTimeout;
            long    cbRead           = m_cbRead;
            Object  oReceiptUnacked  = null;
            int     ofReceiptUnacked = 0;
            Object  batchUnacked     = null;

            f_cBytesUnacked.get(); // volatile read to allow all other read-health checks to see data recently written
                                   // by the corresponding SS thread.
            boolean fReadHealthy = true;
            if (cbRead == m_cbReadLastCheck)
                {
                // we haven't read anything, but we need to check if we should have read something.  The only thing
                // we check for are acks, so lets see if we are expecting one; scan the resend queue to see if we've
                // sent anything which requires an ack, and also sent the subsequent ack request.  We could also
                // monitor messages which have started to arrive, but then stalled, but those will naturally be protected
                // by our peer running its own health check waiting for us to send the corresponding ack, and while we
                // could monitor these, we couldn't monitor messages until they started to arrive, so the monitoring would
                // be both redundant and incomplete.
                UNHEALTHY: for (WriteBatch batchResend = m_batchWriteResendHead; batchResend != null; batchResend = batchResend.next())
                    {
                    Object[] aReceipt = batchResend.m_aReceipt;
                    int      ofSafe   = aReceipt.length; // defend against concurrent batch.append/ack
                    for (int i = Math.min(ofSafe, batchResend.m_ofAck), e = Math.min(ofSafe, batchResend.m_ofSend); i < e; ++i)
                        {
                        Object oReceipt = aReceipt[i];
                        if (oReceiptUnacked == null)
                            {
                            if (oReceipt != null &&
                                oReceipt != RECEIPT_MSG_MARKER &&                // don't count
                                oReceipt != RECEIPT_HEADER_RECYCLE &&            // artificial receipts
                                oReceipt != RECEIPT_MSG_MARKER_HEADER_RECYCLE && // which
                                oReceipt != RECEIPT_ACKREQ_MARKER &&             // don't
                                oReceipt != RECEIPT_ACKREQ_MARKER_RECYCLE)       // get ack'd
                                {
                                // this indicates that we've sent a message which requires an ack that we've yet
                                // to receive; but we also need to know we've sent the ack request. The ack request
                                // for this would go out in our next receipt, so start searching for that.
                                oReceiptUnacked  = oReceipt; // in coherence this will be the actual Message
                                ofReceiptUnacked = i;
                                batchUnacked     = batchResend;
                                }
                            }
                        else if (oReceipt == RECEIPT_ACKREQ_MARKER ||
                                 oReceipt == RECEIPT_ACKREQ_MARKER_RECYCLE)
                            {
                            // oReceiptUnacked is non-null thus we know we've sent something requiring an ack, and
                            // we've now seen that we've sent the ack request as well, thus our health is now suspect.
                            fReadHealthy = false;
                            break UNHEALTHY;
                            }
                        }
                    }
                }
            else
                {
                // we read something; we have read health
                fReadHealthy = true;

                // but we also need to help out our peer. if we're on a slow network and the peer has a large tx buffer
                // it's possible they finished their write long before the last packet from that write will actually
                // leave TCP's tx buffer.  The peer can't tell when that has happened and started their ack timer at
                // the point they finished writing to the tx buffer.  Since we don't want the ack timeout to need to
                // be relative to message sizes or network speed we need a way to help our peer to not declare a timeout.
                // We can do this by ensuring that maintain read health while awaiting the ack, and we can do that by
                // sending dummy data, i.e. heartbeats.  But of course we only want to do that if we can see that that
                // their TCP stack is still draining the tx buffer, which of course we can infer by us still draining
                // our rx buffer.  To be here we know that we read some bytes, but we also want to know that there is
                // still more coming, so for that we need to see that we're not waiting on a message header.

                if ((m_nInterestOpsLast & OP_EAGER) == 0) // no pending read
                    {
                    m_ldtForceHeartbeat = 0; // disable heartbeat timeout
                    }
                else if (m_ldtForceHeartbeat == 0 ||                   // start of pending read
                        (ldtNow > m_ldtForceHeartbeat && heartbeat())) // or time to force heartbeat
                    {
                    // force a HB multiple times during an ack timeout period, this assumes our peer has the same timeout as us
                    m_ldtForceHeartbeat = ldtNow + f_driver.getDependencies().getAckTimeoutMillis() / 3;
                    }
                // else; not time to force a heartbeat yet
                }

            if (fReadHealthy && fWriteHealthy) // common path
                {
                // we're healthy; disable any active timeout
                m_ldtAckFatalTimeout = m_ldtAckTimeout = 0;

                // only record read/write amounts when healthy
                m_cbWriteLastCheck = cbWrite;
                m_cbReadLastCheck  = cbRead;
                }
            else if (ldtAckTimeout == 0)
                {
                // health is now in doubt, set timeouts
                long cMillisTimeout = f_driver.getDependencies().getAckTimeoutMillis();

                m_ldtAckTimeout      = cMillisTimeout == 0 || getProtocolVersion() == 0 ? Long.MAX_VALUE : ldtNow + cMillisTimeout;
                cMillisTimeout       = f_driver.getDependencies().getAckFatalTimeoutMillis();
                m_ldtAckFatalTimeout = cMillisTimeout == 0 ? Long.MAX_VALUE : ldtNow + cMillisTimeout;
                }
            else if (ldtNow >= m_ldtAckFatalTimeout)
                {
                // fatal timeout expired
                long     cMillisTimeout = f_driver.getDependencies().getAckFatalTimeoutMillis();
                Duration dur            = new Duration(cMillisTimeout, Duration.Magnitude.MILLI);

                getLogger().log(makeRecord(Level.WARNING,
                        "{0} dropping connection with {1} after {2} fatal ack timeout health(read={3}, write={4}), receiptWait={5}: {6}",
                        getLocalEndPoint(), getPeer(), dur, fReadHealthy, fWriteHealthy, oReceiptUnacked, BufferedConnection.this));

                scheduleDisconnect(new IOException("fatal ack timeout after " + dur));
                }
            else if (ldtNow >= ldtAckTimeout)
                {
                // timeout expired
                final int cMultCap = 10;
                long      cMillisTimeout = f_driver.getDependencies().getAckTimeoutMillis();
                Duration  dur            = new Duration(cMillisTimeout * Math.min(cMultCap, m_cUnackLast + 1), Duration.Magnitude.MILLI);

                getLogger().log(makeRecord(Level.WARNING,
                        "{0} initiating connection migration with {1} after {2} ack timeout health(read={3}, write={4}), receiptWait={5}: {6}",
                        getLocalEndPoint(), getPeer(), dur, fReadHealthy, fWriteHealthy, oReceiptUnacked, BufferedConnection.this));

                if (oReceiptUnacked == null)
                    {
                    m_nIdUnackLast = 0;
                    m_cUnackLast   = 0;
                    }
                else
                    {
                    // if we have successive read health timeouts then perhaps our ack timeout is just too small for
                    // an apparently really slow network and big message.  While a needless connection migration is
                    // harmless, endless ones without progress is certainly not.  So lets push up the timeout while
                    // we work to get this larger message successfully ack'd.  Note our health check algorithm already
                    // defends against large message transmission time in that we're happy so long as we see read and
                    // write progress, but it doesn't account for how long it may take to drain the OS tx buffer. So
                    // this time increase defends against that unknown.  We do limit the increase we'll extend the
                    // configured timeout by cMultiCap times the configured timeout.
                    int nId = System.identityHashCode(oReceiptUnacked) ^
                              System.identityHashCode(batchUnacked)    ^ ofReceiptUnacked; // in case the same receipt object is reused frequently
                    if (nId == m_nIdUnackLast)
                        {
                        int cStuck = ++m_cUnackLast;
                        cMillisTimeout *= Math.min(cMultCap, cStuck + 1);

                        if (cStuck == MIGRATION_LIMIT_BEFORE_DUMP)
                            {
                            // see onMigration where we also check m_cUnackLast and request that our peer collect a dump as well
                            String sName = HeapDump.dumpHeapForBug("Bug-27585336-tmb-migration");
                            getLogger().log(makeRecord(Level.WARNING,
                                    "{0} has failed to deliver {1} to {2} after {3} attempts, {4} has been collected for analysis",
                                    getLocalEndPoint(), oReceiptUnacked, BufferedConnection.this, cStuck, sName));
                            ldtNow = SafeClock.INSTANCE.getSafeTimeMillis(); // heap dump may have taken awhile
                            }
                        }
                    else
                        {
                        m_cUnackLast = 0;
                        }
                    m_nIdUnackLast = nId;
                    }

                // reset ack timeout
                m_ldtAckTimeout = ldtNow + cMillisTimeout;

                migrate(new IOException("ack timeout after " + dur));
                }
            // else; progressing towards timeout
            }


        @Override
        public void onMigration()
            {
            super.onMigration();

            // don't log as a warning, we can get here simply because the other process terminated, i.e. delay
            // logging as a warning until we actually re-establish the connection.
            getLogger().log(makeRecord(Level.FINER, "{0} migrating connection with {1}",
                    getLocalEndPoint(), BufferedConnection.this));

            m_cMsgInSkip = 0; // our peer will start with a new SYNC message which will tell us exactly how much we should skip

            WriteBatch batchWriteUnflushed = m_batchWriteUnflushed;
            if (batchWriteUnflushed != null)
                {
                m_batchWriteUnflushed = null;
                f_cbQueued.addAndGet(batchWriteUnflushed.getLength());
                }

            // remove any sends (from f_cbQueued) that were pre-ack'd during a former migration
            for (WriteBatch batch = m_batchWriteSendHead; batch != null && batch.m_ofAck == batch.m_ofAdd; batch = batch.next())
                {
                batch.m_ofSend = batch.m_ofAdd; // pretend we've sent everything
                f_cbQueued.addAndGet(-batch.getLength());
                }

            // rewind batches from the resend queue; Note: we scan the entire queue because it is not trivial
            // to identify where it is safe to stop scanning.  Specifically we can't stop and the first unsent
            // message since we can run into empty batches
            WriteBatch batchResendHead = m_batchWriteResendHead;
            for (WriteBatch batch = batchResendHead; batch != null; batch = batch.next())
                {
                f_cbQueued.addAndGet(batch.rewind());
                }

            int nProt = getProtocolVersion();
            if (nProt > 0)
                {
                // place SYNC message at the start of the send queue, but don't included it in the resend queue
                WriteBatch batchWriteSync = m_batchWriteSendHead = new WriteBatch(/*fLink*/ false);

                int        cbHead  = nProt < 5 ? 4 : 16;       // msg header size
                int        cbBody  = 17 + (nProt < 3 ? 0 : 1); // msg body size
                ByteBuffer bufSync = ByteBuffer.allocate(cbHead + cbBody);

                if (nProt > 4)
                    {
                    bufSync.putLong(-cbBody)  // negative msg size indicates control message of that size
                            .position(16); // skip writes for header/body crc
                    }
                else
                    {
                    bufSync.putInt(-cbBody);
                    }

                bufSync.put(MSG_SYNC)
                        .putLong(m_cMsgOutDelivered)
                        .putLong(m_cMsgIn);

                if (nProt > 2)
                    {
                    bufSync.put(m_cUnackLast == MIGRATION_LIMIT_BEFORE_DUMP ? SYNC_CMD_DUMP : SYNC_CMD_NONE);
                    }

                bufSync.flip();
                if (nProt > 4)
                    {
                    // backfill the message header with the header and body CRCs now that they can be computed
                    populateCtrlMsgHeaderCrc(bufSync);
                    }

                batchWriteSync.append(bufSync, /*fRecycle*/ false, /*body*/ null, /*receipt*/ null);
                batchWriteSync.m_ofAck = batchWriteSync.m_ofAdd; // prevent it from accepting bundles, since it isn't resend eligible
                batchWriteSync.m_next  = batchResendHead;
                f_cbQueued.addAndGet(batchWriteSync.getLength());
                }
            else if (m_state == ConnectionState.ACTIVE)
                {
                // this shouldn't be possible except at protocol v0 and we don't call migrate when running at v0
                scheduleDisconnect(new IOException("protocol error; sync at protocol=" + nProt + ", with in=" + m_cMsgIn + ", out=" + m_cMsgOutDelivered));
                }
            else
                {
                // the disconnect may have occurred before we negotiated a protocol version, we can't send a SYNC
                // if we don't know the version, since by the time it gets processed on the new connection a version will
                // have been negotiated and we would need to use that unknown version here.  But since we haven't negotiated
                // a version yet it also means we could not have exchanged messages either, so it is safe to skip the SYNC.
                m_batchWriteSendHead = batchResendHead;
                }
            }

        /**
         * Populate message header.
         *
         * @param bufHead  the header buffer to be written to
         */
        protected void populateCtrlMsgHeaderCrc(ByteBuffer bufHead)
            {
            int   cbHeader = 16;
            int   nPos     = bufHead.position();
            int   nLimit   = bufHead.limit();
            int   lCrc     = 0;
            CRC32 crc32    = f_crcTx;

            // compute and write body CRC; Note, we still need to write a 0 when crc is disabled
            // as buffers may not be zero'd out to begin with
            if (crc32 != null)
                {
                crc32.reset();
                bufHead.position(nPos + cbHeader);
                lCrc = Buffers.updateCrc(crc32, bufHead);
                lCrc = lCrc == 0 ? 1: lCrc;
                }
            bufHead.putInt(nPos + 8, lCrc); // write body crc

            // compute and write header CRC
            if (crc32 != null)
                {
                crc32.reset();
                bufHead.position(nPos).limit(nPos + cbHeader - 4);
                lCrc = Buffers.updateCrc(crc32, bufHead);
                lCrc = lCrc == 0 ? 1: lCrc;
                bufHead.limit(nLimit);
                }

            bufHead.putInt(nPos + 12, lCrc); // write headed crc
            bufHead.position(nPos);
            }

        /**
         * Handle any incoming data.
         *
         * @param fReady  true iff the channel is readable
         *
         * @return a partial SelectionService.Handler interest set
         *
         * @throws IOException if an I/O error occurs
         */
        protected abstract int processReads(boolean fReady)
                throws IOException;

        /**
         * Write the contents of the WriteQueue to the channel.
         *
         * @param fReady  true iff the channel is writeable
         *
         * @return a partial SelectionService.Handler interest set
         *
         * @throws IOException if an I/O error occurs
         */
        protected int processWrites(boolean fReady)
                throws IOException
            {
            long cbBacklog = f_cbQueued.get();

            if (fReady)
                {
                WriteBatch batch       = m_batchWriteSendHead;
                long       cbBundle    = getAutoFlushThreshold();
                long       cbExcessive = getBacklogExcessiveThreshold();
                boolean    fBacklog    = m_fBacklog;

                // process the queue

                // Note: Avoid the possibility of staying in this loop "forever" if the producer and consumer
                // are matching pace.  The only reason to bail out is to allow other work to be accomplished
                // on this SelectionService thread.
                long cbWritten = 0;
                try
                    {
                    for (int i = 0; cbBacklog > 0 && i < 16 && (cbBacklog <= cbExcessive || fBacklog); ++i)
                        {
                        long cbBatch = batch.getLength();
                        while (cbBatch != 0 &&        // don't bundle into an empty batch
                               cbBatch < cbBundle &&  // batch is small enough that it is worth bundling
                               cbBacklog > cbBatch && // there is more in the backlog, i.e. batch.bundle won't NPE
                               batch.m_ofAck < batch.m_ofAdd) // ensure we don't bundle into fully ack'd batches as this can lead to data loss during migration
                            {
                            cbBatch = batch.bundle();
                            }

                        if (cbBatch == 0 || batch.write()) // unlike in flush we don't need to sync since acks are also processed on this thread
                            {
                            cbWritten += cbBatch;
                            cbBacklog -= cbBatch;

                            if (cbBacklog == 0)
                                {
                                // check to see if more has been queued
                                cbBacklog = f_cbQueued.addAndGet(-cbWritten);
                                cbWritten = 0;

                                if (cbBacklog == 0)
                                    {
                                    break;
                                    }
                                }

                            batch = batch.next();
                            }
                        else // we've exhausted the socket write buffer
                            {
                            cbWritten += (cbBatch - batch.getLength());
                            break;
                            }
                        }
                    }
                finally
                    {
                    cbBacklog = f_cbQueued.addAndGet(-cbWritten);
                    m_batchWriteSendHead = batch;
                    }

                // change backlog status if necessary
                if (fBacklog)
                    {
                    if (cbBacklog < cbExcessive / 2)
                        {
                        m_fBacklog = false;
                        emitEvent(new SimpleEvent(Event.Type.BACKLOG_NORMAL, getPeer()));
                        }
                    }
                else if (cbBacklog > cbExcessive)
                    {
                    m_fBacklog = true;
                    emitEvent(new SimpleEvent(Event.Type.BACKLOG_EXCESSIVE, getPeer()));
                    }
                }

            return cbBacklog > 0 ? OP_WRITE : 0;
            }

        @Override
        public void dispose()
            {
            ByteBuffer bufferMsgReceipt = m_bufferRecycleOutboundReceipts;
            if (bufferMsgReceipt != null)
                {
                // recycle resources
                getSocketDriver().getDependencies().getBufferManager().release(bufferMsgReceipt);
                m_bufferRecycleOutboundReceipts = null;
                }

            super.dispose();
            }

        @Override
        public void drainReceipts()
            {
            // Note: we must drain in send order, specifially
            // resent, current, queue'd, unflushed

            synchronized (this)
                {
                WriteBatch batch = m_batchWriteUnflushed;
                m_batchWriteUnflushed = null;
                if (batch != null)
                    {
                    enqueueWriteBatch(batch);
                    }

                for (batch = m_batchWriteResendHead; batch != null; batch = batch.next())
                    {
                    batch.m_ofSend = batch.m_ofAdd; // pretend we've sent it
                    f_cbQueued.addAndGet(-batch.getLength());
                    int cEmit;
                    while ((cEmit = batch.ack(Integer.MAX_VALUE, f_aoReceiptTmp)) > 0)
                        {
                        for (int i = 0; i < cEmit; ++i)
                            {
                            Object oReceipt = f_aoReceiptTmp[i];
                            f_aoReceiptTmp[i] = null;
                            if (oReceipt != RECEIPT_NO_EMIT)
                                {
                                addEvent(new SimpleEvent(Event.Type.RECEIPT, getPeer(), oReceipt));
                                }
                            }
                        }
                    }

                m_batchWriteResendHead = m_batchWriteSendHead = m_batchWriteTail = new WriteBatch(/*fLink*/ false);
                }
            }

        /**
         * Populate message header.
         *
         * @param buffHead  the header buffer to be written to
         * @param aBuff     the buffer array that contains the message
         * @param of        the offset
         * @param cBuffers  the number of buffers
         * @param cbBuffer  the number of bytes in message
         */
        protected void populateMessageHeader(ByteBuffer buffHead, ByteBuffer[] aBuff, int of, int cBuffers, long cbBuffer)
            {}

        /**
         * Return the receipt size that includes header and body.
         *
         * @return the receipt size
         */
        protected abstract int getReceiptSize();

        @Override
        public String toString()
            {
            WriteBatch batch          = m_batchWriteUnflushed;
            long       ldtAckTimeout  = m_ldtAckTimeout;
            String     sTimeout;

            if (ldtAckTimeout == 0)
                {
                sTimeout = "n/a";
                }
            else
                {
                long ldtNow = SafeClock.INSTANCE.getSafeTimeMillis();
                long ldtAckFatal = m_ldtAckFatalTimeout;
                sTimeout = "ack=" + new Duration(ldtAckTimeout - ldtNow, Duration.Magnitude.MILLI) +
                        (ldtAckFatal == Long.MAX_VALUE ? "" : ", conn=" + new Duration(ldtAckFatal - ldtNow, Duration.Magnitude.MILLI));
                }

            // NOTE: there may be many more pending outs, they just didn't have receipts so we won't count them until the next message with a receipt goes out
            // and the corresponding RECEIPT message comes back, at which point we'll scan the resend queue and count up the former messages which have now
            // been confirmed as delivered.  When comparing the toStrings of each side of a connection, we'll always see our out lag behind our peers in because
            // the peer must receive it before we can *know* that it was delivered, and we defer the ack of the ack until the next application RECEIPT request
            // thus peer's in is *always* greater then local out.
            return super.toString() + ", bufferedOut=" + new MemorySize(f_cbQueued.get()) +
                    ", unflushed=" + new MemorySize(batch == null ? 0 : batch.getLength()) +
                    ", delivered(in=" + m_cMsgIn + ", out=" + m_cMsgOutDelivered + ")" + // see note above regarding out
                    ", timeout(" + sTimeout + "), interestOps=" + m_nInterestOpsLast +
                    ", unflushed receipt=" + m_cReceiptsUnflushed + ", receiptReturn " + m_cReceiptsReturn +
                    ", isReceiptFlushRequired " + isReceiptFlushRequired();
            }

        /**
         * WriteBatch is used to encapsulate an array of ByteBuffers which
         * are to be written to the connection.
         */
        public class WriteBatch
            {
            public WriteBatch()
                {
                this(true);
                }

            public WriteBatch(boolean fLink)
                {
                if (fLink)
                    {
                    m_batchWriteTail = m_batchWriteTail.m_next = this;
                    }
                }

            /**
             * Return the number of bytes remaining to be sent in the batch.
             *
             * @return the number of bytes remaining to be sent in the batch
             */
            public long getLength()
                {
                return m_cbBatch;
                }

            /**
             * Append a message to the batch.
             *
             * This method is synchronized as it is always called from the app thread and
             * the batch may be concurrently being ack processed on the SS thread.
             *
             * @param bufHead       the buffer header to append
             * @param fRecycleHead  true iff the buffer header should be recycled
             * @param bufseqBody    optional buffer body to append
             * @param receipt       optional associated receipt
             *
             * @return the batch size
             */
            public synchronized long append(ByteBuffer bufHead, boolean fRecycleHead, BufferSequence bufseqBody, Object receipt)
                {
                addWriter();

                int cBufferAdd = 1 + (bufseqBody == null ? 0 : bufseqBody.getBufferCount());

                int          ofAdd     = ensureAdditionalBufferCapacity(cBufferAdd);
                Object[]     aoReceipt = m_aReceipt;
                ByteBuffer[] aBuffer   = m_aBuffer;
                long         cbBody    = 0;
                if (bufseqBody != null)
                    {
                    bufseqBody.getBuffers(0, cBufferAdd - 1, aBuffer, ofAdd + 1);
                    cbBody = bufseqBody.getLength();

                    populateMessageHeader(bufHead, aBuffer, ofAdd + 1, cBufferAdd, cbBody);
                    }

                aBuffer[ofAdd] = bufHead;

                long cb        = bufHead.remaining() + cbBody;
                long cbUnacked = cbBody;

                if (fRecycleHead)
                    {
                    // we don't count recycled against unacked
                    aoReceipt[ofAdd] = RECEIPT_HEADER_RECYCLE;
                    }
                else
                    {
                    cbUnacked += bufHead.remaining();
                    }

                // mark each of the append buffers so that we can reset to initial position
                // in case we need to retransmit due to migration
                for (int of = ofAdd, eOf = ofAdd + cBufferAdd; of < eOf; ++of)
                    {
                    aBuffer[of].mark();
                    }

                if (receipt == null && m_cbBatch == 0 &&
                    cBufferAdd > 1) // cBufferAdd > 1 ensures it is not a control message; SYNCs shouldn't be counted and we don't want receipts for receipts
                    {
                    // in order to prevent the resend queue from growing endlessly we ensure that we
                    // have periodic acks by injecting artificial receipts at most once per batch.
                    // this becomes a real receipt from the protocol perspective, but it will never be emitted
                    // to the user
                    receipt = RECEIPT_NO_EMIT;
                    }

                if (receipt == null)
                    {
                    // mark message boundaries with artificial receipts, unlike RECEIPT_NO_EMIT these
                    // are not real from a protocol perspective
                    aoReceipt[ofAdd + cBufferAdd - 1] = fRecycleHead && cBufferAdd == 1
                            ? RECEIPT_MSG_MARKER_HEADER_RECYCLE // combo receipt indicating msg and recycling
                            : RECEIPT_MSG_MARKER;
                    }
                else if (cBufferAdd == 1)
                    {
                    if (receipt == RECEIPT_ACKREQ_MARKER || receipt == RECEIPT_ACKREQ_MARKER_RECYCLE)
                        {
                        aoReceipt[ofAdd] = receipt;
                        }
                    else
                        {
                        // we don't have a way to encode this. There is also no way for a user to cause this
                        // it could only be the result of a bug in the bus
                        throw new IllegalStateException();
                        }
                    // else; it is an ACK with a marker; do nothing
                    }
                else
                    {
                    aoReceipt[ofAdd + cBufferAdd - 1] = receipt;
                    ++m_cReceiptsUnflushed;
                    }

                m_ofAdd = ofAdd + cBufferAdd;

                BufferedConnection.this.f_cBytesUnacked.addAndGet(cbUnacked);

                return m_cbBatch += cb;
                }

            /**
             * Append the next batch into this WriteBatch.
             *
             * @return the new batch size
             */
            public long bundle()
                {
                WriteBatch batchSrc   = m_next;
                int        ofAckSrc   = batchSrc.m_ofAck;
                int        ofSndSrc   = batchSrc.m_ofSend;
                int        ofSrc      = Math.min(ofAckSrc, ofSndSrc); // ofSend can be < ofAck
                int        cBufferSrc = batchSrc.m_ofAdd - ofSrc;
                int        ofAdd      = ensureAdditionalBufferCapacity(cBufferSrc);

                System.arraycopy(batchSrc.m_aBuffer,  ofSrc, m_aBuffer,  ofAdd, cBufferSrc);
                System.arraycopy(batchSrc.m_aReceipt, ofSrc, m_aReceipt, ofAdd, cBufferSrc);

                m_ofAdd = ofAdd + cBufferSrc;

                WriteBatch batchNext = batchSrc.m_next;
                if (batchNext != null)
                    {
                    m_next = batchNext;
                    }
                // else; avoid invalidating the tail

                if (ofAckSrc > ofSndSrc) // the source batch is partially ack'd; thus this batch is fully ack'd
                    {
                    m_ofAck += (ofAckSrc - ofSndSrc);
                    }

                long cbBatch = m_cbBatch += batchSrc.getLength();

                // NOTE: f_cBytesUnacked and m_cReceiptsUnflushed don't need to be updated as it had already been
                // accounted for when producing the source batch

                // make the original batch unusable
                batchSrc.m_aReceipt = batchSrc.m_aBuffer = EMPTY_BUFFER_ARRAY;
                batchSrc.m_ofAck    = batchSrc.m_ofSend  = batchSrc.m_ofAdd = 0;
                batchSrc.m_cbBatch  = 0;
                // we don't null out batchSrc.m_next as there could still be head pointers referencing it

                return cbBatch;
                }

            /**
             * Attempt to write the batch to the connection.  When called from the client thread synchronization
             * must be held since the corresponding ack could come in and be processed on the SS thread before
             * this call returns.
             *
             * @return true iff the entire batch has been written
             *
             * @throws IOException if an I/O error occurs
             */
            public boolean write()
                throws IOException
                {
                ByteBuffer[] aBuffer = m_aBuffer;
                int          ofSend  = m_ofSend;
                int          ofAdd   = m_ofAdd;

                long cb = BufferedConnection.this.write(aBuffer, ofSend, ofAdd - ofSend);

                // advance offset, decrement cBuffer based on amount written
                if (cb > 0)
                    {
                    for (; ofSend < ofAdd && !aBuffer[ofSend].hasRemaining(); ++ofSend)
                        {}

                    m_cbBatch -= cb;
                    m_ofSend   = ofSend;
                    }

                return ofSend == ofAdd;
                }

            /**
             * Ack some the contents of the batch up through the next receipt.
             *
             * This method will emit the receipt(s) as well.
             *
             * @param cReceipts the maximum number of receipts to consume
             * @param aoReceipt array which will be filled with receipts to emit
             *
             * @return the number of receipts copied into aoReceipt
             */
            public int ack(int cReceipts, Object[] aoReceipt)
                {
                BufferManager manager   = getSocketDriver().getDependencies().getBufferManager();
                int           cMsg      = 0;
                ByteBuffer[]  aBuff     = m_aBuffer;
                Object[]      aReceipt  = m_aReceipt;
                int           ofSend    = m_ofSend;
                int           ofAdd     = m_ofAdd;
                int           ofAck     = m_ofAck;
                int           ofReceipt = 0;

                while (ofAck < ofAdd && cReceipts > 0)
                    {
                    Object     oReceipt = aReceipt[ofAck];
                    ByteBuffer buff     = aBuff[ofAck];

                    aBuff[ofAck]    = null;
                    aReceipt[ofAck] = null;

                    if (ofAck >= ofSend)
                        {
                        // we've received an ACK for something we haven't sent yet.  This can only legally happen
                        // after a connection migration, which means that we haven't *re*sent it yet, but our peer
                        // received our original transmission, we just didn't receive the ack.  The ack we're processing
                        // is in response to our original send, and there will *not* be any ack when we resend this
                        // message.  While it would be nice to simply not send the message, we can't do that as we've
                        // already told our peer (indirectly via SYNC) how many messages we'll be resending, so we
                        // must send that many.  This batch already resides in the write queue but we can't retain these
                        // buffers as once they are recycled or a receipt is emitted their contents could be changed and
                        // thus we would be sending garbage to our peer, if we send garbage as a message length then
                        // the peer can't properly skip messages.  We could alternately substitute alternate lengths in
                        // but then we'd have much more bookkeeping to do especially for messages which were partially sent.

                        ByteBuffer buffNew = ByteBuffer.allocate(buff.remaining()); // normal GCable garbage
                        buffNew.put(buff).flip();
                        aBuff[ofAck] = buffNew;
                        }

                    ++ofAck;

                    if (oReceipt == RECEIPT_HEADER_RECYCLE)
                        {
                        manager.release(buff);
                        }
                    else if (oReceipt == RECEIPT_MSG_MARKER_HEADER_RECYCLE ||
                             oReceipt == RECEIPT_ACKREQ_MARKER_RECYCLE)
                        {
                        ++cMsg;
                        manager.release(buff);
                        }
                    else if (oReceipt == RECEIPT_MSG_MARKER ||
                             oReceipt == RECEIPT_ACKREQ_MARKER)
                        {
                        ++cMsg;
                        }
                    else if (oReceipt != null) // real receipt
                        {
                        ++cMsg;
                        --cReceipts;
                        aoReceipt[ofReceipt++] = oReceipt;
                        if (ofReceipt == aoReceipt.length)
                            {
                            break;
                            }
                        }
                    }

                m_ofAck             = ofAck;
                m_cMsgOutDelivered += cMsg;

                return ofReceipt;
                }

            /**
             * Rewind the state of the batch such that any previously {@link #write sent}, but @{link #ack unacked}
             * messages can be resent.
             *
             * @return the number of bytes which were rescheduled
             */
            public long rewind()
                {
                int          ofAck   = m_ofAck;
                int          ofSend  = m_ofSend;
                ByteBuffer[] aBuffer = m_aBuffer;
                long         cbDelta = 0;

                if (ofSend < ofAck) // rare; only if we do a quick double migration
                    {
                    // we don't need to resend these since they've been acked before we rewound
                    // but we do need to adjust our batch size accordingly
                    for (; ofSend < ofAck; ++ofSend)
                        {
                        cbDelta -= aBuffer[ofSend].remaining();
                        }
                    }
                m_ofSend = ofAck; // our new send position

                for (int ofAdd = m_ofAdd; ofAck < ofAdd; ++ofAck)
                    {
                    ByteBuffer buff = aBuffer[ofAck];
                    cbDelta -= buff.remaining();
                    buff.reset(); // see append
                    cbDelta += buff.remaining();
                    }

                m_cbBatch += cbDelta;

                return cbDelta;
                }

            /**
             * Ensure the buffer array has enough capacity to add the specified
             * number of buffers.
             *
             * @param cBufferAdd  the number of buffers that will be added
             *
             * @return the index at which to add
             */
            protected int ensureAdditionalBufferCapacity(int cBufferAdd)
                {
                ByteBuffer[] aBuffer     = m_aBuffer;
                Object[]     aReceipt    = m_aReceipt;
                int          ce          = aBuffer.length;
                int          ofSrc       = Math.min(m_ofAck, m_ofSend);
                int          ofAdd       = m_ofAdd;
                int          cBufferUsed = ofAdd - ofSrc;

                if (cBufferUsed + cBufferAdd > ce)
                    {
                    // reallocate and shift the buffer array
                    ByteBuffer[] aBufferNew = new ByteBuffer[(ce + cBufferAdd) * 2];
                    System.arraycopy(aBuffer, ofSrc, aBufferNew, 0, cBufferUsed);

                    Object[] aReceiptNew = new Object[aBufferNew.length];
                    System.arraycopy(aReceipt, ofSrc, aReceiptNew, 0, cBufferUsed);

                    m_aBuffer  = aBufferNew;
                    m_aReceipt = aReceiptNew;

                    m_ofAck   -= ofSrc;
                    m_ofSend  -= ofSrc;

                    ofAdd = m_ofAdd -= ofSrc;
                    }
                else if (ofSrc + cBufferUsed + cBufferAdd > ce)
                    {
                    // we have enough space, but need to shift the buffers
                    System.arraycopy(aBuffer, ofSrc, aBuffer, 0, cBufferUsed);
                    Arrays.fill(aBuffer, cBufferUsed, ce, null);

                    System.arraycopy(aReceipt, ofSrc, aReceipt, 0, cBufferUsed);
                    Arrays.fill(aReceipt, cBufferUsed, ce, null);

                    m_ofAck  -= ofSrc;
                    m_ofSend -= ofSrc;

                    ofAdd = m_ofAdd -= ofSrc;
                    }
                // else; nothing to do

                return ofAdd;
                }

            /**
             * Return the next batch in the queue
             *
             * @return the next batch
             */
            public WriteBatch next()
                {
                return m_next;
                }

            /**
             * Return true iff there are subsequent batches in the queue.
             *
             * @return true iff there are subsequent batches in the queue.
             */
            public boolean hasNext()
                {
                return m_next != null;
                }

            /**
             * The total number of bytes remaining to write in the batch.
             */
            protected long m_cbBatch;

            /**
             * The ByteBuffer array.
             */
            protected ByteBuffer[] m_aBuffer = new ByteBuffer[16];

            /**
             * An array of potentially null receipts corresponding to the buffers in m_aBuffer.
             */
            protected Object[] m_aReceipt = new Object[m_aBuffer.length];

            /**
             * The next free buffer
             */
            protected int m_ofAdd;

            /**
             * The next offset to process while sending
             */
            protected int m_ofSend;

            /**
             * The next buffer to process when we receive an ack
             */
            protected int m_ofAck;

            /**
             * The next batch in the write queue.
             */
            protected volatile WriteBatch m_next;
            }

        // ----- data members -------------------------------------------

        /**
         * The tail of the write queue.
         *
         * This is only accessed while synchronized on the connection.
         */
        protected WriteBatch m_batchWriteTail = new WriteBatch(/*fLink*/ false);

        /**
         * The send pointer (head) into the write queue.
         *
         * This is only accessed by the SS thread associated with the connection's channel.
         */
        protected WriteBatch m_batchWriteSendHead = m_batchWriteTail;

        /**
         * The resend pointer (head) into the write queue.
         *
         * This is only accessed by the SS thread associated with the connection's channel.
         *
         * Note that this queue has two heads and while generally the resend head is the true
         * head, in some cases the send head can actually come before it.  This can occur for
         * a brief period after a connection migration as the peer resends ACKs for messages
         * which we've yet to resend, thus allowing us to move the resend head behind the
         * send head.
         */
        protected WriteBatch m_batchWriteResendHead = m_batchWriteTail;

        /**
         * The number of unsent bytes in the write queue.
         *
         * Note this atomic also acks as the primary write barrier for the queue.  Producers will first insert
         * into the queue and only then update f_cbQueued, ensuring that the consumer can see the updated value
         * presuming that it has first checked the size.  Note that producers themselves will be sync'd on the
         * connection which will ensure visibility and consistency when updating the tail.
         */
        protected final AtomicLong f_cbQueued = new AtomicLong();

        /**
         * The value of m_cbWrite at the last heartbeat cycle
         */
        protected long m_cbHeartbeatLast;

        /**
         * True if heartbeat has identified this connection as idle and an empty receipt should be sent
         */
        protected boolean m_fIdle;

        /**
         * The auto-flush threshold.
         */
        protected long m_cbAutoFlushThreshold;

        /**
         * The force ack threshold.
         */
        protected long m_cbForceAckThreshold;

        /**
         * The excessive backlog threashold.
         */
        protected long m_cbBacklogExcessiveThreshold;

        /**
         * The last interest ops for this channel.
         */
        protected int m_nInterestOpsLast;

        /**
         * The current unflushed WriteBatch, or null.
         *
         * Unflushed means that its size hasn't been added for f_cbQueued and it is still usable by application threads.
         * 
         * This field is volatile for use by isFlushRequired and isReceiptFlushRequired methods
         */
        protected volatile WriteBatch m_batchWriteUnflushed = m_batchWriteTail;

        /**
        * An estimate of the number of threads which contributed to the current batch.
        */
        protected int m_cWritersBatch;

        /**
         * Bit set used to identify threads contributing to the current batch
         */
        protected long m_lWritersBatchBitSet;

        /**
         * True if in the backlog state, false otherwise.
         */
        protected boolean m_fBacklog;

        /**
         * True if a backlog check has been scheduled.
         */
        protected final AtomicBoolean m_fBacklogScheduled = new AtomicBoolean();

        /**
         * The number of receipts associated with the unflushed WriteBatch.
         */
        protected int m_cReceiptsUnflushed;

        /**
         * The number of receipts to return to the peer on our next send.
         */
        protected final AtomicInteger m_cReceiptsReturn = new AtomicInteger();

        /**
         * As estimate of the number of unacked bytes since the last received ack or sent forced flush.
         *
         * TODO: now that we have a resend queue we could choose to accurately track this
         */
        protected final AtomicLong f_cBytesUnacked = new AtomicLong();

        /**
         * ByteBuffer to write Receipt messages
         */
        protected ByteBuffer m_bufferRecycleOutboundReceipts;

        /**
         * The number of threads waiting to use the connection.
         */
        protected final AtomicInteger m_cWritersWaiting = new AtomicInteger();

        /**
         * The total number of messages (including control messages) received from our peer.
         */
        protected long m_cMsgIn;

        /**
         * The number of upcomming inbound messages to skip (due to migration).
         */
        protected long m_cMsgInSkip;

        /**
         * The total number of receipts emitted.
         */
        protected long m_cReceiptsEmitted;

        /**
         * The total number of messages (including control messages) which have been confirmed as delivered to our peer.
         */
        protected long m_cMsgOutDelivered;

        /**
         * The number of bytes read as of last health check.
         */
        protected long m_cbReadLastCheck;

        /**
         * The number of bytes written as of last health check.
         */
        protected long m_cbWriteLastCheck;

        /**
         * timestamp at which the current pending ack times out
         */
        protected long m_ldtAckTimeout;

        /**
         * The time at which the health check must force a heartbeat
         */
        protected long m_ldtForceHeartbeat;

        /**
         * The id of the receipt of the last message which failed the ack health check
         */
        protected int m_nIdUnackLast;

        /**
         * The number of times the specific last receipt triggered a health check timeout.
         */
        protected int m_cUnackLast;

        /**
         * timestamp at which the current pending ack fatally times out
         */
        protected long m_ldtAckFatalTimeout;

        /**
         * Reusable array for holding receipts.
         */
        protected final Object[] f_aoReceiptTmp = new Object[16];
        }


    // ----- constants ------------------------------------------------------

    /**
     * The number of migrations on the same message before collecting a heap dump.
     */
    protected static final int MIGRATION_LIMIT_BEFORE_DUMP = 4;

    /**
     * Internal message type for exchanging receipts.
     */
    protected static final byte MSG_RECEIPT = 1;

    /**
     * Internal message type for syncing a connection (after a migration)
     */
    protected static final byte MSG_SYNC = 2;

    /**
     * Perform no additional commands upon syncing
     */
    protected static final byte SYNC_CMD_NONE = 0;

    /**
     * Collect a heap dump upon syncing.
     */
    protected static final byte SYNC_CMD_DUMP = 1;

    /**
     * For messages which don't include their own receipt, this dummy marker is used, allowing us
     * to identify message boundaries within a batch.  This receipt is not reflected on the wire
     * it is only used to identify message boundaries.
     *
     * Not reflected on the wire.
     */
    protected static final Object RECEIPT_MSG_MARKER = new Object();

    /**
     * Used to tag header fields as being recycleable.  As these are headers they would never have a receipt of their own.
     *
     * Not reflected on the wire.
     */
    protected static final Object RECEIPT_HEADER_RECYCLE = new Object();

    /**
     * A combination of RECEIPT_MSG_MARKER and RECEIPT_HEADER_RECYCLE for single buffer recyclable messages.
     *
     * Not reflected on the wire.
     */
    protected static final Object RECEIPT_MSG_MARKER_HEADER_RECYCLE = new Object();

    /**
     * Used to tag ack request messages.
     *
     * Not reflected on the wire.
     */
    protected static final Object RECEIPT_ACKREQ_MARKER = new Object();

    /**
     * Used to tag ack request messages which should be recycled.
     *
     * Not reflected on the wire.
     */
    protected static final Object RECEIPT_ACKREQ_MARKER_RECYCLE = new Object();

    /**
     * A dummy over-the-wire receipt, i.e. we will request a receipt from our peer, but we will not emit it to
     * our application when it comes in.  This is used in the case that the application doesn't use receipts
     * or doesn't use them often, and we want to avoid having the resend queue grow too large.
     *
     * This receipt *is* reflected on the wire, but never emitted to the user.
     */
    protected static final Object RECEIPT_NO_EMIT = new Object();

    /**
     * Empty buffer array for use in bundling.
     */
    protected static final ByteBuffer[] EMPTY_BUFFER_ARRAY = new ByteBuffer[0];
    }