com.twitter.finagle.mux.transport.MuxFramer.scala Maven / Gradle / Ivy

Go to download
Show more of this group Show more artifacts with this name
Show all versions of finagle-mux_2.11 Show documentation
finagle-mux
There is a newer version: 6.39.0
package com.twitter.finagle.mux.transport

import com.twitter.concurrent.{AsyncQueue, Broker, Offer}
import com.twitter.finagle.stats.StatsReceiver
import com.twitter.finagle.transport.Transport
import com.twitter.finagle.util.{BufReader, BufWriter}
import com.twitter.finagle.{Failure, Status}
import com.twitter.io.Buf
import com.twitter.util.{Future, NonFatal, Promise,Time, Throw, Return}
import java.net.SocketAddress
import java.security.cert.Certificate
import java.util.concurrent.atomic.AtomicInteger

/**
 * Defines a [[com.twitter.finagle.transport.Transport]] which allows a
 * mux session to be shared between multiple tag streams. The transport splits
 * mux messages into fragments with a size defined by a parameter. Writes are
 * then interleaved to achieve equity and goodput over the entire stream.
 * Fragments are aggregated into complete mux messages when read. The fragment size
 * is negotiated when a mux session is initialized.
 *
 * @see [[com.twitter.finagle.mux.Handshake]] for usage details.
 *
 * @note Our current implementation does not offer any mechanism to resize
 * the window after a session is established. However, it is possible to
 * compose a flow control algorithm over this which can dynamically control
 * the size of `window`.
 */
private[finagle] object MuxFramer {
  /**
   * Defines mux framer keys and values exchanged as part of a
   * mux session header during initialization.
   */
  object Header {
    val KeyBuf: Buf = Buf.Utf8("mux-framer")

    /**
     * Returns a header value with the given frame `size` encoded.
     */
    def encodeFrameSize(size: Int): Buf = {
      require(size > 0)
      val bw = BufWriter.fixed(4)
      bw.writeIntBE(size)
      bw.owned()
    }

    /**
     * Extracts frame size from the `buf`.
     */
    def decodeFrameSize(buf: Buf): Int = {
      val size = BufReader(buf).readIntBE()
      require(size > 0)
      size
    }
  }

  /**
   * Represents a tag stream while writing fragments. To avoid unncessary allocations
   * `FragmentStream` carries some mutable state. In particular, `fragments` is a mutable
   * iterator and its contents should not be written concurrently.
   */
  case class FragmentStream(
    tag: Int,
    fragments: Iterator[Buf],
    writePromise: Promise[Unit])

  /**
   * Represents an interrupt for a stream.
   */
  case class Interrupt(tag: Int, exc: Throwable)

  /**
   * Creates a new [[Transport]] which fragments writes into `writeWindowBytes`
   * sized payloads and defragments reads into mux [[Message]]s.
   *
   * @param writeWindowBytes messages larger than this value are fragmented on
   * write. If the value is not defined, writes are proxied to the underlying
   * transport. However, the transport is always prepared to read fragments.
   */
  def apply(
    trans: Transport[Buf, Buf],
    writeWindowBytes: Option[Int],
    statsReceiver: StatsReceiver
  ): Transport[Message, Message] = new Transport[Message, Message] {
    require(writeWindowBytes.isEmpty || writeWindowBytes.exists(_ > 0),
      s"writeWindowBytes must be positive: $writeWindowBytes")

    // stats for both read and write paths
    private[this] val pendingWriteStreams, pendingReadStreams = new AtomicInteger(0)
    private[this] val writeStreamBytes = statsReceiver.stat("write_stream_bytes")
    private[this] val readStreamBytes = statsReceiver.stat("read_stream_bytes")
    private[this] val gauges = Seq(
      statsReceiver.addGauge("pending_write_streams") { pendingWriteStreams.get },
      statsReceiver.addGauge("pending_read_streams") { pendingReadStreams.get },
      statsReceiver.addGauge("write_window_bytes") {
        writeWindowBytes match {
          case Some(bytes) => bytes.toFloat
          case None => -1F
        }
      }
    )

    /**
     * Returns an iterator over the fragments of `msg`. Each fragment is sized to
     * be <= `maxSize`. Note, this should not be iterated over on more than one
     * thread at a time.
     */
    private[this] def fragment(msg: Message, maxSize: Int): Iterator[Buf] =
      if (msg.buf.length <= maxSize) {
        Iterator.single(Message.encode(msg))
      } else new Iterator[Buf] {
        // Create a mux header with the tag MSB set to 1. This signifies
        // that the message is part of a set of fragments. Note that the
        // decoder needs to respect this and not attempt to decode fragments
        // past their headers.
        private[this] val header: Array[Byte] = {
          val tag = Message.Tags.setMsb(msg.tag)
          Array[Byte](msg.typ,
            (tag >> 16 & 0xff).toByte,
            (tag >> 8 & 0xff).toByte,
            (tag & 0xff).toByte
          )
        }

        private[this] val headerBuf = Buf.ByteArray.Owned(header)
        private[this] val buf = msg.buf
        private[this] val readable = buf.length

        @volatile private[this] var read = 0

        def hasNext: Boolean = read < readable

        def next(): Buf = {
          if (!hasNext) throw new NoSuchElementException

          if (readable - read <= maxSize) {
            // Toggle the tag MSB in the header region which signifies
            // the end of the sequence. Note, our header is denormalized
            // across 4 bytes.
            header(1) = (header(1) ^ (1 << 7)).toByte
          }
          // ensure we don't slice past the end of the msg.buf
          val frameLength = math.min(readable - read, maxSize)
          // note that the size of a frame is implicitly prepended by the transports
          // pipeline and is derived from readable bytes.
          val b = headerBuf.concat(buf.slice(from = read, until = read + frameLength))
          read += frameLength
          b
        }
      }

    // Queues incoming streams which are dequeued and
    // flushed in `writeLoop`.
    private[this] val writeq = new Broker[FragmentStream]

    // Kicks off a new writeLoop with an incoming stream.
    private[this] val newWriteLoop: Offer[Unit] = writeq.recv.map { stream =>
      writeLoop(Seq(stream))
    }

    // Communicates interrupts for outstanding streams.
    private[this] val interrupts = new Broker[Interrupt]

    // This is lifted out of writeLoop to avoid a closure. Technically, we could
    // inline `Offer.const(writeLoop)` in the loop, but this makes it difficult to
    // feign concurrency over a single thread because of how the LocalScheduler is
    // implemented.
    private[this] val unitOffer = Offer.const(())

    /**
     * Write fragments from `streams` recursively. Each iteration, a layer
     * from `streams` is written to the transport, effectively load balancing
     * across all streams to ensure a diverse and equitable session. New streams
     * join the `writeLoop` via the `writeq` broker and are interrupted via the
     * `interrupts` broker.
     *
     * @note The order in which we iterate over `streams` (and thus write to
     * the transport) isn't strictly guaranteed and can change in the presence
     * of interrupts, for example.
     */
    private[this] def writeLoop(streams: Seq[FragmentStream]): Future[Unit] =
      if (streams.isEmpty) newWriteLoop.sync() else {
        val round = streams.foldLeft[Seq[Future[FragmentStream]]](Nil) {
          case (writes, s@FragmentStream(_, fragments, writep)) if fragments.hasNext =>
            val buf = fragments.next()
            writeStreamBytes.add(buf.length)
            val write = trans.write(buf).transform {
              case Return(_) => Future.value(s)
              case exc@Throw(_) =>
                // `streams` should only contain streams where the
                // the write promise is not complete because interrupted
                // streams are filtered out before entering `writeLoop`.
                writep.update(exc)
                Future.value(s)
            }
            write +: writes
          case (writes, FragmentStream(_, _, writep)) =>
            // We have completed the stream. It's not possible for
            // `writep` to be complete since interrupted streams are
            // guaranteed to be filtered out of `streams`.
            writep.update(Return.Unit)
            writes
        }
        // Note, we don't need to `collectToTry` here because `round` always
        // completes succesfully. Failures to write per-stream are encoded in
        // the stream's `writePromise`.
        Future.collect(round).flatMap { nextStreams =>
          // After each round, we choose between the following cases
          // (note, if more than one offer is available we choose the
          // first available w.r.t to the argument order):
          Offer.prioritize[Unit](
            // 1. Remove a stream which has been interrupted. We interrupt first
            // to allow a backup in `writeq` to be drained on interrupts. Note that
            // an interrupt before an element reaches the writeq is possible and
            // handled in `write`.
            interrupts.recv.map { case Interrupt(tag, exc) =>
              writeLoop(nextStreams.foldLeft[Seq[FragmentStream]](Nil) {
                case (ss, FragmentStream(`tag`, _, writep)) =>
                  writep.update(Throw(exc))
                  ss
                case (ss, s) => s +: ss
              })
            },
            // 2. Add an incoming stream.
            writeq.recv.map { s => writeLoop(s +: nextStreams) },
            // 3. Dispatch another round of writes.
            unitOffer.map { _ => writeLoop(nextStreams) }
          ).sync()
        }
      }

    // kick off the loop.
    newWriteLoop.sync()

    def write(msg: Message): Future[Unit] =
      if (writeWindowBytes.isEmpty) {
        trans.write(Message.encode(msg))
      } else msg match {
        // The sender of a Tdispatch has indicated it is no longer
        // interested in the request, in which case, we need to make
        // sure the Tdispatch is removed from the writeLoop if it
        // exists.
        case [email protected](tag, why) =>
          val intr = interrupts ! Interrupt(tag, Failure(why))
          intr.before { trans.write(Message.encode(m)) }

        case m: Message =>
          val p = new Promise[Unit]
          p.setInterruptHandler { case NonFatal(exc) =>
            // if an Rdispatch stream is interrupted, we send the
            // receiver an `Rdiscarded` so they can safely relinquish
            // any outstanding fragments and we remove the pending stream
            // from our `writeLoop`. Note, `Tdiscarded` is handled above.
            if (m.typ == Message.Types.Rdispatch) {
              val intr = interrupts ! Interrupt(m.tag, exc)
              // We make sure to interrupt before sending the Rdiscarded
              // so we can sequence the discard relative to fragments sitting
              // in the writeLoop.
              intr.before { trans.write(Message.encode(Message.Rdiscarded(m.tag))) }
            }
          }
          pendingWriteStreams.incrementAndGet()
          // There is no upper bound on writeq and elements can only
          // be removed via interrupts. However, the transport can
          // be bounded which is the underlying resource backing the
          // writeq.
          val nq = writeq ! FragmentStream(m.tag, fragment(m, writeWindowBytes.get), p)
          nq.before(p).ensure {
            pendingWriteStreams.decrementAndGet()
          }
      }

    /**
     * Stores fully aggregated mux messages that were read from `trans`.
     */
    private[this] val readq = new AsyncQueue[Message]

    /**
     * The `readLoop` is responsible for demuxing and defragmenting tag streams.
     * If we read an `Rdiscarded` remove the corresponding stream from `tags`.
     */
    private[this] def readLoop(tags: Map[Int, Buf]): Future[Unit] =
      trans.read().flatMap { buf =>
        readStreamBytes.add(buf.length)
        val br = BufReader(buf)
        val header = br.readIntBE()
        val typ = Message.Tags.extractType(header)
        val tag = Message.Tags.extractTag(header)

        val isFragment = Message.Tags.isFragment(tag)

        // normalize tag by flipping the tag MSB
        val t = Message.Tags.setMsb(tag)

        // Both a transmitter or receiver can discard a stream.
        val discard = typ == Message.Types.BAD_Tdiscarded ||
          typ == Message.Types.Rdiscarded

        // We only want to intercept discards in this loop if we
        // are processing the stream.
        val nextTags = if (discard && tags.contains(tag)) {
          tags - tag
        } else if (isFragment) {
          // Append the fragment to the respective `tag` in `tags`.
          // Note, we don't reset the reader index because we want
          // to consume the header for fragments.
          tags.updated(t, tags.get(t) match {
            case Some(buf0) => buf0.concat(br.readAll())
            case None => br.readAll()
          })
        } else {
          // If the fragment bit isn't flipped, the `buf` is either
          // a fully buffered message or the last fragment for `tag`.
          // We distinguish between the two by checking for the presence
          // of `tag` in `tags`.
          val resBuf = if (!tags.contains(t)) buf else {
            val head = buf.slice(0, 4)
            val rest = tags(t)
            val last = buf.slice(4, buf.length)
            head.concat(rest).concat(last)
          }
          readq.offer(Message.decode(resBuf))
          tags - t
        }
        pendingReadStreams.set(nextTags.size)
        readLoop(nextTags)
      }

    // failures are pushed to the readq which are propagated to
    // the layers above.
    readLoop(Map.empty).onFailure { exc => readq.fail(exc) }

    def read(): Future[Message] = readq.poll()

    def status: Status = trans.status
    val onClose: Future[Throwable] = trans.onClose
    def localAddress: SocketAddress = trans.localAddress
    def remoteAddress: SocketAddress = trans.remoteAddress
    def peerCertificate: Option[Certificate] = trans.peerCertificate
    def close(deadline: Time): Future[Unit] = trans.close(deadline)
  }
}