All Downloads are FREE. Search and download functionalities are using the official Maven repository.

com.twitter.summingbird.scalding.batch.BatchedSink.scala Maven / Gradle / Ivy

The newest version!
/*
 Copyright 2013 Twitter, Inc.

 Licensed under the Apache License, Version 2.0 (the "License");
 you may not use this file except in compliance with the License.
 You may obtain a copy of the License at

 http://www.apache.org/licenses/LICENSE-2.0

 Unless required by applicable law or agreed to in writing, software
 distributed under the License is distributed on an "AS IS" BASIS,
 WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 See the License for the specific language governing permissions and
 limitations under the License.
 */

package com.twitter.summingbird.scalding.batch

import com.twitter.algebird.monad.{ StateWithError, Reader }
import com.twitter.algebird.{ Interval, Intersection, InclusiveLower, ExclusiveUpper, InclusiveUpper }
import com.twitter.summingbird.batch.{ BatchID, Batcher, Timestamp }
import com.twitter.summingbird.scalding._
import com.twitter.scalding.Mode
import cascading.flow.FlowDef

trait BatchedSink[T] extends Sink[T] {
  def batcher: Batcher

  /**
   * If this full stream for this batch is already materialized, return it
   */
  def readStream(batchID: BatchID, mode: Mode): Option[FlowToPipe[T]]

  /**
   * Instances may choose to write out materialized streams
   * by implementing this. This is what readStream returns.
   */
  def writeStream(batchID: BatchID, stream: TimedPipe[T])(implicit flowDef: FlowDef, mode: Mode): Unit

  /**
   * in will completely cover these batches
   * Return a new FlowToPipe with the write as a side effect
   */
  protected def writeBatches(inter: Interval[BatchID], in: FlowToPipe[T]): FlowToPipe[T] =
    Reader[FlowInput, TimedPipe[T]] { (flowMode: (FlowDef, Mode)) =>
      val iter = BatchID.toIterable(inter)
      val inPipe = in(flowMode)

      // TODO (https://github.com/twitter/summingbird/issues/92): a
      // version of template tap is needed here.

      // We need to write each of these.
      iter.foreach { batch =>
        val range = batcher.toInterval(batch)
        writeStream(batch, inPipe.filter {
          case (time, _) =>
            range(time)
        })(flowMode._1, flowMode._2)
      }
      inPipe
    }

  final def write(incoming: PipeFactory[T]): PipeFactory[T] =
    StateWithError({ in: FactoryInput =>
      val (timeSpan, mode) = in
      // This object combines some common scalding batching operations:
      val batchOps = new BatchedOperations(batcher)

      val batchStreams = batchOps.coverIt(timeSpan).map { b => (b, readStream(b, mode)) }

      // Maybe an inclusive interval of batches to pull from incoming
      val batchesToWrite: Option[(BatchID, BatchID)] = batchStreams
        .dropWhile { _._2.isDefined }
        .map { _._1 }
        .toList match {
          case Nil => None
          case list => Some((list.min, list.max))
        }

      val newlyWritten = batchesToWrite.map {
        case (lower, upper) =>
          // Compute the times we need to read of the deltas
          val incBatches = Interval.leftClosedRightOpen(lower, upper.next)
          batchOps.readBatched(incBatches, mode, incoming)
            .right
            .map { case (inbatches, flow2Pipe) => (inbatches, writeBatches(inbatches, flow2Pipe)) }
      }
      // This data is already on disk and will not be recomputed
      val existing = batchStreams
        .takeWhile { _._2.isDefined }
        .collect { case (batch, Some(flow)) => (batch, flow) }

      def mergeExistingAndBuilt(optBuilt: Option[(Interval[BatchID], FlowToPipe[T])]): Try[((Interval[Timestamp], Mode), FlowToPipe[T])] = {
        val (aBatches, aFlows) = existing.unzip
        val flows = aFlows ++ (optBuilt.map { _._2 })
        val batches = aBatches ++ (optBuilt.map { pair => BatchID.toIterable(pair._1) }.getOrElse(Iterable.empty))

        if (flows.isEmpty)
          Left(List("Zero batches requested, should never occur: " + timeSpan.toString))
        else {
          // it is a static (i.e. independent from input) bug if this get ever throws
          val available = batchOps.intersect(batches, timeSpan).get
          val merged = Scalding.limitTimes(available, flows.reduce(Scalding.merge(_, _)))
          Right(((available, mode), merged))
        }
      }

      newlyWritten match {
        case None => mergeExistingAndBuilt(None)
        case Some(Left(err)) => if (existing.isEmpty) Left(err) else mergeExistingAndBuilt(None)
        case Some(Right(built)) => mergeExistingAndBuilt(Some(built))
      }
    })
}




© 2015 - 2025 Weber Informatics LLC | Privacy Policy