All Downloads are FREE. Search and download functionalities are using the official Maven repository.

com.twitter.summingbird.scalding.VersionedState.scala Maven / Gradle / Ivy

There is a newer version: 0.11.0-RC1
Show newest version
/*
 Copyright 2013 Twitter, Inc.

 Licensed under the Apache License, Version 2.0 (the "License");
 you may not use this file except in compliance with the License.
 You may obtain a copy of the License at

 http://www.apache.org/licenses/LICENSE-2.0

 Unless required by applicable law or agreed to in writing, software
 distributed under the License is distributed on an "AS IS" BASIS,
 WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 See the License for the specific language governing permissions and
 limitations under the License.
 */

package com.twitter.summingbird.scalding

import com.twitter.algebird.{
  InclusiveUpper,
  Intersection,
  Interval,
  ExclusiveUpper
}
import com.twitter.summingbird.batch.{
  Batcher,
  BatchID,
  PrepareState,
  RunningState,
  Timestamp,
  WaitingState
}
import com.twitter.summingbird.batch.store.HDFSMetadata

import org.slf4j.LoggerFactory

import scala.util.{ Try => ScalaTry, Success, Failure }

/**
 * State representation used by the builder API for compatibility.
 */
private[scalding] object VersionedState {
  def apply(meta: HDFSMetadata, startDate: Option[Timestamp], maxBatches: Int)(implicit batcher: Batcher): VersionedState =
    new VersionedState(meta, startDate, maxBatches)
}

private[scalding] class VersionedState(meta: HDFSMetadata, startDate: Option[Timestamp], maxBatches: Int)(implicit batcher: Batcher) extends WaitingState[Interval[Timestamp]] { outer =>

  private val logger = LoggerFactory.getLogger(classOf[VersionedState])

  def begin: PrepareState[Interval[Timestamp]] = new VersionedPrepareState

  private class VersionedPrepareState extends PrepareState[Interval[Timestamp]] {
    def newestCompleted: Option[BatchID] =
      meta.versions.map { vers =>
        val thisMeta = meta(vers)
        thisMeta
          .get[String]
          .flatMap { str => ScalaTry(BatchID(str)) } match {
            case Success(batchID) => Some(batchID)
            case Failure(ex) =>
              logger.warn("Path: {} missing or corrupt completion file. Ignoring and trying previous",
                thisMeta.path)
              None
          }
      }
        .flatten
        .headOption
    /**
     * Returns a date interval spanning from the beginning of the the
     * batch stored in the most recent metadata file to the current
     * time.
     */
    lazy val requested: Interval[Timestamp] = {
      val beginning: BatchID =
        startDate.map(batcher.batchOf(_))
          .orElse(newestCompleted)
          .getOrElse { sys.error("You must supply a starting date on the job's first run!") }

      val end = beginning + maxBatches
      Interval.leftClosedRightOpen(
        batcher.earliestTimeOf(beginning),
        batcher.earliestTimeOf(end)
      ).right.get
    }

    def willAccept(available: Interval[Timestamp]) =
      available match {
        case intr @ Intersection(_, _) => // is finite:
          Right(new VersionedRunningState(intr))
        case _ => {
          logger.info("Will not accept: %s".format(available))
          Left(outer)
        }
      }
    /**
     * failure has no side effect, since any job writing to a
     * VersionedStore should itself be cleaning up the most recent
     * failed version's data.
     */
    def fail(err: Throwable) = {
      logger.error("Prepare failed", err)
      throw err
    }
  }

  private class VersionedRunningState(succeedPart: Interval.GenIntersection[Timestamp])
      extends RunningState[Interval[Timestamp]] {

    def nextTime: Timestamp = succeedPart match {
      case Intersection(_, ExclusiveUpper(up)) => up
      case Intersection(_, InclusiveUpper(up)) => up.next
      case _ => sys.error("We should always be running for a finite interval")
    }
    def batchID: BatchID = batcher.batchOf(nextTime)
    /**
     * Commit the new maximum completed interval to the backing
     * HDFSMetadata instance.
     */
    def succeed = {
      logger.info("%s succeeded".format(batchID))
      meta.mostRecentVersion.foreach(_.put(Some(batchID.toString)))
      outer
    }

    /**
     * failure has no side effect, since any job writing to a
     * VersionedStore should itself be cleaning up the most recent
     * failed version's data.
     */
    def fail(err: Throwable) = {
      logger.error("%s failed".format(batchID), err)
      throw err
    }
  }
}




© 2015 - 2025 Weber Informatics LLC | Privacy Policy