org.apache.spark.streaming.datasource.receiver.DatasourceDStream.scala Maven / Gradle / Ivy

Go to download
Show more of this group Show more artifacts with this name
Show all versions of spark-datasource_1.5 Show documentation
The newest version!
/**
 * Copyright (C) 2015 Stratio (http://stratio.com)
 *
 * Licensed under the Apache License, Version 2.0 (the "License");
 * you may not use this file except in compliance with the License.
 * You may obtain a copy of the License at
 *
 *         http://www.apache.org/licenses/LICENSE-2.0
 *
 * Unless required by applicable law or agreed to in writing, software
 * distributed under the License is distributed on an "AS IS" BASIS,
 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 * See the License for the specific language governing permissions and
 * limitations under the License.
 */
package org.apache.spark.streaming.datasource.receiver

import org.apache.spark.Logging
import org.apache.spark.sql.{Row, SQLContext}
import org.apache.spark.storage.StorageLevel
import org.apache.spark.streaming.datasource.config.ConfigParameters._
import org.apache.spark.streaming.datasource.config.ParametersUtils
import org.apache.spark.streaming.datasource.models.InputSentences
import org.apache.spark.streaming.datasource.receiver.DatasourceDStream._
import org.apache.spark.streaming.dstream.InputDStream
import org.apache.spark.streaming.scheduler.rate.RateEstimator
import org.apache.spark.streaming.scheduler.{Job, RateController, StreamInputInfo}
import org.apache.spark.streaming.{Seconds, StreamingContext, Time}

private[streaming]
class DatasourceDStream[C <: SQLContext](
                                          @transient val _ssc: StreamingContext,
                                          val inputSentences: InputSentences,
                                          val datasourceParams: Map[String, String],
                                          val sqlContext: C
                                        ) extends InputDStream[Row](_ssc) with Logging with ParametersUtils {

  private[streaming] override def name: String = s"Datasource stream [$id]"

  storageLevel = calculateStorageLevel()

  inputSentences.initialStatements.foreach(statement => sqlContext.sql(statement))

  /**
   * Min storage level is MEMORY_ONLY, because compute function for one rdd is called more than one place
   */
  private[streaming] def calculateStorageLevel(): StorageLevel = {
    val levelFromParams = getStorageLevel(datasourceParams)
    if (levelFromParams == StorageLevel.NONE) {
      log.warn("NONE is not a valid storage level for this datasource receiver, setting it in MEMORY_ONLY")
      StorageLevel.MEMORY_ONLY
    } else levelFromParams
  }

  /**
   * Remember duration for the rdd created by this InputDStream,
   * by default DefaultMinRememberDuration = 60s * slideWindow
   */
  private val userRememberDuration = getRememberDuration(datasourceParams)
  private val stopGracefully = getStopGracefully(datasourceParams)
  private val stopSparkContext = getStopSparkContext(datasourceParams)


  userRememberDuration match {
    case Some(duration) =>
      remember(Seconds(duration))
    case None =>
      val minRememberDuration = Seconds(ssc.conf.getTimeAsSeconds(
        ssc.conf.get("spark.streaming.minRememberDuration", DefaultMinRememberDuration), DefaultMinRememberDuration))
      val numBatchesToRemember = math.ceil(minRememberDuration.milliseconds / slideDuration.milliseconds).toInt

      remember(slideDuration * numBatchesToRemember)
  }

  /**
   * Calculate the max number of records that the receiver must receive and process in one batch when the
   * blackPressure is enable
   */
  private[streaming] def maxRecords(): Option[(Int, Long)] = {
    val estimatedRateLimit = rateController.map(_.getLatestRate().toInt)
    estimatedRateLimit.flatMap(estimatedRateLimit => {
      if (estimatedRateLimit > 0) {
        val recordsRateController = ((slideDuration.milliseconds.toDouble / 1000) * estimatedRateLimit).toLong

        Option((estimatedRateLimit, getMaxRecordsWithRate(recordsRateController)))
      } else None
    })
  }

  /**
   *
   * @return max number of records that the input RDD must receive in the next window
   */
  private[streaming] def getMaxRecordsWithRate(recordsRateController: Long): Long = {
    inputSentences.offsetConditions.fold(recordsRateController) { offsetConditions =>
      offsetConditions.limitRecords.fold(recordsRateController) { maxRecordsPartition =>
        Math.min(recordsRateController, maxRecordsPartition)
      }
    }
  }

  override def compute(validTime: Time): Option[DatasourceRDD] = {

    if (!computingStopped) {
      // Report the record number and metadata of this batch interval to InputInfoTracker and calculate the maxRecords
      val maxRecordsCalculation = maxRecords().map { case (estimated, newLimitRecords) =>
        val description =
          s"LimitRecords : ${
            inputSentences.offsetConditions.fold("") {
              _.limitRecords.fold("") {
                _.toString
              }
            }
          }\t:" + s" Estimated: $estimated\t NewLimitRecords: $newLimitRecords"

        (StreamInputInfo.METADATA_KEY_DESCRIPTION -> description, newLimitRecords)
      }
      val metadata = Map("InputSentences" -> inputSentences) ++
        maxRecordsCalculation.fold(Map.empty[String, Any]) { case (description, _) => Map(description) }
      val inputSentencesCalculated = progressInputSentences.getOrElse(inputSentences)
      val inputSentencesLimited = maxRecordsCalculation.fold(inputSentencesCalculated) { case (_, maxMessages) =>
        inputSentencesCalculated.copy(offsetConditions = inputSentencesCalculated.offsetConditions.map(conditions =>
          conditions.copy(limitRecords = Option(maxMessages))))
      }
      val datasourceRDD = new DatasourceRDD(sqlContext, inputSentencesLimited, datasourceParams)
      val rddSize = datasourceRDD.count()

      //publish data in Spark UI
      ssc.scheduler.inputInfoTracker.reportInfo(validTime, StreamInputInfo(id, rddSize, metadata))

      progressInputSentences = Option(datasourceRDD.progressInputSentences)

      if (rddSize == 0 && inputSentences.stopConditions.fold(false) {_.stopWhenEmpty})
        computingStopped = true

      Option(datasourceRDD)
    } else None
  }

  override private[streaming] def generateJob(time: Time): Option[Job] = {
    getOrCompute(time) match {
      case Some(rdd) =>
        val jobFunc = () => {
          val emptyFunc = { (iterator: Iterator[Row]) => {} }
          context.sparkContext.runJob(rdd, emptyFunc)
        }
        
        Some(new Job(time, jobFunc))
      case None =>
        if (computingStopped &&
          inputSentences.stopConditions.isDefined &&
          inputSentences.stopConditions.get.finishContextWhenEmpty
        )
          _ssc.stop(stopSparkContext, stopGracefully)

        None
    }
  }

  override def start(): Unit = {}

  override def stop(): Unit = {}

  /**
   * Asynchronously maintains & sends new rate limits to the receiver through the receiver tracker.
   */
  override protected[streaming] val rateController: Option[RateController] = {
    if (RateController.isBackPressureEnabled(ssc.conf))
      Option(new DatasourceRateController(id, RateEstimator.create(ssc.conf, context.graph.batchDuration)))
    else None
  }

  /**
   * A RateController to retrieve the rate from RateEstimator.
   */
  private[streaming] class DatasourceRateController(id: Int, estimator: RateEstimator)
    extends RateController(id, estimator) {

    override def publish(rate: Long): Unit = {
      ssc.scheduler.receiverTracker.sendRateUpdate(id, rate)
    }
  }

}

private[streaming] object DatasourceDStream {

  /**
   * Control the new calculated offsets
   */
  var progressInputSentences: Option[InputSentences] = None

  /**
   * Computing stopped
   */
  var computingStopped: Boolean = false
}