All Downloads are FREE. Search and download functionalities are using the official Maven repository.

org.apache.spark.streaming.eventhubs.checkpoint.DirectDStreamProgressTracker.scala Maven / Gradle / Ivy

/*
 * Licensed to the Apache Software Foundation (ASF) under one or more
 * contributor license agreements.  See the NOTICE file distributed with
 * this work for additional information regarding copyright ownership.
 * The ASF licenses this file to You under the Apache License, Version 2.0
 * (the "License"); you may not use this file except in compliance with
 * the License.  You may obtain a copy of the License at
 *
 *    http://www.apache.org/licenses/LICENSE-2.0
 *
 * Unless required by applicable law or agreed to in writing, software
 * distributed under the License is distributed on an "AS IS" BASIS,
 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 * See the License for the specific language governing permissions and
 * limitations under the License.
 */

package org.apache.spark.streaming.eventhubs.checkpoint

import scala.collection.mutable.ListBuffer
import org.apache.hadoop.conf.Configuration
import org.apache.hadoop.fs._
import org.apache.spark.eventhubs.common.{ NameAndPartition, EventHubsConnector, OffsetRecord }
import org.apache.spark.eventhubs.common.progress.ProgressTrackerBase

/**
 * EventHub uses offset to indicates the startpoint of each receiver, and uses the number of
 * messages for rate control, which are described by offset and sequence number respesctively.
 * As a result, we have to build this class to translate the sequence number to offset for the next
 * batch to start. The basic idea is that the tasks running on executors writes the offset of the
 * last message to HDFS and we gather those files into a progress tracking point for a certain batch
 *
 * @param progressDir the directory of checkpoint files
 * @param appName the name of Spark application
 * @param hadoopConfiguration the hadoop configuration instance
 */
private[spark] class DirectDStreamProgressTracker private[spark] (
    progressDir: String,
    appName: String,
    hadoopConfiguration: Configuration)
    extends ProgressTrackerBase(progressDir, appName, hadoopConfiguration) {

  // the lock synchronizing the read and committing operations, since they are executed in driver
  // and listener thread respectively.
  private val driverLock = new Object

  override def eventHubNameAndPartitions: Map[String, List[NameAndPartition]] = {
    DirectDStreamProgressTracker.registeredConnectors.map { connector =>
      (connector.uid, connector.namesAndPartitions)
    }.toMap
  }

  private def initProgressFileDirectory(): Unit = {
    try {
      val fs = progressDirectoryPath.getFileSystem(hadoopConfiguration)
      val checkpointDirExisted = fs.exists(progressDirectoryPath)
      if (checkpointDirExisted) {
        val (validationPass, latestFile) = validateProgressFile(fs)
        if (!validationPass) {
          if (latestFile.isDefined) {
            logWarning(s"latest progress file ${latestFile.get} corrupt, rolling back...")
            fs.delete(latestFile.get, true)
          }
        }
      } else {
        fs.mkdirs(progressDirectoryPath)
      }
    } catch {
      case ex: Exception =>
        ex.printStackTrace()
        throw ex
    }
  }

  private def initTempProgressFileDirectory(): Unit = {
    try {
      val fs = tempDirectoryPath.getFileSystem(hadoopConfiguration)
      val checkpointTempDirExisted = fs.exists(tempDirectoryPath)
      if (checkpointTempDirExisted) {
        fs.delete(tempDirectoryPath, true)
        logInfo(s"cleanup temp checkpoint $tempDirectoryPath")
      }
      fs.mkdirs(tempDirectoryPath)
    } catch {
      case ex: Exception =>
        ex.printStackTrace()
        throw ex
    }
  }

  private def initMetadataDirectory(): Unit = {
    try {
      val fs = metadataDirectoryPath.getFileSystem(hadoopConfiguration)
      val checkpointMetadaDirExisted = fs.exists(tempDirectoryPath)
      if (!checkpointMetadaDirExisted) {
        fs.mkdirs(metadataDirectoryPath)
      }
    } catch {
      case ex: Exception =>
        ex.printStackTrace()
        throw ex
    }
  }

  /**
   * called when ProgressTracker is referred for the first time, including recovering from the
   * Spark Streaming checkpoint
   */
  override def init(): Unit = {
    initProgressFileDirectory()
    initTempProgressFileDirectory()
    initMetadataDirectory()
  }

  /**
   * read the progress record for the specified namespace, streamId and timestamp
   */
  override def read(namespace: String, timestamp: Long, fallBack: Boolean): OffsetRecord =
    driverLock.synchronized {
      super.read(namespace, timestamp, fallBack)
    }

  def close(): Unit = {}

  // called in EventHubDirectDStream's clearCheckpointData method
  override def cleanProgressFile(timestampToClean: Long): Unit = driverLock.synchronized {
    val fs = progressDirectoryPath.getFileSystem(hadoopConfiguration)
    // clean progress directory
    // NOTE: due to SPARK-19280 (https://issues.apache.org/jira/browse/SPARK-19280)
    // we have to disable cleanup thread
    /*
    val allUselessFiles = fs.listStatus(progressDirectoryPath, new PathFilter {
      override def accept(path: Path): Boolean = fromPathToTimestamp(path) <= checkpointTime
    }).map(_.getPath)
    val sortedFileList = allUselessFiles.sortWith((p1, p2) => fromPathToTimestamp(p1) >
      fromPathToTimestamp(p2))
    if (sortedFileList.nonEmpty) {
      sortedFileList.tail.foreach { filePath =>
        logInfo(s"delete $filePath")
        fs.delete(filePath, true)
      }
    }
     */
    // clean temp directory
    val allUselessTempFiles = fs
      .listStatus(tempDirectoryPath, new PathFilter {
        override def accept(path: Path): Boolean = fromPathToTimestamp(path) <= timestampToClean
      })
      .map(_.getPath)
    if (allUselessTempFiles.nonEmpty) {
      allUselessTempFiles
        .groupBy(fromPathToTimestamp)
        .toList
        .sortWith((p1, p2) => p1._1 > p2._1)
        .tail
        .flatMap(_._2)
        .foreach { filePath =>
          logInfo(s"delete $filePath")
          fs.delete(filePath, true)
        }
    }
  }

  /**
   * commit offsetToCommit to a new progress tracking file
   */
  override def commit(offsetToCommit: Map[String, Map[NameAndPartition, (Long, Long)]],
                      commitTime: Long): Unit = driverLock.synchronized {
    super.commit(offsetToCommit, commitTime)
  }
}

object DirectDStreamProgressTracker {

  val registeredConnectors = new ListBuffer[EventHubsConnector]

  private var _progressTracker: DirectDStreamProgressTracker = _

  private[spark] def reset(): Unit = {
    registeredConnectors.clear()
    _progressTracker.metadataCleanupFuture.cancel(true)
    _progressTracker = null
  }

  def getInstance: ProgressTrackerBase[_ <: EventHubsConnector] = _progressTracker

  // should only be used for testing
  private[streaming] def setProgressTracker(progressTracker: DirectDStreamProgressTracker): Unit = {
    _progressTracker = progressTracker
  }

  private[spark] def initInstance(
      progressDirStr: String,
      appName: String,
      hadoopConfiguration: Configuration): ProgressTrackerBase[_ <: EventHubsConnector] = {
    this.synchronized {
      // DirectDStream shall have singleton progress tracker
      if (_progressTracker == null) {
        _progressTracker =
          new DirectDStreamProgressTracker(progressDirStr, appName, hadoopConfiguration)
      }
      _progressTracker.init()
    }
    _progressTracker
  }
}




© 2015 - 2025 Weber Informatics LLC | Privacy Policy