org.apache.spark.api.csharp.CSharpRDD.scala Maven / Gradle / Ivy

Go to download
Show more of this group Show more artifacts with this name
Show all versions of spark-clr Show documentation
C# language binding and extensions to Apache Spark
The newest version!
// Copyright (c) Microsoft. All rights reserved.
// Licensed under the MIT license. See LICENSE file in the project root for full license information.

package org.apache.spark.api.csharp

import java.nio.channels.{OverlappingFileLockException, FileChannel, FileLock}
import java.util.{List => JList, Map => JMap}
import java.io._
import java.util.{ArrayList => JArrayList, List => JList, Map => JMap}

import org.apache.spark.api.python.{PythonBroadcast, PythonRDD}
import org.apache.spark.broadcast.Broadcast
import org.apache.spark.rdd.RDD
import org.apache.spark.{TaskContext, Partition, Accumulator, SparkContext}
import org.apache.spark.api.java.{JavaPairRDD, JavaRDD, JavaSparkContext}
import org.apache.spark.util.csharp.{Utils => CSharpUtils}

/**
 * RDD used for forking an external C# process and pipe in & out the data
 * between JVM and CLR. Since PythonRDD already has the required implementation
 * it just extends from it without overriding any behavior for now
 */
class CSharpRDD(
                                @transient parent: RDD[_],
                                command: Array[Byte],
                                envVars: JMap[String, String],
                                cSharpIncludes: JList[String],
                                preservePartitioning: Boolean,
                                cSharpWorkerExecutable: String,
                                unUsedVersionIdentifier: String,
                                broadcastVars: JList[Broadcast[PythonBroadcast]],
                                accumulator: Accumulator[JList[Array[Byte]]])
  extends PythonRDD (parent, command, envVars, cSharpIncludes, preservePartitioning, cSharpWorkerExecutable, unUsedVersionIdentifier, broadcastVars, accumulator) {

  override def compute(split: Partition, context: TaskContext): Iterator[Array[Byte]] = {
    unzip(new File(cSharpWorkerExecutable).getAbsoluteFile.getParentFile)
    super.compute(split, context)
  }

  /**
   * Uncompress all zip files under directory cSharpWorkerWorkingDir.
   * As .zip file is supported to be submitted by sparkclr-submit.cmd, and there might be some runtime dependencies of cSharpWorker.exe in the zip files,
   * so before start to execute cSharpWorker.exe, uncompress all zip files first.
   *
   * One executor might process multiple splits, if zip files have already been unzipped in the previous split, there is no need to unzip them again.
   * Once uncompression is done, a flag file "doneFlag" will be created.
   * @param cSharpWorkerWorkingDir directory where cSharpWorker.exe is located
   */
  private def unzip(cSharpWorkerWorkingDir: File): Unit = {

    val files = cSharpWorkerWorkingDir.list.filter(_.toLowerCase.endsWith(".zip"))

    val lockName = "_unzip_lock"
    val unzippingFlagName = "_unzipping"
    val doneFlagName = "_unzip_done"

    if (files.length == 0) {
      logWarning("Found no zip files.")
      return
    } else {
      logInfo("Found zip files: " + files.mkString(","))
    }

    val doneFlag = new File(cSharpWorkerWorkingDir, doneFlagName)

    // check whether all zip files have already uncompressed
    if (doneFlag.exists()) {
      logInfo("Already unzipped all zip files, skip.")
      return
    }

    val unzippingFlag = new File(cSharpWorkerWorkingDir, unzippingFlagName)

    // if another thread is uncompressing files, current thread just needs to wait the operation done and return
    if (unzippingFlag.exists()) {
      waitUnzipOperationDone(doneFlag)
      return
    }

    val lockFile = new File(cSharpWorkerWorkingDir, lockName)
    var file: RandomAccessFile = null
    var lock: FileLock = null
    var channel: FileChannel = null

    try {
      file = new RandomAccessFile(lockFile, "rw")
      channel = file.getChannel()
      lock = channel.tryLock()

      if (lock == null) {
        logWarning("Failed to obtain lock for file " + lockFile.getPath)
        waitUnzipOperationDone(doneFlag)
        return
      }

      // check again whether un-compression operation already done
      if (new File(cSharpWorkerWorkingDir, doneFlagName).exists()) {
        return
      }

      // unzippingFlag file will be deleted before release the lock
      // so if obtain the lock successfully, there is no chance that the unzippingFlag still exists
      unzippingFlag.createNewFile()

      //unzip file
      for (zipFile <- files) {
        CSharpUtils.unzip(new File(cSharpWorkerWorkingDir, zipFile), cSharpWorkerWorkingDir)
        logInfo("Unzip file: " + zipFile)
      }

      doneFlag.createNewFile()
      unzippingFlag.delete()
      logInfo("Unzip done.")

    } catch {
      case e: OverlappingFileLockException => {
        logInfo("Already obtained the lock.")
        waitUnzipOperationDone(doneFlag)
      }
      case e: Exception => e.printStackTrace()
    }
    finally {
      if (lock != null && lock.isValid()) lock.release()
      if (channel != null && channel.isOpen) channel.close()
      if (file != null) file.close()
    }
  }

  /**
   * Wait until doneFlag file is created, or total waiting time exceeds threshold
   * @param doneFlag
   */
  private def waitUnzipOperationDone(doneFlag: File): Unit = {
    val maxSleepTimeInSeconds = 30 // max wait time
    var sleepTimeInSeconds = 0
    val interval = 5

    while (true) {

      if (!doneFlag.exists()) {
        if (sleepTimeInSeconds > maxSleepTimeInSeconds) {
          return
        }

        sleepTimeInSeconds += interval
        Thread.sleep(5 * 1000) // sleep 5 seconds
      } else {
        return
      }
    }
  }
}

object CSharpRDD {
  def createRDDFromArray(sc: SparkContext, arr: Array[Array[Byte]], numSlices: Int): JavaRDD[Array[Byte]] = {
    JavaRDD.fromRDD(sc.parallelize(arr, numSlices))
  }
}